BibTeX bibliography taco.bib

%%% -*-BibTeX-*-
%%% ====================================================================
%%% BibTeX-file{
%%%     author          = "Nelson H. F. Beebe",
%%%     version         = "1.88",
%%%     date            = "23 February 2024",
%%%     time            = "16:31:32 MST",
%%%     filename        = "taco.bib",
%%%     address         = "University of Utah
%%%                        Department of Mathematics, 110 LCB
%%%                        155 S 1400 E RM 233
%%%                        Salt Lake City, UT 84112-0090
%%%                        USA",
%%%     telephone       = "+1 801 581 5254",
%%%     FAX             = "+1 801 581 4148",
%%%     URL             = "https://www.math.utah.edu/~beebe",
%%%     checksum        = "29872 40814 218627 2086875",
%%%     email           = "beebe at math.utah.edu, beebe at acm.org,
%%%                        beebe at computer.org (Internet)",
%%%     codetable       = "ISO/ASCII",
%%%     keywords        = "ACM Transactions on Architecture and Code
%%%                        Optimization; bibliography; TACO",
%%%     license         = "public domain",
%%%     supported       = "yes",
%%%     docstring       = "This is a COMPLETE BibTeX bibliography for
%%%                        ACM Transactions on Architecture and Code
%%%                        Optimization (CODEN ????, ISSN 1544-3566
%%%                        (print), 1544-3973 (electronic)), covering
%%%                        all journal issues from 2004 -- date.
%%%
%%%                        At version 1.88, the COMPLETE journal
%%%                        coverage looked like this:
%%%
%%%                             2004 (  17)    2011 (  17)    2018 (  39)
%%%                             2005 (  17)    2012 (  61)    2019 (  63)
%%%                             2006 (  19)    2013 ( 103)    2020 (  47)
%%%                             2007 (  19)    2014 (  34)    2021 (  56)
%%%                             2008 (  21)    2015 (  66)    2022 (  61)
%%%                             2009 (  20)    2016 (  91)    2023 (  62)
%%%                             2010 (  21)    2017 (  55)    2024 (  20)
%%%
%%%                             Article:        909
%%%
%%%                             Total entries:  909
%%%
%%%                        The journal Web page can be found at:
%%%
%%%                            http://www.acm.org/pubs/taco.html
%%%
%%%                        The journal table of contents page is at:
%%%
%%%                            http://www.acm.org/taco/
%%%                            http://portal.acm.org/browse_dl.cfm?idx=J924
%%%                            https://dl.acm.org/loi/taco
%%%
%%%                        Qualified subscribers can retrieve the full
%%%                        text of recent articles in PDF form.
%%%
%%%                        The initial draft was extracted from the ACM
%%%                        Web pages.
%%%
%%%                        ACM copyrights explicitly permit abstracting
%%%                        with credit, so article abstracts, keywords,
%%%                        and subject classifications have been
%%%                        included in this bibliography wherever
%%%                        available.  Article reviews have been
%%%                        omitted, until their copyright status has
%%%                        been clarified.
%%%
%%%                        bibsource keys in the bibliography entries
%%%                        below indicate the entry originally came
%%%                        from the computer science bibliography
%%%                        archive, even though it has likely since
%%%                        been corrected and updated.
%%%
%%%                        URL keys in the bibliography point to
%%%                        World Wide Web locations of additional
%%%                        information about the entry.
%%%
%%%                        BibTeX citation tags are uniformly chosen
%%%                        as name:year:abbrev, where name is the
%%%                        family name of the first author or editor,
%%%                        year is a 4-digit number, and abbrev is a
%%%                        3-letter condensation of important title
%%%                        words. Citation tags were automatically
%%%                        generated by software developed for the
%%%                        BibNet Project.
%%%
%%%                        In this bibliography, entries are sorted in
%%%                        publication order, using ``bibsort -byvolume.''
%%%
%%%                        The checksum field above contains a CRC-16
%%%                        checksum as the first value, followed by the
%%%                        equivalent of the standard UNIX wc (word
%%%                        count) utility output of lines, words, and
%%%                        characters.  This is produced by Robert
%%%                        Solovay's checksum utility."
%%%     }
%%% ====================================================================

@Preamble{"\input bibnames.sty" #
    "\def \TM {${}^{\sc TM}$}"
}

%%% ====================================================================
%%% Acknowledgement abbreviations:

@String{ack-nhfb = "Nelson H. F. Beebe,
                    University of Utah,
                    Department of Mathematics, 110 LCB,
                    155 S 1400 E RM 233,
                    Salt Lake City, UT 84112-0090, USA,
                    Tel: +1 801 581 5254,
                    FAX: +1 801 581 4148,
                    e-mail: \path|beebe@math.utah.edu|,
                            \path|beebe@acm.org|,
                            \path|beebe@computer.org| (Internet),
                    URL: \path|https://www.math.utah.edu/~beebe/|"}

%%% ====================================================================
%%% Journal abbreviations:

@String{j-TACO                  = "ACM Transactions on Architecture and
                                  Code Optimization"}

%%% ====================================================================
%%% Bibliography entries:

@Article{Calder:2004:I,
  author =       "Brad Calder and Dean Tullsen",
  title =        "Introduction",
  journal =      j-TACO,
  volume =       "1",
  number =       "1",
  pages =        "1--2",
  month =        mar,
  year =         "2004",
  CODEN =        "????",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Thu Aug 5 07:08:09 MDT 2004",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Zhang:2004:RIC,
  author =       "W. Zhang and J. S. Hu and V. Degalahal and M. Kandemir
                 and N. Vijaykrishnan and M. J. Irwin",
  title =        "Reducing instruction cache energy consumption using a
                 compiler-based strategy",
  journal =      j-TACO,
  volume =       "1",
  number =       "1",
  pages =        "3--33",
  month =        mar,
  year =         "2004",
  CODEN =        "????",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Thu Aug 5 07:08:09 MDT 2004",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Isailovic:2004:DCQ,
  author =       "Nemanja Isailovic and Mark Whitney and Yatish Patel
                 and John Kubiatowicz and Dean Copsey and Frederic T.
                 Chong and Isaac L. Chuang and Mark Oskin",
  title =        "Datapath and control for quantum wires",
  journal =      j-TACO,
  volume =       "1",
  number =       "1",
  pages =        "34--61",
  month =        mar,
  year =         "2004",
  CODEN =        "????",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Thu Aug 5 07:08:09 MDT 2004",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Sankaralingam:2004:TPA,
  author =       "Karthikeyan Sankaralingam and Ramadass Nagarajan and
                 Haiming Liu and Changkyu Kim and Jaehyuk Huh and Nitya
                 Ranganathan and Doug Burger and Stephen W. Keckler and
                 Robert G. McDonald and Charles R. Moore",
  title =        "{TRIPS}: a polymorphous architecture for exploiting
                 {ILP}, {TLP}, and {DLP}",
  journal =      j-TACO,
  volume =       "1",
  number =       "1",
  pages =        "62--93",
  month =        mar,
  year =         "2004",
  CODEN =        "????",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Thu Aug 5 07:08:09 MDT 2004",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Skadron:2004:TAM,
  author =       "Kevin Skadron and Mircea R. Stan and Karthik
                 Sankaranarayanan and Wei Huang and Sivakumar Velusamy
                 and David Tarjan",
  title =        "Temperature-aware microarchitecture: {Modeling} and
                 implementation",
  journal =      j-TACO,
  volume =       "1",
  number =       "1",
  pages =        "94--125",
  month =        mar,
  year =         "2004",
  CODEN =        "????",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Thu Aug 5 07:08:09 MDT 2004",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Aleta:2004:RCC,
  author =       "Alex Alet{\`a} and Josep M. Codina and Antonio
                 Gonz{\'a}lez and David Kaeli",
  title =        "Removing communications in clustered
                 microarchitectures through instruction replication",
  journal =      j-TACO,
  volume =       "1",
  number =       "2",
  pages =        "127--151",
  month =        jun,
  year =         "2004",
  CODEN =        "????",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Thu Aug 5 07:08:10 MDT 2004",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Bai:2004:LPO,
  author =       "Yu Bai and R. Iris Bahar",
  title =        "A low-power in-order\slash out-of-order issue queue",
  journal =      j-TACO,
  volume =       "1",
  number =       "2",
  pages =        "152--179",
  month =        jun,
  year =         "2004",
  CODEN =        "????",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Thu Aug 5 07:08:10 MDT 2004",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Juang:2004:IBP,
  author =       "Philo Juang and Kevin Skadron and Margaret Martonosi
                 and Zhigang Hu and Douglas W. Clark and Philip W.
                 Diodato and Stefanos Kaxiras",
  title =        "Implementing branch-predictor decay using quasi-static
                 memory cells",
  journal =      j-TACO,
  volume =       "1",
  number =       "2",
  pages =        "180--219",
  month =        jun,
  year =         "2004",
  CODEN =        "????",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Thu Aug 5 07:08:10 MDT 2004",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Santana:2004:LCF,
  author =       "Oliverio J. Santana and Alex Ramirez and Josep L.
                 Larriba-Pey and Mateo Valero",
  title =        "A low-complexity fetch architecture for
                 high-performance superscalar processors",
  journal =      j-TACO,
  volume =       "1",
  number =       "2",
  pages =        "220--245",
  month =        jun,
  year =         "2004",
  CODEN =        "????",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Thu Aug 5 07:08:10 MDT 2004",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Lin:2004:CFS,
  author =       "Jin Lin and Tong Chen and Wei-Chung Hsu and Pen-Chung
                 Yew and Roy Dz-Ching Ju and Tin-Fook Ngai and Sun
                 Chan",
  title =        "A compiler framework for speculative optimizations",
  journal =      j-TACO,
  volume =       "1",
  number =       "3",
  pages =        "247--271",
  month =        sep,
  year =         "2004",
  CODEN =        "????",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Oct 29 06:39:45 MDT 2004",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Fields:2004:ICS,
  author =       "Brian A. Fields and Rastislav Bodik and Mark D. Hill
                 and Chris J. Newburn",
  title =        "Interaction cost and shotgun profiling",
  journal =      j-TACO,
  volume =       "1",
  number =       "3",
  pages =        "272--304",
  month =        sep,
  year =         "2004",
  CODEN =        "????",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Oct 29 06:39:45 MDT 2004",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Sankaranarayanan:2004:PBA,
  author =       "Karthik Sankaranarayanan and Kevin Skadron",
  title =        "Profile-based adaptation for cache decay",
  journal =      j-TACO,
  volume =       "1",
  number =       "3",
  pages =        "305--322",
  month =        sep,
  year =         "2004",
  CODEN =        "????",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Oct 29 06:39:45 MDT 2004",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Xie:2004:IDV,
  author =       "Fen Xie and Margaret Martonosi and Sharad Malik",
  title =        "Intraprogram dynamic voltage scaling: {Bounding}
                 opportunities with analytic modeling",
  journal =      j-TACO,
  volume =       "1",
  number =       "3",
  pages =        "323--367",
  month =        sep,
  year =         "2004",
  CODEN =        "????",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Oct 29 06:39:45 MDT 2004",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Hartstein:2004:OPD,
  author =       "A. Hartstein and Thomas R. Puzak",
  title =        "The optimum pipeline depth considering both power and
                 performance",
  journal =      j-TACO,
  volume =       "1",
  number =       "4",
  pages =        "369--388",
  month =        dec,
  year =         "2004",
  CODEN =        "????",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Thu Apr 14 12:17:47 MDT 2005",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Cristal:2004:TKI,
  author =       "Adri{\'a}n Cristal and Oliverio J. Santana and Mateo
                 Valero and Jos{\'e} F. Mart{\'\i}nez",
  title =        "Toward kilo-instruction processors",
  journal =      j-TACO,
  volume =       "1",
  number =       "4",
  pages =        "389--417",
  month =        dec,
  year =         "2004",
  CODEN =        "????",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Thu Apr 14 12:17:47 MDT 2005",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Akkary:2004:ARE,
  author =       "Haitham Akkary and Ravi Rajwar and Srikanth T.
                 Srinivasan",
  title =        "An analysis of a resource efficient checkpoint
                 architecture",
  journal =      j-TACO,
  volume =       "1",
  number =       "4",
  pages =        "418--444",
  month =        dec,
  year =         "2004",
  CODEN =        "????",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Thu Apr 14 12:17:47 MDT 2005",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Yang:2004:TML,
  author =       "Chia-Lin Yang and Alvin R. Lebeck and Hung-Wei Tseng
                 and Chien-Hao Lee",
  title =        "Tolerating memory latency through push prefetching for
                 pointer-intensive applications",
  journal =      j-TACO,
  volume =       "1",
  number =       "4",
  pages =        "445--475",
  month =        dec,
  year =         "2004",
  CODEN =        "????",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Thu Apr 14 12:17:47 MDT 2005",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Calder:2005:I,
  author =       "Brad Calder and Dean Tullsen",
  title =        "Introduction",
  journal =      j-TACO,
  volume =       "2",
  number =       "1",
  pages =        "1--2",
  month =        mar,
  year =         "2005",
  CODEN =        "????",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon May 2 11:13:58 MDT 2005",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Zhou:2005:EFA,
  author =       "Yuanyuan Zhou and Pin Zhou and Feng Qin and Wei Liu
                 and Josep Torrellas",
  title =        "Efficient and flexible architectural support for
                 dynamic monitoring",
  journal =      j-TACO,
  volume =       "2",
  number =       "1",
  pages =        "3--33",
  month =        mar,
  year =         "2005",
  CODEN =        "????",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon May 2 11:13:58 MDT 2005",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Zhang:2005:WHC,
  author =       "Chuanjun Zhang and Frank Vahid and Jun Yang and Walid
                 Najjar",
  title =        "A way-halting cache for low-energy high-performance
                 systems",
  journal =      j-TACO,
  volume =       "2",
  number =       "1",
  pages =        "34--54",
  month =        mar,
  year =         "2005",
  CODEN =        "????",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon May 2 11:13:58 MDT 2005",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Abella:2005:ISP,
  author =       "Jaume Abella and Antonio Gonz{\'a}lez and Xavier Vera
                 and Michael F. P. O'Boyle",
  title =        "{IATAC}: a smart predictor to turn-off {L2} cache
                 lines",
  journal =      j-TACO,
  volume =       "2",
  number =       "1",
  pages =        "55--77",
  month =        mar,
  year =         "2005",
  CODEN =        "????",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon May 2 11:13:58 MDT 2005",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Haskins:2005:AWS,
  author =       "John W. {Haskins, Jr.} and Kevin Skadron",
  title =        "Accelerated warmup for sampled microarchitecture
                 simulation",
  journal =      j-TACO,
  volume =       "2",
  number =       "1",
  pages =        "78--108",
  month =        mar,
  year =         "2005",
  CODEN =        "????",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon May 2 11:13:58 MDT 2005",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Li:2005:ABT,
  author =       "Tao Li and Ravi Bhargava and Lizy Kurian John",
  title =        "Adapting branch-target buffer to improve the target
                 predictability of {Java} code",
  journal =      j-TACO,
  volume =       "2",
  number =       "2",
  pages =        "109--130",
  month =        jun,
  year =         "2005",
  CODEN =        "????",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Thu Jul 7 14:09:53 MDT 2005",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Zhang:2005:DIE,
  author =       "Lingli Zhang and Chandra Krintz",
  title =        "The design, implementation, and evaluation of adaptive
                 code unloading for resource-constrained devices",
  journal =      j-TACO,
  volume =       "2",
  number =       "2",
  pages =        "131--164",
  month =        jun,
  year =         "2005",
  CODEN =        "????",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Thu Jul 7 14:09:53 MDT 2005",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Kulkarni:2005:FES,
  author =       "Prasad A. Kulkarni and Stephen R. Hines and David B.
                 Whalley and Jason D. Hiser and Jack W. Davidson and
                 Douglas L. Jones",
  title =        "Fast and efficient searches for effective
                 optimization-phase sequences",
  journal =      j-TACO,
  volume =       "2",
  number =       "2",
  pages =        "165--198",
  month =        jun,
  year =         "2005",
  CODEN =        "????",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Thu Jul 7 14:09:53 MDT 2005",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Salami:2005:DMI,
  author =       "Esther Salam{\'\i} and Mateo Valero",
  title =        "Dynamic memory interval test vs. interprocedural
                 pointer analysis in multimedia applications",
  journal =      j-TACO,
  volume =       "2",
  number =       "2",
  pages =        "199--219",
  month =        jun,
  year =         "2005",
  CODEN =        "????",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Thu Jul 7 14:09:53 MDT 2005",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Meng:2005:ELL,
  author =       "Yan Meng and Timothy Sherwood and Ryan Kastner",
  title =        "Exploring the limits of leakage power reduction in
                 caches",
  journal =      j-TACO,
  volume =       "2",
  number =       "3",
  pages =        "221--246",
  month =        sep,
  year =         "2005",
  CODEN =        "????",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Wed Oct 5 07:42:22 MDT 2005",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Garzaran:2005:TBS,
  author =       "Mar{\'\i}a Jes{\'u}s Garzar{\'a}n and Milos Prvulovic
                 and Jos{\'e} Mar{\'\i}a Llaber{\'\i}a and V{\'\i}ctor
                 Vi{\~n}als and Lawrence Rauchwerger and Josep
                 Torrellas",
  title =        "Tradeoffs in buffering speculative memory state for
                 thread-level speculation in multiprocessors",
  journal =      j-TACO,
  volume =       "2",
  number =       "3",
  pages =        "247--279",
  month =        sep,
  year =         "2005",
  CODEN =        "????",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Wed Oct 5 07:42:22 MDT 2005",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Tarjan:2005:MPG,
  author =       "David Tarjan and Kevin Skadron",
  title =        "Merging path and gshare indexing in perceptron branch
                 prediction",
  journal =      j-TACO,
  volume =       "2",
  number =       "3",
  pages =        "280--300",
  month =        sep,
  year =         "2005",
  CODEN =        "????",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Wed Oct 5 07:42:22 MDT 2005",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Zhang:2005:WET,
  author =       "Xiangyu Zhang and Rajiv Gupta",
  title =        "Whole execution traces and their applications",
  journal =      j-TACO,
  volume =       "2",
  number =       "3",
  pages =        "301--334",
  month =        sep,
  year =         "2005",
  CODEN =        "????",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Wed Oct 5 07:42:22 MDT 2005",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Zhao:2005:IWA,
  author =       "Wankang Zhao and David Whalley and Christopher Healy
                 and Frank Mueller",
  title =        "Improving {WCET} by applying a {WC} code-positioning
                 optimization",
  journal =      j-TACO,
  volume =       "2",
  number =       "4",
  pages =        "335--365",
  month =        dec,
  year =         "2005",
  CODEN =        "????",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Thu Feb 16 11:03:13 MST 2006",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
  keywords =     "WC (worst case); WCET (worst case execution time)",
}

@Article{Reis:2005:SCF,
  author =       "George A. Reis and Jonathan Chang and Neil
                 Vachharajani and Ram Rangan and David I. August and
                 Shubhendu S. Mukherjee",
  title =        "Software-controlled fault tolerance",
  journal =      j-TACO,
  volume =       "2",
  number =       "4",
  pages =        "366--396",
  month =        dec,
  year =         "2005",
  CODEN =        "????",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Thu Feb 16 11:03:13 MST 2006",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Li:2005:PPC,
  author =       "Jian Li and Jos{\'e} F. Mart{\'\i}nez",
  title =        "Power-performance considerations of parallel computing
                 on chip multiprocessors",
  journal =      j-TACO,
  volume =       "2",
  number =       "4",
  pages =        "397--422",
  month =        dec,
  year =         "2005",
  CODEN =        "????",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Thu Feb 16 11:03:13 MST 2006",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Sharma:2005:SPE,
  author =       "Saurabh Sharma and Jesse G. Beu and Thomas M. Conte",
  title =        "Spectral prefetcher: {An} effective mechanism for {L2}
                 cache prefetching",
  journal =      j-TACO,
  volume =       "2",
  number =       "4",
  pages =        "423--450",
  month =        dec,
  year =         "2005",
  CODEN =        "????",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Thu Feb 16 11:03:13 MST 2006",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Calder:2006:I,
  author =       "Brad Calder and Dean Tullsen",
  title =        "Introduction",
  journal =      j-TACO,
  volume =       "3",
  number =       "1",
  pages =        "1--2",
  month =        mar,
  year =         "2006",
  CODEN =        "????",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Thu May 18 08:38:26 MDT 2006",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Tan:2006:BSS,
  author =       "Lin Tan and Brett Brotherton and Timothy Sherwood",
  title =        "Bit-split string-matching engines for intrusion
                 detection and prevention",
  journal =      j-TACO,
  volume =       "3",
  number =       "1",
  pages =        "3--34",
  month =        mar,
  year =         "2006",
  CODEN =        "????",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Thu May 18 08:38:26 MDT 2006",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Nagpurkar:2006:ERP,
  author =       "Priya Nagpurkar and Hussam Mousa and Chandra Krintz
                 and Timothy Sherwood",
  title =        "Efficient remote profiling for resource-constrained
                 devices",
  journal =      j-TACO,
  volume =       "3",
  number =       "1",
  pages =        "35--66",
  month =        mar,
  year =         "2006",
  CODEN =        "????",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Thu May 18 08:38:26 MDT 2006",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Lin:2006:RCG,
  author =       "Jin Lin and Wei-Chung Hsu and Pen-Chung Yew and Roy
                 Dz-Ching Ju and Tin-Fook Ngai",
  title =        "Recovery code generation for general speculative
                 optimizations",
  journal =      j-TACO,
  volume =       "3",
  number =       "1",
  pages =        "67--89",
  month =        mar,
  year =         "2006",
  CODEN =        "????",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Thu May 18 08:38:26 MDT 2006",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Choi:2006:ORR,
  author =       "Yoonseo Choi and Hwansoo Han",
  title =        "Optimal register reassignment for register stack
                 overflow minimization",
  journal =      j-TACO,
  volume =       "3",
  number =       "1",
  pages =        "90--114",
  month =        mar,
  year =         "2006",
  CODEN =        "????",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Thu May 18 08:38:26 MDT 2006",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Xue:2006:LOA,
  author =       "Jingling Xue and Qiong Cai",
  title =        "A lifetime optimal algorithm for speculative {PRE}",
  journal =      j-TACO,
  volume =       "3",
  number =       "2",
  pages =        "115--155",
  month =        jun,
  year =         "2006",
  CODEN =        "????",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Jun 9 06:47:22 MDT 2006",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Sharkey:2006:IPT,
  author =       "Joseph J. Sharkey and Dmitry V. Ponomarev and Kanad
                 Ghose and Oguz Ergin",
  title =        "Instruction packing: {Toward} fast and
                 energy-efficient instruction scheduling",
  journal =      j-TACO,
  volume =       "3",
  number =       "2",
  pages =        "156--181",
  month =        jun,
  year =         "2006",
  CODEN =        "????",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Jun 9 06:47:22 MDT 2006",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Ceze:2006:CUC,
  author =       "Luis Ceze and Karin Strauss and James Tuck and Josep
                 Torrellas and Jose Renau",
  title =        "{CAVA}: {Using} checkpoint-assisted value prediction
                 to hide {L2} misses",
  journal =      j-TACO,
  volume =       "3",
  number =       "2",
  pages =        "182--208",
  month =        jun,
  year =         "2006",
  CODEN =        "????",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Jun 9 06:47:22 MDT 2006",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Zhang:2006:EAR,
  author =       "Lixin Zhang and Mike Parker and John Carter",
  title =        "Efficient address remapping in distributed
                 shared-memory systems",
  journal =      j-TACO,
  volume =       "3",
  number =       "2",
  pages =        "209--229",
  month =        jun,
  year =         "2006",
  CODEN =        "????",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Jun 9 06:47:22 MDT 2006",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Zhao:2006:ATP,
  author =       "Min Zhao and Bruce R. Childers and Mary Lou Soffa",
  title =        "An approach toward profit-driven optimization",
  journal =      j-TACO,
  volume =       "3",
  number =       "3",
  pages =        "231--262",
  month =        sep,
  year =         "2006",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1162690.1162691",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Sat Sep 23 07:54:36 MDT 2006",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Although optimizations have been applied for a number
                 of years to improve the performance of software,
                 problems with respect to the application of
                 optimizations have not been adequately addressed. For
                 example, in certain circumstances, optimizations may
                 degrade performance. However, there is no efficient way
                 to know when a degradation will occur. In this
                 research, we investigate the profitability of
                 optimizations, which is useful for determining the
                 benefit of applying optimizations. We develop a
                 framework that enables us to predict profitability
                 using analytic models. The profitability of an
                 optimization depends on code context, the particular
                 optimization, and machine resources. Thus, our
                 framework has analytic models for each of these
                 components. As part of the framework, there is also a
                 profitability engine that uses models to predict the
                 profit. In this paper, we target scalar optimizations
                 and, in particular, describe the models for partial
                 redundancy elimination (PRE), loop invariant code
                 motion (LICM), and value numbering (VN). We implemented
                 the framework for predicting the profitability of these
                 optimizations. Based on the predictions, we can
                 selectively apply profitable optimizations. We compared
                 the profit-driven approach with an approach that uses a
                 heuristic in deciding when optimizations should be
                 applied. Our experiments demonstrate that the
                 profitability of scalar optimizations can be accurately
                 predicted by using models. That is, without actually
                 applying a scalar optimization, we can determine if an
                 optimization is beneficial and should be applied.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Hazelwood:2006:MBC,
  author =       "Kim Hazelwood and Michael D. Smith",
  title =        "Managing bounded code caches in dynamic binary
                 optimization systems",
  journal =      j-TACO,
  volume =       "3",
  number =       "3",
  pages =        "263--294",
  month =        sep,
  year =         "2006",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1162690.1162692",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Sat Sep 23 07:54:36 MDT 2006",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Dynamic binary optimizers store altered copies of
                 original program instructions in software-managed code
                 caches in order to maximize reuse of transformed code.
                 Code caches store code blocks that may vary in size,
                 reference other code blocks, and carry a high
                 replacement overhead. These unique constraints reduce
                 the effectiveness of conventional cache management
                 policies. Our work directly addresses these unique
                 constraints and presents several contributions to the
                 code-cache management problem. First, we show that
                 evicting more than the minimum number of code blocks
                 from the code cache results in less run-time overhead
                 than the existing alternatives. Such granular evictions
                 reduce overall execution time, as the fixed costs of
                 invoking the eviction mechanism are amortized across
                 multiple cache insertions. Second, a study of the ideal
                 lifetimes of dynamically generated code blocks
                 illustrates the benefit of a replacement algorithm
                 based on a generational heuristic. We describe and
                 evaluate a generational approach to code cache
                 management that makes it easy to identify long-lived
                 code blocks and simultaneously avoid any fragmentation
                 because of the eviction of short-lived blocks. Finally,
                 we present results from an implementation of our
                 generational approach in the DynamoRIO framework and
                 illustrate that, as dynamic optimization systems become
                 more prevalent, effective code cache-management
                 policies will be essential for reliable, scalable
                 performance of modern applications.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Rochecouste:2006:CCE,
  author =       "Olivier Rochecouste and Gilles Pokam and Andr{\'e}
                 Seznec",
  title =        "A case for a complexity-effective, width-partitioned
                 microarchitecture",
  journal =      j-TACO,
  volume =       "3",
  number =       "3",
  pages =        "295--326",
  month =        sep,
  year =         "2006",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1162690.1162693",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Sat Sep 23 07:54:36 MDT 2006",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "The analysis of program executions reveals that most
                 integer and multimedia applications make heavy use of
                 narrow-width operations, i.e., instructions exclusively
                 using narrow-width operands and producing a
                 narrow-width result. Moreover, this usage is relatively
                 well distributed over the application. We observed this
                 program property on the MediaBench and SPEC2000
                 benchmarks with about 40\% of the instructions being
                 narrow-width operations. Current superscalar processors
                 use 64-bit datapaths to execute all the instructions of
                 the applications. In this paper, we suggest the use of
                 a width-partitioned microarchitecture (WPM) to master
                 the hardware complexity of a superscalar processor. For
                 a four-way issue machine, we split the processor in two
                 two-way clusters: the main cluster executing 64-bit
                 operations, load/store, and complex operations and a
                 narrow cluster executing the 16-bit operations. We
                 resort to partitioning to decouple the treatment of the
                 narrow-width operations from that of the other program
                 instructions. This provides the benefit of greatly
                 simplifying the design of the critical processor
                 components in each cluster (e.g., the register file and
                 the bypass network). The dynamic interleaving of the
                 two instruction types allows maintaining the workload
                 balanced among clusters. WPM also helps to reduce the
                 complexity of the interconnection fabric and of the
                 issue logic. In fact, since the 16-bit cluster can only
                 communicate narrow-width data, the datapath-width of
                 the interconnect fabric can be significantly reduced,
                 yielding a corresponding saving of the interconnect
                 power and area. We explore different possible
                 configurations of WPM, discussing the various
                 implementation tradeoffs. We also examine a speculative
                 steering heuristic to distribute the narrow-width
                 operations among clusters. A detailed analysis of the
                 complexity factors shows using WPM instead of a
                 classical 64-bit two-cluster microarchitecture can save
                 power and silicon area with a minimal impact on the
                 overall performance.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Zmily:2006:BAI,
  author =       "Ahmad Zmily and Christos Kozyrakis",
  title =        "Block-aware instruction set architecture",
  journal =      j-TACO,
  volume =       "3",
  number =       "3",
  pages =        "327--357",
  month =        sep,
  year =         "2006",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1162690.1162694",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Sat Sep 23 07:54:36 MDT 2006",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Instruction delivery is a critical component for
                 wide-issue, high-frequency processors since its
                 bandwidth and accuracy place an upper limit on
                 performance. The processor front-end accuracy and
                 bandwidth are limited by instruction-cache misses,
                 multicycle instruction-cache accesses, and target or
                 direction mispredictions for control-flow operations.
                 This paper presents a block-aware instruction set
                 (BLISS) that allows software to assist with front-end
                 challenges. BLISS defines basic block descriptors that
                 are stored separately from the actual instructions in a
                 program. We show that BLISS allows for a decoupled
                 front-end that tolerates instruction-cache latency,
                 facilitates instruction prefetching, and leads to
                 higher prediction accuracy.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Crandall:2006:MAS,
  author =       "Jedidiah R. Crandall and S. Felix Wu and Frederic T.
                 Chong",
  title =        "{Minos}: {Architectural} support for protecting
                 control data",
  journal =      j-TACO,
  volume =       "3",
  number =       "4",
  pages =        "359--389",
  month =        dec,
  year =         "2006",
  CODEN =        "????",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Sat Apr 14 10:44:57 MDT 2007",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Marathe:2006:ACC,
  author =       "Jaydeep Marathe and Frank Mueller and Bronis R. de
                 Supinski",
  title =        "Analysis of cache-coherence bottlenecks with hybrid
                 hardware\slash software techniques",
  journal =      j-TACO,
  volume =       "3",
  number =       "4",
  pages =        "390--423",
  month =        dec,
  year =         "2006",
  CODEN =        "????",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Sat Apr 14 10:44:57 MDT 2007",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Ganusov:2006:FEP,
  author =       "Ilya Ganusov and Martin Burtscher",
  title =        "Future execution: a prefetching mechanism that uses
                 multiple cores to speed up single threads",
  journal =      j-TACO,
  volume =       "3",
  number =       "4",
  pages =        "424--449",
  month =        dec,
  year =         "2006",
  CODEN =        "????",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Sat Apr 14 10:44:57 MDT 2007",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Co:2006:ETC,
  author =       "Michele Co and Dee A. B. Weikle and Kevin Skadron",
  title =        "Evaluating trace cache energy efficiency",
  journal =      j-TACO,
  volume =       "3",
  number =       "4",
  pages =        "450--476",
  month =        dec,
  year =         "2006",
  CODEN =        "????",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Sat Apr 14 10:44:57 MDT 2007",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Hu:2006:EMM,
  author =       "Shiwen Hu and Madhavi Valluri and Lizy Kurian John",
  title =        "Effective management of multiple configurable units
                 using dynamic optimization",
  journal =      j-TACO,
  volume =       "3",
  number =       "4",
  pages =        "477--501",
  month =        dec,
  year =         "2006",
  CODEN =        "????",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Sat Apr 14 10:44:57 MDT 2007",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Bentley:2006:IAB,
  author =       "Chris Bentley and Scott A. Watterson and David K.
                 Lowenthal and Barry Rountree",
  title =        "Implicit array bounds checking on 64-bit
                 architectures",
  journal =      j-TACO,
  volume =       "3",
  number =       "4",
  pages =        "502--527",
  month =        dec,
  year =         "2006",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1187976.1187982",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Sat Apr 14 10:44:57 MDT 2007",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Several programming languages guarantee that array
                 subscripts are checked to ensure they are within the
                 bounds of the array. While this guarantee improves the
                 correctness and security of array-based code, it adds
                 overhead to array references. This has been an obstacle
                 to using higher-level languages, such as Java, for
                 high-performance parallel computing, where the language
                 specification requires that all array accesses must be
                 checked to ensure they are within bounds. This is
                 because, in practice, array-bounds checking in
                 scientific applications may increase execution time by
                 more than a factor of 2. Previous research has explored
                 optimizations to statically eliminate bounds checks,
                 but the dynamic nature of many scientific codes makes
                 this difficult or impossible. Our approach is, instead,
                 to create a compiler and operating system
                 infrastructure that does not generate explicit bounds
                 checks. It instead places arrays inside of Index
                 Confinement Regions (ICRs), which are large, isolated,
                 mostly unmapped virtual memory regions. Any array
                 reference outside of its bounds will cause a protection
                 violation; this provides implicit bounds checking. Our
                 results show that when applying this infrastructure to
                 high-performance computing programs written in Java,
                 the overhead of bounds checking relative to a program
                 with no bounds checks is reduced from an average of
                 63\% to an average of 9\%.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Calder:2007:I,
  author =       "Brad Calder and Dean Tullsen",
  title =        "Introduction",
  journal =      j-TACO,
  volume =       "4",
  number =       "1",
  pages =        "1:1--1:1",
  month =        mar,
  year =         "2007",
  CODEN =        "????",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Sat Apr 14 10:44:57 MDT 2007",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "1",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Constantinides:2007:ARC,
  author =       "Kypros Constantinides and Stephen Plaza and Jason
                 Blome and Valeria Bertacco and Scott Mahlke and Todd
                 Austin and Bin Zhang and Michael Orshansky",
  title =        "Architecting a reliable {CMP} switch architecture",
  journal =      j-TACO,
  volume =       "4",
  number =       "1",
  pages =        "2:1--2:37",
  month =        mar,
  year =         "2007",
  CODEN =        "????",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Sat Apr 14 10:44:57 MDT 2007",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "2",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Sasanka:2007:AES,
  author =       "Ruchira Sasanka and Man-Lap Li and Sarita V. Adve and
                 Yen-Kuang Chen and Eric Debes",
  title =        "{ALP}: {Efficient} support for all levels of
                 parallelism for complex media applications",
  journal =      j-TACO,
  volume =       "4",
  number =       "1",
  pages =        "3:1--3:30",
  month =        mar,
  year =         "2007",
  CODEN =        "????",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Sat Apr 14 10:44:57 MDT 2007",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "3",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Luo:2007:CNP,
  author =       "Yan Luo and Jia Yu and Jun Yang and Laxmi N. Bhuyan",
  title =        "Conserving network processor power consumption by
                 exploiting traffic variability",
  journal =      j-TACO,
  volume =       "4",
  number =       "1",
  pages =        "4:1--4:26",
  month =        mar,
  year =         "2007",
  CODEN =        "????",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Sat Apr 14 10:44:57 MDT 2007",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "4",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Soteriou:2007:SDP,
  author =       "Vassos Soteriou and Noel Eisley and Li-Shiuan Peh",
  title =        "Software-directed power-aware interconnection
                 networks",
  journal =      j-TACO,
  volume =       "4",
  number =       "1",
  pages =        "5:1--5:40",
  month =        mar,
  year =         "2007",
  CODEN =        "????",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Sat Apr 14 10:44:57 MDT 2007",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "5",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Hwang:2007:SSA,
  author =       "Yuan-Shin Hwang and Jia-Jhe Li",
  title =        "Snug set-associative caches: Reducing leakage power of
                 instruction and data caches with no performance
                 penalties",
  journal =      j-TACO,
  volume =       "4",
  number =       "1",
  pages =        "6:1--6:28",
  month =        mar,
  year =         "2007",
  CODEN =        "????",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Sat Apr 14 10:44:57 MDT 2007",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "6",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Rong:2007:SDS,
  author =       "Hongbo Rong and Zhizhong Tang and R. Govindarajan and
                 Alban Douillet and Guang R. Gao",
  title =        "Single-dimension software pipelining for
                 multidimensional loops",
  journal =      j-TACO,
  volume =       "4",
  number =       "1",
  pages =        "7:1--7:44",
  month =        mar,
  year =         "2007",
  CODEN =        "????",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Sat Apr 14 10:44:57 MDT 2007",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "7",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Bower:2007:ODH,
  author =       "Fred A. Bower and Daniel J. Sorin and Sule Ozev",
  title =        "Online diagnosis of hard faults in microprocessors",
  journal =      j-TACO,
  volume =       "4",
  number =       "2",
  pages =        "8:1--8:??",
  month =        jun,
  year =         "2007",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1250727.1250728",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Jun 16 11:40:54 MDT 2008",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "We develop a microprocessor design that tolerates hard
                 faults, including fabrication defects and in-field
                 faults, by leveraging existing microprocessor
                 redundancy. To do this, we must: detect and correct
                 errors, diagnose hard faults at the field
                 deconfigurable unit (FDU) granularity, and deconfigure
                 FDUs with hard faults. In our reliable microprocessor
                 design, we use DIVA dynamic verification to detect and
                 correct errors. Our new scheme for diagnosing hard
                 faults tracks instructions' core structure occupancy
                 from decode until commit. If a DIVA checker detects an
                 error in an instruction, it increments a small
                 saturating error counter for every FDU used by that
                 instruction, including that DIVA checker. A hard fault
                 in an FDU quickly leads to an above-threshold error
                 counter for that FDU and thus diagnoses the fault. For
                 deconfiguration, we use previously developed schemes
                 for functional units and buffers and present a scheme
                 for deconfiguring DIVA checkers. Experimental results
                 show that our reliable microprocessor quickly and
                 accurately diagnoses each hard fault that is injected
                 and continues to function, albeit with somewhat
                 degraded performance.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "8",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
  keywords =     "fine-grained diagnosis; hard fault tolerance;
                 processor microarchitecture",
}

@Article{Michaud:2007:STM,
  author =       "Pierre Michaud and Andr{\'e} Seznec and Damien Fetis
                 and Yiannakis Sazeides and Theofanis Constantinou",
  title =        "A study of thread migration in temperature-constrained
                 multicores",
  journal =      j-TACO,
  volume =       "4",
  number =       "2",
  pages =        "9:1--9:??",
  month =        jun,
  year =         "2007",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1250727.1250729",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Jun 16 11:40:54 MDT 2008",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Temperature has become an important constraint in
                 high-performance processors, especially multicores.
                 Thread migration will be essential to exploit the full
                 potential of future thermally constrained multicores.
                 We propose and study a thread migration method that
                 maximizes performance under a temperature constraint,
                 while minimizing the number of migrations and ensuring
                 fairness between threads. We show that thread migration
                 brings important performance gains and that it is most
                 effective during the first tens of seconds following a
                 decrease of the number of running threads.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "9",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
  keywords =     "multicore processor; power density; temperature;
                 thermal management; thread migration",
}

@Article{Chen:2007:CRL,
  author =       "Yu Chen and Fuxin Zhang",
  title =        "Code reordering on limited branch offset",
  journal =      j-TACO,
  volume =       "4",
  number =       "2",
  pages =        "10:1--10:??",
  month =        jun,
  year =         "2007",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1250727.1250730",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Jun 16 11:40:54 MDT 2008",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Since the 1980's code reordering has gained popularity
                 as an important way to improve the spatial locality of
                 programs. While the effect of the processor's
                 microarchitecture and memory hierarchy on this
                 optimization technique has been investigated, little
                 research has focused on the impact of the instruction
                 set. In this paper, we analyze the effect of limited
                 branch offset of the MIPS-like instruction set [Hwu et
                 al. 2004, 2005] on code reordering, explore two simple
                 methods to handle the exceeded branches, and propose
                 the bidirectional code layout (BCL) algorithm to reduce
                 the number of branches exceeding the offset limit. The
                 BCL algorithm sorts the chains according to the
                 position of related chains, avoids cache conflict
                 misses deliberately and lays out the code
                 bidirectionally. It strikes a balance among the
                 distance of related blocks, the instruction cache miss
                 rate, the memory size required, and the control flow
                 transfer. Experimental results show that BCL can
                 effectively reduce exceeded branches by 50.1\%, on
                 average, with up to 100\% for some programs. Except for
                 some programs with little spatial locality, the BCL
                 algorithm can achieve the performance, as the case with
                 no branch offset limitation.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "10",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
  keywords =     "code reordering; Godson Processor; link-time
                 optimization",
}

@Article{Terechko:2007:ICC,
  author =       "A. S. Terechko and H. Corporaal",
  title =        "Inter-cluster communication in {VLIW} architectures",
  journal =      j-TACO,
  volume =       "4",
  number =       "2",
  pages =        "11:1--11:??",
  month =        jun,
  year =         "2007",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1250727.1250731",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Jun 16 11:40:54 MDT 2008",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "The traditional VLIW (very long instruction word)
                 architecture with a single register file does not scale
                 up well to address growing performance demands on
                 embedded media processors. However, splitting a VLIW
                 processor in smaller clusters, which are comprised of
                 function units fully connected to local register files,
                 can significantly improve VLSI implementation
                 characteristics of the processor, such as speed, energy
                 consumption, and area. In our paper we reveal that
                 achieving the best characteristics of a clustered VLIW
                 requires a thorough selection of an Inter-cluster
                 Communication (ICC) model, which is the way clustering
                 is exposed in the Instruction Set Architecture. For our
                 study we, first, define a taxonomy of ICC models
                 including copy operations, dedicated issue slots,
                 extended operands, extended results, and multicast.
                 Evaluation of the execution time of the models requires
                 both the dynamic cycle count and clock period. We
                 developed an advanced instruction scheduler for all the
                 five ICC models in order to quantify the dynamic cycle
                 counts of our multimedia C benchmarks. To assess the
                 clock period of the ICC models we designed and laid out
                 VLIW datapaths using the RTL hardware descriptions
                 derived from a deeply pipelined commercial TriMedia
                 processor. In contrast to prior art, our research shows
                 that fully distributed register file architectures
                 (with eight clusters in our study) often underperform
                 compared to moderately clustered machines with two or
                 four clusters because of explosion of the cycle count
                 overhead in the former. Among the evaluated ICC models,
                 performance of the copy operation model, popular both
                 in academia and industry, is severely limited by the
                 copy operations hampering scheduling of regular
                 operations in high ILP (instruction-level parallelism)
                 code. The dedicated issue slots model combats this
                 limitation by dedicating extra VLIW issue slots purely
                 for ICC, reaching the highest 1.74 execution time
                 speedup relative to the unicluster. Furthermore, our
                 VLSI experiments show that the lowest area and energy
                 consumption of 42 and 57\% relative to the unicluster,
                 respectively, are achieved by the extended operands
                 model, which, nevertheless, provides higher performance
                 than the copy operation model.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "11",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
  keywords =     "clock frequency; cluster assignment; instruction
                 scheduler; instruction-level parallelism; intercluster
                 communication; optimizing compiler; pipelining;
                 register allocation; VLIW",
}

@Article{Dou:2007:CCM,
  author =       "Jialin Dou and Marcelo Cintra",
  title =        "A compiler cost model for speculative
                 parallelization",
  journal =      j-TACO,
  volume =       "4",
  number =       "2",
  pages =        "12:1--12:??",
  month =        jun,
  year =         "2007",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1250727.1250732",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Jun 16 11:40:54 MDT 2008",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Speculative parallelization is a technique that allows
                 code sections that cannot be fully analyzed by the
                 compiler to be aggressively executed in parallel.
                 However, while speculative parallelization can
                 potentially deliver significant speedups, several
                 overheads associated with this technique can limit
                 these speedups in practice. This paper proposes a novel
                 compiler static cost model of speculative multithreaded
                 execution that can be used to predict the resulting
                 performance. This model attempts to predict the
                 expected speedups, or slowdowns, of the candidate
                 speculative sections based on the estimation of the
                 combined runtime effects of various overheads, and
                 taking into account the scheduling restrictions of most
                 speculative execution environments. The model is based
                 on estimating the likely execution duration of threads
                 and considers all the possible permutations of these
                 threads. This model also produces a quantitative
                 estimate of the speedup, which is different from prior
                 heuristics that only qualitatively estimate the
                 benefits of speculative multithreaded execution. In
                 previous work, a limited version of the framework was
                 evaluated on a number of loops from a collection of
                 SPEC benchmarks that suffer mainly from load imbalance
                 and thread dispatch and commit overheads. In this work,
                 an extended framework is also evaluated on loops that
                 may suffer from data-dependence violations.
                 Experimental results show that prediction accuracy is
                 lower when loops with violations are included.
                 Nevertheless, accuracy is still very high for a static
                 model: the framework can identify, on average, 45\% of
                 the loops that cause slowdowns and, on average, 96\% of
                 the loops that lead to speedups; it predicts the
                 speedups or slowdowns with an error of less than 20\%
                 for an average of 28\% of the loops across the
                 benchmarks and with an error of less than 50\% for an
                 average of 80\% of the loops. Overall, the framework
                 often outperforms, by as much as 25\%, a naive approach
                 that attempts to speculatively parallelize all the
                 loops considered, and is able to curb the large
                 slowdowns caused in many cases by this naive
                 approach.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "12",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
  keywords =     "speculative multithreading; speculative
                 parallelization; thread-level speculation",
}

@Article{Amme:2007:SBM,
  author =       "Wolfram Amme and Jeffery von Ronne and Michael Franz",
  title =        "{SSA}-based mobile code: {Implementation} and
                 empirical evaluation",
  journal =      j-TACO,
  volume =       "4",
  number =       "2",
  pages =        "13:1--13:??",
  month =        jun,
  year =         "2007",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1250727.1250733",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Jun 16 11:40:54 MDT 2008",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Although one might expect transportation formats based
                 on static single-assignment form (SSA) to yield faster
                 just-in-time compilation times than those based on
                 stack-based virtual machines, this claim has not
                 previously been validated, in practice. We attempt to
                 quantify the effect of using an SSA-based mobile code
                 representation by integrating support for a verifiable
                 SSA-based IR into Jikes RVM. Performance results,
                 measured with various optimizations and on both the
                 IA32 and PowerPC, show improvements in both compilation
                 time and code quality.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "13",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
  keywords =     "SafeTSA; static single-assignment form; virtual
                 machines",
}

@Article{Li:2007:CCE,
  author =       "Xiaodong Li and Ritu Gupta and Sarita V. Adve and
                 Yuanyuan Zhou",
  title =        "Cross-component energy management: {Joint} adaptation
                 of processor and memory",
  journal =      j-TACO,
  volume =       "4",
  number =       "3",
  pages =        "14:1--14:??",
  month =        sep,
  year =         "2007",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1275937.1275938",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Jun 16 11:41:20 MDT 2008",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Researchers have proposed the use of adaptation to
                 reduce the energy consumption of different hardware
                 components, such as the processor, memory, disk, and
                 display for general-purpose applications. Previous
                 algorithms to control these adaptations, however, have
                 focused on a single component. This work takes the
                 first step toward developing algorithms that can
                 jointly control adaptations in multiple interacting
                 components for general-purpose applications, with the
                 goal of minimizing the total energy consumed within a
                 specified performance loss. Specifically, we develop a
                 joint-adaptation algorithm for processor and memory
                 adaptations. We identify two properties that enable
                 per-component algorithms to be easily used in a
                 cross-component context---the algorithms' performance
                 impact must be guaranteed and composable. We then
                 modify a current processor and a memory algorithm to
                 obey these properties. This allows the cross-component
                 problem to be reduced to determine an appropriate
                 (energy-optimal) allocation of the target performance
                 loss (slack) between the two components. We develop
                 such an optimal slack allocation algorithm that
                 exploits the above properties. The result is an
                 efficient cross-component adaptation framework that
                 minimizes the total energy of the processor and memory
                 without exceeding the target performance loss, while
                 substantially leveraging current per-component
                 algorithms. Our experiments show that joint processor
                 and memory adaptation provides significantly more
                 energy savings than adapting either component alone;
                 intelligent slack distribution is specifically
                 effective for highly compute- or memory-intensive
                 applications; and the performance slowdown never
                 exceeds the specification.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "14",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
  keywords =     "adaptive systems; control algorithms; energy
                 management; low-power design; memory; performance
                 guarantee; processor",
}

@Article{Gabor:2007:FES,
  author =       "Ron Gabor and Shlomo Weiss and Avi Mendelson",
  title =        "Fairness enforcement in switch on event
                 multithreading",
  journal =      j-TACO,
  volume =       "4",
  number =       "3",
  pages =        "15:1--15:??",
  month =        sep,
  year =         "2007",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1275937.1275939",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Jun 16 11:41:20 MDT 2008",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "The need to reduce power and complexity will increase
                 the interest in Switch On Event multithreading
                 (coarse-grained multithreading). Switch On Event
                 multithreading is a low-power and low-complexity
                 mechanism to improve processor throughput by switching
                 threads on execution stalls. Fairness may, however,
                 become a problem in a multithreaded processor. Unless
                 fairness is properly handled, some threads may starve
                 while others consume all of the processor cycles.
                 Heuristics that were devised in order to improve
                 fairness in simultaneous multithreading are not
                 applicable to Switch On Event multithreading. This
                 paper defines the fairness metric using the ratio of
                 the individual threads' speedups and shows how it can
                 be enforced in Switch On Event multithreading. Fairness
                 is controlled by forcing additional thread switch
                 points. These switch points are determined dynamically
                 by runtime estimation of the single threaded
                 performance of each of the individual threads. We
                 analyze the impact of the fairness enforcement
                 mechanism on aggregate IPC and weighted speedup. We
                 present simulation results of the performance of Switch
                 On Event multithreading. Switch On Event multithreading
                 achieves an average aggregate IPC increase of 26\% over
                 single thread and 12\% weighted speedup when no
                 fairness is enforced. In this case, a sixth of our runs
                 resulted in poor fairness in which one thread ran
                 extremely slowly (10 to 100 times slower than its
                 single-thread performance), while the other thread's
                 performance was hardly affected. By using the proposed
                 mechanism, we can guarantee fairness at different
                 levels of strictness and, in most cases, even improve
                 the weighted speedup.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "15",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
  keywords =     "coarse-grained multithreading; fairness;
                 multithreading; performance; SOE; Switch on Event
                 multithreading; throughput; weighted speedup",
}

@Article{Andrade:2007:PAA,
  author =       "Diego Andrade and Basilio B. Fraguela and Ram{\'o}n
                 Doallo",
  title =        "Precise automatable analytical modeling of the cache
                 behavior of codes with indirections",
  journal =      j-TACO,
  volume =       "4",
  number =       "3",
  pages =        "16:1--16:??",
  month =        sep,
  year =         "2007",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1275937.1275940",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Jun 16 11:41:20 MDT 2008",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "The performance of memory hierarchies, in which caches
                 play an essential role, is critical in nowadays
                 general-purpose and embedded computing systems because
                 of the growing memory bottleneck problem.
                 Unfortunately, cache behavior is very unstable and
                 difficult to predict. This is particularly true in the
                 presence of irregular access patterns, which exhibit
                 little locality. Such patterns are very common, for
                 example, in applications in which pointers or
                 compressed sparse matrices give place to indirections.
                 Nevertheless, cache behavior in the presence of
                 irregular access patterns has not been widely studied.
                 In this paper we present an extension of a systematic
                 analytical modeling technique based on PMEs
                 (probabilistic miss equations), previously developed by
                 the authors, that allows the automated analysis of the
                 cache behavior for codes with irregular access patterns
                 resulting from indirections. The model generates very
                 accurate predictions despite the irregularities and has
                 very low computing requirements, being the first model
                 that gathers these desirable characteristics that can
                 automatically analyze this kind of codes. These
                 properties enable this model to help drive compiler
                 optimizations, as we show with an example.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "16",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
  keywords =     "analytical modeling; irregular access patterns; memory
                 hierarchy; performance prediction",
}

@Article{Venstermans:2007:JOH,
  author =       "Kris Venstermans and Lieven Eeckhout and Koen {De
                 Bosschere}",
  title =        "{Java} object header elimination for reduced memory
                 consumption in 64-bit virtual machines",
  journal =      j-TACO,
  volume =       "4",
  number =       "3",
  pages =        "17:1--17:??",
  month =        sep,
  year =         "2007",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1275937.1275941",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Jun 16 11:41:20 MDT 2008",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Memory performance is an important design issue for
                 contemporary computer systems given the huge
                 processor/memory speed gap. This paper proposes a
                 space-efficient Java object model for reducing the
                 memory consumption of 64-bit Java virtual machines. We
                 completely eliminate the object header through typed
                 virtual addressing (TVA) or implicit typing. TVA
                 encodes the object type in the object's virtual address
                 by allocating all objects of a given type in a
                 contiguous memory segment. This allows for removing the
                 type information as well as the status field from the
                 object header. Whenever type and status information is
                 needed, masking is applied to the object's virtual
                 address for obtaining an offset into type and status
                 information structures. Unlike previous work on
                 implicit typing, we apply TVA to a selected number of
                 frequently allocated object types, hence, the name
                 selective TVA (STVA); this limits the amount of memory
                 fragmentation. In addition to applying STVA, we also
                 compress the type information block (TIB) pointers for
                 all objects that do not fall under TVA. We implement
                 the space-efficient Java object model in the 64-bit
                 version of the Jikes RVM on an AIX IBM platform and
                 compare its performance against the traditionally used
                 Java object model using a multitude of Java benchmarks.
                 We conclude that the space-efficient Java object model
                 reduces memory consumption by on average 15\% (and up
                 to 45\% for some benchmarks). About one-half the
                 reduction comes from TIB pointer compression; the other
                 one-half comes from STVA. In terms of performance, the
                 space-efficient object model generally does not affect
                 performance; however, for some benchmarks we observe
                 statistically significant performance speedups, up to
                 20\%.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "17",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
  keywords =     "64-bit implementation; implicit typing; Java object
                 model; typed virtual addressing; Virtual machine",
}

@Article{Xiao:2007:VIS,
  author =       "Shu Xiao and Edmund M.-K. Lai",
  title =        "{VLIW} instruction scheduling for minimal power
                 variation",
  journal =      j-TACO,
  volume =       "4",
  number =       "3",
  pages =        "18:1--18:??",
  month =        sep,
  year =         "2007",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1275937.1275942",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Jun 16 11:41:20 MDT 2008",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "The focus of this paper is on the minimization of the
                 variation in power consumed by a VLIW processor during
                 the execution of a target program through instruction
                 scheduling. The problem is formulated as a
                 mixed-integer program (MIP) and a problem-specific
                 branch-and-bound algorithm has been developed to solve
                 it more efficiently than generic MIP solvers.
                 Simulation results based on the TMS320C6711 VLIW
                 digital signal processor using benchmarks from
                 Mediabench and Trimaran showed that over 40\% average
                 reduction in power variation can be achieved without
                 sacrificing execution speed of these benchmarks.
                 Computational requirements and convergence rates of our
                 algorithm are also analyzed.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "18",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
  keywords =     "instruction scheduling; power variation reduction;
                 VLIW processors",
}

@Article{Tallam:2007:UCF,
  author =       "Sriraman Tallam and Rajiv Gupta",
  title =        "Unified control flow and data dependence traces",
  journal =      j-TACO,
  volume =       "4",
  number =       "3",
  pages =        "19:1--19:??",
  month =        sep,
  year =         "2007",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1275937.1275943",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Jun 16 11:41:20 MDT 2008",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "We describe the design, generation, and compression of
                 the extended whole program path (eWPP), representation
                 that not only captures the control flow history of a
                 program execution but also its data dependence history.
                 This representation is motivated by the observation
                 that, typically, a significant fraction of data
                 dependence history can be recovered from the control
                 flow trace. To capture the remainder of the data
                 dependence history, we introduce disambiguation checks
                 in the program whose control flow signatures capture
                 the results of the checks. The resulting extended
                 control flow trace enables the recovery of otherwise
                 irrecoverable data dependences. The code for the checks
                 is designed to minimize the increase in program
                 execution time and the extended control flow trace size
                 when compared to directly collecting control flow and
                 address traces. Our experiments show that compressed
                 eWPPs are only one-quarter of the size of combined
                 compressed control flow and address traces. However,
                 their collection incurs a 5{\times} increase in runtime
                 overhead relative to the overhead required for directly
                 collecting the control flow and address traces,
                 respectively.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "19",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
  keywords =     "address trace; control flow trace; dynamic data
                 dependence trace; profiling",
}

@Article{Ipek:2008:EAD,
  author =       "Engin Ipek and Sally A. McKee and Karan Singh and Rich
                 Caruana and Bronis R. de Supinski and Martin Schulz",
  title =        "Efficient architectural design space exploration via
                 predictive modeling",
  journal =      j-TACO,
  volume =       "4",
  number =       "4",
  pages =        "1:1--1:??",
  month =        jan,
  year =         "2008",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1328195.1328196",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Jun 16 11:41:35 MDT 2008",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Efficiently exploring exponential-size architectural
                 design spaces with many interacting parameters remains
                 an open problem: the sheer number of experiments
                 required renders detailed simulation intractable. We
                 attack this via an automated approach that builds
                 accurate predictive models. We simulate sampled points,
                 using results to teach our models the function
                 describing relationships among design parameters. The
                 models can be queried and are very fast, enabling
                 efficient design tradeoff discovery. We validate our
                 approach via two uniprocessor sensitivity studies,
                 predicting IPC with only 1--2\% error. In an
                 experimental study using the approach, training on 1\%
                 of a 250-K-point CMP design space allows our models to
                 predict performance with only 4--5\% error. Our
                 predictive modeling combines well with techniques that
                 reduce the time taken by each simulation experiment,
                 achieving net time savings of three-four orders of
                 magnitude.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "1",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
  keywords =     "artificial neural networks; design space exploration;
                 performance prediction; sensitivity studies",
}

@Article{Shi:2008:VMS,
  author =       "Yunhe Shi and Kevin Casey and M. Anton Ertl and David
                 Gregg",
  title =        "Virtual machine showdown: {Stack} versus registers",
  journal =      j-TACO,
  volume =       "4",
  number =       "4",
  pages =        "2:1--2:??",
  month =        jan,
  year =         "2008",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1328195.1328197",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Jun 16 11:41:35 MDT 2008",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Virtual machines (VMs) enable the distribution of
                 programs in an architecture-neutral format, which can
                 easily be interpreted or compiled. A long-running
                 question in the design of VMs is whether a stack
                 architecture or register architecture can be
                 implemented more efficiently with an interpreter. We
                 extend existing work on comparing virtual stack and
                 virtual register architectures in three ways. First,
                 our translation from stack to register code and
                 optimization are much more sophisticated. The result is
                 that we eliminate an average of more than 46\% of
                 executed VM instructions, with the bytecode size of the
                 register machine being only 26\% larger than that of
                 the corresponding stack one. Second, we present a fully
                 functional virtual-register implementation of the Java
                 virtual machine (JVM), which supports Intel, AMD64,
                 PowerPC and Alpha processors. This register VM supports
                 inline-threaded, direct-threaded, token-threaded, and
                 switch dispatch. Third, we present experimental results
                 on a range of additional optimizations such as register
                 allocation and elimination of redundant heap loads. On
                 the AMD64 architecture the register machine using
                 switch dispatch achieves an average speedup of 1.48
                 over the corresponding stack machine. Even using the
                 more efficient inline-threaded dispatch, the register
                 VM achieves a speedup of 1.15 over the equivalent
                 stack-based VM.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "2",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
  keywords =     "interpreter; register architecture; stack
                 architecture; virtual machine",
}

@Article{Yan:2008:EVR,
  author =       "Jun Yan and Wei Zhang",
  title =        "Exploiting virtual registers to reduce pressure on
                 real registers",
  journal =      j-TACO,
  volume =       "4",
  number =       "4",
  pages =        "3:1--3:??",
  month =        jan,
  year =         "2008",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1328195.1328198",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Jun 16 11:41:35 MDT 2008",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "It is well known that a large fraction of variables
                 are short-lived. This paper proposes a novel approach
                 to exploiting this fact to reduce the register pressure
                 for pipelined processors with data-forwarding network.
                 The idea is that the compiler can allocate virtual
                 registers (i.e., place holders to identify dependences
                 among instructions) to short-lived variables, which do
                 not need to be stored to physical storage locations. As
                 a result, real registers (i.e., physically existed
                 registers) can be reserved for long-lived variables for
                 mitigating the register pressure and decreasing the
                 register spills, leading to performance improvement. In
                 this paper, we develop the architectural and compiler
                 support for exploiting virtual registers for statically
                 scheduled processors. Our experimental results show
                 that virtual registers are very effective at reducing
                 the register spills, which, in many cases, can achieve
                 the performance close to the processor with twice
                 number of real registers. Our results also indicate
                 that, for some applications, using 24 virtual, in
                 addition to 8 real registers, can attain even higher
                 performance than that of 16 real without any virtual
                 registers.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "3",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
  keywords =     "data forwarding; register allocation; register file;
                 short-lived variables; virtual register",
}

@Article{Yu:2008:OCL,
  author =       "Zoe C. H. Yu and Francis C. M. Lau and Cho-Li Wang",
  title =        "Object co-location and memory reuse for {Java}
                 programs",
  journal =      j-TACO,
  volume =       "4",
  number =       "4",
  pages =        "4:1--4:??",
  month =        jan,
  year =         "2008",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1328195.1328199",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Jun 16 11:41:35 MDT 2008",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "We introduce a new memory management system, STEMA,
                 which can improve the execution time of Java programs.
                 STEMA detects prolific types on-the-fly and co-locates
                 their objects in a special memory space which supports
                 reuse of memory. We argue and show that memory reuse
                 and co-location of prolific objects can result in
                 improved cache locality, reduced memory fragmentation,
                 reduced GC time, and faster object allocation. We
                 evaluate STEMA using 16 benchmarks. Experimental
                 results show that STEMA performs 2.7\%, 4.0\%, and
                 8.2\% on average better than MarkSweep, CopyMS, and
                 SemiSpace.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "4",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
  keywords =     "garbage collector; Java; memory allocator; memory
                 reuse; mutator; object co-location",
}

@Article{Zhang:2008:RCM,
  author =       "Chuanjun Zhang",
  title =        "Reducing cache misses through programmable decoders",
  journal =      j-TACO,
  volume =       "4",
  number =       "4",
  pages =        "5:1--5:??",
  month =        jan,
  year =         "2008",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1328195.1328200",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Jun 16 11:41:35 MDT 2008",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Level-one caches normally reside on a processor's
                 critical path, which determines clock frequency.
                 Therefore, fast access to level-one cache is important.
                 Direct-mapped caches exhibit faster access time, but
                 poor hit rates, compared with same sized
                 set-associative caches because of nonuniform accesses
                 to the cache sets. The nonuniform accesses generate
                 more cache misses in some sets, while other sets are
                 underutilized. We propose to increase the decoder
                 length and, hence, reduce the accesses to heavily used
                 sets without dynamically detecting the cache set usage
                 information. We increase the access to the
                 underutilized cache sets by incorporating a replacement
                 policy into the cache design using programmable
                 decoders. On average, the proposed techniques achieve
                 as low a miss rate as a traditional 4-way cache on all
                 26 SPEC2K benchmarks for the instruction and data
                 caches, respectively. This translates into an average
                 IPC improvement of 21.5 and 42.4\% for SPEC2K integer
                 and floating-point benchmarks, respectively. The
                 B-Cache consumes 10.5\% more power per access, but
                 exhibits a 12\% total memory access-related energy
                 savings as a result of the miss rate reductions, and,
                 hence, the reduction to applications' execution time.
                 Compared with previous techniques that aim at reducing
                 the miss rate of direct-mapped caches, our technique
                 requires only one cycle to access all cache hits and
                 has the same access time of a direct-mapped cache.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "5",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
  keywords =     "cache; dynamic optimization; low power",
}

@Article{Golander:2008:HMP,
  author =       "Amit Golander and Shlomo Weiss",
  title =        "Hiding the misprediction penalty of a
                 resource-efficient high-performance processor",
  journal =      j-TACO,
  volume =       "4",
  number =       "4",
  pages =        "6:1--6:??",
  month =        jan,
  year =         "2008",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1328195.1328201",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Jun 16 11:41:35 MDT 2008",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Misprediction is a major obstacle for increasing
                 speculative out-of-order processors performance.
                 Performance degradation depends on both the number of
                 misprediction events and the recovery time associated
                 with each one of them. In recent years a few checkpoint
                 based microarchitectures have been proposed. In
                 comparison with ROB-based processors, checkpoint
                 processors are scalable and highly resource efficient.
                 Unfortunately, in these proposals the misprediction
                 recovery time is proportional to the instruction queue
                 size.\par

                 In this paper we analyze methods to reduce the
                 misprediction recovery time. We propose a new register
                 file management scheme and techniques to selectively
                 flush the instruction queue and the load store queue,
                 and to isolate deeply pipelined execution units. The
                 result is a novel checkpoint processor with Constant
                 misprediction RollBack time (CRB). We further present a
                 streamlined, cost-efficient solution, which saves
                 complexity at the price of slightly lower
                 performance.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "6",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
  keywords =     "checkpoints; misprediction; out-of-order execution;
                 rollback; scalable architecture",
}

@Article{Calder:2008:E,
  author =       "Brad Calder and Dean Tullsen",
  title =        "Editorial",
  journal =      j-TACO,
  volume =       "5",
  number =       "1",
  pages =        "1:1--1:??",
  month =        may,
  year =         "2008",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1369396.1369397",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Jun 16 11:41:51 MDT 2008",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "1",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Mysore:2008:FIP,
  author =       "Shashidhar Mysore and Banit Agrawal and Rodolfo Neuber
                 and Timothy Sherwood and Nisheeth Shrivastava and
                 Subhash Suri",
  title =        "Formulating and implementing profiling over adaptive
                 ranges",
  journal =      j-TACO,
  volume =       "5",
  number =       "1",
  pages =        "2:1--2:??",
  month =        may,
  year =         "2008",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1369396.1369398",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Jun 16 11:41:51 MDT 2008",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Modern computer systems are called on to deal with
                 billions of events every second, whether they are
                 executed instructions, accessed memory locations, or
                 forwarded packets. This presents a serious challenge to
                 those who seek to quantify, analyze, or optimize such
                 systems, because important trends and behaviors may
                 easily be lost in a sea of data. We present
                 range-adaptive profiling (RAP) as a new and
                 general-purpose profiling method capable of
                 hierarchically efficiently classifying streams of data
                 in hardware. Through the use of RAP, events in an input
                 stream are dynamically classified into increasingly
                 precise categories, based on the frequency with which
                 they occur. The more important a class, or range of
                 events, the more precisely it is quantified. Despite
                 the dynamic nature of our technique, we build upon
                 tight theoretic bounds covering both worst-case error,
                 as well as the required memory. In the limit, it is
                 known that error and the memory bounds can be
                 independent of the stream size and grow only linearly
                 with the level of precision desired. Significantly, we
                 expose the critical constants in these algorithms and
                 through careful engineering, algorithm redesign, and
                 use of heuristics, we show how a high-performance
                 profile system can be implemented for range-adaptive
                 profiling. RAP can be used on various profiles, such as
                 PCs, load values, and memory addresses, and has a broad
                 range of uses, from hot-region profiling to quantifying
                 cache miss value locality. We propose two methods of
                 implementation of RAP, one in software and the other
                 with specialized hardware, for which we also describe
                 our prototype FPGA implementation. We show that with
                 just 8KB of memory, range profiles can be gathered with
                 an average accuracy of 98\%.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "2",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
  keywords =     "profiling hardware; range adaptive; value locality",
}

@Article{Zhai:2008:CHS,
  author =       "Antonia Zhai and J. Gregory Steffan and Christopher B.
                 Colohan and Todd C. Mowry",
  title =        "Compiler and hardware support for reducing the
                 synchronization of speculative threads",
  journal =      j-TACO,
  volume =       "5",
  number =       "1",
  pages =        "3:1--3:??",
  month =        may,
  year =         "2008",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1369396.1369399",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Jun 16 11:41:51 MDT 2008",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Thread-level speculation (TLS) allows us to
                 automatically parallelize general-purpose programs by
                 supporting parallel execution of threads that might not
                 actually be independent. In this article, we focus on
                 one important limitation of program performance under
                 TLS, which stalls as a result of synchronizing and
                 forwarding scalar values between speculative threads
                 that would otherwise cause frequent data dependences
                 and, hence, failed speculation. Using SPECint
                 benchmarks that have been automatically transformed by
                 our compiler to exploit TLS, we present, evaluate in
                 detail, and compare both compiler and hardware
                 techniques for improving the communication of scalar
                 values. We find that through our dataflow algorithms
                 for three increasingly aggressive instruction
                 scheduling techniques, the compiler can drastically
                 reduce the critical forwarding path introduced by the
                 synchronization and forwarding of scalar values. We
                 also show that hardware techniques for reducing
                 synchronization can be complementary to compiler
                 scheduling, but that the additional performance
                 benefits are minimal and are generally not worth the
                 cost.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "3",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
  keywords =     "automatic parallelization; chip-multiprocessing;
                 instruction scheduling; thread-level speculation",
}

@Article{Winter:2008:ATN,
  author =       "Jonathan A. Winter and David H. Albonesi",
  title =        "Addressing thermal nonuniformity in {SMT} workloads",
  journal =      j-TACO,
  volume =       "5",
  number =       "1",
  pages =        "4:1--4:??",
  month =        may,
  year =         "2008",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1369396.1369400",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Jun 16 11:41:51 MDT 2008",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "We explore DTM techniques within the context of
                 uniform and nonuniform SMT workloads. While DVS is
                 suitable for addressing workloads with uniformly high
                 temperatures, for nonuniform workloads, performance
                 loss occurs because of the slowdown of the cooler
                 thread. To address this, we propose and evaluate DTM
                 mechanisms that exploit the steering-based thread
                 management mechanisms inherent in a clustered SMT
                 architecture. We show that in contrast to DVS, which
                 operates globally, our techniques are more effective at
                 controlling temperature for nonuniform workloads.
                 Furthermore, we devise a DTM technique that combines
                 steering and DVS to achieve consistently good
                 performance across all workloads.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "4",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
  keywords =     "adaptive microarchitectures; clustered
                 microarchitectures; dynamic thermal management; dynamic
                 voltage scaling; simultaneous multithreading",
}

@Article{Shahbahrami:2008:VES,
  author =       "Asadollah Shahbahrami and Ben Juurlink and Stamatis
                 Vassiliadis",
  title =        "Versatility of extended subwords and the matrix
                 register file",
  journal =      j-TACO,
  volume =       "5",
  number =       "1",
  pages =        "5:1--5:??",
  month =        may,
  year =         "2008",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1369396.1369401",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Jun 16 11:41:51 MDT 2008",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Extended subwords and the matrix register file (MRF)
                 are two micro architectural techniques that address
                 some of the limitations of existing SIMD architectures.
                 Extended subwords are wider than the data stored in
                 memory. Specifically, for every byte of data stored in
                 memory, there are four extra bits in the media register
                 file. This avoids the need for data-type conversion
                 instructions. The MRF is a register file organization
                 that provides both conventional row-wise, as well as
                 column-wise, access to the register file. In other
                 words, it allows to view the register file as a matrix
                 in which corresponding subwords in different registers
                 corresponds to a column of the matrix. It was
                 introduced to accelerate matrix transposition which is
                 a very common operation in multimedia applications. In
                 this paper, we show that the MRF is very versatile,
                 since it can also be used for other permutations than
                 matrix transposition. Specifically, it is shown how it
                 can be used to provide efficient access to strided
                 data, as is needed in, e.g., color space conversion.
                 Furthermore, it is shown that special-purpose
                 instructions (SPIs), such as the sum-of-absolute
                 differences (SAD) instruction, have limited usefulness
                 when extended subwords and a few general SIMD
                 instructions that we propose are supported, for the
                 following reasons. First, when extended subwords are
                 supported, the SAD instruction provides only a
                 relatively small performance improvement. Second, the
                 SAD instruction processes 8-bit subwords only, which is
                 not sufficient for quarter-pixel resolution nor for
                 cost functions used in image and video retrieval.
                 Results obtained by extending the SimpleScalar toolset
                 show that the proposed techniques provide a speedup of
                 up to 3.00 over the MMX architecture. The results also
                 show that using, at most, 13 extra media registers
                 yields an additional performance improvement ranging
                 from 1.3 to 1.57.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "5",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
  keywords =     "multimedia standards; SIMD architectures; SIMD
                 programming",
}

@Article{Guo:2008:EHC,
  author =       "Zhi Guo and Walid Najjar and Betul Buyukkurt",
  title =        "Efficient hardware code generation for {FPGAs}",
  journal =      j-TACO,
  volume =       "5",
  number =       "1",
  pages =        "6:1--6:??",
  month =        may,
  year =         "2008",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1369396.1369402",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Jun 16 11:41:51 MDT 2008",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "The wider acceptance of FPGAs as a computing device
                 requires a higher level of programming abstraction.
                 ROCCC is an optimizing C to HDL compiler. We describe
                 the code generation approach in ROCCC. The smart buffer
                 is a component that reuses input data between adjacent
                 iterations. It significantly improves the performance
                 of the circuit and simplifies loop control. The
                 ROCCC-generated datapath can execute one loop iteration
                 per clock cycle when there is no loop dependency or
                 there is only scalar recurrence variable dependency.
                 ROCCC's approach to supporting while-loops operating on
                 scalars makes the compiler able to move scalar
                 iterative computation into hardware.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "6",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
  keywords =     "data reuse; FPGA; high-level synthesis; reconfigurable
                 computing; VHDL",
}

@Article{Kotzmann:2008:DJH,
  author =       "Thomas Kotzmann and Christian Wimmer and Hanspeter
                 M{\"o}ssenb{\"o}ck and Thomas Rodriguez and Kenneth
                 Russell and David Cox",
  title =        "Design of the {Java HotSpot\TM} client compiler for
                 {Java 6}",
  journal =      j-TACO,
  volume =       "5",
  number =       "1",
  pages =        "7:1--7:??",
  month =        may,
  year =         "2008",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1369396.1370017",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Jun 16 11:41:51 MDT 2008",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Version 6 of Sun Microsystems' Java HotSpot{\TM} VM
                 ships with a redesigned version of the client
                 just-in-time compiler that includes several research
                 results of the last years. The client compiler is at
                 the heart of the VM configuration used by default for
                 interactive desktop applications. For such
                 applications, low startup and pause times are more
                 important than peak performance. This paper outlines
                 the new architecture of the client compiler and shows
                 how it interacts with the VM. It presents the
                 intermediate representation that now uses static
                 single-assignment (SSA) form and the linear scan
                 algorithm for global register allocation. Efficient
                 support for exception handling and deoptimization
                 fulfills the demands that are imposed by the dynamic
                 features of the Java programming language. The
                 evaluation shows that the new client compiler generates
                 better code in less time. The popular SPECjvm98
                 benchmark suite is executed 45\% faster, while the
                 compilation speed is also up to 40\% better. This
                 indicates that a carefully selected set of global
                 optimizations can also be integrated in just-in-time
                 compilers that focus on compilation speed and not on
                 peak performance. In addition, the paper presents the
                 impact of several optimizations on execution and
                 compilation speed. As the source code is freely
                 available, the Java HotSpot{\TM} VM and the client
                 compiler are the ideal basis for experiments with new
                 feedback-directed optimizations in a production-level
                 Java just-in-time compiler. The paper outlines research
                 projects that add fast algorithms for escape analysis,
                 automatic object inlining, and array bounds check
                 elimination.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "7",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
  keywords =     "compiler; deoptimization; intermediate representation;
                 Java; just-in-time compilation; optimization; register
                 allocation",
}

@Article{Rangan:2008:PSD,
  author =       "Ram Rangan and Neil Vachharajani and Guilherme Ottoni
                 and David I. August",
  title =        "Performance scalability of decoupled software
                 pipelining",
  journal =      j-TACO,
  volume =       "5",
  number =       "2",
  pages =        "8:1--8:??",
  month =        aug,
  year =         "2008",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1400112.1400113",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Thu Aug 28 13:25:00 MDT 2008",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Any successful solution to using multicore processors
                 to scale general-purpose program performance will have
                 to contend with rising intercore communication costs
                 while exposing coarse-grained parallelism. Recently
                 proposed pipelined multithreading (PMT) techniques have
                 been demonstrated to have general-purpose applicability
                 and are also able to effectively tolerate inter-core
                 latencies through pipelined interthread communication.
                 These desirable properties make PMT techniques strong
                 candidates for program parallelization on current and
                 future multicore processors and understanding their
                 performance characteristics is critical to their
                 deployment. To that end, this paper evaluates the
                 performance scalability of a general-purpose PMT
                 technique called decoupled software pipelining (DSWP)
                 and presents a thorough analysis of the communication
                 bottlenecks that must be overcome for optimal DSWP
                 scalability.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "8",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
  keywords =     "decoupled software pipelining; performance analysis",
}

@Article{Long:2008:TMM,
  author =       "Jieyi Long and Seda Ogrenci Memik and Gokhan Memik and
                 Rajarshi Mukherjee",
  title =        "Thermal monitoring mechanisms for chip
                 multiprocessors",
  journal =      j-TACO,
  volume =       "5",
  number =       "2",
  pages =        "9:1--9:??",
  month =        aug,
  year =         "2008",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1400112.1400114",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Thu Aug 28 13:25:00 MDT 2008",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "With large-scale integration and increasing power
                 densities, thermal management has become an important
                 tool to maintain performance and reliability in modern
                 process technologies. In the core of dynamic thermal
                 management schemes lies accurate reading of on-die
                 temperatures. Therefore, careful planning and embedding
                 of thermal monitoring mechanisms into high-performance
                 systems becomes crucial. In this paper, we propose
                 three techniques to create sensor infrastructures for
                 monitoring the maximum temperature on a multicore
                 system. Initially, we extend a nonuniform sensor
                 placement methodology proposed in the literature to
                 handle chip multiprocessors (CMPs) and show its
                 limitations. We then analyze a grid-based approach
                 where the sensors are placed on a static grid covering
                 each core and show that the sensor readings can differ
                 from the actual maximum core temperature by as much as
                 12.6^\circ C when using 16 sensors per core. Also, as
                 large as 10.6\% of the thermal emergencies are not
                 captured using the same number of sensors. Based on
                 this observation, we first develop an interpolation
                 scheme, which estimates the maximum core temperature
                 through interpolation of the readings collected at the
                 static grid points. We show that the interpolation
                 scheme improves the measurement accuracy and emergency
                 coverage compared to grid-based placement when using
                 the same number of sensors. Second, we present a
                 dynamic scheme where only a subset of the sensor
                 readings is collected to predict the maximum
                 temperature of each core. Our results indicate that, we
                 can reduce the number of active sensors by as much as
                 50\%, while maintaining similar measurement accuracy
                 and emergency coverage compared to the case where the
                 entire sensor set on the grid is sampled at all
                 times.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "9",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
  keywords =     "nonuniform and uniform sensor placement; thermal
                 sensor allocation",
}

@Article{Joshi:2008:DEP,
  author =       "Ajay Joshi and Lieven Eeckhout and Robert H. {Bell,
                 Jr.} and Lizy K. John",
  title =        "Distilling the essence of proprietary workloads into
                 miniature benchmarks",
  journal =      j-TACO,
  volume =       "5",
  number =       "2",
  pages =        "10:1--10:??",
  month =        aug,
  year =         "2008",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1400112.1400115",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Thu Aug 28 13:25:00 MDT 2008",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Benchmarks set standards for innovation in computer
                 architecture research and industry product development.
                 Consequently, it is of paramount importance that these
                 workloads are representative of real-world
                 applications. However, composing such representative
                 workloads poses practical challenges to application
                 analysis teams and benchmark developers (1) real-world
                 workloads are intellectual property and vendors
                 hesitate to share these proprietary applications; and
                 (2) porting and reducing these applications to
                 benchmarks that can be simulated in a tractable amount
                 of time is a nontrivial task. In this paper, we address
                 this problem by proposing a technique that
                 automatically distills key inherent behavioral
                 attributes of a proprietary workload and captures them
                 into a miniature synthetic benchmark clone. The
                 advantage of the benchmark clone is that it hides the
                 functional meaning of the code but exhibits similar
                 performance characteristics as the target application.
                 Moreover, the dynamic instruction count of the
                 synthetic benchmark clone is substantially shorter than
                 the proprietary application, greatly reducing overall
                 simulation time for SPEC CPU, the simulation time
                 reduction is over five orders of magnitude compared to
                 entire benchmark execution. Using a set of benchmarks
                 representative of general-purpose, scientific, and
                 embedded applications, we demonstrate that the power
                 and performance characteristics of the synthetic
                 benchmark clone correlate well with those of the
                 original application across a wide range of
                 microarchitecture configurations.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "10",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
  keywords =     "benchmark cloning; benchmarks; workload
                 characterization",
}

@Article{Catania:2008:RCM,
  author =       "Vincenzo Catania and Maurizio Palesi and Davide
                 Patti",
  title =        "Reducing complexity of multiobjective design space
                 exploration in {VLIW}-based embedded systems",
  journal =      j-TACO,
  volume =       "5",
  number =       "2",
  pages =        "11:1--11:??",
  month =        aug,
  year =         "2008",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1400112.1400116",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Thu Aug 28 13:25:00 MDT 2008",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Architectures based on very-long instruction word
                 (VLIW) have found fertile ground in multimedia
                 electronic appliances thanks to their ability to
                 exploit high degrees of instruction level parallelism
                 (ILP) with a reasonable trade-off in complexity and
                 silicon cost. Specialization of such architectures
                 involves the configuration of both hardware-related
                 aspects (e.g., register files, functional units, memory
                 subsystem) and software-related issues (e.g., the
                 compilation strategy). The complex interactions between
                 the components of such systems will force a human
                 designer to rely on judgment and experience in
                 designing them, possibly eliminating interesting
                 configurations, and making tuning of the system, for
                 either power, energy, or performance, difficult. In
                 this paper we propose tools and methodologies to
                 efficiently cope with this complexity from a
                 multiobjective perspective. We first analyze the impact
                 of ILP-oriented code transformations using two
                 alternative compilation profiles to quantitatively show
                 the effect of such transformations on typical design
                 objectives like performance, power dissipation, and
                 energy consumption. Next, by means of statistical
                 analysis, we collect useful data to predict the
                 effectiveness of a given compilation profiles for a
                 specific application. Information gathered from such
                 analysis can be exploited to drastically reduce the
                 computational effort needed to perform the design space
                 exploration.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "11",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
  keywords =     "design space exploration; energy; genetic algorithms;
                 hyperblock formation; ILP; multiobjective optimization;
                 performances; power; statistical analysis; VLIW
                 architectures",
}

@Article{Leverich:2008:CEM,
  author =       "Jacob Leverich and Hideho Arakida and Alex
                 Solomatnikov and Amin Firoozshahian and Mark Horowitz
                 and Christos Kozyrakis",
  title =        "Comparative evaluation of memory models for chip
                 multiprocessors",
  journal =      j-TACO,
  volume =       "5",
  number =       "3",
  pages =        "12:1--12:??",
  month =        nov,
  year =         "2008",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1455650.1455651",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Dec 8 14:28:18 MST 2008",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "There are two competing models for the on-chip memory
                 in Chip Multiprocessor (CMP) systems: {\em
                 hardware-managed coherent caches\/} and {\em
                 software-managed streaming memory}. This paper performs
                 a direct comparison of the two models under the same
                 set of assumptions about technology, area, and
                 computational capabilities. The goal is to quantify how
                 and when they differ in terms of performance, energy
                 consumption, bandwidth requirements, and latency
                 tolerance for general-purpose CMPs. We demonstrate that
                 for data-parallel applications on systems with up to 16
                 cores, the cache-based and streaming models perform and
                 scale equally well. For certain applications with
                 little data reuse, streaming scales better due to
                 better bandwidth use and macroscopic software
                 prefetching. However, the introduction of techniques
                 such as hardware prefetching and nonallocating stores
                 to the cache-based model eliminates the streaming
                 advantage. Overall, our results indicate that there is
                 not sufficient advantage in building streaming memory
                 systems where all on-chip memory structures are
                 explicitly managed. On the other hand, we show that
                 streaming at the programming model level is
                 particularly beneficial, even with the cache-based
                 model, as it enhances locality and creates
                 opportunities for bandwidth optimizations. Moreover, we
                 observe that stream programming is actually easier with
                 the cache-based model because the hardware guarantees
                 correct, best-effort execution even when the programmer
                 cannot fully regularize an application's code.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "12",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
  keywords =     "cache coherence; Chip multiprocessors; locality
                 optimizations; parallel programming; streaming memory",
}

@Article{Sharkey:2008:RRP,
  author =       "Joseph J. Sharkey and Jason Loew and Dmitry V.
                 Ponomarev",
  title =        "Reducing register pressure in {SMT} processors through
                 {L2}-miss-driven early register release",
  journal =      j-TACO,
  volume =       "5",
  number =       "3",
  pages =        "13:1--13:??",
  month =        nov,
  year =         "2008",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1455650.1455652",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Dec 8 14:28:18 MST 2008",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "The register file is one of the most critical datapath
                 components limiting the number of threads that can be
                 supported on a simultaneous multithreading (SMT)
                 processor. To allow the use of smaller register files
                 without degrading performance, techniques that maximize
                 the efficiency of using registers through aggressive
                 register allocation/deallocation can be considered. In
                 this article, we propose a novel technique to early
                 deallocate physical registers allocated to threads
                 which experience L2 cache misses. This is accomplished
                 by speculatively committing the load-independent
                 instructions and deallocating the registers
                 corresponding to the previous mappings of their
                 destinations, without waiting for the cache miss
                 request to be serviced. The early deallocated registers
                 are then made immediately available for allocation to
                 instructions within the same thread as well as within
                 other threads, thus improving the overall processor
                 throughput. On the average across the simulated mixes
                 of multiprogrammed SPEC 2000 workloads, our technique
                 results in 33\% improvement in throughput and 25\%
                 improvement in terms of harmonic mean of weighted IPCs
                 over the baseline SMT with the state-of-the-art DCRA
                 policy. This is achieved without creating checkpoints,
                 maintaining per-register counters of pending consumers,
                 performing tag rebroadcasts, register remappings,
                 and/or additional associative searches.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "13",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
  keywords =     "register file; Simultaneous multithreading",
}

@Article{Mehrara:2008:ESP,
  author =       "Mojtaba Mehrara and Todd Austin",
  title =        "Exploiting selective placement for low-cost memory
                 protection",
  journal =      j-TACO,
  volume =       "5",
  number =       "3",
  pages =        "14:1--14:??",
  month =        nov,
  year =         "2008",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1455650.1455653",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Dec 8 14:28:18 MST 2008",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Many embedded processing applications, such as those
                 found in the automotive or medical field, require
                 hardware designs that are at the same time low cost and
                 reliable. Traditionally, reliable memory systems have
                 been implemented using coded storage techniques, such
                 as ECC. While these designs can effectively detect and
                 correct memory faults such as transient errors and
                 single-bit defects, their use bears a significant cost
                 overhead. In this article, we propose a novel partial
                 memory protection scheme that provides high-coverage
                 fault protection for program code and data, but with
                 much lower cost than traditional approaches. Our
                 approach profiles program code and data usage to assess
                 which program elements are most critical to maintaining
                 program correctness. Critical code and variables are
                 then placed into a limited protected storage resources.
                 To ensure high coverage of program elements, our
                 placement technique considers all program components
                 simultaneously, including code, global variables, stack
                 frames, and heap variables. The fault coverage of our
                 approach is gauged using Monte Carlo fault-injection
                 experiments, which confirm that our technique provides
                 high levels of fault protection (99\% coverage) with
                 limited memory protection resources (36\% protected
                 area).",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "14",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
  keywords =     "fault-tolerant design; memory system design; Partial
                 memory protection; selective placement; transient
                 faults",
}

@Article{Vandierendonck:2008:SRA,
  author =       "Hans Vandierendonck and Andr{\'e} Seznec",
  title =        "Speculative return address stack management
                 revisited",
  journal =      j-TACO,
  volume =       "5",
  number =       "3",
  pages =        "15:1--15:??",
  month =        nov,
  year =         "2008",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1455650.1455654",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Dec 8 14:28:18 MST 2008",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Branch prediction feeds a speculative execution
                 processor core with instructions. Branch mispredictions
                 are inevitable and have negative effects on performance
                 and energy consumption. With the advent of highly
                 accurate conditional branch predictors, nonconditional
                 branch instructions are gaining importance.\par

                 In this article, we address the prediction of procedure
                 returns. On modern processors, procedure returns are
                 predicted through a return address stack (RAS). The
                 overwhelming majority of the return mispredictions are
                 due to RAS overflows and/or overwriting the top entries
                 of the RAS on a mispredicted path. These sources of
                 misprediction were addressed by previously proposed
                 speculative return address stacks [Jourdan et al. 1996;
                 Skadron et al. 1998]. However, the remaining
                 misprediction rate of these RAS designs is still
                 significant when compared to state-of-the-art
                 conditional predictors.\par

                 We present two low-cost corruption detectors for RAS
                 predictors. They detect RAS overflows and wrong path
                 corruption with 100\% coverage. As a consequence, when
                 such a corruption is detected, another source can be
                 used for predicting the return. On processors featuring
                 a branch target buffer (BTB), this BTB can be used as a
                 free backup predictor for predicting returns when
                 corruption is detected.\par

                 Our experiments show that our proposal can be used to
                 improve the behavior of all previously proposed
                 speculative RASs. For instance, without any specific
                 management of the speculative states on the RAS, an
                 8-entry BTB-backed up RAS achieves the same performance
                 level as a state-of-the-art, but complex, 64-entry
                 self-checkpointing RAS [Jourdan et al. 1996].
                 Therefore, our proposal can be used either to improve
                 the performance of the processor or to reduce its
                 hardware complexity.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "15",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
  keywords =     "back-up predictor; corruption detection; Return
                 address prediction",
}

@Article{Chhabra:2009:MSP,
  author =       "Siddhartha Chhabra and Brian Rogers and Yan Solihin
                 and Milos Prvulovic",
  title =        "Making secure processors {OS}- and
                 performance-friendly",
  journal =      j-TACO,
  volume =       "5",
  number =       "4",
  pages =        "16:1--16:??",
  month =        mar,
  year =         "2009",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1498690.1498691",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Wed Mar 18 21:35:33 MDT 2009",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "In today's digital world, computer security issues
                 have become increasingly important. In particular,
                 researchers have proposed designs for secure processors
                 that utilize hardware-based memory encryption and
                 integrity verification to protect the privacy and
                 integrity of computation even from sophisticated
                 physical attacks. However, currently proposed schemes
                 remain hampered by problems that make them impractical
                 for use in today's computer systems: lack of virtual
                 memory and Inter-Process Communication support as well
                 as excessive storage and performance overheads. In this
                 article, we propose (1) address independent seed
                 encryption (AISE), a counter-mode-based memory
                 encryption scheme using a novel seed composition, and
                 (2) bonsai Merkle trees (BMT), a novel Merkle
                 tree-based memory integrity verification technique, to
                 eliminate these system and performance issues
                 associated with prior counter-mode memory encryption
                 and Merkle tree integrity verification schemes. We
                 present both a qualitative discussion and a
                 quantitative analysis to illustrate the advantages of
                 our techniques over previously proposed approaches in
                 terms of complexity, feasibility, performance, and
                 storage. Our results show that AISE+BMT reduces the
                 overhead of prior memory encryption and integrity
                 verification schemes from 12\% to 2\% on average for
                 single-threaded benchmarks on uniprocessor systems, and
                 from 15\% to 4\% for coscheduled benchmarks on
                 multicore systems while eliminating critical
                 system-level problems.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "16",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
  keywords =     "memory encryption; memory integrity verification;
                 Secure processor architectures; virtualization",
}

@Article{Jimenez:2009:GNB,
  author =       "Daniel A. Jim{\'e}nez",
  title =        "Generalizing neural branch prediction",
  journal =      j-TACO,
  volume =       "5",
  number =       "4",
  pages =        "17:1--17:??",
  month =        mar,
  year =         "2009",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1498690.1498692",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Wed Mar 18 21:35:33 MDT 2009",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Improved branch prediction accuracy is essential to
                 sustaining instruction throughput with today's deep
                 pipelines. Traditional branch predictors exploit
                 correlations between pattern history and branch outcome
                 to predict branches, but there is a stronger and more
                 natural correlation between path history and branch
                 outcome. We explore the potential for exploiting this
                 correlation. We introduce {\em piecewise linear branch
                 prediction}, an idealized branch predictor that
                 develops a set of linear functions, one for each
                 program path to the branch to be predicted, that
                 separate predicted taken from predicted not taken
                 branches. Taken together, all of these linear functions
                 form a piecewise linear decision surface. We present a
                 limit study of this predictor showing its potential to
                 greatly improve predictor accuracy.\par

                 We then introduce a practical implementable branch
                 predictor based on piecewise linear branch prediction.
                 In making our predictor practical, we show how a
                 parameterized version of it unifies the previously
                 distinct concepts of perceptron prediction and
                 path-based neural prediction. Our new branch predictor
                 has implementation costs comparable to current
                 prominent predictors in the literature while
                 significantly improving accuracy. For a deeply
                 pipelined simulated microarchitecture our predictor
                 with a 256-KB hardware budget improves the harmonic
                 mean normalized instructions-per-cycle rate by 8\% over
                 both the original path-based neural predictor and
                 2Bc-{\em gskew}. The average misprediction rate is
                 decreased by 16\% over the path-based neural predictor
                 and by 22\% over 2Bc-{\em gskew}.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "17",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
  keywords =     "Branch prediction; machine learning",
}

@Article{Jeon:2009:AAP,
  author =       "Jinseong Jeon and Keoncheol Shin and Hwansoo Han",
  title =        "Abstracting access patterns of dynamic memory using
                 regular expressions",
  journal =      j-TACO,
  volume =       "5",
  number =       "4",
  pages =        "18:1--18:??",
  month =        mar,
  year =         "2009",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1498690.1498693",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Wed Mar 18 21:35:33 MDT 2009",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Unless the speed gap between CPU and memory
                 disappears, efficient memory usage remains a decisive
                 factor for performance. To optimize data usage of
                 programs in the presence of the memory hierarchy, we
                 are particularly interested in two compiler techniques:
                 {\em pool allocation\/} and {\em field layout
                 restructuring}. Since foreseeing runtime behaviors of
                 programs at compile time is difficult, most of the
                 previous work relied on profiling. On the contrary, our
                 goal is to develop a fully automatic compiler that
                 statically transforms input codes to use memory
                 efficiently. Noticing that {\em regular expressions},
                 which denote repetition explicitly, are sufficient for
                 memory access patterns, we describe how to extract
                 memory access patterns as regular expressions in
                 detail. Based on static patterns presented in regular
                 expressions, we apply pool allocation to repeatedly
                 accessed structures and exploit field layout
                 restructuring according to field affinity relations of
                 chosen structures. To make a scalable framework, we
                 devise and apply new abstraction techniques, which
                 build and interpret access patterns for the whole
                 programs in a bottom-up fashion. We implement our
                 analyses and transformations with the CIL compiler. To
                 verify the effect and scalability of our scheme, we
                 examine 17 benchmarks including 2 SPECINT 2000
                 benchmarks whose source lines of code are larger than
                 10,000. Our experiments demonstrate that the static
                 layout transformations for dynamic memory can reduce
                 L1D cache misses by 16\% and execution times by 14\% on
                 average.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "18",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
  keywords =     "Access patterns; field affinity; layout
                 transformation; pool allocation; regular expressions",
}

@Article{Shobaki:2009:OTS,
  author =       "Ghassan Shobaki and Kent Wilken and Mark Heffernan",
  title =        "Optimal trace scheduling using enumeration",
  journal =      j-TACO,
  volume =       "5",
  number =       "4",
  pages =        "19:1--19:??",
  month =        mar,
  year =         "2009",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1498690.1498694",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Wed Mar 18 21:35:33 MDT 2009",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "This article presents the first optimal algorithm for
                 trace scheduling. The trace is a global scheduling
                 region used by compilers to exploit instruction-level
                 parallelism across basic block boundaries. Several
                 heuristic techniques have been proposed for trace
                 scheduling, but the precision of these techniques has
                 not been studied relative to optimality. This article
                 describes a technique for finding provably optimal
                 trace schedules, where optimality is defined in terms
                 of a weighted sum of schedule lengths across all code
                 paths in a trace. The optimal algorithm uses
                 branch-and-bound enumeration to efficiently explore the
                 entire solution space. Experimental evaluation of the
                 algorithm shows that, with a time limit of 1 second per
                 problem, 91\% of the hard trace scheduling problems in
                 the SPEC CPU 2006 Integer Benchmarks are solved
                 optimally. For 58\% of these hard problems, the optimal
                 schedule is improved compared to that produced by a
                 heuristic scheduler with a geometric mean improvement
                 of 3.2\% in weighted schedule length and 18\% in
                 compensation code size.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "19",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
  keywords =     "branch-and-bound enumeration; compiler optimizations;
                 global instruction scheduling; Instruction scheduling;
                 instruction-level parallelism; optimal instruction
                 scheduling; trace scheduling",
}

@Article{Kulkarni:2009:PEO,
  author =       "Prasad A. Kulkarni and David B. Whalley and Gary S.
                 Tyson and Jack W. Davidson",
  title =        "Practical exhaustive optimization phase order
                 exploration and evaluation",
  journal =      j-TACO,
  volume =       "6",
  number =       "1",
  pages =        "1:1--1:??",
  month =        mar,
  year =         "2009",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1509864.1509865",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Thu May 7 14:55:25 MDT 2009",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Choosing the most appropriate optimization phase
                 ordering has been a long-standing problem in compiler
                 optimizations. Exhaustive evaluation of all possible
                 orderings of optimization phases for each function is
                 generally dismissed as infeasible for
                 production-quality compilers targeting accepted
                 benchmarks. In this article, we show that it is
                 possible to exhaustively evaluate the optimization
                 phase order space for each function in a reasonable
                 amount of time for most of the functions in our
                 benchmark suite. To achieve this goal, we used various
                 techniques to significantly prune the optimization
                 phase order search space so that it can be
                 inexpensively enumerated in most cases and reduce the
                 number of program simulations required to evaluate
                 program performance for each distinct phase ordering.
                 The techniques described are applicable to other
                 compilers in which it is desirable to find the best
                 phase ordering for most functions in a reasonable
                 amount of time. We also describe some interesting
                 properties of the optimization phase order space, which
                 will prove useful for further studies of related
                 problems in compilers.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "1",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
  keywords =     "exhaustive search; iterative compilation; Phase
                 ordering",
}

@Article{Hohenauer:2009:SOF,
  author =       "Manuel Hohenauer and Felix Engel and Rainer Leupers
                 and Gerd Ascheid and Heinrich Meyr",
  title =        "A {SIMD} optimization framework for retargetable
                 compilers",
  journal =      j-TACO,
  volume =       "6",
  number =       "1",
  pages =        "2:1--2:??",
  month =        mar,
  year =         "2009",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1509864.1509866",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Thu May 7 14:55:25 MDT 2009",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Retargetable C compilers are currently widely used to
                 quickly obtain compiler support for new embedded
                 processors and to perform early processor architecture
                 exploration. A partially inherent problem of the
                 retargetable compilation approach, though, is the
                 limited code quality as compared to hand-written
                 compilers or assembly code due to the lack of dedicated
                 optimizations techniques. This problem can be
                 circumvented by designing flexible, retargetable code
                 optimization techniques that apply to a certain range
                 of target architectures. This article focuses on target
                 machines with SIMD instruction support, a common
                 feature in embedded processors for multimedia
                 applications. However, SIMD optimization is known to be
                 a difficult task since SIMD architectures are largely
                 nonuniform, support only a limited set of data types
                 and impose several memory alignment constraints.
                 Additionally, such techniques require complicated loop
                 transformations, which are tailored to the SIMD
                 architecture in order to exhibit the necessary amount
                 of parallelism in the code. Thus, integrating the SIMD
                 optimization {\em and\/} the required loop
                 transformations together in a single retargeting
                 formalism is an ambitious challenge. In this article,
                 we present an efficient and quickly retargetable SIMD
                 code optimization framework that is integrated into an
                 industrial retargetable C compiler. Experimental
                 results for different processors demonstrate that the
                 proposed technique applies to real-life target machines
                 and that it produces code quality improvements close to
                 the theoretical limit.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "2",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
  keywords =     "ASIP; retargetable compilers; SIMD; subword
                 parallelism; vectorization",
}

@Article{Eyerman:2009:MLP,
  author =       "Stijn Eyerman and Lieven Eeckhout",
  title =        "Memory-level parallelism aware fetch policies for
                 simultaneous multithreading processors",
  journal =      j-TACO,
  volume =       "6",
  number =       "1",
  pages =        "3:1--3:??",
  month =        mar,
  year =         "2009",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1509864.1509867",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Thu May 7 14:55:25 MDT 2009",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "A thread executing on a simultaneous multithreading
                 (SMT) processor that experiences a long-latency load
                 will eventually stall while holding execution
                 resources. Existing long-latency load aware SMT fetch
                 policies limit the amount of resources allocated by a
                 stalled thread by identifying long-latency loads and
                 preventing the thread from fetching more instructions
                 --- and in some implementations, instructions beyond
                 the long-latency load are flushed to release allocated
                 resources.\par

                 This article proposes an SMT fetch policy that takes
                 into account the available memory-level parallelism
                 (MLP) in a thread. The key idea proposed in this
                 article is that in case of an isolated long-latency
                 load (i.e., there is no MLP), the thread should be
                 prevented from allocating additional resources.
                 However, in case multiple independent long-latency
                 loads overlap (i.e., there is MLP), the thread should
                 allocate as many resources as needed in order to fully
                 expose the available MLP. MLP-aware fetch policies
                 achieve better performance for MLP-intensive threads on
                 SMT processors, leading to higher overall system
                 throughput and shorter average turnaround time than
                 previously proposed fetch policies.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "3",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
  keywords =     "Fetch Policy; Memory-Level Parallelism (MLP);
                 Simultaneous Multithreading (SMT)",
}

@Article{Strozek:2009:EAE,
  author =       "Lukasz Strozek and David Brooks",
  title =        "Energy- and area-efficient architectures through
                 application clustering and architectural
                 heterogeneity",
  journal =      j-TACO,
  volume =       "6",
  number =       "1",
  pages =        "4:1--4:??",
  month =        mar,
  year =         "2009",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1509864.1509868",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Thu May 7 14:55:25 MDT 2009",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Customizing architectures for particular applications
                 is a promising approach to yield highly
                 energy-efficient designs for embedded systems. This
                 work explores the benefits of architectural
                 customization for a class of embedded architectures
                 typically used in energy- and area-constrained
                 application domains, such as sensor nodes and
                 multimedia processing. We implement a process flow that
                 performs an automatic synthesis and evaluation of the
                 different architectures based on runtime profiles of
                 applications and determines an efficient architecture,
                 with consideration for both energy and area
                 constraints. An expressive architectural model, used by
                 our engine, is introduced that takes advantage of
                 efficient opcode allocation, several memory addressing
                 modes, and operand types. By profiling embedded
                 benchmarks from a variety of sensor and multimedia
                 applications, we show that the energy savings resulting
                 from various architectural optimizations relative to
                 the base architectures (e.g., MIPS and MSP430) are
                 significant and can reach 50\%, depending on the
                 application. We then identify the set of architectures
                 that achieves near-optimal savings for a group of
                 applications. Finally, we propose the use of
                 heterogeneous ISA processors implementing those
                 architectures as a solution to capitalize on energy
                 savings provided by application customization while
                 executing a range of applications efficiently.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "4",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
  keywords =     "Efficient custom architectures; heterogeneous ISA
                 processors",
}

@Article{Venkataramani:2009:MAM,
  author =       "Guru Venkataramani and Ioannis Doudalis and Yan
                 Solihin and Milos Prvulovic",
  title =        "{MemTracker}: {An} accelerator for memory debugging
                 and monitoring",
  journal =      j-TACO,
  volume =       "6",
  number =       "2",
  pages =        "5:1--5:??",
  month =        jun,
  year =         "2009",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1543753.1543754",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Thu Jul 2 12:32:04 MDT 2009",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Memory bugs are a broad class of bugs that is becoming
                 increasingly common with increasing software
                 complexity, and many of these bugs are also security
                 vulnerabilities. Existing software and hardware
                 approaches for finding and identifying memory bugs have
                 a number of drawbacks including considerable
                 performance overheads, target only a specific type of
                 bug, implementation cost, and inefficient use of
                 computational resources.\par

                 This article describes MemTracker, a new hardware
                 support mechanism that can be configured to perform
                 different kinds of memory access monitoring tasks.
                 MemTracker associates each word of data in memory with
                 a few bits of state, and uses a programmable state
                 transition table to react to different events that can
                 affect this state. The number of state bits per word,
                 the events to which MemTracker reacts, and the
                 transition table are all fully programmable.
                 MemTracker's rich set of states, events, and
                 transitions can be used to implement different
                 monitoring and debugging checkers with minimal
                 performance overheads, even when frequent state updates
                 are needed. To evaluate MemTracker, we map three
                 different checkers onto it, as well as a checker that
                 combines all three. For the most demanding (combined)
                 checker with 8 bits state per memory word, we observe
                 performance overheads of only around 3\%, on average,
                 and 14.5\% worst-case across different benchmark
                 suites. Such low overheads allow continuous (always-on)
                 use of MemTracker-enabled checkers, even in production
                 runs.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "5",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
  keywords =     "Accelerator; debugging; memory access monitoring",
}

@Article{Gabor:2009:SLA,
  author =       "Ron Gabor and Avi Mendelson and Shlomo Weiss",
  title =        "Service level agreement for multithreaded processors",
  journal =      j-TACO,
  volume =       "6",
  number =       "2",
  pages =        "6:1--6:??",
  month =        jun,
  year =         "2009",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1543753.1543755",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Thu Jul 2 12:32:04 MDT 2009",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Multithreading is widely used to increase processor
                 throughput. As the number of shared resources increase,
                 managing them while guaranteeing predicted performance
                 becomes a major problem. Attempts have been made in
                 previous work to ease this via different fairness
                 mechanisms. In this article, we present a new approach
                 to control the resource allocation and sharing via a
                 service level agreement (SLA)-based mechanism; that is,
                 via an agreement in which multithreaded processors
                 guarantee a minimal level of service to the running
                 threads. We introduce a new metric, {\em C\/}$_{SLA}$,
                 for conformance to SLA in multithreaded processors and
                 show that controlling resources using with SLA allows
                 for higher gains than are achievable by previously
                 suggested fairness techniques. It also permits
                 improving one metric (e.g., power) while maintaining
                 SLA in another (e.g., performance). We compare SLA
                 enforcement to schemes based on other fairness metrics,
                 which are mostly targeted at equalizing execution
                 parameters. We show that using SLA rather than fairness
                 based algorithms provides a range of acceptable
                 execution points from which we can select the point
                 that best fits our optimization target, such as
                 maximizing the weighted speedup (sum of the speedups of
                 the individual threads) or reducing power. We
                 demonstrate the effectiveness of the new SLA approach
                 using switch-on-event (coarse-grained) multithreading.
                 Our weighted speedup improvement scheme successfully
                 enforces SLA while improving the weighted speedup by an
                 average of 10\% for unbalanced threads. This result is
                 significant when compared with performance losses that
                 may be incurred by fairness enforcement methods. When
                 optimizing for power reduction in unbalanced threads
                 SLA enforcement reduces the power by an average of
                 15\%. SLA may be complemented by other power reduction
                 methods to achieve further power savings {\em and\/}
                 maintain the same service level for the threads. We
                 also demonstrate differentiated SLA, where weighted
                 speedup is maximized while each thread may have a
                 different throughput constraint.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "6",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
  keywords =     "fairness; performance; power; Service level agreement;
                 throughput",
}

@Article{Fung:2009:DWF,
  author =       "Wilson W. L. Fung and Ivan Sham and George Yuan and
                 Tor M. Aamodt",
  title =        "Dynamic warp formation: {Efficient MIMD} control flow
                 on {SIMD} graphics hardware",
  journal =      j-TACO,
  volume =       "6",
  number =       "2",
  pages =        "7:1--7:??",
  month =        jun,
  year =         "2009",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1543753.1543756",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Thu Jul 2 12:32:04 MDT 2009",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Recent advances in graphics processing units (GPUs)
                 have resulted in massively parallel hardware that is
                 easily programmable and widely available in today's
                 desktop and notebook computer systems. GPUs typically
                 use single-instruction, multiple-data (SIMD) pipelines
                 to achieve high performance with minimal overhead for
                 control hardware. Scalar threads running the same
                 computing kernel are grouped together into SIMD
                 batches, sometimes referred to as warps. While SIMD is
                 ideally suited for simple programs, recent GPUs include
                 control flow instructions in the GPU instruction set
                 architecture and programs using these instructions may
                 experience reduced performance due to the way branch
                 execution is supported in hardware. One solution is to
                 add a stack to allow different SIMD processing elements
                 to execute distinct program paths after a branch
                 instruction. The occurrence of diverging branch
                 outcomes for different processing elements
                 significantly degrades performance using this approach.
                 In this article, we propose dynamic warp formation and
                 scheduling, a mechanism for more efficient SIMD branch
                 execution on GPUs. It dynamically regroups threads into
                 new warps on the fly following the occurrence of
                 diverging branch outcomes. We show that a realistic
                 hardware implementation of this mechanism improves
                 performance by 13\%, on average, with 256 threads per
                 core, 24\% with 512 threads, and 47\% with 768 threads
                 for an estimated area increase of 8\%.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "7",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
  keywords =     "control flow; fine-grained multithreading; GPU; SIMD",
}

@Article{Koh:2009:TPV,
  author =       "Cheng-Kok Koh and Weng-Fai Wong and Yiran Chen and Hai
                 Li",
  title =        "Tolerating process variations in large,
                 set-associative caches: {The} buddy cache",
  journal =      j-TACO,
  volume =       "6",
  number =       "2",
  pages =        "8:1--8:??",
  month =        jun,
  year =         "2009",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1543753.1543757",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Thu Jul 2 12:32:04 MDT 2009",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "One important trend in today's microprocessor
                 architectures is the increase in size of the processor
                 caches. These caches also tend to be set associative.
                 As technology scales, process variations are expected
                 to increase the fault rates of the SRAM cells that
                 compose such caches. As an important component of the
                 processor, the parametric yield of SRAM cells is
                 crucial to the overall performance and yield of the
                 microchip. In this article, we propose a
                 microarchitectural solution, called the buddy cache
                 that permits large, set-associative caches to tolerate
                 faults in SRAM cells due to process variations. In
                 essence, instead of disabling a faulty cache block in a
                 set (as is the current practice), it is paired with
                 another faulty cache block in the same set --- the
                 buddy. Although both cache blocks are faulty, if the
                 faults of the two blocks do not overlap, then instead
                 of losing two blocks, buddying will yield a functional
                 block from the nonfaulty portions of the two blocks. We
                 found that with buddying, caches can better mitigate
                 the negative impacts of process variations on
                 performance and yield, gracefully downgrading
                 performance as opposed to catastrophic failure. We will
                 describe the details of the buddy cache and give
                 insights as to why it is both more performance and
                 yield resilient to faults.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "8",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
  keywords =     "caches; fault recovery; memory structures; Processor
                 architectures",
}

@Article{Li:2009:CDS,
  author =       "Lian Li and Hui Feng and Jingling Xue",
  title =        "Compiler-directed scratchpad memory management via
                 graph coloring",
  journal =      j-TACO,
  volume =       "6",
  number =       "3",
  pages =        "9:1--9:??",
  month =        sep,
  year =         "2009",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1582710.1582711",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Thu Oct 1 09:20:47 MDT 2009",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Scratchpad memory (SPM), a fast on-chip SRAM managed
                 by software, is widely used in embedded systems. This
                 article introduces a general-purpose compiler approach,
                 called memory coloring, to assign static data
                 aggregates, such as arrays and structs, in a program to
                 an SPM. The novelty of this approach lies in
                 partitioning the SPM into a pseudo--register file (with
                 interchangeable and aliased registers), splitting the
                 live ranges of data aggregates to create potential data
                 transfer statements between SPM and off-chip memory,
                 and finally, adapting an existing graph coloring
                 algorithm for register allocation to assign the data
                 aggregates to the pseudo--register file. Our
                 experimental results using a set of 10 C benchmarks
                 from MediaBench and MiBench show that our methodology
                 is capable of managing SPMs efficiently and effectively
                 for large embedded applications. In addition, our SPM
                 allocator can obtain close to optimal solutions when
                 evaluated and compared against an existing
                 heuristics-based SPM allocator and an ILP-based SPM
                 allocator.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "9",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
  keywords =     "graph coloring; live range splitting; memory
                 allocation; memory coloring; register coalescing;
                 Scratchpad memory; software-managed cache",
}

@Article{Golander:2009:CAR,
  author =       "Amit Golander and Shlomo Weiss",
  title =        "Checkpoint allocation and release",
  journal =      j-TACO,
  volume =       "6",
  number =       "3",
  pages =        "10:1--10:??",
  month =        sep,
  year =         "2009",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1582710.1582712",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Thu Oct 1 09:20:47 MDT 2009",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Out-of-order speculative processors need a bookkeeping
                 method to recover from incorrect speculation. In recent
                 years, several microarchitectures that employ
                 checkpoints have been proposed, either extending the
                 reorder buffer or entirely replacing it. This work
                 presents an in-dept-study of checkpointing in
                 checkpoint-based microarchitectures, from the desired
                 content of a checkpoint, via implementation trade-offs,
                 and to checkpoint allocation and release policies. A
                 major contribution of the article is a novel adaptive
                 checkpoint allocation policy that outperforms known
                 policies. The adaptive policy controls checkpoint
                 allocation according to dynamic events, such as
                 second-level cache misses and rollback history. It
                 achieves 6.8\% and 2.2\% speedup for the integer and
                 floating point benchmarks, respectively, and does not
                 require a branch confidence estimator. The results show
                 that the proposed adaptive policy achieves most of the
                 potential of an oracle policy whose performance
                 improvement is 9.8\% and 3.9\% for the integer and
                 floating point benchmarks, respectively. We exploit
                 known techniques for saving leakage power by adapting
                 and applying them to checkpoint-based
                 microarchitectures. The proposed applications combine
                 to reduce the leakage power of the register file to
                 about one half of its original value.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "10",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
  keywords =     "Checkpoint; early register release; leakage;
                 misprediction; out-of-order execution; rollback",
}

@Article{Xu:2009:TXP,
  author =       "Weifeng Xu and Russell Tessier",
  title =        "{Tetris-XL}: a performance-driven spill reduction
                 technique for embedded {VLIW} processors",
  journal =      j-TACO,
  volume =       "6",
  number =       "3",
  pages =        "11:1--11:??",
  month =        sep,
  year =         "2009",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1582710.1582713",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Thu Oct 1 09:20:47 MDT 2009",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "As technology has advanced, the application space of
                 Very Long Instruction Word (VLIW) processors has grown
                 to include a variety of embedded platforms. Due to cost
                 and power consumption constraints, many embedded VLIW
                 processors contain limited resources, including
                 registers. As a result, a VLIW compiler that maximizes
                 instruction level parallelism (ILP) without considering
                 register constraints may generate excessive register
                 spills, leading to reduced overall system performance.
                 To address this issue, this article presents a new
                 spill reduction technique that improves VLIW runtime
                 performance by reordering operations prior to register
                 allocation and instruction scheduling. Unlike earlier
                 algorithms, our approach explicitly considers both
                 register reduction and data dependency in performing
                 operation reordering. Data dependency control limits
                 unexpected schedule length increases during subsequent
                 instruction scheduling. Our technique has been
                 evaluated using Trimaran, an academic VLIW compiler,
                 and evaluated using a set of embedded systems
                 benchmarks. Experimental results show that, on average,
                 this technique improves VLIW performance by 10\% for
                 VLIW processors with 32 registers and 8 functional
                 units compared with previous spill reduction
                 techniques. Limited improvement is seen versus prior
                 approaches for VLIW processors with 64 registers and 8
                 functional units.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "11",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
  keywords =     "instruction level parallelism; Register pressure; Very
                 Long Instruction Word (VLIW) processor",
}

@Article{Jones:2009:ELE,
  author =       "Timothy M. Jones and Michael F. P. O'Boyle and Jaume
                 Abella and Antonio Gonz{\'a}lez and O{\u{g}}uz Ergin",
  title =        "Exploring the limits of early register release:
                 {Exploiting} compiler analysis",
  journal =      j-TACO,
  volume =       "6",
  number =       "3",
  pages =        "12:1--12:??",
  month =        sep,
  year =         "2009",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1582710.1582714",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Thu Oct 1 09:20:47 MDT 2009",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Register pressure in modern superscalar processors can
                 be reduced by releasing registers early and by copying
                 their contents to cheap back-up storage. This article
                 quantifies the potential benefits of register occupancy
                 reduction and shows that existing hardware-based
                 schemes typically achieve only a small fraction of this
                 potential. This is because they are unable to
                 accurately determine the last use of a register and
                 must wait until the redefining instruction enters the
                 pipeline. On the other hand, compilers have a global
                 view of the program and, using simple dataflow
                 analysis, can determine the last use. This article
                 evaluates the extent to which compiler analysis can aid
                 early releasing, explores the design space, and
                 introduces commit and issue-based early releasing
                 schemes, quantifying their benefits. Using simple
                 compiler analysis and microarchitecture changes, we
                 achieve 70\% of the potential register file occupancy
                 reduction. By adding more hardware support, we can
                 increase this to 94\%. Our schemes are compared to
                 state-of-the-art approaches for varying register file
                 sizes and are shown to outperform these existing
                 techniques.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "12",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
  keywords =     "compiler; energy efficiency; Low-power design;
                 microarchitecture; register file",
}

@Article{Jones:2009:EER,
  author =       "Timothy M. Jones and Michael F. P. O'Boyle and Jaume
                 Abella and Antonio Gonz{\'a}lez and O{\u{g}}uz Ergin",
  title =        "Energy-efficient register caching with compiler
                 assistance",
  journal =      j-TACO,
  volume =       "6",
  number =       "4",
  pages =        "13:1--13:??",
  month =        oct,
  year =         "2009",
  CODEN =        "????",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Mar 15 18:49:43 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "13",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Li:2009:TUC,
  author =       "Weijia Li and Youtao Zhang and Jun Yang and Jiang
                 Zheng",
  title =        "Towards update-conscious compilation for
                 energy-efficient code dissemination in {WSNs}",
  journal =      j-TACO,
  volume =       "6",
  number =       "4",
  pages =        "14:1--14:??",
  month =        oct,
  year =         "2009",
  CODEN =        "????",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Mar 15 18:49:43 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "14",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Wegiel:2009:SRC,
  author =       "Michal Wegiel and Chandra Krintz",
  title =        "The single-referent collector: {Optimizing} compaction
                 for the common case",
  journal =      j-TACO,
  volume =       "6",
  number =       "4",
  pages =        "15:1--15:??",
  month =        oct,
  year =         "2009",
  CODEN =        "????",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Mar 15 18:49:43 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "15",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Subramaniam:2009:DOS,
  author =       "Samantika Subramaniam and Gabriel H. Loh",
  title =        "Design and optimization of the store vectors memory
                 dependence predictor",
  journal =      j-TACO,
  volume =       "6",
  number =       "4",
  pages =        "16:1--16:??",
  month =        oct,
  year =         "2009",
  CODEN =        "????",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Mar 15 18:49:43 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "16",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Wang:2010:PAM,
  author =       "Xiaohang Wang and Mei Yang and Yingtao Jiang and Peng
                 Liu",
  title =        "A power-aware mapping approach to map {IP} cores onto
                 {NoCs} under bandwidth and latency constraints",
  journal =      j-TACO,
  volume =       "7",
  number =       "1",
  pages =        "1:1--1:??",
  month =        apr,
  year =         "2010",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1736065.1736066",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Wed May 5 15:38:13 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "In this article, we investigate the Intellectual
                 Property (IP) mapping problem that maps a given set of
                 IP cores onto the tiles of a mesh-based Network-on-Chip
                 (NoC) architecture such that the power consumption due
                 to intercore communications is minimized. This IP
                 mapping problem is considered under both bandwidth and
                 latency constraints as imposed by the applications and
                 the on-chip network infrastructure. By examining
                 various applications' communication characteristics
                 extracted from their respective communication trace
                 graphs, two distinguishable connectivity templates are
                 realized: the graphs with tightly coupled vertices and
                 those with distributed vertices. These two templates
                 are formally defined in this article, and different
                 mapping heuristics are subsequently developed to map
                 them. In general, tightly coupled vertices are mapped
                 onto tiles that are physically close to each other
                 while the distributed vertices are mapped following a
                 graph partition scheme. Experimental results on both
                 random and multimedia benchmarks have confirmed that
                 the proposed template-based mapping algorithm achieves
                 an average of 15\% power savings as compared with MOCA,
                 a fast greedy-based mapping algorithm. Compared with a
                 branch-and-bound--based mapping algorithm, which
                 produces near optimal results but incurs an extremely
                 high computation cost, the proposed algorithm, due to
                 its polynomial runtime complexity, can generate the
                 results of almost the same quality with much less CPU
                 time. As the on-chip network size increases, the
                 superiority of the proposed algorithm becomes more
                 evident.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "1",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
  keywords =     "bandwidth and latency constraints; IP mapping; Low
                 power; network-on-chip (NoC)",
}

@Article{Chen:2010:HSF,
  author =       "Zhong-Ho Chen and Alvin W. Y. Su",
  title =        "A hardware\slash software framework for instruction
                 and data scratchpad memory allocation",
  journal =      j-TACO,
  volume =       "7",
  number =       "1",
  pages =        "2:1--2:??",
  month =        apr,
  year =         "2010",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1736065.1736067",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Wed May 5 15:38:13 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Previous researches show that a scratchpad memory
                 device consumed less energy than a cache device with
                 the same capacity. In this article, we locate the
                 scratchpad memory (SPM) in the top level of the memory
                 hierarchy to reduce the energy consumption. To take the
                 advantage of a SPM, we address two issues of utilizing
                 a SPM. First, the program's locality should be
                 improved. The second issue is SPM management. To tackle
                 these two issues, we present a hardware/software
                 framework for dynamically allocating both instructions
                 and data in SPM. The software flow could be divided
                 into three phases: locality improving, locality
                 extraction, and runtime SPM management. Without
                 modifying the original compiler and the source code, we
                 improve the locality of a program. An optimization
                 algorithm is proposed to extract the SPM allocations.
                 At runtime, an SPM management program is employed. In
                 hardware, an address translation logic (ATL) is
                 proposed to reduce the overhead of SPM
                 management.\par

                 The results show that the proposed framework can reduce
                 energy delay product (EDP) by 63\%, on average, when
                 compared with the traditional cache architecture. The
                 reduction in EDP is contributed by properly allocating
                 both instructions and data in SPM. By allocating only
                 instructions in SPM, the EDPs are reduced by 45\%, on
                 average. By allocating only data in SPM, the EDPs are
                 reduced by 14\%, on average.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "2",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
  keywords =     "allocation algorithm; Memory allocation; scratchpad
                 memory",
}

@Article{Woo:2010:CVI,
  author =       "Dong Hyuk Woo and Joshua B. Fryman and Allan D. Knies
                 and Hsien-Hsin S. Lee",
  title =        "{Chameleon}: {Virtualizing} idle acceleration cores of
                 a heterogeneous multicore processor for caching and
                 prefetching",
  journal =      j-TACO,
  volume =       "7",
  number =       "1",
  pages =        "3:1--3:??",
  month =        apr,
  year =         "2010",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1736065.1736068",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Wed May 5 15:38:13 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Heterogeneous multicore processors have emerged as an
                 energy- and area-efficient architectural solution to
                 improving performance for domain-specific applications
                 such as those with a plethora of data-level
                 parallelism. These processors typically contain a large
                 number of small, compute-centric cores for acceleration
                 while keeping one or two high-performance ILP cores on
                 the die to guarantee single-thread performance.
                 Although a major portion of the transistors are
                 occupied by the acceleration cores, these resources
                 will sit idle when running unparallelized legacy codes
                 or the sequential part of an application. To address
                 this underutilization issue, in this article, we
                 introduce Chameleon, a flexible heterogeneous multicore
                 architecture to virtualize these resources for
                 enhancing memory performance when running sequential
                 programs. The Chameleon architecture can dynamically
                 virtualize the idle acceleration cores into a
                 last-level cache, a data prefetcher, or a hybrid
                 between these two techniques. In addition, Chameleon
                 can operate in an adaptive mode that dynamically
                 configures the acceleration cores between the hybrid
                 mode and the prefetch-only mode by monitoring the
                 effectiveness of the Chameleon cache mode. In our
                 evaluation with SPEC2006 benchmark suite, different
                 levels of performance improvements were achieved in
                 different modes for different applications. In the case
                 of the adaptive mode, Chameleon improves the
                 performance of SPECint06 and SPECfp06 by 31\% and 15\%,
                 on average. When considering only memory-intensive
                 applications, Chameleon improves the system performance
                 by 50\% and 26\% for SPECint06 and SPECfp06,
                 respectively.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "3",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
  keywords =     "cache; Heterogeneous multicore; idle core;
                 prefetching",
}

@Article{Sanchez:2010:ACI,
  author =       "Daniel Sanchez and George Michelogiannakis and
                 Christos Kozyrakis",
  title =        "An analysis of on-chip interconnection networks for
                 large-scale chip multiprocessors",
  journal =      j-TACO,
  volume =       "7",
  number =       "1",
  pages =        "4:1--4:??",
  month =        apr,
  year =         "2010",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1756065.1736069",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Wed May 5 15:38:13 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "With the number of cores of chip multiprocessors
                 (CMPs) rapidly growing as technology scales down,
                 connecting the different components of a CMP in a
                 scalable and efficient way becomes increasingly
                 challenging. In this article, we explore the
                 architectural-level implications of interconnection
                 network design for CMPs with up to 128 fine-grain
                 multithreaded cores. We evaluate and compare different
                 network topologies using accurate simulation of the
                 full chip, including the memory hierarchy and
                 interconnect, and using a diverse set of scientific and
                 engineering workloads.\par

                 We find that the interconnect has a large impact on
                 performance, as it is responsible for 60\% to 75\% of
                 the miss latency. Latency, and not bandwidth, is the
                 primary performance constraint, since, even with many
                 threads per core and workloads with high miss rates,
                 networks with enough bandwidth can be efficiently
                 implemented for the system scales we consider. From the
                 topologies we study, the flattened butterfly
                 consistently outperforms the mesh and fat tree on all
                 workloads, leading to performance advantages of up to
                 22\%. We also show that considering interconnect and
                 memory hierarchy together when designing large-scale
                 CMPs is crucial, and neglecting either of the two can
                 lead to incorrect conclusions. Finally, the effect of
                 the interconnect on overall performance becomes more
                 important as the number of cores increases, making
                 interconnection choices especially critical when
                 scaling up.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "4",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
  keywords =     "chip multiprocessors; hierarchical networks;
                 Networks-on-chip",
}

@Article{Zhou:2010:PAT,
  author =       "Xiuyi Zhou and Jun Yang and Marek Chrobak and Youtao
                 Zhang",
  title =        "Performance-aware thermal management via task
                 scheduling",
  journal =      j-TACO,
  volume =       "7",
  number =       "1",
  pages =        "5:1--5:??",
  month =        apr,
  year =         "2010",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1746065.1736070",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Wed May 5 15:38:13 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "High on-chip temperature impairs the processor's
                 reliability and reduces its lifetime. Hardware-level
                 dynamic thermal management (DTM) techniques can
                 effectively constrain the chip temperature, but
                 degrades the performance. We propose an OS-level
                 technique that performs thermal-aware job scheduling to
                 reduce DTMs. The algorithm is based on the observation
                 that hot and cool jobs executed in a different order
                 can make a difference in resulting temperature.
                 Real-system implementation in Linux shows that our
                 scheduler can remove 10.5\% to 73.6\% of the hardware
                 DTMs in a medium thermal environment. The CPU
                 throughput is improved by up to 7.6\% (4.1\%, on
                 average) in a severe thermal environment.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "5",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
  keywords =     "task scheduling; Thermal management",
}

@Article{Raghavan:2010:TTP,
  author =       "Arun Raghavan and Colin Blundell and Milo M. K.
                 Martin",
  title =        "Token tenure and {PATCH}: a predictive\slash adaptive
                 token-counting hybrid",
  journal =      j-TACO,
  volume =       "7",
  number =       "2",
  pages =        "6:1--6:??",
  month =        sep,
  year =         "2010",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1839667.1839668",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Sat Oct 2 18:05:46 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Traditional coherence protocols present a set of
                 difficult trade-offs: the reliance of snoopy protocols
                 on broadcast and ordered interconnects limits their
                 scalability, while directory protocols incur a
                 performance penalty on sharing misses due to
                 indirection. This work introduces Patch
                 (Predictive/Adaptive Token-Counting Hybrid), a
                 coherence protocol that provides the scalability of
                 directory protocols while opportunistically sending
                 direct requests to reduce sharing latency. Patch
                 extends a standard directory protocol to track tokens
                 and use token-counting rules for enforcing coherence
                 permissions. Token counting allows Patch to support
                 direct requests on an unordered interconnect, while a
                 mechanism called {\em token tenure\/} provides
                 broadcast-free forward progress using the directory
                 protocol's per-block point of ordering at the home
                 along with either timeouts at requesters or explicit
                 race notification messages.\par

                 Patch makes three main contributions. First, Patch
                 introduces token tenure, which provides broadcast-free
                 forward progress for token-counting protocols. Second,
                 Patch deprioritizes best-effort direct requests to
                 match or exceed the performance of directory protocols
                 without restricting scalability. Finally, Patch
                 provides greater scalability than directory protocols
                 when using inexact encodings of sharers because only
                 processors holding tokens need to acknowledge requests.
                 Overall, Patch is a ``one-size-fits-all'' coherence
                 protocol that dynamically adapts to work well for small
                 systems, large systems, and anywhere in between.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "6",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
  keywords =     "adaptive; bandwidth-efficiency; Cache coherence
                 protocol; predictive; token coherence",
}

@Article{Wimmer:2010:AFD,
  author =       "Christian Wimmer and Hanspeter M{\"o}ssenb{\"o}sck",
  title =        "Automatic feedback-directed object fusing",
  journal =      j-TACO,
  volume =       "7",
  number =       "2",
  pages =        "7:1--7:??",
  month =        sep,
  year =         "2010",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1839667.1839669",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Sat Oct 2 18:05:46 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Object fusing is an optimization that embeds certain
                 referenced objects into their referencing object. The
                 order of objects on the heap is changed in such a way
                 that objects that are accessed together are placed next
                 to each other in memory. Their offset is then fixed,
                 that is, the objects are colocated, allowing field
                 loads to be replaced by address arithmetic. Array
                 fusing specifically optimizes arrays, which are
                 frequently used for the implementation of dynamic data
                 structures. Therefore, the length of arrays often
                 varies, and fields referencing such arrays have to be
                 changed. An efficient code pattern detects these
                 changes and allows the optimized access of such
                 fields.\par

                 We integrated these optimizations into Sun
                 Microsystems' Java HotSpot\TM{} VM. The analysis is
                 performed automatically at runtime, requires no actions
                 on the part of the programmer, and supports dynamic
                 class loading. To safely eliminate a field load, the
                 colocation of the object that holds the field and the
                 object that is referenced by the field must be
                 guaranteed. Two preconditions must be satisfied: The
                 objects must be allocated at the same time, and the
                 field must not be overwritten later. These
                 preconditions are checked by the just-in-time compiler
                 to avoid an interprocedural data flow analysis. The
                 garbage collector ensures that groups of colocated
                 objects are not split by copying groups as a whole. The
                 evaluation shows that the dynamic approach successfully
                 identifies and optimizes frequently accessed fields for
                 several benchmarks with a low compilation and analysis
                 overhead. It leads to a speedup of up to 76\% for
                 simple benchmarks and up to 6\% for complex
                 workloads.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "7",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
  keywords =     "cache performance; garbage collection; Java;
                 just-in-time compilation; object colocation; object
                 fusing; object inlining; optimization",
}

@Article{Lee:2010:AIC,
  author =       "Benjamin C. Lee and David Brooks",
  title =        "Applied inference: {Case} studies in
                 microarchitectural design",
  journal =      j-TACO,
  volume =       "7",
  number =       "2",
  pages =        "8:1--8:??",
  month =        sep,
  year =         "2010",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1839667.1839670",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Sat Oct 2 18:05:46 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "We propose and apply a new simulation paradigm for
                 microarchitectural design evaluation and optimization.
                 This paradigm enables more comprehensive design studies
                 by combining spatial sampling and statistical
                 inference. Specifically, this paradigm (i) defines a
                 large, comprehensive design space, (ii) samples points
                 from the space for simulation, and (iii) constructs
                 regression models based on sparse simulations. This
                 approach greatly improves the computational efficiency
                 of microarchitectural simulation and enables new
                 capabilities in design space exploration.\par

                 We illustrate new capabilities in three case studies
                 for a large design space of approximately 260,000
                 points: (i) Pareto frontier, (ii) pipeline depth, and
                 (iii) multiprocessor heterogeneity analyses. In
                 particular, regression models are exhaustively
                 evaluated to identify Pareto optimal designs that
                 maximize performance for given power budgets. These
                 models enable pipeline depth studies in which all
                 parameters vary simultaneously with depth, thereby more
                 effectively revealing interactions with nondepth
                 parameters. Heterogeneity analysis combines
                 regression-based optimization with clustering
                 heuristics to identify efficient design compromises
                 between similar optimal architectures. These
                 compromises are potential core designs in a
                 heterogeneous multicore architecture. Increasing
                 heterogeneity can improve {\em bips\/}$^3$ / {\em w\/}
                 efficiency by as much as 2.4\times , a theoretical
                 upper bound on heterogeneity benefits that neglects
                 contention between shared resources as well as design
                 complexity. Collectively these studies demonstrate
                 regression models' ability to expose trends and
                 identify optima in diverse design regions, motivating
                 the application of such models in statistical inference
                 for more effective use of modern simulator
                 infrastructure.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "8",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
  keywords =     "Microarchitecture; regression; simulation;
                 statistics",
}

@Article{Rakvic:2010:TMT,
  author =       "R. Rakvic and Q. Cai and J. Gonz{\'a}lez and G.
                 Magklis and P. Chaparro and A. Gonz{\'a}lez",
  title =        "Thread-management techniques to maximize efficiency in
                 multicore and simultaneous multithreaded
                 microprocessors",
  journal =      j-TACO,
  volume =       "7",
  number =       "2",
  pages =        "9:1--9:??",
  month =        sep,
  year =         "2010",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1839667.1839671",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Sat Oct 2 18:05:46 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "We provide an analysis of thread-management techniques
                 that increase performance or reduce energy in multicore
                 and Simultaneous Multithreaded (SMT) cores. Thread
                 delaying reduces energy consumption by running the core
                 containing the critical thread at maximum frequency
                 while scaling down the frequency and voltage of the
                 cores containing noncritical threads. In this article,
                 we provide an insightful breakdown of thread delaying
                 on a simulated multi-core microprocessor. Thread
                 balancing improves overall performance by giving higher
                 priority to the critical thread in the issue queue of
                 an SMT core. We provide a detailed breakdown of
                 performance results for thread-balancing, identifying
                 performance benefits and limitations. For those
                 benchmarks where a performance benefit is not possible,
                 we introduce a novel thread-balancing mechanism on an
                 SMT core that can reduce energy consumption. We have
                 performed a detailed study on an Intel microprocessor
                 simulator running parallel applications. Thread
                 delaying can reduce energy consumption by 4\% to 44\%
                 with negligible performance loss. Thread balancing can
                 increase performance by 20\% or can reduce energy
                 consumption by 23\%.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "9",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
  keywords =     "critical threads; energy-aware; low-power; Meeting
                 point thread characterization; microarchitecture;
                 multi-threaded application; thread balancing; thread
                 delaying",
}

@Article{Pao:2010:MEP,
  author =       "Derek Pao and Wei Lin and Bin Liu",
  title =        "A memory-efficient pipelined implementation of the
                 {Aho--Corasick} string-matching algorithm",
  journal =      j-TACO,
  volume =       "7",
  number =       "2",
  pages =        "10:1--10:??",
  month =        sep,
  year =         "2010",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1839667.1839672",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Sat Oct 2 18:05:46 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "With rapid advancement in Internet technology and
                 usages, some emerging applications in data
                 communications and network security require matching of
                 huge volume of data against large signature sets with
                 thousands of strings in real time. In this article, we
                 present a memory-efficient hardware implementation of
                 the well-known Aho--Corasick (AC) string-matching
                 algorithm using a pipelining approach called P-AC. An
                 attractive feature of the AC algorithm is that it can
                 solve the string-matching problem in time linearly
                 proportional to the length of the input stream, and the
                 computation time is independent of the number of
                 strings in the signature set. A major disadvantage of
                 the AC algorithm is the high memory cost required to
                 store the transition rules of the underlying
                 deterministic finite automaton. By incorporating
                 pipelined processing, the state graph is reduced to a
                 character trie that only contains forward edges.
                 Together with an intelligent implementation of look-up
                 tables, the memory cost of P-AC is only about 18 bits
                 per character for a signature set containing 6,166
                 strings extracted from Snort. The control structure of
                 P-AC is simple and elegant. The cost of the control
                 logic is very low. With the availability of dual-port
                 memories in FPGA devices, we can double the system
                 throughput by duplicating the control logic such that
                 the system can process two data streams concurrently.
                 Since our method is memory-based, incremental changes
                 to the signature set can be accommodated by updating
                 the look-up tables without reconfiguring the FPGA
                 circuitry.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "10",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
  keywords =     "deterministic and nondeterministic finite automaton;
                 intrusion detection system; pipelined processing;
                 String-matching",
}

@Article{Yang:2010:ERS,
  author =       "Xuejun Yang and Ying Zhang and Xicheng Lu and Jingling
                 Xue and Ian Rogers and Gen Li and Guibin Wang and
                 Xudong Fang",
  title =        "Exploiting the reuse supplied by loop-dependent stream
                 references for stream processors",
  journal =      j-TACO,
  volume =       "7",
  number =       "2",
  pages =        "11:1--11:??",
  month =        sep,
  year =         "2010",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1839667.1839673",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Sat Oct 2 18:05:46 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Memory accesses limit the performance of stream
                 processors. By exploiting the reuse of data held in the
                 Stream Register File (SRF), an on-chip, software
                 controlled storage, the number of memory accesses can
                 be reduced. In current stream compilers, reuse
                 exploitation is only attempted for simple stream
                 references, those whose start and end are known.
                 Compiler analysis, from outside of stream processors,
                 does not directly enable the consideration of other
                 more complex stream references. In this article, we
                 propose a transformation to automatically optimize
                 stream programs to exploit the reuse supplied by
                 loop-dependent stream references. The transformation is
                 based on three results: lemmas identifying the reuse
                 supplied by stream references, a new abstract
                 representation called the Stream Reuse Graph (SRG)
                 depicting the identified reuse, and the optimization of
                 the SRG for our transformation. Both the reuse between
                 the whole sequences accessed by stream references and
                 between partial sequences is exploited in the article.
                 In particular, partial reuse and its treatment are
                 quite new and have never, to the best of our knowledge,
                 appeared in scalar and vector processing. At the same
                 time, reusing streams increases the pressure on the
                 SRF, and this presents a problem of which reuse should
                 be exploited within limited SRF capacity. We extend our
                 analysis to achieve this objective. Finally, we
                 implement our techniques based on the StreamC/KernelC
                 compiler that has been optimized with the best existing
                 compilation techniques for stream processors.
                 Experimental results show a resultant speed-up of 1.14
                 to 2.54 times using a range of benchmarks.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "11",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
  keywords =     "stream professor; Stream programming model; stream
                 register file; stream reuse; streamc",
}

@Article{Reddi:2010:EVE,
  author =       "Vijay Janapa Reddi and Simone Campanoni and Meeta S.
                 Gupta and Michael D. Smith and Gu-Yeon Wei and David
                 Brooks and Kim Hazelwood",
  title =        "Eliminating voltage emergencies via software-guided
                 code transformations",
  journal =      j-TACO,
  volume =       "7",
  number =       "2",
  pages =        "12:1--12:??",
  month =        sep,
  year =         "2010",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1839667.1839674",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Sat Oct 2 18:05:46 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "In recent years, circuit reliability in modern
                 high-performance processors has become increasingly
                 important. Shrinking feature sizes and diminishing
                 supply voltages have made circuits more sensitive to
                 microprocessor supply voltage fluctuations. These
                 fluctuations result from the natural variation of
                 processor activity as workloads execute, but when left
                 unattended, these voltage fluctuations can lead to
                 timing violations or even transistor lifetime issues.
                 In this article, we present a hardware--software
                 collaborative approach to mitigate voltage
                 fluctuations. A checkpoint-recovery mechanism rectifies
                 errors when voltage violates maximum tolerance
                 settings, while a runtime software layer reschedules
                 the program's instruction stream to prevent recurring
                 violations at the same program location. The runtime
                 layer, combined with the proposed code-rescheduling
                 algorithm, removes 60\% of all violations with minimal
                 overhead, thereby significantly improving overall
                 performance. Our solution is a radical departure from
                 the ongoing industry-standard approach to circumvent
                 the issue altogether by optimizing for the worst-case
                 voltage flux, which compromises power and performance
                 efficiency severely, especially looking ahead to future
                 technology generations. Existing conservative
                 approaches will have severe implications on the ability
                 to deliver efficient microprocessors. The proposed
                 technique reassembles a traditional reliability problem
                 as a runtime performance optimization problem, thus
                 allowing us to design processors for typical case
                 operation by building intelligent algorithms that can
                 prevent recurring violations.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "12",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
  keywords =     "dI/dt; inductive noise; voltage emergencies; Voltage
                 noise",
}

@Article{Zhao:2010:PPP,
  author =       "Qin Zhao and Ioana Cutcutache and Weng-Fai Wong",
  title =        "{PiPA}: {Pipelined} profiling and analysis on
                 multicore systems",
  journal =      j-TACO,
  volume =       "7",
  number =       "3",
  pages =        "13:1--13:??",
  month =        dec,
  year =         "2010",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1880037.1880038",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Jan 10 09:37:16 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Profiling and online analysis are important tasks in
                 program understanding and feedback-directed
                 optimization. However, fine-grained profiling and
                 online analysis tend to seriously slow down the
                 application. To cope with the slowdown, one may have to
                 terminate the process early or resort to sampling. The
                 former tends to distort the result because of warm-up
                 effects. The latter runs the risk of missing important
                 effects because sampling was turned off during the time
                 that these effects appeared. A promising approach is to
                 make use of the parallel processing capabilities of the
                 now ubiquitous multicore processors to speed up the
                 profiling and analysis process.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "13",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Guo:2010:QSS,
  author =       "Fei Guo and Yan Solihin and Li Zhao and Ravishankar
                 Iyer",
  title =        "Quality of service shared cache management in chip
                 multiprocessor architecture",
  journal =      j-TACO,
  volume =       "7",
  number =       "3",
  pages =        "14:1--14:??",
  month =        dec,
  year =         "2010",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1880037.1880039",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Jan 10 09:37:16 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "The trends in enterprise IT toward service-oriented
                 computing, server consolidation, and virtual computing
                 point to a future in which workloads are becoming
                 increasingly diverse in terms of performance,
                 reliability, and availability requirements. It can be
                 expected that more and more applications with diverse
                 requirements will run on a Chip Multi-Processor (CMP)
                 and share platform resources such as the lowest level
                 cache and off-chip bandwidth. In this environment, it
                 is desirable to have microarchitecture and software
                 support that can provide a guarantee of a certain level
                 of performance, which we refer to as performance
                 Quality of Service. In this article, we investigated a
                 framework would be needed to manage the shared cache
                 resource for fully providing QoS in a CMP.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "14",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Wu:2010:DEH,
  author =       "Xiaoxia Wu and Jian Li and Lixin Zhang and Evan
                 Speight and Ram Rajamony and Yuan Xie",
  title =        "Design exploration of hybrid caches with disparate
                 memory technologies",
  journal =      j-TACO,
  volume =       "7",
  number =       "3",
  pages =        "15:1--15:??",
  month =        dec,
  year =         "2010",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1880037.1880040",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Jan 10 09:37:16 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Traditional multilevel SRAM-based cache hierarchies,
                 especially in the context of chip multiprocessors
                 (CMPs), present many challenges in area requirements,
                 core--to--cache balance, power consumption, and design
                 complexity. New advancements in technology enable
                 caches to be built from other technologies, such as
                 Embedded DRAM (EDRAM), Magnetic RAM (MRAM), and
                 Phase-change RAM (PRAM), in both 2D chips or 3D stacked
                 chips. Caches fabricated in these technologies offer
                 dramatically different power-performance
                 characteristics when compared with SRAM-based caches,
                 particularly in the areas of access latency, cell
                 density, and overall power consumption. In this
                 article, we propose to take advantage of the best
                 characteristics that each technology has to offer
                 through the use of Hybrid Cache Architecture (HCA)
                 designs.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "15",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Kourtis:2010:ECO,
  author =       "Kornilios Kourtis and Georgios Goumas and Nectarios
                 Koziris",
  title =        "Exploiting compression opportunities to improve
                 {SpMxV} performance on shared memory systems",
  journal =      j-TACO,
  volume =       "7",
  number =       "3",
  pages =        "16:1--16:??",
  month =        dec,
  year =         "2010",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1880037.1880041",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Jan 10 09:37:16 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "The Sparse Matrix-Vector Multiplication (SpMxV) kernel
                 exhibits poor scaling on shared memory systems, due to
                 the streaming nature of its data access pattern. To
                 decrease memory contention and improve kernel
                 performance we propose two compression schemes: CSR-DU,
                 that targets the reduction of the matrix structural
                 data by applying coarse-grained delta-encoding, and
                 CSR-VI, that targets the reduction of the values using
                 indirect indexing, applicable to matrices with a small
                 number of unique values.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "16",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Buyukkurt:2010:IHL,
  author =       "Betul Buyukkurt and John Cortes and Jason Villarreal
                 and Walid A. Najjar",
  title =        "Impact of high-level transformations within the
                 {ROCCC} framework",
  journal =      j-TACO,
  volume =       "7",
  number =       "4",
  pages =        "17:1--17:??",
  month =        dec,
  year =         "2010",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1880043.1880044",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Jan 10 09:37:16 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "17",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Hwang:2010:DCR,
  author =       "Yuan-Shin Hwang and Tzong-Yen Lin and Rong-Guey
                 Chang",
  title =        "{DisIRer}: {Converting} a retargetable compiler into a
                 multiplatform binary translator",
  journal =      j-TACO,
  volume =       "7",
  number =       "4",
  pages =        "18:1--18:??",
  month =        dec,
  year =         "2010",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1880043.1880045",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Jan 10 09:37:16 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "18",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Boyer:2010:FBP,
  author =       "Michael Boyer and David Tarjan and Kevin Skadron",
  title =        "Federation: {Boosting} per-thread performance of
                 throughput-oriented manycore architectures",
  journal =      j-TACO,
  volume =       "7",
  number =       "4",
  pages =        "19:1--19:??",
  month =        dec,
  year =         "2010",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1880043.1880046",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Jan 10 09:37:16 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "19",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Fursin:2010:COP,
  author =       "Grigori Fursin and Olivier Temam",
  title =        "Collective optimization: a practical collaborative
                 approach",
  journal =      j-TACO,
  volume =       "7",
  number =       "4",
  pages =        "20:1--20:??",
  month =        dec,
  year =         "2010",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1880043.1880047",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Jan 10 09:37:16 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "20",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Liu:2010:UBI,
  author =       "Fang Liu and Yan Solihin",
  title =        "Understanding the behavior and implications of context
                 switch misses",
  journal =      j-TACO,
  volume =       "7",
  number =       "4",
  pages =        "21:1--21:??",
  month =        dec,
  year =         "2010",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1880043.1880048",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Jan 10 09:37:16 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "21",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Eyerman:2011:FGD,
  author =       "Stijn Eyerman and Lieven Eeckhout",
  title =        "Fine-grained {DVFS} using on-chip regulators",
  journal =      j-TACO,
  volume =       "8",
  number =       "1",
  pages =        "1:1--1:??",
  month =        apr,
  year =         "2011",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1952998.1952999",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Wed Apr 27 07:54:03 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Limit studies on Dynamic Voltage and Frequency Scaling
                 (DVFS) provide apparently contradictory conclusions. On
                 the one hand early limit studies report that DVFS is
                 effective at large timescales (on the order of
                 million(s) of cycles) with large scaling overheads (on
                 the order of tens of microseconds), and they conclude
                 that there is no need for small overhead DVFS at small
                 timescales. Recent work on the other hand --- motivated
                 by the surge of on-chip voltage regulator research ---
                 explores the potential of fine-grained DVFS and reports
                 substantial energy savings at timescales of hundreds of
                 cycles (while assuming no scaling overhead). This
                 article unifies these apparently contradictory
                 conclusions through a DVFS limit study that
                 simultaneously explores timescale and scaling speed.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "1",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Cher:2011:EEC,
  author =       "Chen-Yong Cher and Eren Kursun",
  title =        "Exploring the effects of on-chip thermal variation on
                 high-performance multicore architectures",
  journal =      j-TACO,
  volume =       "8",
  number =       "1",
  pages =        "2:1--2:??",
  month =        apr,
  year =         "2011",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1952998.1953000",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Wed Apr 27 07:54:03 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Inherent temperature variation among cores in a
                 multicore architecture can be caused by a number of
                 factors including process variation, cooling and
                 packaging imperfections, and even placement of the chip
                 in the module. Current dynamic thermal management
                 techniques assume identical heating profiles for
                 homogeneous multicore architectures. Our experimental
                 results indicate that inherent thermal variation is
                 very common in existing multicores. While most
                 multicore chips accommodate multiple thermal sensors,
                 the dynamic power/thermal management schemes are
                 oblivious of the inherent heating tendencies. Hence, in
                 the case of variation, the chip faces repetitive
                 hotspots running on such cores. In this article, we
                 propose a technique that leverages the on-chip sensor
                 infrastructure as well as the capabilities of
                 power/thermal management to effectively reduce the
                 heating and minimize local hotspots.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "2",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Wu:2011:ATR,
  author =       "Carole-Jean Wu and Margaret Martonosi",
  title =        "Adaptive timekeeping replacement: Fine-grained
                 capacity management for shared {CMP} caches",
  journal =      j-TACO,
  volume =       "8",
  number =       "1",
  pages =        "3:1--3:??",
  month =        apr,
  year =         "2011",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1952998.1953001",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Wed Apr 27 07:54:03 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "In chip multiprocessors (CMPs), several
                 high-performance cores typically compete for capacity
                 in a shared last-level cache. This causes degraded and
                 unpredictable memory performance for multiprogrammed
                 and parallel workloads. In response, recent schemes
                 apportion cache bandwidth and capacity in ways that
                 offer better aggregate performance for the workloads.
                 These schemes, however, focus primarily on relatively
                 coarse-grained capacity management without concern for
                 operating system process priority levels. In this work,
                 we explore capacity management approaches that are both
                 temporally and spatially more fine-grained than prior
                 work. We also consider operating system priority levels
                 as part of capacity management. We propose a capacity
                 management mechanism based on timekeeping techniques
                 that track the time interval since the last access to
                 cached data.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "3",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Vespa:2011:DFA,
  author =       "Lucas Vespa and Ning Weng",
  title =        "Deterministic finite automata characterization and
                 optimization for scalable pattern matching",
  journal =      j-TACO,
  volume =       "8",
  number =       "1",
  pages =        "4:1--4:??",
  month =        apr,
  year =         "2011",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1952998.1953002",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Wed Apr 27 07:54:03 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Memory-based Deterministic Finite Automata (DFA) are
                 ideal for pattern matching in network intrusion
                 detection systems due to their deterministic
                 performance and ease of update of new patterns, however
                 severe DFA memory requirements make it impractical to
                 implement thousands of patterns. This article aims to
                 understand the basic relationship between DFA
                 characteristics and memory requirements, and to design
                 a practical memory-based pattern matching engine. We
                 present a methodology that consists of theoretical DFA
                 characterization, encoding optimization, and
                 implementation architecture. Results show the validity
                 of the characterization metrics, effectiveness of the
                 encoding techniques, and efficiency of the memory-based
                 pattern engines.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "4",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Bhattacharjee:2011:PLC,
  author =       "Abhishek Bhattacharjee and Gilberto Contreras and
                 Margaret Martonosi",
  title =        "Parallelization libraries: Characterizing and reducing
                 overheads",
  journal =      j-TACO,
  volume =       "8",
  number =       "1",
  pages =        "5:1--5:??",
  month =        apr,
  year =         "2011",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1952998.1953003",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Wed Apr 27 07:54:03 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Creating efficient, scalable dynamic parallel runtime
                 systems for chip multiprocessors (CMPs) requires
                 understanding the overheads that manifest at high core
                 counts and small task sizes. In this article, we assess
                 these overheads on Intel's Threading Building Blocks
                 (TBB) and OpenMP. First, we use real hardware and
                 simulations to detail various scheduler and
                 synchronization overheads. We find that these can
                 amount to 47\% of TBB benchmark runtime and 80\% of
                 OpenMP benchmark runtime. Second, we propose load
                 balancing techniques such as occupancy-based and
                 criticality-guided task stealing, to boost performance.
                 Overall, our study provides valuable insights for
                 creating robust, scalable runtime libraries.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "5",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Dong:2011:HCU,
  author =       "Xiangyu Dong and Yuan Xie and Naveen Muralimanohar and
                 Norman P. Jouppi",
  title =        "Hybrid checkpointing using emerging nonvolatile
                 memories for future exascale systems",
  journal =      j-TACO,
  volume =       "8",
  number =       "2",
  pages =        "6:1--6:??",
  month =        jul,
  year =         "2011",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1970386.1970387",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Jun 17 18:32:40 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "The scalability of future Massively Parallel
                 Processing (MPP) systems is being severely challenged
                 by high failure rates. Current centralized Hard Disk
                 Drive (HDD) checkpointing results in overhead of 25\%
                 or more at petascale. Since systems become more
                 vulnerable as the node count keeps increasing, novel
                 techniques that enable fast and frequent checkpointing
                 are critical to the future exascale system
                 implementation. In this work, we first introduce one of
                 the emerging nonvolatile memory technologies,
                 Phase-Change Random Access Memory (PCRAM), as a proper
                 candidate of the fast checkpointing device. After a
                 thorough analysis of MPP systems, failure rates and
                 failure sources, we propose a PCRAM-based hybrid
                 local/global checkpointing mechanism which not only
                 provides a faster checkpoint storage, but also boosts
                 the effectiveness of other orthogonal techniques such
                 as incremental checkpointing and background
                 checkpointing.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "6",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Li:2011:EEM,
  author =       "Jianjun Li and Chenggang Wu and Wei-Chung Hsu",
  title =        "Efficient and effective misaligned data access
                 handling in a dynamic binary translation system",
  journal =      j-TACO,
  volume =       "8",
  number =       "2",
  pages =        "7:1--7:??",
  month =        jul,
  year =         "2011",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1970386.1970388",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Jun 17 18:32:40 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Binary Translation (BT) has been commonly used to
                 migrate application software across Instruction Set
                 Architectures (ISAs). Some architectures, such as X86,
                 allow Misaligned Data Accesses (MDAs), while most
                 modern architectures require natural data alignments.
                 In a binary translation system, where the source ISA
                 allows MDA and the target ISA does not, memory
                 operations must be carefully translated. Naive
                 translation may cause frequent misaligned data access
                 traps to occur at runtime on the target machine and
                 severely slow down the migrated application. This
                 article evaluates different approaches in handling MDA
                 in a binary translation system including how to
                 identify MDA candidates and how to translate such
                 memory instructions.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "7",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Venkataramani:2011:DDS,
  author =       "Guru Venkataramani and Christopher J. Hughes and
                 Sanjeev Kumar and Milos Prvulovic",
  title =        "{DeFT}: Design space exploration for on-the-fly
                 detection of coherence misses",
  journal =      j-TACO,
  volume =       "8",
  number =       "2",
  pages =        "8:1--8:??",
  month =        jul,
  year =         "2011",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1970386.1970389",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Jun 17 18:32:40 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "While multicore processors promise large performance
                 benefits for parallel applications, writing these
                 applications is notoriously difficult. Tuning a
                 parallel application to achieve good performance, also
                 known as performance debugging, is often more
                 challenging than debugging the application for
                 correctness. Parallel programs have many
                 performance-related issues that are not seen in
                 sequential programs. An increase in cache misses is one
                 of the biggest challenges that programmers face. To
                 minimize these misses, programmers must not only
                 identify the source of the extra misses, but also
                 perform the tricky task of determining if the misses
                 are caused by interthread communication (i.e.,
                 coherence misses) and if so, whether they are caused by
                 true or false sharing (since the solutions for these
                 two are quite different).",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "8",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Hiser:2011:EIB,
  author =       "Jason D. Hiser and Daniel W. Williams and Wei Hu and
                 Jack W. Davidson and Jason Mars and Bruce R. Childers",
  title =        "Evaluating indirect branch handling mechanisms in
                 software dynamic translation systems",
  journal =      j-TACO,
  volume =       "8",
  number =       "2",
  pages =        "9:1--9:??",
  month =        jul,
  year =         "2011",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1970386.1970390",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Jun 17 18:32:40 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Software Dynamic Translation (SDT) is used for
                 instrumentation, optimization, security, and many other
                 uses. A major source of SDT overhead is the execution
                 of code to translate an indirect branch's target
                 address into the translated destination block's
                 address. This article discusses sources of Indirect
                 Branch (IB) overhead in SDT systems and evaluates
                 techniques for overhead reduction. Measurements using
                 SPEC CPU2000 show that the appropriate choice and
                 configuration of IB translation mechanisms can
                 significantly reduce the overhead. Further,
                 cross-architecture evaluation of these mechanisms
                 reveals that the most efficient implementation and
                 configuration can be highly dependent on the
                 architecture implementation.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "9",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Chen:2011:HAM,
  author =       "Xi E. Chen and Tor M. Aamodt",
  title =        "Hybrid analytical modeling of pending cache hits, data
                 prefetching, and {MSHRs}",
  journal =      j-TACO,
  volume =       "8",
  number =       "3",
  pages =        "10:1--10:??",
  month =        oct,
  year =         "2011",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2019608.2019609",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Sat Oct 22 09:15:12 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "This article proposes techniques to predict the
                 performance impact of pending cache hits, hardware
                 prefetching, and miss status holding register resources
                 on superscalar microprocessors using hybrid analytical
                 models. The proposed models focus on timeliness of
                 pending hits and prefetches and account for a limited
                 number of MSHRs. They improve modeling accuracy of
                 pending hits by 3.9{\times} and when modeling data
                 prefetching, a limited number of MSHRs, or both, these
                 techniques result in average errors of 9.5\% to 17.8\%.
                 The impact of non-uniform DRAM memory latency is shown
                 to be approximated well by using a moving average of
                 memory access latency.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "10",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Kleanthous:2011:CMD,
  author =       "Marios Kleanthous and Yiannakis Sazeides",
  title =        "{CATCH}: a mechanism for dynamically detecting
                 cache-content-duplication in instruction caches",
  journal =      j-TACO,
  volume =       "8",
  number =       "3",
  pages =        "11:1--11:??",
  month =        oct,
  year =         "2011",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2019608.2019610",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Sat Oct 22 09:15:12 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Cache-content-duplication (CCD) occurs when there is a
                 miss for a block in a cache and the entire content of
                 the missed block is already in the cache in a block
                 with a different tag. Caches aware of
                 content-duplication can have lower miss penalty by
                 fetching, on a miss to a duplicate block, directly from
                 the cache instead of accessing lower in the memory
                 hierarchy, and can have lower miss rates by allowing
                 only blocks with unique content to enter a cache. This
                 work examines the potential of CCD for instruction
                 caches. We show that CCD is a frequent phenomenon and
                 that an idealized duplication-detection mechanism for
                 instruction caches has the potential to increase
                 performance of an out-of-order processor, with a 16KB,
                 8-way, 8 instructions per block instruction cache,
                 often by more than 10\% and up to 36\%.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "11",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Vandierendonck:2011:MSR,
  author =       "Hans Vandierendonck and Andr{\'e} Seznec",
  title =        "Managing {SMT} resource usage through speculative
                 instruction window weighting",
  journal =      j-TACO,
  volume =       "8",
  number =       "3",
  pages =        "12:1--12:??",
  month =        oct,
  year =         "2011",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2019608.2019611",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Sat Oct 22 09:15:12 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Simultaneous multithreading processors dynamically
                 share processor resources between multiple threads. In
                 general, shared SMT resources may be managed
                 explicitly, for instance, by dynamically setting queue
                 occupation bounds for each thread as in the DCRA and
                 Hill-Climbing policies. Alternatively, resources may be
                 managed implicitly; that is, resource usage is
                 controlled by placing the desired instruction mix in
                 the resources. In this case, the main resource
                 management tool is the instruction fetch policy which
                 must predict the behavior of each thread (branch
                 mispredictions, long-latency loads, etc.) as it fetches
                 instructions. In this article, we present the use of
                 Speculative Instruction Window Weighting (SIWW) to
                 bridge the gap between implicit and explicit SMT fetch
                 policies.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "12",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Wang:2011:PGS,
  author =       "Po-Han Wang and Chia-Lin Yang and Yen-Ming Chen and
                 Yu-Jung Cheng",
  title =        "Power gating strategies on {GPUs}",
  journal =      j-TACO,
  volume =       "8",
  number =       "3",
  pages =        "13:1--13:??",
  month =        oct,
  year =         "2011",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2019608.2019612",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Sat Oct 22 09:15:12 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "As technology continues to shrink, reducing leakage is
                 critical to achieving energy efficiency. Previous
                 studies on low-power GPUs (Graphics Processing Units)
                 focused on techniques for dynamic power reduction, such
                 as DVFS (Dynamic Voltage and Frequency Scaling) and
                 clock gating. In this paper, we explore the potential
                 of adopting architecture-level power gating techniques
                 for leakage reduction on GPUs. We propose three
                 strategies for applying power gating on different
                 modules in GPUs. The Predictive Shader Shutdown
                 technique exploits workload variation across frames to
                 eliminate leakage in shader clusters. Deferred Geometry
                 Pipeline seeks to minimize leakage in fixed-function
                 geometry units by utilizing an imbalance between
                 geometry and fragment computation across batches.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "13",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Feng:2011:DAD,
  author =       "Min Feng and Chen Tian and Changhui Lin and Rajiv
                 Gupta",
  title =        "Dynamic access distance driven cache replacement",
  journal =      j-TACO,
  volume =       "8",
  number =       "3",
  pages =        "14:1--14:??",
  month =        oct,
  year =         "2011",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2019608.2019613",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Sat Oct 22 09:15:12 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "In this article, we propose a new cache replacement
                 policy that makes the replacement decision based on the
                 reuse information of the cache lines and the requested
                 data. We present the architectural support and evaluate
                 the performance of our approach using SPEC benchmarks.
                 We also develop two reuse information predictors: a
                 profile-based static predictor and a runtime predictor.
                 The applicability of each predictor is discussed in
                 this paper. We further extend our reuse information
                 predictors so that the cache can adaptively choose
                 between the reuse information based replacement policy
                 and an approximation of LRU policy. According to the
                 experimental results, our adaptive reuse information
                 based replacement policy performs either better than or
                 close to the LRU policy.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "14",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Samih:2011:EPP,
  author =       "Ahmad Samih and Yan Solihin and Anil Krishna",
  title =        "Evaluating placement policies for managing capacity
                 sharing in {CMP} architectures with private caches",
  journal =      j-TACO,
  volume =       "8",
  number =       "3",
  pages =        "15:1--15:??",
  month =        oct,
  year =         "2011",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2019608.2019614",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Sat Oct 22 09:15:12 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Chip Multiprocessors (CMP) with distributed L2 caches
                 suffer from a cache fragmentation problem; some caches
                 may be overutilized while others may be underutilized.
                 To avoid such fragmentation, researchers have proposed
                 capacity sharing mechanisms where applications that
                 need additional cache space can place their victim
                 blocks in remote caches. However, we found that only
                 allowing victim blocks to be placed on remote caches
                 tends to cause a high number of remote cache hits
                 relative to local cache hits. In this article, we show
                 that many of the remote cache hits can be converted
                 into local cache hits if we allow newly fetched blocks
                 to be selectively placed directly in a remote cache,
                 rather than in the local cache.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "15",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Yeh:2011:MPP,
  author =       "Chang-Ching Yeh and Kuei-Chung Chang and Tien-Fu Chen
                 and Chingwei Yeh",
  title =        "Maintaining performance on power gating of
                 microprocessor functional units by using a predictive
                 pre-wakeup strategy",
  journal =      j-TACO,
  volume =       "8",
  number =       "3",
  pages =        "16:1--16:??",
  month =        oct,
  year =         "2011",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2019608.2019615",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Sat Oct 22 09:15:12 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Power gating is an effective technique for reducing
                 leakage power in deep submicron CMOS technology.
                 Microarchitectural techniques for power gating of
                 functional units have been developed by detecting
                 suitable idle regions and turning them off to reduce
                 leakage energy consumption; however, wakeup of
                 functional units is needed when instructions are ready
                 for execution such that wakeup overhead is naturally
                 incurred. This study presents time-based power gating
                 with reference pre-wakeup (PGRP), a novel predictive
                 strategy that detects suitable idle periods for power
                 gating and then enables pre-wakeup of needed functional
                 units for avoiding wakeup overhead. The key insight is
                 that most wakeups are repeated due to program
                 locality.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "16",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Lee:2011:DDE,
  author =       "Hyunjin Lee and Sangyeun Cho and Bruce R. Childers",
  title =        "{DEFCAM}: a design and evaluation framework for
                 defect-tolerant cache memories",
  journal =      j-TACO,
  volume =       "8",
  number =       "3",
  pages =        "17:1--17:??",
  month =        oct,
  year =         "2011",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2019608.2019616",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Sat Oct 22 09:15:12 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Advances in deep submicron technology call for a
                 careful review of existing cache designs and design
                 practices in terms of yield, area, and performance.
                 This article presents a Design and Evaluation Framework
                 for defect-tolerant Cache Memories (DEFCAM), which
                 enables processor architects to consider yield, area,
                 and performance together in a unified framework. Since
                 there is a complex, changing trade-off among these
                 metrics depending on the technology, the cache
                 organization, and the yield enhancement scheme
                 employed, such a design flow is invaluable to processor
                 architects when they assess a design and explore the
                 design space quickly at an early stage. We develop a
                 complete framework supporting the proposed DEFCAM
                 design flow, from injecting defects into a wafer to
                 evaluating program performance of individual processors
                 on the wafer.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "17",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Stenstrom:2012:ISI,
  author =       "Per Stenstr{\"o}m and Koen {De Bosschere}",
  title =        "Introduction to the special issue on high-performance
                 and embedded architectures and compilers",
  journal =      j-TACO,
  volume =       "8",
  number =       "4",
  pages =        "18:1--18:??",
  month =        jan,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2086696.2086697",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Sat Jan 21 07:49:49 MST 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "18",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Albericio:2012:ALC,
  author =       "Jorge Albericio and Rub{\'e}n Gran and Pablo
                 Ib{\'a}{\~n}ez and V{\'\i}ctor Vi{\~n}als and Jose
                 Mar{\'\i}a Llaber{\'\i}a",
  title =        "{ABS}: a low-cost adaptive controller for prefetching
                 in a banked shared last-level cache",
  journal =      j-TACO,
  volume =       "8",
  number =       "4",
  pages =        "19:1--19:??",
  month =        jan,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2086696.2086698",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Sat Jan 21 07:49:49 MST 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Hardware data prefetch is a very well known technique
                 for hiding memory latencies. However, in a multicore
                 system fitted with a shared Last-Level Cache (LLC),
                 prefetch induced by a core consumes common resources
                 such as shared cache space and main memory bandwidth.
                 This may degrade the performance of other cores and
                 even the overall system performance unless the prefetch
                 aggressiveness of each core is controlled from a system
                 standpoint. On the other hand, LLCs in commercial chip
                 multiprocessors are more and more frequently organized
                 in independent banks. In this contribution, we target
                 for the first time prefetch in a banked LLC
                 organization and propose ABS, a low-cost controller
                 with a hill-climbing approach that runs stand-alone at
                 each LLC bank without requiring inter-bank
                 communication.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "19",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Bayrak:2012:AII,
  author =       "Ali Galip Bayrak and Nikola Velickovic and Paolo Ienne
                 and Wayne Burleson",
  title =        "An architecture-independent instruction shuffler to
                 protect against side-channel attacks",
  journal =      j-TACO,
  volume =       "8",
  number =       "4",
  pages =        "20:1--20:??",
  month =        jan,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2086696.2086699",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Sat Jan 21 07:49:49 MST 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Embedded cryptographic systems, such as smart cards,
                 require secure implementations that are robust to a
                 variety of low-level attacks. Side-Channel Attacks
                 (SCA) exploit the information such as power
                 consumption, electromagnetic radiation and acoustic
                 leaking through the device to uncover the secret
                 information. Attackers can mount successful attacks
                 with very modest resources in a short time period.
                 Therefore, many methods have been proposed to increase
                 the security against SCA. Randomizing the execution
                 order of the instructions that are independent, i.e.,
                 random shuffling, is one of the most popular among
                 them. Implementing instruction shuffling in software is
                 either implementation specific or has a significant
                 performance or code size overhead.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "20",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Demme:2012:AGC,
  author =       "John Demme and Simha Sethumadhavan",
  title =        "Approximate graph clustering for program
                 characterization",
  journal =      j-TACO,
  volume =       "8",
  number =       "4",
  pages =        "21:1--21:??",
  month =        jan,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2086696.2086700",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Sat Jan 21 07:49:49 MST 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "An important aspect of system optimization research is
                 the discovery of program traits or behaviors. In this
                 paper, we present an automated method of program
                 characterization which is able to examine and cluster
                 program graphs, i.e., dynamic data graphs or control
                 flow graphs. Our novel approximate graph clustering
                 technology allows users to find groups of program
                 fragments which contain similar code idioms or patterns
                 in data reuse, control flow, and context. Patterns of
                 this nature have several potential applications
                 including development of new static or dynamic
                 optimizations to be implemented in software or in
                 hardware. For the SPEC CPU 2006 suite of benchmarks,
                 our results show that approximate graph clustering is
                 effective at grouping behaviorally similar functions.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "21",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Pricopi:2012:BPH,
  author =       "Mihai Pricopi and Tulika Mitra",
  title =        "{Bahurupi}: a polymorphic heterogeneous multi-core
                 architecture",
  journal =      j-TACO,
  volume =       "8",
  number =       "4",
  pages =        "22:1--22:??",
  month =        jan,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2086696.2086701",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Sat Jan 21 07:49:49 MST 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Computing systems have made an irreversible transition
                 towards parallel architectures with the emergence of
                 multi-cores. Moreover, power and thermal limits in
                 embedded systems mandate the deployment of many simpler
                 cores rather than a few complex cores on chip. Consumer
                 electronic devices, on the other hand, need to support
                 an ever-changing set of diverse applications with
                 varying performance demands. While some applications
                 can benefit from thread-level parallelism offered by
                 multi-core solutions, there still exist a large number
                 of applications with substantial amount of sequential
                 code. The sequential programs suffer from limited
                 exploitation of instruction-level parallelism in simple
                 cores. We propose a reconfigurable multi-core
                 architecture, called Bahurupi, that can successfully
                 reconcile the conflicting demands of instruction-level
                 and thread-level parallelism.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "22",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Cleemput:2012:CMT,
  author =       "Jeroen V. Cleemput and Bart Coppens and Bjorn {De
                 Sutter}",
  title =        "Compiler mitigations for time attacks on modern x86
                 processors",
  journal =      j-TACO,
  volume =       "8",
  number =       "4",
  pages =        "23:1--23:??",
  month =        jan,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2086696.2086702",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Sat Jan 21 07:49:49 MST 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "This paper studies and evaluates the extent to which
                 automated compiler techniques can defend against
                 timing-based side channel attacks on modern x86
                 processors. We study how modern x86 processors can leak
                 timing information through side channels that relate to
                 data flow. We study the efficiency, effectiveness,
                 portability, predictability and sensitivity of several
                 mitigating code transformations that eliminate or
                 minimize key-dependent execution time variations.
                 Furthermore, we discuss the extent to which compiler
                 backends are a suitable tool to provide automated
                 support for the proposed mitigations.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "23",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Mccandless:2012:CTI,
  author =       "Jason Mccandless and David Gregg",
  title =        "Compiler techniques to improve dynamic branch
                 prediction for indirect jump and call instructions",
  journal =      j-TACO,
  volume =       "8",
  number =       "4",
  pages =        "24:1--24:??",
  month =        jan,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2086696.2086703",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Sat Jan 21 07:49:49 MST 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Indirect jump instructions are used to implement
                 multiway branch statements and virtual function calls
                 in object-oriented languages. Branch behavior can have
                 significant impact on program performance, but
                 fortunately hardware predictors can alleviate much of
                 the risk. Modern processors include indirect branch
                 predictors which use part of the target address to
                 update a global history. We present a code generation
                 technique to maximize the branch history information
                 available to the predictor. We implement our
                 optimization as an assembly language transformation,
                 and evaluate it for SPEC benchmarks and interpreters
                 using simulated and real hardware, showing indirect
                 branch misprediction decreases.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "24",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Garcia-Guirado:2012:DDA,
  author =       "Antonio Garc{\'\i}a-Guirado and Ricardo
                 Fern{\'a}ndez-Pascual and Alberto Ros and Jos{\'e} M.
                 Garc{\'\i}a",
  title =        "{DAPSCO}: Distance-aware partially shared cache
                 organization",
  journal =      j-TACO,
  volume =       "8",
  number =       "4",
  pages =        "25:1--25:??",
  month =        jan,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2086696.2086704",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Sat Jan 21 07:49:49 MST 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Many-core tiled CMP proposals often assume a partially
                 shared last level cache (LLC) since this provides a
                 good compromise between access latency and cache
                 utilization. In this paper, we propose a novel way to
                 map memory addresses to LLC banks that takes into
                 account the average distance between the banks and the
                 tiles that access them. Contrary to traditional
                 approaches, our mapping does not group the tiles in
                 clusters within which all the cores access the same
                 bank for the same addresses. Instead, two neighboring
                 cores access different sets of banks minimizing the
                 average distance travelled by the cache requests.
                 Results for a 64-core CMP show that our proposal
                 improves both execution time and the energy consumed by
                 the network by 13\% when compared to a traditional
                 mapping.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "25",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Wang:2012:FSS,
  author =       "Zhenjiang Wang and Chenggang Wu and Pen-Chung Yew and
                 Jianjun Li and Di Xu",
  title =        "On-the-fly structure splitting for heap objects",
  journal =      j-TACO,
  volume =       "8",
  number =       "4",
  pages =        "26:1--26:??",
  month =        jan,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2086696.2086705",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Sat Jan 21 07:49:49 MST 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "With the advent of multicore systems, the gap between
                 processor speed and memory latency has grown worse
                 because of their complex interconnect. Sophisticated
                 techniques are needed more than ever to improve an
                 application's spatial and temporal locality. This paper
                 describes an optimization that aims to improve heap
                 data layout by structure-splitting. It also provides
                 runtime address checking by piggybacking on the
                 existing page protection mechanism to guarantee the
                 correctness of such optimization that has eluded many
                 previous attempts due to safety concerns. The technique
                 can be applied to both sequential and parallel programs
                 at either compile time or runtime. However, we focus
                 primarily on sequential programs (i.e., single-threaded
                 programs) at runtime in this paper.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "26",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Das:2012:ELC,
  author =       "Dibyendu Das and B. Dupont {De Dinechin} and
                 Ramakrishna Upadrasta",
  title =        "Efficient liveness computation using merge sets and
                 {DJ}-graphs",
  journal =      j-TACO,
  volume =       "8",
  number =       "4",
  pages =        "27:1--27:??",
  month =        jan,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2086696.2086706",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Sat Jan 21 07:49:49 MST 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "In this work we devise an efficient algorithm that
                 computes the liveness information of program variables.
                 The algorithm employs SSA form and DJ-graphs as
                 representation to build Merge sets. The Merge set of
                 node n, M(n) is based on the structure of the Control
                 Flow Graph (CFG) and consists of all nodes where a
                 {\phi}-function needs to be placed, if a definition of
                 a variable appears in n. The merge sets of a CFG can be
                 computed using DJ-graphs without prior knowledge of how
                 the variables are used and defined. Later, we can
                 answer the liveness query (as a part of other
                 optimization or analysis phase) by utilizing the
                 knowledge of the use/def of variables, the dominator
                 tree and the pre-computed merge sets.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "27",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Patsilaras:2012:EEM,
  author =       "George Patsilaras and Niket K. Choudhary and James
                 Tuck",
  title =        "Efficiently exploiting memory level parallelism on
                 asymmetric coupled cores in the dark silicon era",
  journal =      j-TACO,
  volume =       "8",
  number =       "4",
  pages =        "28:1--28:??",
  month =        jan,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2086696.2086707",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Sat Jan 21 07:49:49 MST 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Extracting high memory-level parallelism (MLP) is
                 essential for speeding up single-threaded applications
                 which are memory bound. At the same time, the projected
                 amount of dark silicon (the fraction of the chip
                 powered off) on a chip is growing. Hence, Asymmetric
                 Multicore Processors (AMP) offer a unique opportunity
                 to integrate many types of cores, each powered at
                 different times, in order to optimize for different
                 regions of execution. In this work, we quantify the
                 potential for exploiting core customization to speedup
                 programs during regions of high MLP. Based on a careful
                 design space exploration, we discover that an AMP that
                 includes a narrow and fast specialized core has the
                 potential to efficiently exploit MLP.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "28",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Malits:2012:ELG,
  author =       "Roman Malits and Evgeny Bolotin and Avinoam Kolodny
                 and Avi Mendelson",
  title =        "Exploring the limits of {GPGPU} scheduling in control
                 flow bound applications",
  journal =      j-TACO,
  volume =       "8",
  number =       "4",
  pages =        "29:1--29:??",
  month =        jan,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2086696.2086708",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Sat Jan 21 07:49:49 MST 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "GPGPUs are optimized for graphics, for that reason the
                 hardware is optimized for massively data parallel
                 applications characterized by predictable memory access
                 patterns and little control flow. For such
                 applications' e.g., matrix multiplication, GPGPU based
                 system can achieve very high performance. However, many
                 general purpose data parallel applications are
                 characterized as having intensive control flow and
                 unpredictable memory access patterns. Optimizing the
                 code in such problems for current hardware is often
                 ineffective and even impractical since it exhibits low
                 hardware utilization leading to relatively low
                 performance. This work tracks the root causes of
                 execution inefficacies when running control flow
                 intensive CUDA applications on NVIDIA GPGPU hardware.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "29",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Orosa:2012:FIF,
  author =       "Lois Orosa and Elisardo Antelo and Javier D.
                 Bruguera",
  title =        "{FlexSig}: {Implementing} flexible hardware
                 signatures",
  journal =      j-TACO,
  volume =       "8",
  number =       "4",
  pages =        "30:1--30:??",
  month =        jan,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2086696.2086709",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Sat Jan 21 07:49:49 MST 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "With the advent of chip multiprocessors, new
                 techniques have been developed to make parallel
                 programming easier and more reliable. New parallel
                 programming paradigms and new methods of making the
                 execution of programs more efficient and more reliable
                 have been developed. Usually, these improvements
                 require hardware support to avoid a system slowdown.
                 Signatures based on Bloom filters are widely used as
                 hardware support for parallel programming in chip
                 multiprocessors. Signatures are used in Transactional
                 Memory, thread-level speculation, parallel debugging,
                 deterministic replay and other tools and applications.
                 The main limitation of hardware signatures is the lack
                 of flexibility: if signatures are designed with a given
                 configuration, tailored to the requirements of a
                 specific tool or application, it is likely that they do
                 not fit well for other different requirements.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "30",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Titos-Gil:2012:HTM,
  author =       "Ruben Titos-Gil and Manuel E. Acacio and Jose M.
                 Garcia and Tim Harris and Adrian Cristal and Osman
                 Unsal and Ibrahim Hur and Mateo Valero",
  title =        "Hardware transactional memory with software-defined
                 conflicts",
  journal =      j-TACO,
  volume =       "8",
  number =       "4",
  pages =        "31:1--31:??",
  month =        jan,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2086696.2086710",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Sat Jan 21 07:49:49 MST 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "In this paper we investigate the benefits of turning
                 the concept of transactional conflict from its
                 traditionally fixed definition into a variable one that
                 can be dynamically controlled in software. We propose
                 the extension of the atomic language construct with an
                 attribute that specifies the definition of conflict, so
                 that programmers can write code which adjusts what
                 kinds of conflicts are to be detected, relaxing or
                 tightening the conditions according to the forms of
                 interference that can be tolerated by a particular
                 algorithm. Using this performance-motivated construct,
                 specific conflict information can be associated with
                 portions of code, as each transaction is provided with
                 a local definition that applies while it executes.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "31",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Kim:2012:IPN,
  author =       "Yongjoo Kim and Jongeun Lee and Toan X. Mai and
                 Yunheung Paek",
  title =        "Improving performance of nested loops on
                 reconfigurable array processors",
  journal =      j-TACO,
  volume =       "8",
  number =       "4",
  pages =        "32:1--32:??",
  month =        jan,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2086696.2086711",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Sat Jan 21 07:49:49 MST 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Pipelining algorithms are typically concerned with
                 improving only the steady-state performance, or the
                 kernel time. The pipeline setup time happens only once
                 and therefore can be negligible compared to the kernel
                 time. However, for Coarse-Grained Reconfigurable
                 Architectures (CGRAs) used as a coprocessor to a main
                 processor, pipeline setup can take much longer due to
                 the communication delay between the two processors, and
                 can become significant if it is repeated in an outer
                 loop of a loop nest. In this paper we evaluate the
                 overhead of such non-kernel execution times when
                 mapping nested loops for CGRAs, and propose a novel
                 architecture-compiler cooperative scheme to reduce the
                 overhead, while also minimizing the number of extra
                 configurations required.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "32",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Purnaprajna:2012:MWI,
  author =       "Madhura Purnaprajna and Paolo Ienne",
  title =        "Making wide-issue {VLIW} processors viable on
                 {FPGAs}",
  journal =      j-TACO,
  volume =       "8",
  number =       "4",
  pages =        "33:1--33:??",
  month =        jan,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2086696.2086712",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Sat Jan 21 07:49:49 MST 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Soft and highly-customized processors are emerging as
                 a common way to efficiently control large amount of
                 computing resources available on FPGAs. Yet, some
                 processor architectures of choice for DSP and media
                 applications, such as wide-issue VLIW processors,
                 remain impractical: the multi-ported register file
                 makes a very inefficient use of the resources in the
                 FPGA fabric. This paper proposes modifications to
                 existing FPGAs to make soft-VLIW processor viable. We
                 introduce an embedded multi-ported RAM that can be
                 customized to match the issue-width of VLIW processors.
                 To ascertain the benefits of this approach, we map an
                 extensible VLIW processor onto a standard FPGA from
                 Xilinx.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "33",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Radojkovic:2012:EIS,
  author =       "Petar Radojkovi{\'c} and Sylvain Girbal and Arnaud
                 Grasset and Eduardo Qui{\~n}ones and Sami Yehia and
                 Francisco J. Cazorla",
  title =        "On the evaluation of the impact of shared resources in
                 multithreaded {COTS} processors in time-critical
                 environments",
  journal =      j-TACO,
  volume =       "8",
  number =       "4",
  pages =        "34:1--34:??",
  month =        jan,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2086696.2086713",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Sat Jan 21 07:49:49 MST 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Commercial Off-The-Shelf (COTS) processors are now
                 commonly used in real-time embedded systems. The
                 characteristics of these processors fulfill system
                 requirements in terms of time-to-market, low cost, and
                 high performance-per-watt ratio. However, multithreaded
                 (MT) processors are still not widely used in real-time
                 systems because the timing analysis is too complex. In
                 MT processors, simultaneously-running tasks share and
                 compete for processor resources, so the timing analysis
                 has to estimate the possible impact that the inter-task
                 interferences have on the execution time of the
                 applications. In this paper, we propose a method that
                 quantifies the slowdown that simultaneously-running
                 tasks may experience due to collision in shared
                 processor resources.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "34",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Domnitser:2012:NMC,
  author =       "Leonid Domnitser and Aamer Jaleel and Jason Loew and
                 Nael Abu-Ghazaleh and Dmitry Ponomarev",
  title =        "Non-monopolizable caches: Low-complexity mitigation of
                 cache side channel attacks",
  journal =      j-TACO,
  volume =       "8",
  number =       "4",
  pages =        "35:1--35:??",
  month =        jan,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2086696.2086714",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Sat Jan 21 07:49:49 MST 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "We propose a flexibly-partitioned cache design that
                 either drastically weakens or completely eliminates
                 cache-based side channel attacks. The proposed
                 Non-Monopolizable (NoMo) cache dynamically reserves
                 cache lines for active threads and prevents other
                 co-executing threads from evicting reserved lines.
                 Unreserved lines remain available for dynamic sharing
                 among threads. NoMo requires only simple modifications
                 to the cache replacement logic, making it
                 straightforward to adopt. It requires no software
                 support enabling it to automatically protect
                 pre-existing binaries. NoMo results in performance
                 degradation of about 1\% on average. We demonstrate
                 that NoMo can provide strong security guarantees for
                 the AES and Blowfish encryption algorithms.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "35",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Rico:2012:SLS,
  author =       "Alejandro Rico and Felipe Cabarcas and Carlos
                 Villavieja and Milan Pavlovic and Augusto Vega and Yoav
                 Etsion and Alex Ramirez and Mateo Valero",
  title =        "On the simulation of large-scale architectures using
                 multiple application abstraction levels",
  journal =      j-TACO,
  volume =       "8",
  number =       "4",
  pages =        "36:1--36:??",
  month =        jan,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2086696.2086715",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Sat Jan 21 07:49:49 MST 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Simulation is a key tool for computer architecture
                 research. In particular, cycle-accurate simulators are
                 extremely important for microarchitecture exploration
                 and detailed design decisions, but they are slow and,
                 so, not suitable for simulating large-scale
                 architectures, nor are they meant for this. Moreover,
                 microarchitecture design decisions are irrelevant, or
                 even misleading, for early processor design stages and
                 high-level explorations. This allows one to raise the
                 abstraction level of the simulated architecture, and
                 also the application abstraction level, as it does not
                 necessarily have to be represented as an instruction
                 stream. In this paper we introduce a definition of
                 different application abstraction levels, and how these
                 are employed in TaskSim, a multi-core architecture
                 simulator, to provide several architecture modeling
                 abstractions, and simulate large-scale architectures
                 with hundreds of cores.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "36",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Saidi:2012:OED,
  author =       "Selma Saidi and Pranav Tendulkar and Thierry Lepley
                 and Oded Maler",
  title =        "Optimizing explicit data transfers for data parallel
                 applications on the {Cell} architecture",
  journal =      j-TACO,
  volume =       "8",
  number =       "4",
  pages =        "37:1--37:??",
  month =        jan,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2086696.2086716",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Sat Jan 21 07:49:49 MST 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "In this paper we investigate a general approach to
                 automate some deployment decisions for a certain class
                 of applications on multi-core computers. We consider
                 data-parallelizable programs that use the well-known
                 double buffering technique to bring the data from the
                 off-chip slow memory to the local memory of the cores
                 via a DMA (direct memory access) mechanism. Based on
                 the computation time and size of elementary data items
                 as well as DMA characteristics, we derive optimal and
                 near optimal values for the number of blocks that
                 should be clustered in a single DMA command. We then
                 extend the results to the case where a computation for
                 one data item needs some data in its neighborhood.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "37",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Feng:2012:PPL,
  author =       "Min Feng and Changhui Lin and Rajiv Gupta",
  title =        "{PLDS}: Partitioning linked data structures for
                 parallelism",
  journal =      j-TACO,
  volume =       "8",
  number =       "4",
  pages =        "38:1--38:??",
  month =        jan,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2086696.2086717",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Sat Jan 21 07:49:49 MST 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Recently, parallelization of computations in the
                 presence of dynamic data structures has shown promising
                 potential. In this paper, we present PLDS, a system for
                 easily expressing and efficiently exploiting
                 parallelism in computations that are based on dynamic
                 linked data structures. PLDS improves the execution
                 efficiency by providing support for data partitioning
                 and then distributing computation across threads based
                 on the partitioning. Such computations often require
                 the use of speculation to exploit dynamic parallelism.
                 PLDS supports a conditional speculation mechanism that
                 reduces the cost of speculation. PLDS can be employed
                 in the context of different forms of parallelism, which
                 to cover a wide range of parallel applications.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "38",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Pradelle:2012:PPB,
  author =       "Benoit Pradelle and Alain Ketterlin and Philippe
                 Clauss",
  title =        "Polyhedral parallelization of binary code",
  journal =      j-TACO,
  volume =       "8",
  number =       "4",
  pages =        "39:1--39:??",
  month =        jan,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2086696.2086718",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Sat Jan 21 07:49:49 MST 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Many automatic software parallelization systems have
                 been proposed in the past decades, but most of them are
                 dedicated to source-to-source transformations. This
                 paper shows that parallelizing executable programs is
                 feasible, even if they require complex transformations,
                 and in effect decouples parallelization from
                 compilation, for example, for closed-source or legacy
                 software, where binary code is the only available
                 representation. We propose an automatic parallelizer,
                 which is able to perform advanced parallelization on
                 binary code. It first parses the binary code and
                 extracts high-level information. From this information,
                 a C program is generated. This program captures only a
                 subset of the program semantics, namely, loops and
                 memory accesses.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "39",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Dong:2012:RAE,
  author =       "Yaozu Dong and Yu Chen and Zhenhao Pan and Jinquan Dai
                 and Yunhong Jiang",
  title =        "{ReNIC}: Architectural extension to {SR-IOV} {I/O}
                 virtualization for efficient replication",
  journal =      j-TACO,
  volume =       "8",
  number =       "4",
  pages =        "40:1--40:??",
  month =        jan,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2086696.2086719",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Sat Jan 21 07:49:49 MST 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Virtualization is gaining popularity in cloud
                 computing and has become the key enabling technology in
                 cloud infrastructure. By replicating the virtual server
                 state to multiple independent platforms, virtualization
                 improves the reliability and availability of cloud
                 systems. Unfortunately, existing Virtual Machine (VM)
                 replication solutions were designed only for software
                 virtualized I/O, which suffers from large performance
                 and scalability overheads. Although hardware-assisted
                 I/O virtualization (such as SR-IOV) can achieve close
                 to native performance and very good scalability, they
                 cannot be properly replicated across different physical
                 machines due to architectural limitations (such as lack
                 of efficient device state read/write, buffering
                 outbound packets, etc.) .",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "40",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Bruintjes:2012:SLA,
  author =       "Tom M. Bruintjes and Karel H. G. Walters and Sabih H.
                 Gerez and Bert Molenkamp and Gerard J. M. Smit",
  title =        "{Sabrewing}: a lightweight architecture for combined
                 floating-point and integer arithmetic",
  journal =      j-TACO,
  volume =       "8",
  number =       "4",
  pages =        "41:1--41:??",
  month =        jan,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2086696.2086720",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Sat Jan 21 07:49:49 MST 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "In spite of the fact that floating-point arithmetic is
                 costly in terms of silicon area, the joint design of
                 hardware for floating-point and integer arithmetic is
                 seldom considered. While components like multipliers
                 and adders can potentially be shared, floating-point
                 and integer units in contemporary processors are
                 practically disjoint. This work presents a new
                 architecture which tightly integrates floating-point
                 and integer arithmetic in a single datapath. It is
                 mainly intended for use in low-power embedded digital
                 signal processors and therefore the following design
                 constraints were important: limited use of pipelining
                 for the convenience of the compiler; maintaining
                 compatibility with existing technology; minimal area
                 and power consumption for applicability in embedded
                 systems.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "41",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Kicherer:2012:SPA,
  author =       "Mario Kicherer and Fabian Nowak and Rainer Buchty and
                 Wolfgang Karl",
  title =        "Seamlessly portable applications: Managing the
                 diversity of modern heterogeneous systems",
  journal =      j-TACO,
  volume =       "8",
  number =       "4",
  pages =        "42:1--42:??",
  month =        jan,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2086696.2086721",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Sat Jan 21 07:49:49 MST 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Nowadays, many possible configurations of
                 heterogeneous systems exist, posing several new
                 challenges to application development: different types
                 of processing units usually require individual
                 programming models with dedicated runtime systems and
                 accompanying libraries. If these are absent on an
                 end-user system, e.g. because the respective hardware
                 is not present, an application linked against these
                 will break. This handicaps portability of applications
                 being developed on one system and executed on other,
                 differently configured heterogeneous systems. Moreover,
                 the individual profit of different processing units is
                 normally not known in advance. In this work, we propose
                 a technique to effectively decouple applications from
                 their accelerator-specific parts, respectively code.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "42",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Premillieu:2012:SSR,
  author =       "Nathanael Premillieu and Andre Seznec",
  title =        "{SYRANT}: {SYmmetric Resource Allocation on Not-taken
                 and Taken} paths",
  journal =      j-TACO,
  volume =       "8",
  number =       "4",
  pages =        "43:1--43:??",
  month =        jan,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2086696.2086722",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Sat Jan 21 07:49:49 MST 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "In the multicore era, achieving ultimate single
                 process performance is still an issue e.g. for single
                 process workload or for sequential sections in parallel
                 applications. Unfortunately, despite tremendous
                 research effort on branch prediction, substantial
                 performance potential is still wasted due to branch
                 mispredictions. On a branch misprediction resolution,
                 instruction treatment on the wrong path is essentially
                 thrown away. However, in most cases after a conditional
                 branch, the taken and the not-taken paths of execution
                 merge after a few instructions. Instructions that
                 follow the reconvergence point are executed whatever
                 the branch outcome is. We present SYRANT (SYmmetric
                 Resource Allocation on Not-taken and Taken paths), a
                 new technique for exploiting control independence.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "43",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Hasenplaugh:2012:GBC,
  author =       "William Hasenplaugh and Pritpal S. Ahuja and Aamer
                 Jaleel and Simon {Steely, Jr.} and Joel Emer",
  title =        "The gradient-based cache partitioning algorithm",
  journal =      j-TACO,
  volume =       "8",
  number =       "4",
  pages =        "44:1--44:??",
  month =        jan,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2086696.2086723",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Sat Jan 21 07:49:49 MST 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "This paper addresses the problem of partitioning a
                 cache between multiple concurrent threads and in the
                 presence of hardware prefetching. Cache replacement
                 designed to preserve temporal locality (e.g., LRU) will
                 allocate cache resources proportional to the miss-rate
                 of each competing thread irrespective of whether the
                 cache space will be utilized [Qureshi and Patt 2006].
                 This is clearly suboptimal as applications vary
                 dramatically in their use of recently accessed data. We
                 address this problem by partitioning a shared cache
                 such that a global goodness metric is optimized. This
                 paper introduces the Gradient-based Cache Partitioning
                 Algorithm (GPA), whose variants optimize either
                 hitrate, total instructions per cycle (IPC) or a
                 weighted IPC metric designed to enforce Quality of
                 Service (QoS) [Iyer 2004].",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "44",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Lira:2012:MPA,
  author =       "Javier Lira and Timothy M. Jones and Carlos Molina and
                 Antonio Gonz{\'a}lez",
  title =        "The migration prefetcher: Anticipating data promotion
                 in dynamic {NUCA} caches",
  journal =      j-TACO,
  volume =       "8",
  number =       "4",
  pages =        "45:1--45:??",
  month =        jan,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2086696.2086724",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Sat Jan 21 07:49:49 MST 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "The exponential increase in multicore processor (CMP)
                 cache sizes accompanied by growing on-chip wire delays
                 make it difficult to implement traditional caches with
                 a single, uniform access latency. Non-Uniform Cache
                 Architecture (NUCA) designs have been proposed to
                 address this problem. A NUCA divides the whole cache
                 memory into smaller banks and allows banks nearer a
                 processor core to have lower access latencies than
                 those further away, thus mitigating the effects of the
                 cache's internal wires. Determining the best placement
                 for data in the NUCA cache at any particular moment
                 during program execution is crucial for exploiting the
                 benefits that this architecture provides.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "45",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Pusukuri:2012:TTD,
  author =       "Kishore Kumar Pusukuri and Rajiv Gupta and Laxmi N.
                 Bhuyan",
  title =        "Thread Tranquilizer: Dynamically reducing performance
                 variation",
  journal =      j-TACO,
  volume =       "8",
  number =       "4",
  pages =        "46:1--46:??",
  month =        jan,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2086696.2086725",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Sat Jan 21 07:49:49 MST 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "To realize the performance potential of multicore
                 systems, we must effectively manage the interactions
                 between memory reference behavior and the operating
                 system policies for thread scheduling and migration
                 decisions. We observe that these interactions lead to
                 significant variations in the performance of a given
                 application, from one execution to the next, even when
                 the program input remains unchanged and no other
                 applications are being run on the system. Our
                 experiments with multithreaded programs, including the
                 TATP database application, SPECjbb2005, and a subset of
                 PARSEC and SPEC OMP programs, on a 24-core Dell
                 PowerEdge R905 server running OpenSolaris confirms the
                 above observation.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "46",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Zhang:2012:TPB,
  author =       "Dongsong Zhang and Deke Guo and Fangyuan Chen and Fei
                 Wu and Tong Wu and Ting Cao and Shiyao Jin",
  title =        "{TL}-plane-based multi-core energy-efficient real-time
                 scheduling algorithm for sporadic tasks",
  journal =      j-TACO,
  volume =       "8",
  number =       "4",
  pages =        "47:1--47:??",
  month =        jan,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2086696.2086726",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Sat Jan 21 07:49:49 MST 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "As the energy consumption of multi-core systems
                 becomes increasingly prominent, it's a challenge to
                 design an energy-efficient real-time scheduling
                 algorithm in multi-core systems for reducing the system
                 energy consumption while guaranteeing the feasibility
                 of real-time tasks. In this paper, we focus on
                 multi-core processors, with the global Dynamic Voltage
                 Frequency Scaling (DVFS) and Dynamic Power Management
                 (DPM) technologies. In this setting, we propose an
                 energy-efficient real-time scheduling algorithm, the
                 Time Local remaining execution plane based Dynamic
                 Voltage Frequency Scaling (TL-DVFS). TL-DVFS utilizes
                 the concept of Time Local remaining execution (TL)
                 plane to dynamically scale the voltage and frequency of
                 a processor at the initial time of each TL plane as
                 well as at the release time of a sporadic task in each
                 TL plane.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "47",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Lyons:2012:ASS,
  author =       "Michael J. Lyons and Mark Hempstead and Gu-Yeon Wei
                 and David Brooks",
  title =        "The accelerator store: a shared memory framework for
                 accelerator-based systems",
  journal =      j-TACO,
  volume =       "8",
  number =       "4",
  pages =        "48:1--48:??",
  month =        jan,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2086696.2086727",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Sat Jan 21 07:49:49 MST 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "This paper presents the many-accelerator architecture,
                 a design approach combining the scalability of
                 homogeneous multi-core architectures and
                 system-on-chip's high performance and power-efficient
                 hardware accelerators. In preparation for systems
                 containing tens or hundreds of accelerators, we
                 characterize a diverse pool of accelerators and find
                 each contains significant amounts of SRAM memory (up to
                 90\% of their area). We take advantage of this
                 discovery and introduce the accelerator store, a
                 scalable architectural component to minimize
                 accelerator area by sharing its memories between
                 accelerators. We evaluate the accelerator store for two
                 applications and find significant system area
                 reductions (30\%) in exchange for small overheads (2\%
                 performance, 0\%--8\% energy).",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "48",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Orozco:2012:THT,
  author =       "Daniel Orozco and Elkin Garcia and Rishi Khan and
                 Kelly Livingston and Guang R. Gao",
  title =        "Toward high-throughput algorithms on many-core
                 architectures",
  journal =      j-TACO,
  volume =       "8",
  number =       "4",
  pages =        "49:1--49:??",
  month =        jan,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2086696.2086728",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Sat Jan 21 07:49:49 MST 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Advanced many-core CPU chips already have a few
                 hundreds of processing cores (e.g., 160 cores in an IBM
                 Cyclops-64 chip) and more and more processing cores
                 become available as computer architecture progresses.
                 The underlying runtime systems of such architectures
                 need to efficiently serve hundreds of processors at the
                 same time, requiring all basic data structures within
                 the runtime to maintain unprecedented throughput. In
                 this paper, we analyze the throughput requirements that
                 must be met by algorithms in runtime systems to be able
                 to handle hundreds of simultaneous operations in real
                 time. We reach a surprising conclusion: Many
                 traditional algorithm techniques are poorly suited for
                 highly parallel computing environments because of their
                 low throughput.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "49",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Stock:2012:UML,
  author =       "Kevin Stock and Louis-No{\"e}l Pouchet and P.
                 Sadayappan",
  title =        "Using machine learning to improve automatic
                 vectorization",
  journal =      j-TACO,
  volume =       "8",
  number =       "4",
  pages =        "50:1--50:??",
  month =        jan,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2086696.2086729",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Sat Jan 21 07:49:49 MST 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Automatic vectorization is critical to enhancing
                 performance of compute-intensive programs on modern
                 processors. However, there is much room for improvement
                 over the auto-vectorization capabilities of current
                 production compilers through careful vector-code
                 synthesis that utilizes a variety of loop
                 transformations (e.g., unroll-and-jam, interchange,
                 etc.) . As the set of transformations considered is
                 increased, the selection of the most effective
                 combination of transformations becomes a significant
                 challenge: Currently used cost models in vectorizing
                 compilers are often unable to identify the best
                 choices. In this paper, we address this problem using
                 machine learning models to predict the performance of
                 SIMD codes. In contrast to existing approaches that
                 have used high-level features of the program, we
                 develop machine learning models based on features
                 extracted from the generated assembly code.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "50",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Therdsteerasukdi:2012:URI,
  author =       "Kanit Therdsteerasukdi and Gyungsu Byun and Jason Cong
                 and M. Frank Chang and Glenn Reinman",
  title =        "Utilizing {RF-I} and intelligent scheduling for better
                 throughput\slash watt in a mobile {GPU} memory system",
  journal =      j-TACO,
  volume =       "8",
  number =       "4",
  pages =        "51:1--51:??",
  month =        jan,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2086696.2086730",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Sat Jan 21 07:49:49 MST 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Smartphones and tablets are becoming more and more
                 powerful, replacing desktops and laptops as the users'
                 main computing system. As these systems support higher
                 and higher resolutions with more complex 3D graphics, a
                 high-throughput and low-power memory system is
                 essential for the mobile GPU. In this article, we
                 propose to improve throughput/watt in a mobile GPU
                 memory system by using intelligent scheduling to reduce
                 power and multi-band radio frequency interconnect
                 (MRF-I) to offset any throughput degradation caused by
                 our intelligent scheduling.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "51",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Ryckbosch:2012:VSM,
  author =       "Frederick Ryckbosch and Stijn Polfliet and Lieven
                 Eeckhout",
  title =        "{VSim}: Simulating multi-server setups at near native
                 hardware speed",
  journal =      j-TACO,
  volume =       "8",
  number =       "4",
  pages =        "52:1--52:??",
  month =        jan,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2086696.2086731",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Sat Jan 21 07:49:49 MST 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Simulating contemporary computer systems is a
                 challenging endeavor, especially when it comes to
                 simulating high-end setups involving multiple servers.
                 The simulation environment needs to run complete
                 software stacks, including operating systems,
                 middleware, and application software, and it needs to
                 simulate network and disk activity next to CPU
                 performance. In addition, it needs the ability to scale
                 out to a large number of server nodes while attaining
                 good accuracy and reasonable simulation speeds. This
                 paper presents VSim, a novel simulation methodology for
                 multi-server systems. VSim leverages virtualization
                 technology for simulating a target system on a host
                 system. VSim controls CPU, network and disk performance
                 on the host, and it gives the illusion to the software
                 stack to run on a target system through time
                 dilation.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "52",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Zhou:2012:WAP,
  author =       "Miao Zhou and Yu Du and Bruce Childers and Rami Melhem
                 and Daniel Moss{\'e}",
  title =        "Writeback-aware partitioning and replacement for
                 last-level caches in phase change main memory systems",
  journal =      j-TACO,
  volume =       "8",
  number =       "4",
  pages =        "53:1--53:??",
  month =        jan,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2086696.2086732",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Sat Jan 21 07:49:49 MST 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Phase-Change Memory (PCM) has emerged as a promising
                 low-power main memory candidate to replace DRAM. The
                 main problems of PCM are that writes are much slower
                 and more power hungry than reads, write bandwidth is
                 much lower than read bandwidth, and limited write
                 endurance. Adding an extra layer of cache, which is
                 logically the last-level cache (LLC), can mitigate the
                 drawbacks of PCM. However, writebacks from the LLC
                 might (a) overwhelm the limited PCM write bandwidth and
                 stall the application, (b) shorten lifetime, and (c)
                 increase energy consumption. Cache partitioning and
                 replacement schemes are important to achieve high
                 throughput for multi-core systems.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "53",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Wang:2012:TMA,
  author =       "Qingping Wang and Sameer Kulkarni and John Cavazos and
                 Michael Spear",
  title =        "A transactional memory with automatic performance
                 tuning",
  journal =      j-TACO,
  volume =       "8",
  number =       "4",
  pages =        "54:1--54:??",
  month =        jan,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2086696.2086733",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Sat Jan 21 07:49:49 MST 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "A significant obstacle to the acceptance of
                 transactional memory (TM) in real-world parallel
                 programs is the abundance of substantially different TM
                 algorithms. Each TM algorithm appears well-suited to
                 certain workload characteristics, but the best choice
                 of algorithm is sensitive to program inputs, available
                 cores, and program phases. Furthermore, operating
                 system and hardware characteristics can affect which
                 algorithm is best, with tradeoffs changing across
                 iterations of a single ISA. This paper introduces
                 methods for constructing policies to dynamically select
                 the most appropriate TM algorithm based on static and
                 dynamic information. We leverage intraprocedural static
                 analysis to create a static profile of the
                 application.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "54",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Bogdanski:2012:SFC,
  author =       "Bartosz Bogdanski and Sven-Arne Reinemo and Frank Olaf
                 Sem-Jacobsen and Ernst Gunnar Gran",
  title =        "{sFtree}: a fully connected and deadlock-free
                 switch-to-switch routing algorithm for fat-trees",
  journal =      j-TACO,
  volume =       "8",
  number =       "4",
  pages =        "55:1--55:??",
  month =        jan,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2086696.2086734",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Sat Jan 21 07:49:49 MST 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Existing fat-tree routing algorithms fully exploit the
                 path diversity of a fat-tree topology in the context of
                 compute node traffic, but they lack support for
                 deadlock-free and fully connected switch-to-switch
                 communication. Such support is crucial for efficient
                 system management, for example, in InfiniBand (IB)
                 systems. With the general increase in system management
                 capabilities found in modern InfiniBand switches, the
                 lack of deadlock-free switch-to-switch communication is
                 a problem for fat-tree-based IB installations because
                 management traffic might cause routing deadlocks that
                 bring the whole system down. This lack of deadlock-free
                 communication affects all system management and
                 diagnostic tools using LID routing. In this paper, we
                 propose the sFtree routing algorithm that guarantees
                 deadlock-free and fully connected switch-to-switch
                 communication in fat-trees while maintaining the
                 properties of the current fat-tree algorithm.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "55",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Ghandour:2012:LSB,
  author =       "Walid J. Ghandour and Haitham Akkary and Wes Masri",
  title =        "Leveraging Strength-Based Dynamic Information Flow
                 Analysis to Enhance Data Value Prediction",
  journal =      j-TACO,
  volume =       "9",
  number =       "1",
  pages =        "1:1--1:??",
  month =        mar,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2133382.2133383",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Mar 30 17:45:35 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Value prediction is a technique to increase
                 parallelism by attempting to overcome serialization
                 constraints caused by true data dependences. By
                 predicting the outcome of an instruction before it
                 executes, value prediction allows data dependent
                 instructions to issue and execute speculatively, hence
                 increasing parallelism when the prediction is correct.
                 In case of a misprediction, the execution is redone
                 with the corrected value. If the benefit from increased
                 parallelism outweighs the misprediction recovery
                 penalty, overall performance could be improved.
                 Enhancing performance with value prediction therefore
                 requires highly accurate prediction methods. Most
                 existing general value prediction techniques are local,
                 that is, future outputs of an instruction are predicted
                 based on outputs from previous executions of the same
                 instruction. In this article, we investigate leveraging
                 strength-based dynamic information flow analysis to
                 enhance data value prediction. We use dynamic
                 information flow analysis (DIFA) to determine when a
                 specific value predictor can perform well and even
                 outperform other predictors. We apply information
                 theory to mathematically prove the validity and
                 benefits of correlating value predictors. We also
                 introduce the concept of the linear value predictors, a
                 new technique that predicts a new value from another
                 one using a linear relation. We finally present a
                 variant of stride predictor that we call update stride.
                 We then conduct an empirical analysis using Pin, a
                 dynamic binary instrumentation tool, and DynFlow, a
                 dynamic information flow analysis tool, that we apply
                 to programs from the SPECjvm2008 and Siemens
                 benchmarks. Our empirical measurements support our
                 mathematical theory and allow us to make important
                 observations on the relation between predictability of
                 data values and information flow. Our analysis and
                 empirical results show that the values of a set of
                 selected variables can be predicted with a very high
                 accuracy, up to 100\%. Such prediction is based on the
                 previous history and/or the values of one or more other
                 source variables that have strong information flow into
                 the predicted variable. Using our selection criteria,
                 we show that a DIFA-directed predictor outperforms
                 hardware value prediction for all subject programs, and
                 sometimes by a significant margin. This was observed
                 even when using an ideal tagged hardware value
                 prediction table that does not suffer from aliasing or
                 capacity misses.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "1",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Lee:2012:WPW,
  author =       "Jaekyu Lee and Hyesoon Kim and Richard Vuduc",
  title =        "When Prefetching Works, When It Doesn't, and Why",
  journal =      j-TACO,
  volume =       "9",
  number =       "1",
  pages =        "2:1--2:??",
  month =        mar,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2133382.2133384",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Mar 30 17:45:35 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "In emerging and future high-end processor systems,
                 tolerating increasing cache miss latency and properly
                 managing memory bandwidth will be critical to achieving
                 high performance. Prefetching, in both hardware and
                 software, is among our most important available
                 techniques for doing so; yet, we claim that prefetching
                 is perhaps also the least well-understood. Thus, the
                 goal of this study is to develop a novel, foundational
                 understanding of both the benefits and limitations of
                 hardware and software prefetching. Our study includes:
                 source code-level analysis, to help in understanding
                 the practical strengths and weaknesses of compiler- and
                 software-based prefetching; a study of the synergistic
                 and antagonistic effects between software and hardware
                 prefetching; and an evaluation of hardware prefetching
                 training policies in the presence of software
                 prefetching requests. We use both simulation and
                 measurement on real systems. We find, for instance,
                 that although there are many opportunities for
                 compilers to prefetch much more aggressively than they
                 currently do, there is also a tangible risk of
                 interference with training existing hardware
                 prefetching mechanisms. Taken together, our
                 observations suggest new research directions for
                 cooperative hardware/software prefetching.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "2",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Mazloom:2012:DTI,
  author =       "Bita Mazloom and Shashidhar Mysore and Mohit Tiwari
                 and Banit Agrawal and Tim Sherwood",
  title =        "Dataflow Tomography: Information Flow Tracking For
                 Understanding and Visualizing Full Systems",
  journal =      j-TACO,
  volume =       "9",
  number =       "1",
  pages =        "3:1--3:??",
  month =        mar,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2133382.2133385",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Mar 30 17:45:35 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "It is not uncommon for modern systems to be composed
                 of a variety of interacting services, running across
                 multiple machines in such a way that most developers do
                 not really understand the whole system. As abstraction
                 is layered atop abstraction, developers gain the
                 ability to compose systems of extraordinary complexity
                 with relative ease. However, many software properties,
                 especially those that cut across abstraction layers,
                 become very difficult to understand in such
                 compositions. The communication patterns involved, the
                 privacy of critical data, and the provenance of
                 information, can be difficult to find and understand,
                 even with access to all of the source code. The goal of
                 Dataflow Tomography is to use the inherent information
                 flow of such systems to help visualize the interactions
                 between complex and interwoven components across
                 multiple layers of abstraction. In the same way that
                 the injection of short-lived radioactive isotopes help
                 doctors trace problems in the cardiovascular system,
                 the use of ``data tagging'' can help developers slice
                 through the extraneous layers of software and pin-point
                 those portions of the system interacting with the data
                 of interest. To demonstrate the feasibility of this
                 approach we have developed a prototype system in which
                 tags are tracked both through the machine and in
                 between machines over the network, and from which novel
                 visualizations of the whole system can be derived. We
                 describe the system-level challenges in creating a
                 working system tomography tool and we qualitatively
                 evaluate our system by examining several example real
                 world scenarios.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "3",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Ahn:2012:ISE,
  author =       "Jung Ho Ahn and Norman P. Jouppi and Christos
                 Kozyrakis and Jacob Leverich and Robert S. Schreiber",
  title =        "Improving System Energy Efficiency with Memory Rank
                 Subsetting",
  journal =      j-TACO,
  volume =       "9",
  number =       "1",
  pages =        "4:1--4:??",
  month =        mar,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2133382.2133386",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Mar 30 17:45:35 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "VLSI process technology scaling has enabled dramatic
                 improvements in the capacity and peak bandwidth of DRAM
                 devices. However, current standard DDR x DIMM memory
                 interfaces are not well tailored to achieve high energy
                 efficiency and performance in modern
                 chip-multiprocessor-based computer systems. Their
                 suboptimal performance and energy inefficiency can have
                 a significant impact on system-wide efficiency since
                 much of the system power dissipation is due to memory
                 power. New memory interfaces, better suited for future
                 many-core systems, are needed. In response, there are
                 recent proposals to enhance the energy efficiency of
                 main-memory systems by dividing a memory rank into
                 subsets, and making a subset rather than a whole rank
                 serve a memory request. We holistically assess the
                 effectiveness of rank subsetting from system-wide
                 performance, energy-efficiency, and reliability
                 perspectives. We identify the impact of rank subsetting
                 on memory power and processor performance analytically,
                 compare two promising rank-subsetting proposals,
                 Multicore DIMM and mini-rank, and verify our analysis
                 by simulating a chip-multiprocessor system using
                 multithreaded and consolidated workloads. We extend the
                 design of Multicore DIMM for high-reliability systems
                 and show that compared with conventional chipkill
                 approaches, rank subsetting can lead to much higher
                 system-level energy efficiency and performance at the
                 cost of additional DRAM devices. This holistic
                 assessment shows that rank subsetting offers compelling
                 alternatives to existing processor-memory interfaces
                 for future DDR systems.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "4",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Yang:2012:CGC,
  author =       "Xuejun Yang and Li Wang and Jingling Xue and Qingbo
                 Wu",
  title =        "Comparability Graph Coloring for Optimizing
                 Utilization of Software-Managed Stream Register Files
                 for Stream Processors",
  journal =      j-TACO,
  volume =       "9",
  number =       "1",
  pages =        "5:1--5:??",
  month =        mar,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2133382.2133387",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Mar 30 17:45:35 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "The stream processors represent a promising
                 alternative to traditional cache-based general-purpose
                 processors in achieving high performance in stream
                 applications (media and some scientific applications).
                 In a stream programming model for stream processors, an
                 application is decomposed into a sequence of kernels
                 operating on streams of data. During the execution of a
                 kernel on a stream processor, all streams accessed must
                 be communicated through a nonbypassing software-managed
                 on-chip memory, the SRF (Stream Register File).
                 Optimizing utilization of the scarce on-chip memory is
                 crucial for good performance. The key insight is that
                 the interference graphs (IGs) formed by the streams in
                 stream applications tend to be comparability graphs or
                 decomposable into a set of comparability graphs. We
                 present a compiler algorithm for finding optimal or
                 near-optimal colorings, that is, SRF allocations in
                 stream IGs, by computing a maximum spanning forest of
                 the sub-IG formed by long live ranges, if necessary.
                 Our experimental results validate the optimality and
                 near-optimality of our algorithm by comparing it with
                 an ILP solver, and show that our algorithm yields
                 improved SRF utilization over the First-Fit bin-packing
                 algorithm, the best in the literature.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "5",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Majumdar:2012:MPE,
  author =       "Abhinandan Majumdar and Srihari Cadambi and Michela
                 Becchi and Srimat T. Chakradhar and Hans Peter Graf",
  title =        "A Massively Parallel, Energy Efficient Programmable
                 Accelerator for Learning and Classification",
  journal =      j-TACO,
  volume =       "9",
  number =       "1",
  pages =        "6:1--6:??",
  month =        mar,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2133382.2133388",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Mar 30 17:45:35 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Applications that use learning and classification
                 algorithms operate on large amounts of unstructured
                 data, and have stringent performance constraints. For
                 such applications, the performance of general purpose
                 processors scales poorly with data size because of
                 their limited support for fine-grained parallelism and
                 absence of software-managed caches. The large
                 intermediate data in these applications also limits
                 achievable performance on many-core processors such as
                 GPUs. To accelerate such learning applications, we
                 present a programmable accelerator that can execute
                 multiple learning and classification algorithms. To
                 architect such an accelerator, we profile five
                 representative workloads, and find that their
                 computationally intensive portions can be formulated as
                 matrix or vector operations generating large amounts of
                 intermediate data, which are then reduced by a
                 secondary operation such as array ranking, finding
                 max/min and aggregation. Our proposed accelerator,
                 called MAPLE, has hundreds of simple processing
                 elements (PEs) laid out in a two-dimensional grid, with
                 two key features. First, it uses dynamic in-memory
                 processing where on-chip memory blocks perform the
                 secondary reduction operations. Second, MAPLE uses
                 banked off-chip memory, and organizes its PEs into
                 independent groups each with its own off-chip memory
                 bank. These two features allow MAPLE to scale its
                 performance with data size. We also present an Atom
                 based energy-efficient heterogeneous system with MAPLE
                 as the accelerator that satisfies the application's
                 performance requirements at a lower system power. This
                 article describes the MAPLE architecture, explores its
                 design space with a simulator, illustrates how to
                 automatically map application kernels to the hardware,
                 and presents its performance improvement and energy
                 benefits over classic server-based implementations. We
                 implement a 512-PE FPGA prototype of MAPLE and find
                 that it is 1.5-10x faster than a 2.5 GHz quad-core Xeon
                 processor despite running at a modest 125 MHz clock
                 rate. With MAPLE connected to a 1.6GHz dual-core Atom,
                 we show an energy improvement of 38--84\% over the Xeon
                 server coupled to a 1.3 GHz 240 core Tesla GPU.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "6",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Eyerman:2012:PMJ,
  author =       "Stijn Eyerman and Lieven Eeckhout",
  title =        "Probabilistic modeling for job symbiosis scheduling on
                 {SMT} processors",
  journal =      j-TACO,
  volume =       "9",
  number =       "2",
  pages =        "7:1--7:??",
  month =        jun,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2207222.2207223",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Wed Jun 13 17:20:51 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Symbiotic job scheduling improves simultaneous
                 multithreading (SMT) processor performance by
                 coscheduling jobs that have ``compatible'' demands on
                 the processor's shared resources. Existing approaches
                 however require a sampling phase, evaluate a limited
                 number of possible coschedules, use heuristics to gauge
                 symbiosis, are rigid in their optimization target, and
                 do not preserve system-level priorities/shares. This
                 article proposes probabilistic job symbiosis modeling,
                 which predicts whether jobs will create positive or
                 negative symbiosis when coscheduled without requiring
                 the coschedule to be evaluated. The model, which uses
                 per-thread cycle stacks computed through a previously
                 proposed cycle accounting architecture, is simple
                 enough to be used in system software. Probabilistic job
                 symbiosis modeling provides six key innovations over
                 prior work in symbiotic job scheduling: (i) it does not
                 require a sampling phase, (ii) it readjusts the job
                 coschedule continuously, (iii) it evaluates a large
                 number of possible coschedules at very low overhead,
                 (iv) it is not driven by heuristics, (v) it can
                 optimize a performance target of interest (e.g., system
                 throughput or job turnaround time), and (vi) it
                 preserves system-level priorities/shares. These
                 innovations make symbiotic job scheduling both
                 practical and effective. Our experimental evaluation,
                 which assumes a realistic scenario in which jobs come
                 and go, reports an average 16\% (and up to 35\%)
                 reduction in job turnaround time compared to the
                 previously proposed SOS (sample, optimize, symbios)
                 approach for a two-thread SMT processor, and an average
                 19\% (and up to 45\%) reduction in job turnaround time
                 for a four-thread SMT processor.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "7",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Seghir:2012:IAT,
  author =       "Rachid Seghir and Vincent Loechner and Beno{\^\i}t
                 Meister",
  title =        "Integer affine transformations of parametric
                 {$Z$}-polytopes and applications to loop nest
                 optimization",
  journal =      j-TACO,
  volume =       "9",
  number =       "2",
  pages =        "8:1--8:??",
  month =        jun,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2207222.2207224",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Wed Jun 13 17:20:51 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "The polyhedral model is a well-known compiler
                 optimization framework for the analysis and
                 transformation of affine loop nests. We present a new
                 method to solve a difficult geometric operation that is
                 raised by this model: the integer affine transformation
                 of parametric $Z$-polytopes. The result of such a
                 transformation is given by a worst-case exponential
                 union of $Z$-polytopes. We also propose a polynomial
                 algorithm (for fixed dimension), to count points in
                 arbitrary unions of a fixed number of parametric
                 $Z$-polytopes. We implemented these algorithms and
                 compared them to other existing algorithms, for a set
                 of applications to loop nest analysis and
                 optimization.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "8",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Yang:2012:UOC,
  author =       "Yi Yang and Ping Xiang and Jingfei Kong and Mike
                 Mantor and Huiyang Zhou",
  title =        "A unified optimizing compiler framework for different
                 {GPGPU} architectures",
  journal =      j-TACO,
  volume =       "9",
  number =       "2",
  pages =        "9:1--9:??",
  month =        jun,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2207222.2207225",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Wed Jun 13 17:20:51 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "This article presents a novel optimizing compiler for
                 general purpose computation on graphics processing
                 units (GPGPU). It addresses two major challenges of
                 developing high performance GPGPU programs: effective
                 utilization of GPU memory hierarchy and judicious
                 management of parallelism. The input to our compiler is
                 a na{\"\i}ve GPU kernel function, which is functionally
                 correct but without any consideration for performance
                 optimization. The compiler generates two kernels, one
                 optimized for global memories and the other for texture
                 memories. The proposed compilation process is effective
                 for both AMD/ATI and NVIDIA GPUs. The experiments show
                 that our optimized code achieves very high performance,
                 either superior or very close to highly fine-tuned
                 libraries.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "9",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Jang:2012:ACO,
  author =       "Choonki Jang and Jaejin Lee and Bernhard Egger and
                 Soojung Ryu",
  title =        "Automatic code overlay generation and partially
                 redundant code fetch elimination",
  journal =      j-TACO,
  volume =       "9",
  number =       "2",
  pages =        "10:1--10:??",
  month =        jun,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2207222.2207226",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Wed Jun 13 17:20:51 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "There is an increasing interest in explicitly managed
                 memory hierarchies, where a hierarchy of distinct
                 memories is exposed to the programmer and managed
                 explicitly in software. These hierarchies can be found
                 in typical embedded systems and an emerging class of
                 multicore architectures. To run an application that
                 requires more code memory than the available
                 higher-level memory, typically an overlay structure is
                 needed. The overlay structure is generated manually by
                 the programmer or automatically by a specialized
                 linker. Manual code overlaying requires the programmer
                 to deeply understand the program structure for maximum
                 memory savings as well as minimum performance
                 degradation. Although the linker can automatically
                 generate the code overlay structure, its memory savings
                 are limited and it even brings significant performance
                 degradation because traditional techniques do not
                 consider the program context. In this article, we
                 propose an automatic code overlay generation technique
                 that overcomes the limitations of traditional automatic
                 code overlaying techniques. We are dealing with a
                 system context that imposes two distinct constraints:
                 (1) no hardware support for address translation and (2)
                 a spatially and temporally coarse grained faulting
                 mechanism at the function level. Our approach addresses
                 those two constraints as efficiently as possible. Our
                 technique statically computes the Worst-Case Number of
                 Conflict misses (WCNC) between two different code
                 segments using path expressions. Then, it constructs a
                 static temporal relationship graph with the WCNCs and
                 emits an overlay structure for a given higher-level
                 memory size. We also propose an inter-procedural
                 partial redundancy elimination technique that minimizes
                 redundant code copying caused by the generated overlay
                 structure. Experimental results show that our approach
                 is promising.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "10",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Abbasi:2012:TSW,
  author =       "Zahra Abbasi and Georgios Varsamopoulos and Sandeep K.
                 S. Gupta",
  title =        "{TACOMA}: Server and workload management in {Internet}
                 data centers considering cooling-computing power
                 trade-off and energy proportionality",
  journal =      j-TACO,
  volume =       "9",
  number =       "2",
  pages =        "11:1--11:??",
  month =        jun,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2207222.2207227",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Wed Jun 13 17:20:51 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "A two-tier Internet data center management scheme,
                 TACOMA, with thermal-aware server provisioning (TASP)
                 in one tier, and thermal-aware workload distribution
                 (TAWD) in the other is proposed. TASP and TAWD
                 coordinate to maximize the energy savings by leveraging
                 the workload dynamics, at coarse and fine time scale,
                 respectively. TACOMA is aware of the QoS constraints,
                 the energy proportionality of servers, and the
                 potential trade-off between cooling and computing
                 power. The obtained energy savings are a combination of
                 suspending idle servers, using servers at their peak
                 efficiency, and avoiding heat recirculation.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "11",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Lankes:2012:BSP,
  author =       "Andreas Lankes and Thomas Wild and Stefan Wallentowitz
                 and Andreas Herkersdorf",
  title =        "Benefits of selective packet discard in
                 networks-on-chip",
  journal =      j-TACO,
  volume =       "9",
  number =       "2",
  pages =        "12:1--12:??",
  month =        jun,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2207222.2207228",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Wed Jun 13 17:20:51 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Today, Network on Chip concepts principally assume
                 inherent lossless operation. Considering that future
                 nanometer CMOS technologies will witness increased
                 sensitivity to all forms of manufacturing and
                 environmental variations (e.g., IR drop, soft errors
                 due to radiation, transient temperature induced timing
                 problems, device aging), efforts to cope with data
                 corruption or packet loss will be unavoidable. Possible
                 counter measures against packet loss are the extension
                 of flits with ECC or the introduction of error
                 detection with retransmission. We propose to make use
                 of the perceived deficiency of packet loss as a
                 feature. By selectively discarding stuck packets in the
                 NoC, a proven practice in computer networks, all types
                 of deadlocks can be resolved. This is especially
                 advantageous for solving the problem of
                 message-dependent deadlocks, which otherwise leads to
                 high costs either in terms of throughput or chip area.
                 Strict ordering, the most popular approach to this
                 problem, results in a significant buffer overhead and a
                 more complex router architecture. In addition, we will
                 show that eliminating local network congestions by
                 selectively discarding individual packets also can
                 improve the effective throughput of the network. The
                 end-to-end retransmission mechanism required for the
                 reliable communication, then also provides lossless
                 communication for the cores.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "12",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Luo:2012:DDS,
  author =       "Yangchun Luo and Antonia Zhai",
  title =        "Dynamically dispatching speculative threads to improve
                 sequential execution",
  journal =      j-TACO,
  volume =       "9",
  number =       "3",
  pages =        "13:1--13:??",
  month =        sep,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2355585.2355586",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Oct 22 10:48:53 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Efficiently utilizing multicore processors to improve
                 their performance potentials demands extracting
                 thread-level parallelism from the applications. Various
                 novel and sophisticated execution models have been
                 proposed to extract thread-level parallelism from
                 sequential programs. One such execution model,
                 Thread-Level Speculation (TLS), allows potentially
                 dependent threads to execute speculatively in parallel.
                 However, TLS execution is inherently unpredictable, and
                 consequently incorrect speculation could degrade
                 performance for the multicore systems. Existing
                 approaches have focused on using the compilers to
                 select sequential program regions to apply TLS. Our
                 research shows that even the state-of-the-art compiler
                 makes suboptimal decisions, due to the unpredictability
                 of TLS execution. Thus, we propose to dynamically
                 optimize TLS performance. This article describes the
                 design, implementation, and evaluation of a runtime
                 thread dispatching mechanism that adjusts the behaviors
                 of speculative threads based on their efficiency. In
                 the proposed system, speculative threads are monitored
                 by hardware-based performance counters and their
                 performance impact is evaluated with a novel
                 methodology that takes into account various unique TLS
                 characteristics. Thread dispatching policies are
                 devised to adjust the behaviors of speculative threads
                 accordingly. With the help of the runtime evaluation,
                 where and how to create speculative threads is better
                 determined. Evaluated with all the SPEC CPU2000
                 benchmark programs written in C, the dynamic
                 dispatching system outperforms the state-of-the-art
                 compiler-based thread management techniques by 9.4\% on
                 average. Comparing to sequential execution, we achieve
                 1.37X performance improvement on a four-core CMP-based
                 system.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "13",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Cui:2012:EPO,
  author =       "Huimin Cui and Jingling Xue and Lei Wang and Yang Yang
                 and Xiaobing Feng and Dongrui Fan",
  title =        "Extendable pattern-oriented optimization directives",
  journal =      j-TACO,
  volume =       "9",
  number =       "3",
  pages =        "14:1--14:??",
  month =        sep,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2355585.2355587",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Oct 22 10:48:53 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Algorithm-specific, that is, semantic-specific
                 optimizations have been observed to bring significant
                 performance gains, especially for a diverse set of
                 multi/many-core architectures. However, current
                 programming models and compiler technologies for the
                 state-of-the-art architectures do not exploit well
                 these performance opportunities. In this article, we
                 propose a pattern-making methodology that enables
                 algorithm-specific optimizations to be encapsulated
                 into ``optimization patterns''. Such optimization
                 patterns are expressed in terms of preprocessor
                 directives so that simple annotations can result in
                 significant performance improvements. To validate this
                 new methodology, a framework, named EPOD, is developed
                 to map these directives into the underlying
                 optimization schemes for a particular architecture. It
                 is difficult to create an exact performance model to
                 determine an optimal or near-optimal optimization
                 scheme (including which optimizations to apply and in
                 which order) for a specific application, due to the
                 complexity of applications and architectures. However,
                 it is trackable to build individual optimization
                 components and let compiler developers synthesize an
                 optimization scheme from these components. Therefore,
                 our EPOD framework provides an Optimization Programming
                 Interface (OPI) for compiler developers to define new
                 optimization schemes. Thus, new patterns can be
                 integrated into EPOD in a flexible manner. We have
                 identified and implemented a number of optimization
                 patterns for three representative computer platforms.
                 Our experimental results show that a pattern-guided
                 compiler can outperform the state-of-the-art compilers
                 and even achieve performance as competitive as
                 hand-tuned code. Therefore, such a pattern-making
                 methodology represents an encouraging direction for
                 domain experts' experience and knowledge to be
                 integrated into general-purpose compilers.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "14",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Lewis:2012:REC,
  author =       "Adam Wade Lewis and Nian-Feng Tzeng and Soumik Ghosh",
  title =        "Runtime energy consumption estimation for server
                 workloads based on chaotic time-series approximation",
  journal =      j-TACO,
  volume =       "9",
  number =       "3",
  pages =        "15:1--15:??",
  month =        sep,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2355585.2355588",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Oct 22 10:48:53 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "This article proposes a runtime model that relates
                 server energy consumption to its overall thermal
                 envelope, using hardware performance counters and
                 experimental measurements. While previous studies have
                 attempted system-wide modeling of server power
                 consumption through subsystem models, our approach is
                 different in that it links system energy input to
                 subsystem energy consumption based on a small set of
                 tightly correlated parameters. The proposed model takes
                 into account processor power, bus activities, and
                 system ambient temperature for real-time prediction on
                 the power consumption of long running jobs. Using the
                 HyperTransport and QuickPath Link structures as case
                 studies and through electrical measurements on example
                 server subsystems, we develop a chaotic time-series
                 approximation for runtime power consumption, arriving
                 at the Chaotic Attractor Predictor (CAP). With
                 polynomial time complexity, CAP exhibits high
                 prediction accuracy, having the prediction errors
                 within 1.6\% (or 3.3\%) for servers based on the
                 HyperTransport bus (or the QuickPath Links), as
                 verified by a set of common processor benchmarks. Our
                 CAP is a superior predictive mechanism over existing
                 linear auto-regressive methods, which require expensive
                 and complex corrective steps to address the nonlinear
                 and chaotic aspects of the underlying physical
                 system.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "15",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Valero:2012:CRI,
  author =       "Alejandro Valero and Julio Sahuquillo and Salvador
                 Petit and Pedro L{\'o}pez and Jos{\'e} Duato",
  title =        "Combining recency of information with selective random
                 and a victim cache in last-level caches",
  journal =      j-TACO,
  volume =       "9",
  number =       "3",
  pages =        "16:1--16:??",
  month =        sep,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2355585.2355589",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Oct 22 10:48:53 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Memory latency has become an important performance
                 bottleneck in current microprocessors. This problem
                 aggravates as the number of cores sharing the same
                 memory controller increases. To palliate this problem,
                 a common solution is to implement cache hierarchies
                 with large or huge Last-Level Cache (LLC)
                 organizations. LLC memories are implemented with a high
                 number of ways (e.g., 16) to reduce conflict misses.
                 Typically, caches have implemented the LRU algorithm to
                 exploit temporal locality, but its performance goes
                 away from the optimal as the number of ways increases.
                 In addition, the implementation of a strict LRU
                 algorithm is costly in terms of area and power. This
                 article focuses on a family of low-cost replacement
                 strategies, whose implementation scales with the number
                 of ways while maintaining the performance. The proposed
                 strategies track the accessing order for just a few
                 blocks, which cannot be replaced. The victim is
                 randomly selected among those blocks exhibiting poor
                 locality. Although, in general, the random policy helps
                 improving the performance, in some applications the
                 scheme fails with respect to the LRU policy leading to
                 performance degradation. This drawback can be overcome
                 by the addition of a small victim cache of the large
                 LLC. Experimental results show that, using the best
                 version of the family without victim cache, MPKI
                 reduction falls in between 10\% and 11\% compared to a
                 set of the most representative state-of-the-art
                 algorithms, whereas the reduction grows up to 22\% with
                 respect to LRU. The proposal with victim cache achieves
                 speedup improvements, on average, by 4\% compared to
                 LRU. In addition, it reduces dynamic energy, on
                 average, up to 8\%. Finally, compared to the studied
                 algorithms, hardware complexity is largely reduced by
                 the baseline algorithm of the family.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "16",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Li:2012:DQM,
  author =       "Bin Li and Li-Shiuan Peh and Li Zhao and Ravi Iyer",
  title =        "Dynamic {QoS} management for chip multiprocessors",
  journal =      j-TACO,
  volume =       "9",
  number =       "3",
  pages =        "17:1--17:??",
  month =        sep,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2355585.2355590",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Oct 22 10:48:53 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "With the continuing scaling of semiconductor
                 technologies, chip multiprocessor (CMP) has become the
                 de facto design for modern high performance computer
                 architectures. It is expected that more and more
                 applications with diverse requirements will run
                 simultaneously on the CMP platform. However, this will
                 exert contention on shared resources such as the last
                 level cache, network-on-chip bandwidth and off-chip
                 memory bandwidth, thus affecting the performance and
                 quality-of-service (QoS) significantly. In this
                 environment, efficient resource sharing and a guarantee
                 of a certain level of performance is highly desirable.
                 Researchers have proposed different frameworks for
                 providing QoS. Most of these frameworks focus on
                 individual resource for QoS management. Coordinated
                 management of multiple QoS-aware shared resources at
                 runtime remains an open problem. Recently, there has
                 been work that proposed a class-of-serviced based
                 framework to jointly managing cache, NoC and memory
                 resources simultaneously. However, the work allocates
                 shared resources statically at the beginning of
                 application runtime, and do not dynamically track,
                 manage and share shared resources across applications.
                 In this article, we address this limitation by
                 proposing dynamic resource management policies that
                 monitor the resource usage of applications at runtime,
                 then steals resources from the high-priority
                 applications for lower-priority ones. The goal is to
                 maintain the targeted level of performance for
                 high-priority applications while improving the
                 performance of lower-priority applications. We use a PI
                 (Proportional-Integral gain) feedback controller based
                 technique to maintain stability in our framework. Our
                 evaluation results show that our policy can improve
                 performance for lower-priority applications
                 significantly while maintaining the performance for
                 high-priority application, thus demonstrating the
                 effectiveness of our dynamic QoS resource management
                 policy.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "17",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Xekalakis:2012:MSM,
  author =       "Polychronis Xekalakis and Nikolas Ioannou and Marcelo
                 Cintra",
  title =        "Mixed speculative multithreaded execution models",
  journal =      j-TACO,
  volume =       "9",
  number =       "3",
  pages =        "18:1--18:??",
  month =        sep,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2355585.2355591",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Oct 22 10:48:53 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "The current trend toward multicore architectures has
                 placed great pressure on programmers and compilers to
                 generate thread-parallel programs. Improved execution
                 performance can no longer be obtained via traditional
                 single-thread instruction level parallelism (ILP), but,
                 instead, via multithreaded execution. One notable
                 technique that facilitates the extraction of parallel
                 threads from sequential applications is thread-level
                 speculation (TLS). This technique allows
                 programmers/compilers to generate threads without
                 checking for inter-thread data and control dependences,
                 which are then transparently enforced by the hardware.
                 Most prior work on TLS has concentrated on thread
                 selection and mechanisms to efficiently support the
                 main TLS operations, such as squashes, data versioning,
                 and commits. This article seeks to enhance TLS
                 functionality by combining it with other speculative
                 multithreaded execution models. The main idea is that
                 TLS already requires extensive hardware support, which
                 when slightly augmented can accommodate other
                 speculative multithreaded techniques. Recognizing that
                 for different applications, or even program phases, the
                 application bottlenecks may be different, it is
                 reasonable to assume that the more versatile a system
                 is, the more efficiently it will be able to execute the
                 given program. Toward this direction, we first show
                 that mixed execution models that combine TLS with
                 Helper Threads (HT), RunAhead execution (RA) and
                 MultiPath execution (MP) perform better than any of the
                 models alone. Based on a simple model that we propose,
                 we show that benefits come from being able to extract
                 additional ILP without harming the TLP extracted by
                 TLS. We then show that by combining all the execution
                 models in a unified one that combines all these
                 speculative multithreaded models, ILP can be further
                 enhanced with only minimal additional cost in
                 hardware.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "18",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Sharafeddine:2012:DOE,
  author =       "Mageda Sharafeddine and Komal Jothi and Haitham
                 Akkary",
  title =        "Disjoint out-of-order execution processor",
  journal =      j-TACO,
  volume =       "9",
  number =       "3",
  pages =        "19:1--19:??",
  month =        sep,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2355585.2355592",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Oct 22 10:48:53 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "High-performance superscalar architectures used to
                 exploit instruction level parallelism in single-thread
                 applications have become too complex and power hungry
                 for the multicore processors era. We propose a new
                 architecture that uses multiple small latency-tolerant
                 out-of-order cores to improve single-thread
                 performance. Improving single-thread performance with
                 multiple small out-of-order cores allows designers to
                 place more of these cores on the same die.
                 Consequently, emerging highly parallel applications can
                 take full advantage of the multicore parallel hardware
                 without sacrificing performance of inherently serial
                 and hard to parallelize applications. Our architecture
                 combines speculative multithreading (SpMT) with
                 checkpoint recovery and continual flow pipeline
                 architectures. It splits single-thread program
                 execution into disjoint control and data threads that
                 execute concurrently on multiple cooperating small and
                 latency-tolerant out-of-order cores. Hence we call this
                 style of execution Disjoint Out-of-Order Execution
                 (DOE). DOE uses latency tolerance to overcome
                 performance issues of SpMT caused by interthread data
                 dependences. To evaluate this architecture, we have
                 developed a microarchitecture performance model of DOE
                 based on PTLSim, a simulation infrastructure of the x86
                 instruction set architecture. We evaluate the potential
                 performance of DOE processor architecture using a
                 simple heuristic to fork control independent threads in
                 hardware at the target addresses of future procedure
                 return instructions. Using applications from SpecInt
                 2000, we study DOE under ideal as well as realistic
                 architectural constraints. We discuss the performance
                 impact of key DOE architecture and application
                 variables such as number of cores, interthread data
                 dependences, intercore data communication delay,
                 buffers capacity, and branch mispredictions. Without
                 any DOE specific compiler optimizations, our results
                 show that DOE outperforms conventional SpMT
                 architectures by 15\%, on average. We also show that
                 DOE with four small cores can perform on average
                 equally well to a large superscalar core, consuming
                 about the same power. Most importantly, DOE improves
                 throughput performance by a significant amount over a
                 large superscalar core, up to 2.5 times, when running
                 multitasking applications.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "19",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Andrade:2012:SAW,
  author =       "Diego Andrade and Basilio B. Fraguela and Ram{\'o}n
                 Doallo",
  title =        "Static analysis of the worst-case memory performance
                 for irregular codes with indirections",
  journal =      j-TACO,
  volume =       "9",
  number =       "3",
  pages =        "20:1--20:??",
  month =        sep,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2355585.2355593",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Oct 22 10:48:53 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Real-time systems are subject to timing constraints,
                 whose upper bound is given by the Worst-Case Execution
                 Time (WCET). Cache memory behavior is difficult to
                 predict analytically and estimating a safe and precise
                 worst-case value is even more challenging. The
                 worst-case memory performance (WCMP) component of the
                 WCET can only be estimated with the precise knowledge
                 of the stream of data addresses accessed by the code,
                 which is determined by the access patterns and the base
                 addresses of the data structures accessed. The
                 regularity of strided access patterns simplifies their
                 analysis, as they are characterized by relatively few
                 parameters, which are often available at compile time.
                 Unfortunately codes may exhibit irregular access
                 patterns, which are much more difficult to statically
                 analyze. As for the base addresses of the data
                 structures, they are not always available at
                 compile-time for many reasons: stack variables,
                 dynamically allocated memory, modules compiled
                 separately, etc. This article addresses these problems
                 by presenting a model that predicts an \%safe and upper
                 bound of the data cache performance for codes both with
                 regular and irregular access patterns, which is valid
                 for any possible base addresses of the data structures.
                 The model analyzes irregular access patterns due to the
                 presence of indirections in the code and it can provide
                 two kinds of predictions: a safe hard boundary that is
                 suitable for hard real-time systems and a soft boundary
                 whose safeness is not guaranteed but which is valid
                 most of the times. In fact, in all our experiments the
                 number of misses was below the soft boundary predicted
                 by the model. This turns this soft boundary prediction
                 into a valuable tool, particularly for non and soft
                 real-time systems, which tolerate a percentage of the
                 runs exceeding their deadlines.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "20",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Chen:2012:DIO,
  author =       "Yang Chen and Shuangde Fang and Yuanjie Huang and
                 Lieven Eeckhout and Grigori Fursin and Olivier Temam
                 and Chengyong Wu",
  title =        "Deconstructing iterative optimization",
  journal =      j-TACO,
  volume =       "9",
  number =       "3",
  pages =        "21:1--21:??",
  month =        sep,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2355585.2355594",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Oct 22 10:48:53 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Iterative optimization is a popular compiler
                 optimization approach that has been studied extensively
                 over the past decade. In this article, we deconstruct
                 iterative optimization by evaluating whether it works
                 across datasets and by analyzing why it works. Up to
                 now, most iterative optimization studies are based on a
                 premise which was never truly evaluated: that it is
                 possible to learn the best compiler optimizations
                 across datasets. In this article, we evaluate this
                 question for the first time with a very large number of
                 datasets. We therefore compose KDataSets, a dataset
                 suite with 1000 datasets for 32 programs, which we
                 release to the public. We characterize the diversity of
                 KDataSets, and subsequently use it to evaluate
                 iterative optimization. For all 32 programs, we find
                 that there exists at least one combination of compiler
                 optimizations that achieves at least 83\% or more of
                 the best possible speedup across all datasets on two
                 widely used compilers (Intel's ICC and GNU's GCC). This
                 optimal combination is program-specific and yields
                 speedups up to 3.75$ \times $ (averaged across datasets
                 of a program) over the highest optimization level of
                 the compilers (-O3 for GCC and -fast for ICC). This
                 finding suggests that optimizing programs across
                 datasets might be much easier than previously
                 anticipated. In addition, we evaluate the idea of
                 introducing compiler choice as part of iterative
                 optimization. We find that it can further improve the
                 performance of iterative optimization because different
                 programs favor different compilers. We also investigate
                 why iterative optimization works by analyzing the
                 optimal combinations. We find that only a handful
                 optimizations yield most of the speedup. Finally, we
                 show that optimizations interact in a complex and
                 sometimes counterintuitive way through two case
                 studies, which confirms that iterative optimization is
                 an irreplaceable and important compiler strategy.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "21",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Guha:2012:MOD,
  author =       "Apala Guha and Kim Hazelwood and Mary Lou Soffa",
  title =        "Memory optimization of dynamic binary translators for
                 embedded systems",
  journal =      j-TACO,
  volume =       "9",
  number =       "3",
  pages =        "22:1--22:??",
  month =        sep,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2355585.2355595",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Oct 22 10:48:53 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Dynamic binary translators (DBTs) are becoming
                 increasingly important because of their power and
                 flexibility. DBT-based services are valuable for all
                 types of platforms. However, the high memory demands of
                 DBTs present an obstacle for embedded systems. Most
                 research on DBT design has a performance focus, which
                 often drives up the DBT memory demand. In this article,
                 we present a memory-oriented approach to DBT design. We
                 consider the class of translation-based DBTs and their
                 sources of memory demand; cached translated code,
                 cached auxiliary code and DBT data structures. We
                 explore aspects of DBT design that impact these memory
                 demand sources and present strategies to mitigate
                 memory demand. We also explore performance
                 optimizations for DBTs that handle memory demand by
                 placing a limit on it, and repeatedly flush
                 translations to stay within the limit, thereby
                 replacing the memory demand problem with a performance
                 degradation problem. Our optimizations that mitigate
                 memory demand improve performance.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "22",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Geraci:2012:TFP,
  author =       "James R. Geraci and Sharon M. Sacco",
  title =        "A transpose-free in-place {SIMD} optimized {FFT}",
  journal =      j-TACO,
  volume =       "9",
  number =       "3",
  pages =        "23:1--23:??",
  month =        sep,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2355585.2355596",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Oct 22 10:48:53 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "A transpose-free in-place SIMD optimized algorithm for
                 the computation of large FFTs is introduced and
                 implemented on the Cell Broadband Engine. Six different
                 FFT implementations of the algorithm using six
                 different data movement methods are described. Their
                 relative performance is compared for input sizes from $
                 2^{17} $ to $ 2^{21} $ complex floating point samples.
                 Large differences in performance are observed among
                 even theoretically equivalent data movement patterns.
                 All six implementations compare favorably with FFTW and
                 other previous FFT implementations.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "23",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Coppens:2013:FDB,
  author =       "Bart Coppens and Bjorn {De Sutter} and Jonas Maebe",
  title =        "Feedback-driven binary code diversification to the
                 special issue on high-performance embedded
                 architectures and compilers",
  journal =      j-TACO,
  volume =       "9",
  number =       "4",
  pages =        "24:1--24:??",
  month =        jan,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2400682.2400683",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Jan 18 10:57:16 MST 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "As described in many blog posts and in the scientific
                 literature, exploits for software vulnerabilities are
                 often engineered on the basis of patches. For example,
                 ``Microsoft Patch Tuesday'' is often followed by
                 ``Exploit Wednesday'' during which yet unpatched
                 systems become vulnerable to patch-based exploits. Part
                 of the patch engineering includes the identification of
                 the vulnerable binary code by means of
                 reverse-engineering tools and diffing add-ons. In this
                 article we present a feedback-driven compiler tool flow
                 that iteratively transforms code until diffing tools
                 become ineffective enough to close the ``Exploit
                 Wednesday'' window of opportunity. We demonstrate the
                 tool's effectiveness on a set of real-world patches and
                 against the latest version of BinDiff.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "24",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Fowers:2013:PEC,
  author =       "Jeremy Fowers and Greg Brown and John Wernsing and
                 Greg Stitt",
  title =        "A performance and energy comparison of convolution on
                 {GPUs}, {FPGAs}, and multicore processors",
  journal =      j-TACO,
  volume =       "9",
  number =       "4",
  pages =        "25:1--25:??",
  month =        jan,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2400682.2400684",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Jan 18 10:57:16 MST 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Recent architectural trends have focused on increased
                 parallelism via multicore processors and increased
                 heterogeneity via accelerator devices (e.g.,
                 graphics-processing units, field-programmable gate
                 arrays). Although these architectures have significant
                 performance and energy potential, application designers
                 face many device-specific challenges when choosing an
                 appropriate accelerator or when customizing an
                 algorithm for an accelerator. To help address this
                 problem, in this article we thoroughly evaluate
                 convolution, one of the most common operations in
                 digital-signal processing, on multicores,
                 graphics-processing units, and field-programmable gate
                 arrays. Whereas many previous application studies
                 evaluate a specific usage of an application, this
                 article assists designers with design space exploration
                 for numerous use cases by analyzing effects of
                 different input sizes, different algorithms, and
                 different devices, while also determining
                 Pareto-optimal trade-offs between performance and
                 energy.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "25",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Rohou:2013:VTI,
  author =       "Erven Rohou and Kevin Williams and David Yuste",
  title =        "Vectorization technology to improve interpreter
                 performance",
  journal =      j-TACO,
  volume =       "9",
  number =       "4",
  pages =        "26:1--26:??",
  month =        jan,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2400682.2400685",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Jan 18 10:57:16 MST 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "In the present computing landscape, interpreters are
                 in use in a wide range of systems. Recent trends in
                 consumer electronics have created a new category of
                 portable, lightweight software applications. Typically,
                 these applications have fast development cycles and
                 short life spans. They run on a wide range of systems
                 and are deployed in a target independent bytecode
                 format over Internet and cellular networks. Their
                 authors are untrusted third-party vendors, and they are
                 executed in secure managed runtimes or virtual
                 machines. Furthermore, due to security policies or
                 development time constraints, these virtual machines
                 often lack just-in-time compilers and rely on
                 interpreted execution. At the other end of the
                 spectrum, interpreters are also a reality in the field
                 of high-performance computations because of the
                 flexibility they provide. The main performance penalty
                 in interpreters arises from instruction dispatch. Each
                 bytecode requires a minimum number of machine
                 instructions to be executed. In this work, we introduce
                 a novel approach for interpreter optimization that
                 reduces instruction dispatch thanks to vectorization
                 technology. We extend the split compilation paradigm to
                 interpreters, thus guaranteeing that our approach
                 exhibits almost no overhead at runtime. We take
                 advantage of the vast research in vectorization and its
                 presence in modern compilers. Complex analyses are
                 performed ahead of time, and their results are conveyed
                 to the executable bytecode. At runtime, the interpreter
                 retrieves this additional information to build the SIMD
                 IR (intermediate representation) instructions that
                 carry the vector semantics. The bytecode language
                 remains unmodified, making this representation
                 compatible with legacy interpreters and previously
                 proposed JIT compilers. We show that this approach
                 drastically reduces the number of instructions to
                 interpret and decreases execution time of vectorizable
                 applications. Moreover, we map SIMD IR instructions to
                 hardware SIMD instructions when available, with a
                 substantial additional improvement. Finally, we finely
                 analyze the impact of our extension on the behavior of
                 the caches and branch predictors.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "26",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Cleary:2013:FAT,
  author =       "Jimmy Cleary and Owen Callanan and Mark Purcell and
                 David Gregg",
  title =        "Fast asymmetric thread synchronization",
  journal =      j-TACO,
  volume =       "9",
  number =       "4",
  pages =        "27:1--27:??",
  month =        jan,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2400682.2400686",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Jan 18 10:57:16 MST 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "For most multi-threaded applications, data structures
                 must be shared between threads. Ensuring thread safety
                 on these data structures incurs overhead in the form of
                 locking and other synchronization mechanisms. Where
                 data is shared among multiple threads these costs are
                 unavoidable. However, a common access pattern is that
                 data is accessed primarily by one dominant thread, and
                 only very rarely by the other, non-dominant threads.
                 Previous research has proposed biased locks, which are
                 optimized for a single dominant thread, at the cost of
                 greater overheads for non-dominant threads. In this
                 article we propose a new family of biased
                 synchronization mechanisms that, using a modified
                 interface, push accesses to shared data from the
                 non-dominant threads to the dominant one, via a novel
                 set of message passing mechanisms. We present
                 mechanisms for protecting critical sections, for
                 queueing work, for caching shared data in registers
                 where it is safe to do so, and for asynchronous
                 critical section accesses. We present results for the
                 conventional Intel\reg{} Sandy Bridge processor and for
                 the emerging network-optimized many-core IBM\reg{}
                 PowerENTM processor. We find that our algorithms
                 compete well with existing biased locking algorithms,
                 and, in particular, perform better than existing
                 algorithms as accesses from non-dominant threads
                 increase.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "27",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Li:2013:PTL,
  author =       "Yong Li and Rami Melhem and Alex K. Jones",
  title =        "{PS-TLB}: Leveraging page classification information
                 for fast, scalable and efficient translation for future
                 {CMPs}",
  journal =      j-TACO,
  volume =       "9",
  number =       "4",
  pages =        "28:1--28:??",
  month =        jan,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2400682.2400687",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Jan 18 10:57:16 MST 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Traversing the page table during virtual to physical
                 address translation causes pipeline stalls when misses
                 occur in the translation-lookaside buffer (TLB).
                 State-of-the-art translation proposals typically
                 optimize a single aspect of translation performance
                 (e.g., translation sharing, context switch performance,
                 etc.) with potential trade-offs of additional hardware
                 complexity, increased translation latency, or reduced
                 scalability. In this article, we propose the partial
                 sharing TLB (PS-TLB), a fast and scalable solution that
                 reduces off-chip translation misses without sacrificing
                 the timing-critical requirement of on-chip translation.
                 We introduce the partial sharing buffer (PSB) which
                 leverages application page sharing characteristics
                 using minimal additional hardware resources. Compared
                 to the leading TLB proposal that leverages sharing,
                 PS-TLB provides a more than 45\% improvement in
                 translation latency with a 9\% application speedup
                 while using fewer storage resources. In addition, the
                 page classification and PS-TLB architecture provide
                 further optimizations including an over 30\% reduction
                 of interprocessor interrupts for coherence, and reduced
                 context switch misses with fewer resources compared
                 with existing methods.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "28",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{DuBois:2013:PTC,
  author =       "Kristof {Du Bois} and Stijn Eyerman and Lieven
                 Eeckhout",
  title =        "Per-thread cycle accounting in multicore processors",
  journal =      j-TACO,
  volume =       "9",
  number =       "4",
  pages =        "29:1--29:??",
  month =        jan,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2400682.2400688",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Jan 18 10:57:16 MST 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "While multicore processors improve overall chip
                 throughput and hardware utilization, resource sharing
                 among the cores leads to unpredictable performance for
                 the individual threads running on a multicore
                 processor. Unpredictable per-thread performance becomes
                 a problem when considered in the context of multicore
                 scheduling: system software assumes that all threads
                 make equal progress, however, this is not what the
                 hardware provides. This may lead to problems at the
                 system level such as missed deadlines, reduced
                 quality-of-service, non-satisfied service-level
                 agreements, unbalanced parallel performance, priority
                 inversion, unpredictable interactive performance, etc.
                 This article proposes a hardware-efficient per-thread
                 cycle accounting architecture for multicore processors.
                 The counter architecture tracks per-thread progress in
                 a multicore processor, detects how inter-thread
                 interference affects per-thread performance, and
                 predicts the execution time for each thread if run in
                 isolation. The counter architecture captures the
                 effects of additional conflict misses due to cache
                 sharing as well as increased latency for other memory
                 accesses due to resource and bandwidth contention in
                 the memory subsystem. The proposed method accounts for
                 74.3\% of the interference cycles, and estimates
                 per-thread progress within 14.2\% on average across a
                 large set of multi-program workloads. Hardware cost is
                 limited to 7.44KB for an 8-core processor, a reduction
                 by almost $ 10 \times $ compared to prior work while
                 being 63.8\% more accurate. Making system software
                 progress aware improves fairness by 22.5\% on average
                 over progress-agnostic scheduling.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "29",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Wimmer:2013:MAV,
  author =       "Christian Wimmer and Michael Haupt and Michael L. {Van
                 De Vanter} and Mick Jordan and Laurent Dayn{\`e}s and
                 Douglas Simon",
  title =        "{Maxine}: an approachable virtual machine for, and in,
                 {Java}",
  journal =      j-TACO,
  volume =       "9",
  number =       "4",
  pages =        "30:1--30:??",
  month =        jan,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2400682.2400689",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Jan 18 10:57:16 MST 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "A highly productive platform accelerates the
                 production of research results. The design of a Virtual
                 Machine (VM) written in the Java{\TM} programming
                 language can be simplified through exploitation of
                 interfaces, type and memory safety, automated memory
                 management (garbage collection), exception handling,
                 and reflection. Moreover, modern Java IDEs offer
                 time-saving features such as refactoring,
                 auto-completion, and code navigation. Finally, Java
                 annotations enable compiler extensions for low-level
                 ``systems programming'' while retaining IDE
                 compatibility. These techniques collectively make
                 complex system software more ``approachable'' than has
                 been typical in the past. The Maxine VM, a metacircular
                 Java VM implementation, has aggressively used these
                 features since its inception. A co-designed companion
                 tool, the Maxine Inspector, offers integrated debugging
                 and visualization of all aspects of the VM's runtime
                 state. The Inspector's implementation exploits advanced
                 Java language features, embodies intimate knowledge of
                 the VM's design, and even reuses a significant amount
                 of VM code directly. These characteristics make Maxine
                 a highly approachable VM research platform and a
                 productive basis for research and teaching.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "30",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Khan:2013:SBA,
  author =       "Malik Khan and Protonu Basu and Gabe Rudy and Mary
                 Hall and Chun Chen and Jacqueline Chame",
  title =        "A script-based autotuning compiler system to generate
                 high-performance {CUDA} code",
  journal =      j-TACO,
  volume =       "9",
  number =       "4",
  pages =        "31:1--31:??",
  month =        jan,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2400682.2400690",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Jan 18 10:57:16 MST 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "This article presents a novel compiler framework for
                 CUDA code generation. The compiler structure is
                 designed to support autotuning, which employs empirical
                 techniques to evaluate a set of alternative mappings of
                 computation kernels and select the mapping that obtains
                 the best performance. This article introduces a
                 Transformation Strategy Generator, a meta-optimizer
                 that generates a set of transformation recipes, which
                 are descriptions of the mapping of the sequential code
                 to parallel CUDA code. These recipes comprise a search
                 space of possible implementations. This system achieves
                 performance comparable and sometimes better than
                 manually tuned libraries and exceeds the performance of
                 a state-of-the-art GPU compiler.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "31",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{VanCraeynest:2013:UFD,
  author =       "Kenzo {Van Craeynest} and Lieven Eeckhout",
  title =        "Understanding fundamental design choices in
                 single-{ISA} heterogeneous multicore architectures",
  journal =      j-TACO,
  volume =       "9",
  number =       "4",
  pages =        "32:1--32:??",
  month =        jan,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2400682.2400691",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Jan 18 10:57:16 MST 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Single-ISA heterogeneous multicore processors have
                 gained substantial interest over the past few years
                 because of their power efficiency, as they offer the
                 potential for high overall chip throughput within a
                 given power budget. Prior work in heterogeneous
                 architectures has mainly focused on how heterogeneity
                 can improve overall system throughput. To what extent
                 heterogeneity affects per-program performance has
                 remained largely unanswered. In this article, we aim at
                 understanding how heterogeneity affects both chip
                 throughput and per-program performance; how
                 heterogeneous architectures compare to homogeneous
                 architectures under both performance metrics; and how
                 fundamental design choices, such as core type, cache
                 size, and off-chip bandwidth, affect performance. We
                 use analytical modeling to explore a large space of
                 single-ISA heterogeneous architectures. The analytical
                 model has linear-time complexity in the number of core
                 types and programs of interest, and offers a unique
                 opportunity for exploring the large space of both
                 homogeneous and heterogeneous multicore processors in
                 limited time. Our analysis provides several interesting
                 insights: While it is true that heterogeneity can
                 improve system throughput, it fundamentally trades
                 per-program performance for chip throughput; although
                 some heterogeneous configurations yield better
                 throughput and per-program performance than homogeneous
                 designs, some homogeneous configurations are optimal
                 for particular throughput versus per-program
                 performance trade-offs. Two core types provide most of
                 the benefits from heterogeneity and a larger number of
                 core types does not contribute much; job-to-core
                 mapping is both important and challenging for
                 heterogeneous multicore processors to achieve optimum
                 performance. Limited off-chip bandwidth does alter some
                 of the fundamental design choices in heterogeneous
                 multicore architectures, such as the need for large
                 on-chip caches for achieving high throughput, and
                 per-program performance degrading more relative to
                 throughput under constrained off-chip bandwidth.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "32",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Antao:2013:CFA,
  author =       "Samuel Ant{\~a}o and Leonel Sousa",
  title =        "The {CRNS} framework and its application to
                 programmable and reconfigurable cryptography",
  journal =      j-TACO,
  volume =       "9",
  number =       "4",
  pages =        "33:1--33:??",
  month =        jan,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2400682.2400692",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Jan 18 10:57:16 MST 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "This article proposes the Computing with the
                 ResidueNumber System (CRNS) framework, which aims at
                 the design automation of accelerators for Modular
                 Arithmetic (MA). The framework provides a comprehensive
                 set of tools ranging from a programming language and
                 respective compiler to back-ends targeting parallel
                 computation platforms such as Graphical Processing
                 Units (GPUs) and reconfigurable hardware. Given an
                 input algorithm described with a high-level programming
                 language, the CRNS can be used to obtain in a few
                 seconds the corresponding optimized Parallel Thread
                 Execution (PTX) program ready to be run on GPUs or the
                 Hardware Description Language (HDL) specification of a
                 fully functional accelerator suitable for
                 reconfigurable hardware and embedded systems. The
                 resulting framework's implementations benefit from the
                 Residue Number System (RNS) arithmetic's
                 parallelization properties in a fully automated way.
                 Designers do not need to be familiar with the
                 mathematical details concerning the employed
                 arithmetic, namely the RNS representation. In order to
                 thoroughly describe and evaluate the proposed
                 framework, experimental results obtained for the
                 supported back-ends (GPU and HDL) are presented
                 targeting the implementation of the modular
                 exponentiation used in the Rivest-Shamir-Adleman (RSA)
                 algorithm and Elliptic Curve (EC) point multiplication.
                 Results suggest competitive latency and throughput with
                 minimum design effort and overcoming all the
                 development issues that arise in the specification and
                 verification of dedicated solutions.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "33",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Diouf:2013:DLM,
  author =       "Boubacar Diouf and Can Hantas and Albert Cohen and
                 {\"O}zcan {\"O}zturk and Jens Palsberg",
  title =        "A decoupled local memory allocator",
  journal =      j-TACO,
  volume =       "9",
  number =       "4",
  pages =        "34:1--34:??",
  month =        jan,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2400682.2400693",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Jan 18 10:57:16 MST 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Compilers use software-controlled local memories to
                 provide fast, predictable, and power-efficient access
                 to critical data. We show that the local memory
                 allocation for straight-line, or linearized programs is
                 equivalent to a weighted interval-graph coloring
                 problem. This problem is new when allowing a color
                 interval to ``wrap around,'' and we call it the
                 submarine-building problem. This graph-theoretical
                 decision problem differs slightly from the classical
                 ship-building problem, and exhibits very interesting
                 and unusual complexity properties. We demonstrate that
                 the submarine-building problem is NP-complete, while it
                 is solvable in linear time for not-so-proper interval
                 graphs, an extension of the class of proper interval
                 graphs. We propose a clustering heuristic to
                 approximate any interval graph into a not-so-proper
                 interval graph, decoupling spill code generation from
                 local memory assignment. We apply this heuristic to a
                 large number of randomly generated interval graphs
                 reproducing the statistical features of standard local
                 memory allocation benchmarks, comparing with
                 state-of-the-art heuristics.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "34",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Cui:2013:LOC,
  author =       "Huimin Cui and Qing Yi and Jingling Xue and Xiaobing
                 Feng",
  title =        "Layout-oblivious compiler optimization for matrix
                 computations",
  journal =      j-TACO,
  volume =       "9",
  number =       "4",
  pages =        "35:1--35:??",
  month =        jan,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2400682.2400694",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Jan 18 10:57:16 MST 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Most scientific computations serve to apply
                 mathematical operations to a set of preconceived data
                 structures, e.g., matrices, vectors, and grids. In this
                 article, we use a number of widely used matrix
                 computations from the LINPACK library to demonstrate
                 that complex internal organizations of data structures
                 can severely degrade the effectiveness of compiler
                 optimizations. We then present a data-layout-oblivious
                 optimization methodology, where by isolating an
                 abstract representation of the computations from
                 complex implementation details of their data, we enable
                 these computations to be much more accurately analyzed
                 and optimized through varying state-of-the-art compiler
                 technologies. We evaluated our approach on an Intel
                 8-core platform using two source-to-source compiler
                 infrastructures, Pluto and EPOD. Our results show that
                 while the efficiency of a computational kernel differs
                 when using different data layouts, the alternative
                 implementations typically benefit from a common set of
                 optimizations on the operations. Therefore separately
                 optimizing the operations and the data layout of a
                 computation could dramatically enhance the
                 effectiveness of compiler optimizations compared with
                 the conventional approaches of using a unified
                 representation.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "35",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Dolan:2013:CSL,
  author =       "Stephen Dolan and Servesh Muralidharan and David
                 Gregg",
  title =        "Compiler support for lightweight context switching",
  journal =      j-TACO,
  volume =       "9",
  number =       "4",
  pages =        "36:1--36:??",
  month =        jan,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2400682.2400695",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Jan 18 10:57:16 MST 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "We propose a new language-neutral primitive for the
                 LLVM compiler, which provides efficient context
                 switching and message passing between lightweight
                 threads of control. The primitive, called Swapstack,
                 can be used by any language implementation based on
                 LLVM to build higher-level language structures such as
                 continuations, coroutines, and lightweight threads. As
                 part of adding the primitives to LLVM, we have also
                 added compiler support for passing parameters across
                 context switches. Our modified LLVM compiler produces
                 highly efficient code through a combination of exposing
                 the context switching code to existing compiler
                 optimizations, and adding novel compiler optimizations
                 to further reduce the cost of context switches. To
                 demonstrate the generality and efficiency of our
                 primitives, we add one-shot continuations to C++, and
                 provide a simple fiber library that allows millions of
                 fibers to run on multiple cores, with a work-stealing
                 scheduler and fast inter-fiber sychronization. We argue
                 that compiler-supported lightweight context switching
                 can be significantly faster than using a library to
                 switch between contexts, and provide experimental
                 evidence to support the position.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "36",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Abad:2013:LLE,
  author =       "Pablo Abad and Valentin Puente and Jose-Angel
                 Gregorio",
  title =        "{LIGERO}: a light but efficient router conceived for
                 cache-coherent chip multiprocessors",
  journal =      j-TACO,
  volume =       "9",
  number =       "4",
  pages =        "37:1--37:??",
  month =        jan,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2400682.2400696",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Jan 18 10:57:16 MST 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Although abstraction is the best approach to deal with
                 computing system complexity, sometimes implementation
                 details should be considered. Considering on-chip
                 interconnection networks in particular, underestimating
                 the underlying system specificity could have
                 nonnegligible impact on performance, cost, or
                 correctness. This article presents a very efficient
                 router that has been devised to deal with
                 cache-coherent chip multiprocessor particularities in a
                 balanced way. Employing the same principles of packet
                 rotation structures as in the rotary router, we present
                 a router configuration with the following novel
                 features: (1) reduced buffering requirements, (2)
                 optimized pipeline under contentionless conditions, (3)
                 more efficient deadlock avoidance mechanism, and (4)
                 optimized in-order delivery guarantee. Putting it all
                 together, our proposal provides a set of features that
                 no other router, to the best of our knowledge, has
                 achieved previously. These are: (1') low implementation
                 cost, (2') low pass-through latency under low load,
                 (3') improved resource utilization through adaptive
                 routing and a buffering scheme free of head-of-line
                 blocking, (4') guarantee of coherence protocol
                 correctness via end-to-end deadlock avoidance and
                 in-order delivery, and (5') improvement of coherence
                 protocol responsiveness through adaptive in-network
                 multicast support. We conduct a thorough evaluation
                 that includes hardware cost estimation and performance
                 evaluation under a wide spectrum of realistic workloads
                 and coherence protocols. Comparing our proposal with
                 VCTM, an optimized state-of-the-art wormhole router, it
                 requires 50\% less area, reduces on-chip cache
                 hierarchy energy delay product on average by 20\%, and
                 improves the cache-coherency chip multiprocessor
                 performance under realistic working conditions by up to
                 20\%.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "37",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Albericio:2013:ERL,
  author =       "Jorge Albericio and Pablo Ib{\'a}{\~n}ez and
                 V{\'\i}ctor Vi{\~n}als and Jose Mar{\'\i}a
                 Llaber{\'\i}a",
  title =        "Exploiting reuse locality on inclusive shared
                 last-level caches",
  journal =      j-TACO,
  volume =       "9",
  number =       "4",
  pages =        "38:1--38:??",
  month =        jan,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2400682.2400697",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Jan 18 10:57:16 MST 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Optimization of the replacement policy used for Shared
                 Last-Level Cache (SLLC) management in a
                 Chip-MultiProcessor (CMP) is critical for avoiding
                 off-chip accesses. Temporal locality, while being
                 exploited by first levels of private cache memories, is
                 only slightly exhibited by the stream of references
                 arriving at the SLLC. Thus, traditional replacement
                 algorithms based on recency are bad choices for
                 governing SLLC replacement. Recent proposals involve
                 SLLC replacement policies that attempt to exploit reuse
                 either by segmenting the replacement list or improving
                 the rereference interval prediction. On the other hand,
                 inclusive SLLCs are commonplace in the CMP market, but
                 the interaction between replacement policy and the
                 enforcement of inclusion has barely been discussed.
                 After analyzing that interaction, this article
                 introduces two simple replacement policies exploiting
                 reuse locality and targeting inclusive SLLCs: Least
                 Recently Reused (LRR) and Not Recently Reused (NRR).
                 NRR has the same implementation cost as NRU, and LRR
                 only adds one bit per line to the LRU cost. After
                 considering reuse locality and its interaction with the
                 invalidations induced by inclusion, the proposals are
                 evaluated by simulating multiprogrammed workloads in an
                 8-core system with two private cache levels and an
                 SLLC. LRR outperforms LRU by 4.5\% (performing better
                 in 97 out of 100 mixes) and NRR outperforms NRU by
                 4.2\% (performing better in 99 out of 100 mixes). We
                 also show that our mechanisms outperform rereference
                 interval prediction, a recently proposed SLLC
                 replacement policy and that similar conclusions can be
                 drawn by varying the associativity or the SLLC size.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "38",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Yiapanis:2013:OSR,
  author =       "Paraskevas Yiapanis and Demian Rosas-Ham and Gavin
                 Brown and Mikel Luj{\'a}n",
  title =        "Optimizing software runtime systems for speculative
                 parallelization",
  journal =      j-TACO,
  volume =       "9",
  number =       "4",
  pages =        "39:1--39:??",
  month =        jan,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2400682.2400698",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Jan 18 10:57:16 MST 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Thread-Level Speculation (TLS) overcomes limitations
                 intrinsic with conservative compile-time
                 auto-parallelizing tools by extracting parallel threads
                 optimistically and only ensuring absence of data
                 dependence violations at runtime. A significant barrier
                 for adopting TLS (implemented in software) is the
                 overheads associated with maintaining speculative
                 state. Based on previous TLS limit studies, we observe
                 that on future multicore systems we will likely have
                 more cores idle than those which traditional TLS would
                 be able to harness. This implies that a TLS system
                 should focus on optimizing for small number of cores
                 and find efficient ways to take advantage of the idle
                 cores. Furthermore, research on optimistic systems has
                 covered two important implementation design points:
                 eager vs. lazy version management. With this knowledge,
                 we propose new simple and effective techniques to
                 reduce the execution time overheads for both of these
                 design points. This article describes a novel compact
                 version management data structure optimized for space
                 overhead when using a small number of TLS threads.
                 Furthermore, we describe two novel software runtime
                 parallelization systems that utilize this compact data
                 structure. The first software TLS system, MiniTLS,
                 relies on eager memory data management (in-place
                 updates) and, thus, when a misspeculation occurs a
                 rollback process is required. MiniTLS takes advantage
                 of the novel compact version management representation
                 to parallelize the rollback process and is able to
                 recover from misspeculation faster than existing
                 software eager TLS systems. The second one, Lector
                 (Lazy inspECTOR) is based on lazy version management.
                 Since we have idle cores, the question is whether we
                 can create ``helper'' tasks to determine whether
                 speculation is actually needed without stopping or
                 damaging the speculative execution. In Lector, for each
                 conventional TLS thread running speculatively with lazy
                 version management, there is associated with it a
                 lightweight inspector. The inspector threads execute
                 alongside to verify quickly whether data dependencies
                 will occur. Inspector threads are generated by standard
                 techniques for inspector/executor parallelization. We
                 have applied both TLS systems to seven Java sequential
                 benchmarks, including three benchmarks from
                 SPECjvm2008. Two out of the seven benchmarks exhibit
                 misspeculations. MiniTLS experiments report average
                 speedups of 1.8x for 4 threads increasing close to 7x
                 speedups with 32 threads. Facilitated by our novel
                 compact representation, MiniTLS reduces the space
                 overhead over state-of-the-art software TLS systems
                 between 96\% on 2 threads and 40\% on 32 threads. The
                 experiments for Lector, report average speedups of 1.7x
                 for 2 threads (that is 1 TLS + 1 Inspector threads)
                 increasing close to 8.2x speedups with 32 threads (16 +
                 16 threads). Compared to a well established software
                 TLS baseline, Lector performs on average 1.7x faster
                 for 32 threads and in no case ( x TLS + x Inspector
                 threads) Lector delivers worse performance than the
                 baseline TLS with the equivalent number of TLS threads
                 (i.e. x TLS threads) nor doubling the equivalent number
                 of TLS threads (i.e., x + x TLS threads).",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "39",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Nugteren:2013:ASC,
  author =       "Cedric Nugteren and Pieter Custers and Henk
                 Corporaal",
  title =        "Algorithmic species: a classification of affine loop
                 nests for parallel programming",
  journal =      j-TACO,
  volume =       "9",
  number =       "4",
  pages =        "40:1--40:??",
  month =        jan,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2400682.2400699",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Jan 18 10:57:16 MST 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Code generation and programming have become ever more
                 challenging over the last decade due to the shift
                 towards parallel processing. Emerging processor
                 architectures such as multi-cores and GPUs exploit
                 increasingly parallelism, requiring programmers and
                 compilers to deal with aspects such as threading,
                 concurrency, synchronization, and complex memory
                 partitioning. We advocate that programmers and
                 compilers can greatly benefit from a structured
                 classification of program code. Such a classification
                 can help programmers to find opportunities for
                 parallelization, reason about their code, and interact
                 with other programmers. Similarly, parallelising
                 compilers and source-to-source compilers can take
                 threading and optimization decisions based on the same
                 classification. In this work, we introduce algorithmic
                 species, a classification of affine loop nests based on
                 the polyhedral model and targeted for both automatic
                 and manual use. Individual classes capture information
                 such as the structure of parallelism and the data
                 reuse. To make the classification applicable for manual
                 use, a basic vocabulary forms the base for the creation
                 of a set of intuitive classes. To demonstrate the use
                 of algorithmic species, we identify 115 classes in a
                 benchmark set. Additionally, we demonstrate the
                 suitability of algorithmic species for automated uses
                 by showing a tool to automatically extract species from
                 program code, a species-based source-to-source
                 compiler, and a species-based performance prediction
                 model.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "40",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Gerards:2013:ODD,
  author =       "Marco E. T. Gerards and Jan Kuper",
  title =        "Optimal {DPM} and {DVFS} for frame-based real-time
                 systems",
  journal =      j-TACO,
  volume =       "9",
  number =       "4",
  pages =        "41:1--41:??",
  month =        jan,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2400682.2400700",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Jan 18 10:57:16 MST 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Dynamic Power Management (DPM) and Dynamic Voltage and
                 Frequency Scaling (DVFS) are popular techniques for
                 reducing energy consumption. Algorithms for optimal
                 DVFS exist, but optimal DPM and the optimal combination
                 of DVFS and DPM are not yet solved. In this article we
                 use well-established models of DPM and DVFS for
                 frame-based systems. We show that it is not
                 sufficient-as some authors argue-to consider only
                 individual invocations of a task. We define a schedule
                 that also takes interactions between invocations into
                 account and prove-in a theoretical fashion-that this
                 schedule is optimal.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "41",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Yan:2013:IPA,
  author =       "Zhichao Yan and Hong Jiang and Yujuan Tan and Dan
                 Feng",
  title =        "An integrated pseudo-associativity and relaxed-order
                 approach to hardware transactional memory",
  journal =      j-TACO,
  volume =       "9",
  number =       "4",
  pages =        "42:1--42:??",
  month =        jan,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2400682.2400701",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Jan 18 10:57:16 MST 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Our experimental study and analysis reveal that the
                 bottlenecks of existing hardware transactional memory
                 systems are largely rooted in the extra data movements
                 in version management and in the inefficient scheduling
                 of conflicting transactions in conflict management,
                 particularly in the presence of high-contention and
                 coarse-grained applications. In order to address this
                 problem, we propose an integrated Pseudo-Associativity
                 and Relaxed-Order approach to hardware Transactional
                 Memory, called PARO-TM. It exploits the extra
                 pseudo-associative space in the data cache to hold the
                 new value of each transactional modification, and
                 maintains the mappings between the old and new versions
                 via an implicit pseudo-associative hash algorithm
                 (i.e., by inverting the specific bit of the SET index).
                 PARO-TM can branch out the speculative version from the
                 old version upon each transactional modification on
                 demand without a dedicated hardware component to hold
                 the uncommitted data. This means that it is able to
                 automatically access the proper version upon the
                 transaction's commit or abort. Moreover, PARO-TM
                 augments multi-version support in a chained directory
                 to schedule conflicting transactions in a relaxed-order
                 manner to further reduce their overheads. We compare
                 PARO-TM with the state-of-the-art LogTM-SE, TCC, DynTM,
                 and SUV-TM systems and find that PARO-TM consistently
                 outperforms these four representative HTMs. This
                 performance advantage of PARO-TM is far more pronounced
                 under the high-contention and coarse-grained
                 applications in the STAMP benchmark suite, for which
                 PARO-TM is motivated and designed.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "42",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Chen:2013:PGF,
  author =       "Doris Chen and Deshanand Singh",
  title =        "Profile-guided floating- to fixed-point conversion for
                 hybrid {FPGA}-processor applications",
  journal =      j-TACO,
  volume =       "9",
  number =       "4",
  pages =        "43:1--43:??",
  month =        jan,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2400682.2400702",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Jan 18 10:57:16 MST 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "The key to enabling widespread use of FPGAs for
                 algorithm acceleration is to allow programmers to
                 create efficient designs without the time-consuming
                 hardware design process. Programmers are used to
                 developing scientific and mathematical algorithms in
                 high-level languages (C/C++) using floating point data
                 types. Although easy to implement, the dynamic range
                 provided by floating point is not necessary in many
                 applications; more efficient implementations can be
                 realized using fixed point arithmetic. While this topic
                 has been studied previously [Han et al. 2006; Olson et
                 al. 1999; Gaffar et al. 2004; Aamodt and Chow 1999],
                 the degree of full automation has always been lacking.
                 We present a novel design flow for cases where FPGAs
                 are used to offload computations from a microprocessor.
                 Our LLVM-based algorithm inserts value profiling code
                 into an unmodified C/C++ application to guide its
                 automatic conversion to fixed point. This allows for
                 fast and accurate design space exploration on a host
                 microprocessor before any accelerators are mapped to
                 the FPGA. Through experimental results, we demonstrate
                 that fixed-point conversion can yield resource savings
                 of up to 2x--3x reductions. Embedded RAM usage is
                 minimized, and 13\%--22\% higher $ F_{\rm max} $ than
                 the original floating-point implementation is observed.
                 In a case study, we show that 17\% reduction in logic
                 and 24\% reduction in register usage can be realized by
                 using our algorithm in conjunction with a High-Level
                 Synthesis (HLS) tool.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "43",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Cui:2013:LCA,
  author =       "Yan Cui and Yingxin Wang and Yu Chen and Yuanchun
                 Shi",
  title =        "Lock-contention-aware scheduler: a scalable and
                 energy-efficient method for addressing scalability
                 collapse on multicore systems",
  journal =      j-TACO,
  volume =       "9",
  number =       "4",
  pages =        "44:1--44:??",
  month =        jan,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2400682.2400703",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Jan 18 10:57:16 MST 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "In response to the increasing ubiquity of multicore
                 processors, there has been widespread development of
                 multithreaded applications that strive to realize their
                 full potential. Unfortunately, lock contention within
                 operating systems can limit the scalability of
                 multicore systems so severely that an increase in the
                 number of cores can actually lead to reduced
                 performance (i.e., scalability collapse). Existing
                 efforts of solving scalability collapse mainly focus on
                 making critical sections of kernel code fine-grained or
                 designing new synchronization primitives. However,
                 these methods have disadvantages in scalability or
                 energy efficiency. In this article, we observe that the
                 percentage of lock-waiting time over the total
                 execution time for a lock intensive task has a
                 significant correlation with the occurrence of
                 scalability collapse. Based on this observation, a
                 lock-contention-aware scheduler is proposed.
                 Specifically, each task in the scheduler monitors its
                 percentage of lock waiting time continuously. If the
                 percentage exceeds a predefined threshold, this task is
                 considered as lock intensive and migrated to a Special
                 Set of Cores (i.e., SSC). In this way, the number of
                 concurrently running lock-intensive tasks is limited to
                 the number of cores in the SSC, and therefore, the
                 degree of lock contention is controlled. A central
                 challenge of using this scheme is how many cores should
                 be allocated in the SSC to handle lock-intensive tasks.
                 In our scheduler, the optimal number of cores is
                 determined online by the model-driven search. The
                 proposed scheduler is implemented in the recent Linux
                 kernel and evaluated using micro- and macrobenchmarks
                 on AMD and Intel 32-core systems. Experimental results
                 suggest that our proposal is able to remove scalability
                 collapse completely and sustains the maximal throughput
                 of the spin-lock-based system for most applications.
                 Furthermore, the percentage of lock-waiting time can be
                 reduced by up to 84\%. When compared with scalability
                 collapse reduction methods such as requester-based
                 locking scheme and sleeping-based synchronization
                 primitives, our scheme exhibits significant advantages
                 in scalability, power consumption, and energy
                 efficiency.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "44",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Pusukuri:2013:AFC,
  author =       "Kishore Kumar Pusukuri and Rajiv Gupta and Laxmi N.
                 Bhuyan",
  title =        "{ADAPT}: a framework for coscheduling multithreaded
                 programs",
  journal =      j-TACO,
  volume =       "9",
  number =       "4",
  pages =        "45:1--45:??",
  month =        jan,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2400682.2400704",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Jan 18 10:57:16 MST 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Since multicore systems offer greater performance via
                 parallelism, future computing is progressing towards
                 use of multicore machines with large number of cores.
                 However, the performance of emerging multithreaded
                 programs often does not scale to fully utilize the
                 available cores. Therefore, simultaneously running
                 multiple multithreaded applications becomes inevitable
                 to fully exploit the computing potential of such
                 machines. However, maximizing the performance and
                 throughput on multicore machines in the presence of
                 multiple multithreaded programs is a challenge for the
                 OS. We have observed that the state-of-the-art
                 contention management algorithms fail to effectively
                 coschedule multithreaded programs on multicore
                 machines. To address the above challenge, we present
                 ADAPT, a scheduling framework that continuously
                 monitors the resource usage of multithreaded programs
                 and adaptively coschedules them such that they
                 interfere with each other's performance as little as
                 possible. In addition, ADAPT selects appropriate memory
                 allocation and scheduling policies according to the
                 workload characteristics. We have implemented ADAPT on
                 a 64-core Supermicro server running Solaris 11 and
                 evaluated it using 26 multithreaded programs including
                 the TATP database application, SPECjbb2005, and
                 programs from Phoenix, PARSEC, and SPEC OMP suites. The
                 experimental results show that ADAPT substantially
                 improves total turnaround time and system utilization
                 relative to the default Solaris 11 scheduler.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "45",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Tartara:2013:CLC,
  author =       "Michele Tartara and Stefano Crespi Reghizzi",
  title =        "Continuous learning of compiler heuristics",
  journal =      j-TACO,
  volume =       "9",
  number =       "4",
  pages =        "46:1--46:??",
  month =        jan,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2400682.2400705",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Jan 18 10:57:16 MST 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Optimizing programs to exploit the underlying hardware
                 architecture is an important task. Much research has
                 been done on enabling compilers to find the best set of
                 code optimizations that can build the fastest and less
                 resource-hungry executable for a given program. A
                 common approach is iterative compilation, sometimes
                 enriched by machine learning techniques. This provides
                 good results, but requires extremely long compilation
                 times and an initial training phase lasting even for
                 days or weeks. We present long-term learning, a new
                 algorithm that allows the compiler user to improve the
                 performance of compiled programs with reduced
                 compilation times with respect to iterative
                 compilation, and without an initial training phase. Our
                 algorithm does not just build good programs: it
                 acquires knowledge every time a program is compiled and
                 it uses such knowledge to learn compiler heuristics,
                 without the need for an expert to manually define them.
                 The heuristics are evolved during every compilation, by
                 evaluating their effect on the generated programs. We
                 present implementations of long-term learning on top of
                 two different compilers, and experimental data gathered
                 on multiple hardware configurations showing its
                 effectiveness.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "46",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Chrysos:2013:HCP,
  author =       "Grigorios Chrysos and Panagiotis Dagritzikos and
                 Ioannis Papaefstathiou and Apostolos Dollas",
  title =        "{HC-CART}: a parallel system implementation of data
                 mining classification and regression tree {(CART)}
                 algorithm on a multi-{FPGA} system",
  journal =      j-TACO,
  volume =       "9",
  number =       "4",
  pages =        "47:1--47:??",
  month =        jan,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2400682.2400706",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Jan 18 10:57:16 MST 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Data mining is a new field of computer science with a
                 wide range of applications. Its goal is to extract
                 knowledge from massive datasets in a
                 human-understandable structure, for example, the
                 decision trees. In this article we present an
                 innovative, high-performance, system-level architecture
                 for the Classification And Regression Tree (CART)
                 algorithm, one of the most important and widely used
                 algorithms in the data mining area. Our proposed
                 architecture exploits parallelism at the decision
                 variable level, and was fully implemented and evaluated
                 on a modern high-performance reconfigurable platform,
                 the Convey HC-1 server, that features four FPGAs and a
                 multicore processor. Our FPGA-based implementation was
                 integrated with the widely used ``rpart'' software
                 library of the R project in order to provide the first
                 fully functional reconfigurable system that can handle
                 real-world large databases. The proposed system, named
                 HC-CART system, achieves a performance speedup of up to
                 two orders of magnitude compared to well-known
                 single-threaded data mining software platforms, such as
                 WEKA and the R platform. It also outperforms similar
                 hardware systems which implement parts of the complete
                 application by an order of magnitude. Finally, we show
                 that the HC-CART system offers higher performance
                 speedup than some other proposed parallel software
                 implementations of decision tree construction
                 algorithms.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "47",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Lee:2013:DCD,
  author =       "Jongwon Lee and Yohan Ko and Kyoungwoo Lee and Jonghee
                 M. Youn and Yunheung Paek",
  title =        "Dynamic code duplication with vulnerability awareness
                 for soft error detection on {VLIW} architectures",
  journal =      j-TACO,
  volume =       "9",
  number =       "4",
  pages =        "48:1--48:??",
  month =        jan,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2400682.2400707",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Jan 18 10:57:16 MST 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Soft errors are becoming a critical concern in
                 embedded system designs. Code duplication techniques
                 have been proposed to increase the reliability in
                 multi-issue embedded systems such as VLIW by exploiting
                 empty slots for duplicated instructions. However, they
                 increase code size, another important concern, and
                 ignore vulnerability differences in instructions,
                 causing unnecessary or inefficient protection when
                 selecting instructions to be duplicated under
                 constraints. In this article, we propose a
                 compiler-assisted dynamic code duplication method to
                 minimize the code size overhead, and present
                 vulnerability-aware duplication algorithms to maximize
                 the effectiveness of instruction duplication with least
                 overheads for VLIW architecture. Our experimental
                 results with SoarGen and Synopsys simulation
                 environments demonstrate that our proposals can reduce
                 the code size by up to 40\% and detect more soft errors
                 by up to 82\% via fault injection experiments over
                 benchmarks from DSPstone and Livermore Loops as
                 compared to the previously proposed instruction
                 duplication technique.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "48",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Coelho:2013:ACI,
  author =       "Fabien Coelho and Fran{\c{c}}ois Irigoin",
  title =        "{API} compilation for image hardware accelerators",
  journal =      j-TACO,
  volume =       "9",
  number =       "4",
  pages =        "49:1--49:??",
  month =        jan,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2400682.2400708",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Jan 18 10:57:16 MST 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "We present an API-based compilation strategy to
                 optimize image applications, developed using a
                 high-level image processing library, onto three
                 different image processing hardware accelerators. We
                 demonstrate that such a strategy is profitable for both
                 development cost and overall performance, especially as
                 it takes advantage of optimization opportunities across
                 library calls otherwise beyond reach. The library API
                 provides the semantics of the image computations. The
                 three image accelerator targets are quite distinct: the
                 first one uses a vector architecture; the second one
                 presents an SIMD architecture; the last one runs both
                 on GPGPU and multicores through OpenCL. We have adapted
                 standard compilation techniques to perform these
                 compilation and code generation tasks automatically.
                 Our strategy is implemented in PIPS, a source-to-source
                 compiler which greatly reduces the development cost as
                 standard phases are reused and parameterized. We
                 carried out experiments with applications on hardware
                 functional simulators and GPUs. Our contributions
                 include: (1) a general low-cost compilation strategy
                 for image processing applications, based on the
                 semantics provided by library calls, which improves
                 locality by an order of magnitude; (2) specific
                 heuristics to minimize execution time on the target
                 accelerators; (3) numerous experiments that show the
                 effectiveness of our strategies. We also discuss the
                 conditions required to extend this approach to other
                 application domains.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "49",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Luque:2013:FCT,
  author =       "Carlos Luque and Miquel Moreto and Francisco J.
                 Cazorla and Mateo Valero",
  title =        "Fair {CPU} time accounting in {CMP+SMT} processors",
  journal =      j-TACO,
  volume =       "9",
  number =       "4",
  pages =        "50:1--50:??",
  month =        jan,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2400682.2400709",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Jan 18 10:57:16 MST 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Processor architectures combining several paradigms of
                 Thread-Level Parallelism (TLP), such as CMP processors
                 in which each core is SMT, are becoming more and more
                 popular as a way to improve performance at a moderate
                 cost. However, the complex interaction between running
                 tasks in hardware shared resources in multi-TLP
                 architectures introduces complexities when accounting
                 CPU time (or CPU utilization) to tasks. The CPU
                 utilization accounted to a task depends on both the
                 time it runs in the processor and the amount of
                 processor hardware resources it receives. Deploying
                 systems with accurate CPU accounting mechanisms is
                 necessary to increase fairness. Moreover, it will allow
                 users to be fairly charged on a shared data center,
                 facilitating server consolidation in future systems. In
                 this article we analyze the accuracy and hardware cost
                 of previous CPU accounting mechanisms for pure-CMP and
                 pure-SMT processors and we show that they are not
                 adequate for CMP+SMT processors. Consequently, we
                 propose a new accounting mechanism for CMP+SMT
                 processors which: (1) increases the accuracy of
                 accounted CPU utilization; (2) provides much more
                 stable results over a wide range of processor setups;
                 and (3) does not require tracking all hardware shared
                 resources, significantly reducing its implementation
                 cost. In particular, previous proposals lead to
                 inaccuracies between 21\% and 79\% when measuring CPU
                 utilization in an 8-core 2-way SMT processor, while our
                 proposal reduces this inaccuracy to less than 5.0\%.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "50",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Mattheakis:2013:SRM,
  author =       "Pavlos M. Mattheakis and Ioannis Papaefstathiou",
  title =        "Significantly reducing {MPI} intercommunication
                 latency and power overhead in both embedded and {HPC}
                 systems",
  journal =      j-TACO,
  volume =       "9",
  number =       "4",
  pages =        "51:1--51:??",
  month =        jan,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2400682.2400710",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Jan 18 10:57:16 MST 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Highly parallel systems are becoming mainstream in a
                 wide range of sectors ranging from their traditional
                 stronghold high-performance computing, to data centers
                 and even embedded systems. However, despite the quantum
                 leaps of improvements in cost and performance of
                 individual components over the last decade (e.g.,
                 processor speeds, memory/interconnection bandwidth,
                 etc.), system manufacturers are still struggling to
                 deliver low-latency, highly scalable solutions. One of
                 the main reasons is that the intercommunication latency
                 grows significantly with the number of processor nodes.
                 This article presents a novel way to reduce this
                 intercommunication delay by implementing, in custom
                 hardware, certain communication tasks. In particular,
                 the proposed novel device implements the two most
                 widely used procedures of the most popular
                 communication protocol in parallel systems the Message
                 Passing Interface (MPI). Our novel approach has
                 initially been simulated within a pioneering parallel
                 systems simulation framework and then synthesized
                 directly from a high-level description language (i.e.,
                 SystemC) using a state-of-the-art synthesis tool. To
                 the best of our knowledge, this is the first article
                 presenting the complete hardware implementation of such
                 a system. The proposed novel approach triggers a
                 speedup from one to four orders of magnitude when
                 compared with conventional software-based solutions and
                 from one to three orders of magnitude when compared
                 with a sophisticated software-based approach. Moreover,
                 the performance of our system is from one to two orders
                 of magnitude higher than the simulated performance of a
                 similar but, relatively simpler hardware architecture;
                 at the same time the power consumption of our device is
                 about two orders of magnitude lower than that of a
                 low-power CPU when executing the exact same
                 intercommunication tasks.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "51",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Baghdadi:2013:ILT,
  author =       "Riyadh Baghdadi and Albert Cohen and Sven Verdoolaege
                 and Konrad Trifunovi{\'c}",
  title =        "Improved loop tiling based on the removal of spurious
                 false dependences",
  journal =      j-TACO,
  volume =       "9",
  number =       "4",
  pages =        "52:1--52:??",
  month =        jan,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2400682.2400711",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Jan 18 10:57:16 MST 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "To preserve the validity of loop nest transformations
                 and parallelization, data dependences need to be
                 analyzed. Memory dependences come in two varieties:
                 true dependences or false dependences. While true
                 dependences must be satisfied in order to preserve the
                 correct order of computations, false dependences are
                 induced by the reuse of a single memory location to
                 store multiple values. False dependences reduce the
                 degrees of freedom for loop transformations. In
                 particular, loop tiling is severely limited in the
                 presence of these dependences. While array expansion
                 removes all false dependences, the overhead on memory
                 and the detrimental impact on register-level reuse can
                 be catastrophic. We propose and evaluate a compilation
                 technique to safely ignore a large number of false
                 dependences in order to enable loop nest tiling in the
                 polyhedral model. It is based on the precise
                 characterization of interferences between live range
                 intervals, and it does not incur any scalar or array
                 expansion. Our algorithms have been implemented in the
                 Pluto polyhedral compiler, and evaluated on the
                 PolyBench suite.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "52",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Pop:2013:OED,
  author =       "Antoniu Pop and Albert Cohen",
  title =        "{OpenStream}: Expressiveness and data-flow compilation
                 of {OpenMP} streaming programs",
  journal =      j-TACO,
  volume =       "9",
  number =       "4",
  pages =        "53:1--53:??",
  month =        jan,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2400682.2400712",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Jan 18 10:57:16 MST 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "We present OpenStream, a data-flow extension of OpenMP
                 to express dynamic dependent tasks. The language
                 supports nested task creation, modular composition,
                 variable and unbounded sets of producers/consumers, and
                 first-class streams. These features, enabled by our
                 original compilation flow, allow translating high-level
                 parallel programming patterns, like dependences arising
                 from StarSs' array regions, or universal low-level
                 primitives like futures. In particular, these dynamic
                 features can be embedded efficiently and naturally into
                 an unmanaged imperative language, avoiding the
                 complexity and overhead of a concurrent garbage
                 collector. We demonstrate the performance advantages of
                 a data-flow execution model compared to more restricted
                 task and barrier models. We also demonstrate the
                 efficiency of our compilation and runtime algorithms
                 for the support of complex dependence patterns arising
                 from StarSs benchmarks.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "53",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Verdoolaege:2013:PPC,
  author =       "Sven Verdoolaege and Juan Carlos Juega and Albert
                 Cohen and Jos{\'e} Ignacio G{\'o}mez and Christian
                 Tenllado and Francky Catthoor",
  title =        "Polyhedral parallel code generation for {CUDA}",
  journal =      j-TACO,
  volume =       "9",
  number =       "4",
  pages =        "54:1--54:??",
  month =        jan,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2400682.2400713",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Jan 18 10:57:16 MST 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "This article addresses the compilation of a sequential
                 program for parallel execution on a modern GPU. To this
                 end, we present a novel source-to-source compiler
                 called PPCG. PPCG singles out for its ability to
                 accelerate computations from any static control loop
                 nest, generating multiple CUDA kernels when necessary.
                 We introduce a multilevel tiling strategy and a code
                 generation scheme for the parallelization and locality
                 optimization of imperfectly nested loops, managing
                 memory and exposing concurrency according to the
                 constraints of modern GPUs. We evaluate our algorithms
                 and tool on the entire PolyBench suite.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "54",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Du:2013:DCC,
  author =       "Yu Du and Miao Zhou and Bruce Childers and Rami Melhem
                 and Daniel Moss{\'e}",
  title =        "Delta-compressed caching for overcoming the write
                 bandwidth limitation of hybrid main memory",
  journal =      j-TACO,
  volume =       "9",
  number =       "4",
  pages =        "55:1--55:??",
  month =        jan,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2400682.2400714",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Jan 18 10:57:16 MST 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Limited PCM write bandwidth is a critical obstacle to
                 achieve good performance from hybrid DRAM/PCM memory
                 systems. The write bandwidth is severely restricted in
                 PCM devices, which harms application performance.
                 Indeed, as we show, it is more important to reduce PCM
                 write traffic than to reduce PCM read latency for
                 application performance. To reduce the number of PCM
                 writes, we propose a DRAM cache organization that
                 employs compression. A new delta compression technique
                 for modified data is used to achieve a large
                 compression ratio. Our approach can selectively and
                 predictively apply compression to improve its
                 efficiency and performance. Our approach is designed to
                 facilitate adoption in existing main memory compression
                 frameworks. We describe an instance of how to
                 incorporate delta compression in IBM's MXT memory
                 compression architecture when used for DRAM cache in a
                 hybrid main memory. For fourteen representative
                 memory-intensive workloads, on average, our delta
                 compression technique reduces the number of PCM writes
                 by 54.3\%, and improves IPC performance by 24.4\%.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "55",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Purini:2013:FGO,
  author =       "Suresh Purini and Lakshya Jain",
  title =        "Finding good optimization sequences covering program
                 space",
  journal =      j-TACO,
  volume =       "9",
  number =       "4",
  pages =        "56:1--56:??",
  month =        jan,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2400682.2400715",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Jan 18 10:57:16 MST 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "The compiler optimizations we enable and the order in
                 which we apply them on a program have a substantial
                 impact on the program execution time. Compilers provide
                 default optimization sequences which can give good
                 program speedup. As the default sequences have to
                 optimize programs with different characteristics, they
                 embed in them multiple subsequences which can optimize
                 different classes of programs. These multiple
                 subsequences may falsely interact with each other and
                 affect the potential program speedup achievable.
                 Instead of searching for a single universally optimal
                 sequence, we can construct a small set of good
                 sequences such that for every program class there
                 exists a near-optimal optimization sequence in the good
                 sequences set. If we can construct such a good
                 sequences set which covers all the program classes in
                 the program space, then we can choose the best sequence
                 for a program by trying all the sequences in the good
                 sequences set. This approach completely circumvents the
                 need to solve the program classification problem. Using
                 a sequence set size of around 10 we got an average
                 speedup up to 14\% on PolyBench programs and up to 12\%
                 on MiBench programs. Our approach is quite different
                 from either the iterative compilation or
                 machine-learning-based prediction modeling techniques
                 proposed in the literature so far. We use different
                 training and test datasets for cross-validation as
                 against the Leave-One-Out cross-validation technique.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "56",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Belviranli:2013:DSS,
  author =       "Mehmet E. Belviranli and Laxmi N. Bhuyan and Rajiv
                 Gupta",
  title =        "A dynamic self-scheduling scheme for heterogeneous
                 multiprocessor architectures",
  journal =      j-TACO,
  volume =       "9",
  number =       "4",
  pages =        "57:1--57:??",
  month =        jan,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2400682.2400716",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Jan 18 10:57:16 MST 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Today's heterogeneous architectures bring together
                 multiple general-purpose CPUs and multiple
                 domain-specific GPUs and FPGAs to provide dramatic
                 speedup for many applications. However, the challenge
                 lies in utilizing these heterogeneous processors to
                 optimize overall application performance by minimizing
                 workload completion time. Operating system and
                 application development for these systems is in their
                 infancy. In this article, we propose a new scheduling
                 and workload balancing scheme, HDSS, for execution of
                 loops having dependent or independent iterations on
                 heterogeneous multiprocessor systems. The new algorithm
                 dynamically learns the computational power of each
                 processor during an adaptive phase and then schedules
                 the remainder of the workload using a weighted
                 self-scheduling scheme during the completion phase.
                 Different from previous studies, our scheme uniquely
                 considers the runtime effects of block sizes on the
                 performance for heterogeneous multiprocessors. It finds
                 the right trade-off between large and small block sizes
                 to maintain balanced workload while keeping the
                 accelerator utilization at maximum. Our algorithm does
                 not require offline training or architecture-specific
                 parameters. We have evaluated our scheme on two
                 different heterogeneous architectures: AMD 64-core
                 Bulldozer system with nVidia Fermi C2050 GPU and Intel
                 Xeon 32-core SGI Altix 4700 supercomputer with Xilinx
                 Virtex 4 FPGAs. The experimental results show that our
                 new scheduling algorithm can achieve performance
                 improvements up to over 200\% when compared to the
                 closest existing load balancing scheme. Our algorithm
                 also achieves full processor utilization with all
                 processors completing at nearly the same time which is
                 significantly better than alternative current
                 approaches.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "57",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Negi:2013:SCF,
  author =       "Anurag Negi and Ruben Titos-Gil",
  title =        "{SCIN-cache}: Fast speculative versioning in
                 multithreaded cores",
  journal =      j-TACO,
  volume =       "9",
  number =       "4",
  pages =        "58:1--58:??",
  month =        jan,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2400682.2400717",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Jan 18 10:57:16 MST 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "This article describes cache designs for efficiently
                 supporting speculative techniques like transactional
                 memory on chip multiprocessors with multithreaded
                 cores. On-demand allocation and prompt freeing of
                 speculative cache space in the design reduces the
                 burden on nonspeculative execution. Quick access to
                 both clean and speculative versions of data for
                 multiple contexts provides flexibility and greater
                 design freedom to HTM architects. Performance analysis
                 shows the designs stand up well against other HTM
                 design proposals, with potential performance gains in
                 high contention applications with small transactions.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "58",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Lutz:2013:PAF,
  author =       "Thibaut Lutz and Christian Fensch and Murray Cole",
  title =        "{PARTANS}: an autotuning framework for stencil
                 computation on multi-{GPU} systems",
  journal =      j-TACO,
  volume =       "9",
  number =       "4",
  pages =        "59:1--59:??",
  month =        jan,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2400682.2400718",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Jan 18 10:57:16 MST 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "GPGPUs are a powerful and energy-efficient solution
                 for many problems. For higher performance or larger
                 problems, it is necessary to distribute the problem
                 across multiple GPUs, increasing the already high
                 programming complexity. In this article, we focus on
                 abstracting the complexity of multi-GPU programming for
                 stencil computation. We show that the best strategy
                 depends not only on the stencil operator, problem size,
                 and GPU, but also on the PCI express layout. This adds
                 nonuniform characteristics to a seemingly homogeneous
                 setup, causing up to 23\% performance loss. We address
                 this issue with an autotuner that optimizes the
                 distribution across multiple GPUs.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "59",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Xiao:2013:SAT,
  author =       "Chunhua Xiao and M-C. Frank Chang and Jason Cong and
                 Michael Gill and Zhangqin Huang and Chunyue Liu and
                 Glenn Reinman and Hao Wu",
  title =        "Stream arbitration: Towards efficient bandwidth
                 utilization for emerging on-chip interconnects",
  journal =      j-TACO,
  volume =       "9",
  number =       "4",
  pages =        "60:1--60:??",
  month =        jan,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2400682.2400719",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Jan 18 10:57:16 MST 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Alternative interconnects are attractive for scaling
                 on-chip communication bandwidth in a power-efficient
                 manner. However, efficient utilization of the bandwidth
                 provided by these emerging interconnects still remains
                 an open problem due to the spatial and temporal
                 communication heterogeneity. In this article, a Stream
                 Arbitration scheme is proposed, where at runtime any
                 source can compete for any communication channel of the
                 interconnect to talk to any destination. We apply
                 stream arbitration to radio frequency interconnect
                 (RF-I). Experimental results show that compared to the
                 representative token arbitration scheme, stream
                 arbitration can provide an average 20\% performance
                 improvement and 12\% power reduction.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "60",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Chen:2013:DRU,
  author =       "Yunji Chen and Tianshi Chen and Ling Li and Ruiyang Wu
                 and Daofu Liu and Weiwu Hu",
  title =        "Deterministic Replay Using Global Clock",
  journal =      j-TACO,
  volume =       "10",
  number =       "1",
  pages =        "1:1--1:??",
  month =        apr,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2445572.2445573",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Apr 5 18:36:16 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Debugging parallel programs is a well-known difficult
                 problem. A promising method to facilitate debugging
                 parallel programs is using hardware support to achieve
                 deterministic replay on a Chip Multi-Processor (CMP).
                 As a Design-For-Debug (DFD) feature, a practical
                 hardware-assisted deterministic replay scheme should
                 have low design and verification costs, as well as a
                 small log size. To achieve these goals, we propose a
                 novel and succinct hardware-assisted deterministic
                 replay scheme named LReplay. The key innovation of
                 LReplay is that instead of recording the logical time
                 orders between instructions or instruction blocks as
                 previous investigations, LReplay is built upon
                 recording the pending period information infused by the
                 global clock. By the recorded pending period
                 information, about 99\% execution orders are
                 inferrable, implying that LReplay only needs to record
                 directly the residual 1\% noninferrable execution
                 orders in production run. The 1\% noninferrable orders
                 can be addressed by a simple yet cost-effective
                 direction prediction technique, which further reduces
                 the log size of LReplay. Benefiting from the preceding
                 innovations, the overall log size of LReplay over
                 SPLASH-2 benchmarks is about 0.17B/K-Inst (byte per
                 k-instruction) for the sequential consistency, and
                 0.57B/K-Inst for the Godson-3 consistency. Such log
                 sizes are smaller in an order of magnitude than
                 previous deterministic replay schemes incurring no
                 performance loss. Furthermore, LReplay only consumes
                 about 0.5\% area of the Godson-3 CMP, since it requires
                 only trivial modifications to existing components of
                 Godson-3. The features of LReplay demonstrate the
                 potential of integrating hardware support for
                 deterministic replay into future industrial
                 processors.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "1",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Lustig:2013:TIC,
  author =       "Daniel Lustig and Abhishek Bhattacharjee and Margaret
                 Martonosi",
  title =        "{TLB} Improvements for Chip Multiprocessors:
                 Inter-Core Cooperative Prefetchers and Shared
                 Last-Level {TLBs}",
  journal =      j-TACO,
  volume =       "10",
  number =       "1",
  pages =        "2:1--2:??",
  month =        apr,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2445572.2445574",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Apr 5 18:36:16 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Translation Lookaside Buffers (TLBs) are critical to
                 overall system performance. Much past research has
                 addressed uniprocessor TLBs, lowering access times and
                 miss rates. However, as Chip MultiProcessors (CMPs)
                 become ubiquitous, TLB design and performance must be
                 reevaluated. Our article begins by performing a
                 thorough TLB performance evaluation of sequential and
                 parallel benchmarks running on a real-world, modern CMP
                 system using hardware performance counters. This
                 analysis demonstrates the need for further improvement
                 of TLB hit rates for both classes of application, and
                 it also points out that the data TLB has a
                 significantly higher miss rate than the instruction TLB
                 in both cases. In response to the characterization
                 data, we propose and evaluate both Inter-Core
                 Cooperative (ICC) TLB prefetchers and Shared Last-Level
                 (SLL) TLBs as alternatives to the commercial norm of
                 private, per-core L2 TLBs. ICC prefetchers eliminate
                 19\% to 90\% of Data TLB (D-TLB) misses across parallel
                 workloads while requiring only modest changes in
                 hardware. SLL TLBs eliminate 7\% to 79\% of D-TLB
                 misses for parallel workloads and 35\% to 95\% of D-TLB
                 misses for multiprogrammed sequential workloads. This
                 corresponds to 27\% and 21\% increases in hit rates as
                 compared to private, per-core L2 TLBs, respectively,
                 and is achieved this using even more modest hardware
                 requirements. Because of their benefits for parallel
                 applications, their applicability to sequential
                 workloads, and their readily implementable hardware,
                 SLL TLBs and ICC TLB prefetchers hold great promise for
                 CMPs.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "2",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Chen:2013:TME,
  author =       "Rong Chen and Haibo Chen",
  title =        "{Tiled-MapReduce}: Efficient and Flexible {MapReduce}
                 Processing on Multicore with Tiling",
  journal =      j-TACO,
  volume =       "10",
  number =       "1",
  pages =        "3:1--3:??",
  month =        apr,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2445572.2445575",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Apr 5 18:36:16 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "The prevalence of chip multiprocessors opens
                 opportunities of running data-parallel applications
                 originally in clusters on a single machine with many
                 cores. MapReduce, a simple and elegant programming
                 model to program large-scale clusters, has recently
                 been shown a promising alternative to harness the
                 multicore platform. The differences such as memory
                 hierarchy and communication patterns between clusters
                 and multicore platforms raise new challenges to design
                 and implement an efficient MapReduce system on
                 multicore. This article argues that it is more
                 efficient for MapReduce to iteratively process small
                 chunks of data in turn than processing a large chunk of
                 data at a time on shared memory multicore platforms.
                 Based on the argument, we extend the general MapReduce
                 programming model with a ``tiling strategy'', called
                 Tiled --- MapReduce (TMR). TMR partitions a large
                 MapReduce job into a number of small subjobs and
                 iteratively processes one subjob at a time with
                 efficient use of resources; TMR finally merges the
                 results of all subjobs for output. Based on
                 Tiled-MapReduce, we design and implement several
                 optimizing techniques targeting multicore, including
                 the reuse of the input buffer among subjobs, a
                 NUCA/NUMA-aware scheduler, and pipelining a subjob's
                 reduce phase with the successive subjob's map phase, to
                 optimize the memory, cache, and CPU resources
                 accordingly. Further, we demonstrate that
                 Tiled-MapReduce supports fine-grained fault tolerance
                 and enables several usage scenarios such as online and
                 incremental computing on multicore machines.
                 Performance evaluation with our prototype system called
                 Ostrich on a 48-core machine shows that Ostrich saves
                 up to 87.6\% memory, causes less cache misses, and
                 makes more efficient use of CPU cores, resulting in a
                 speedup ranging from 1.86x to 3.07x over Phoenix.
                 Ostrich also efficiently supports fine-grained fault
                 tolerance, online, and incremental computing with small
                 performance penalty.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "3",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Becchi:2013:DTS,
  author =       "Michela Becchi and Patrick Crowley",
  title =        "{A-DFA}: a Time- and Space-Efficient {DFA} Compression
                 Algorithm for Fast Regular Expression Evaluation",
  journal =      j-TACO,
  volume =       "10",
  number =       "1",
  pages =        "4:1--4:26",
  month =        apr,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2445572.2445576",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Apr 5 18:36:16 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Modern network intrusion detection systems need to
                 perform regular expression matching at line rate in
                 order to detect the occurrence of critical patterns in
                 packet payloads. While Deterministic Finite Automata
                 (DFAs) allow this operation to be performed in linear
                 time, they may exhibit prohibitive memory requirements.
                 Kumar et al. [2006a] have proposed Delayed Input DFAs
                 (D2FAs), which provide a trade-off between the memory
                 requirements of the compressed DFA and the number of
                 states visited for each character processed, which in
                 turn affects the memory bandwidth required to evaluate
                 regular expressions. In this article we introduce
                 Amortized time --- bandwidth overhead DFAs ( A --- DFAs
                 ), a general compression technique that results in at
                 most N ( k + 1)/ k state traversals when processing a
                 string of length N, k being a positive integer. In
                 comparison to the D2FA approach, our technique achieves
                 comparable levels of compression with lower provable
                 bounds on memory bandwidth (or greater compression for
                 a given bandwidth bound). Moreover, the A-DFA algorithm
                 has lower complexity, can be applied during DFA
                 creation, and is suitable for scenarios where a
                 compressed DFA needs to be dynamically built or
                 updated. Finally, we show how to combine A-DFA with
                 alphabet reduction and multistride DFAs, two techniques
                 aimed at reducing the memory space and bandwidth
                 requirement of DFAs, and discuss memory encoding
                 schemes suitable for A-DFAs.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "4",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Li:2013:MFM,
  author =       "Sheng Li and Jung Ho Ahn and Richard D. Strong and Jay
                 B. Brockman and Dean M. Tullsen and Norman P. Jouppi",
  title =        "The {McPAT} Framework for Multicore and Manycore
                 Architectures: Simultaneously Modeling Power, Area, and
                 Timing",
  journal =      j-TACO,
  volume =       "10",
  number =       "1",
  pages =        "5:1--5:??",
  month =        apr,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2445572.2445577",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Apr 5 18:36:16 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "This article introduces McPAT, an integrated power,
                 area, and timing modeling framework that supports
                 comprehensive design space exploration for multicore
                 and manycore processor configurations ranging from 90nm
                 to 22nm and beyond. At microarchitectural level, McPAT
                 includes models for the fundamental components of a
                 complete chip multiprocessor, including in-order and
                 out-of-order processor cores, networks-on-chip, shared
                 caches, and integrated system components such as memory
                 controllers and Ethernet controllers. At circuit level,
                 McPAT supports detailed modeling of critical-path
                 timing, area, and power. At technology level, McPAT
                 models timing, area, and power for the device types
                 forecast in the ITRS roadmap. McPAT has a flexible XML
                 interface to facilitate its use with many performance
                 simulators. Combined with a performance simulator,
                 McPAT enables architects to accurately quantify the
                 cost of new ideas and assess trade-offs of different
                 architectures using new metrics such as
                 Energy-Delay-Area2 Product (EDA2P) and
                 Energy-Delay-Area Product (EDAP). This article explores
                 the interconnect options of future manycore processors
                 by varying the degree of clustering over generations of
                 process technologies. Clustering will bring interesting
                 trade-offs between area and performance because the
                 interconnects needed to group cores into clusters incur
                 area overhead, but many applications can make good use
                 of them due to synergies from cache sharing. Combining
                 power, area, and timing results of McPAT with
                 performance simulation of PARSEC benchmarks for
                 manycore designs at the 22nm technology shows that
                 8-core clustering gives the best energy-delay product,
                 whereas when die area is taken into account, 4-core
                 clustering gives the best EDA2P and EDAP.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "5",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Kritikakou:2013:NOM,
  author =       "Angeliki Kritikakou and Francky Catthoor and George S.
                 Athanasiou and Vasilios Kelefouras and Costas Goutis",
  title =        "Near-Optimal Microprocessor and Accelerators Codesign
                 with Latency and Throughput Constraints",
  journal =      j-TACO,
  volume =       "10",
  number =       "2",
  pages =        "6:1--6:??",
  month =        may,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2459316.2459317",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Wed May 1 16:38:16 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "A systematic methodology for near-optimal
                 software/hardware codesign mapping onto an FPGA
                 platform with microprocessor and HW accelerators is
                 proposed. The mapping steps deal with the
                 inter-organization, the foreground memory management,
                 and the datapath mapping. A step is described by
                 parameters and equations combined in a scalable
                 template. Mapping decisions are propagated as design
                 constraints to prune suboptimal options in next steps.
                 Several performance-area Pareto points are produced by
                 instantiating the parameters. To evaluate our
                 methodology we map a real-time bio-imaging application
                 and loop-dominated benchmarks.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "6",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Jiang:2013:HAC,
  author =       "Lei Jiang and Yu Du and Bo Zhao and Youtao Zhang and
                 Bruce R. Childers and Jun Yang",
  title =        "Hardware-Assisted Cooperative Integration of
                 Wear-Leveling and Salvaging for Phase Change Memory",
  journal =      j-TACO,
  volume =       "10",
  number =       "2",
  pages =        "7:1--7:??",
  month =        may,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2459316.2459318",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Wed May 1 16:38:16 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Phase Change Memory (PCM) has recently emerged as a
                 promising memory technology. However, PCM's limited
                 write endurance restricts its immediate use as a
                 replacement for DRAM. To extend the lifetime of PCM
                 chips, wear-leveling and salvaging techniques have been
                 proposed. Wear-leveling balances write operations
                 across different PCM regions while salvaging extends
                 the duty cycle and provides graceful degradation for a
                 nonnegligible number of failures. Current wear-leveling
                 and salvaging schemes have not been designed and
                 integrated to work cooperatively to achieve the best
                 PCM device lifetime. In particular, a noncontiguous PCM
                 space generated from salvaging complicates
                 wear-leveling and incurs large overhead. In this
                 article, we propose LLS, a Line-Level mapping and
                 Salvaging design. By allocating a dynamic portion of
                 total space in a PCM device as backup space, and
                 mapping failed lines to backup PCM, LLS constructs a
                 contiguous PCM space and masks lower-level failures
                 from the OS and applications. LLS integrates
                 wear-leveling and salvaging and copes well with modern
                 OSes. Our experimental results show that LLS achieves
                 31\% longer lifetime than the state-of-the-art. It has
                 negligible hardware cost and performance overhead.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "7",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Han:2013:PEP,
  author =       "Kyuseung Han and Junwhan Ahn and Kiyoung Choi",
  title =        "Power-Efficient Predication Techniques for
                 Acceleration of Control Flow Execution on {CGRA}",
  journal =      j-TACO,
  volume =       "10",
  number =       "2",
  pages =        "8:1--8:??",
  month =        may,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2459316.2459319",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Wed May 1 16:38:16 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Coarse-grained reconfigurable architecture typically
                 has an array of processing elements which are
                 controlled by a centralized unit. This makes it
                 difficult to execute programs having control divergence
                 among PEs without predication. However, conventional
                 predication techniques have a negative impact on both
                 performance and power consumption due to longer
                 instruction words and unnecessary instruction-fetching
                 decoding nullifying steps. This article reveals
                 performance and power issues in predicated execution
                 which have not been well-addressed yet. Furthermore, it
                 proposes fast and power-efficient predication
                 mechanisms. Experiments conducted through gate-level
                 simulation show that our mechanism improves
                 energy-delay product by 11.9\% to 23.8\% on average.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "8",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Wang:2013:MTD,
  author =       "Chao Wang and Xi Li and Junneng Zhang and Xuehai Zhou
                 and Xiaoning Nie",
  title =        "{MP-Tomasulo}: a Dependency-Aware Automatic Parallel
                 Execution Engine for Sequential Programs",
  journal =      j-TACO,
  volume =       "10",
  number =       "2",
  pages =        "9:1--9:??",
  month =        may,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2459316.2459320",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Wed May 1 16:38:16 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "This article presents MP-Tomasulo, a dependency-aware
                 automatic parallel task execution engine for sequential
                 programs. Applying the instruction-level Tomasulo
                 algorithm to MPSoC environments, MP-Tomasulo detects
                 and eliminates Write-After-Write (WAW) and
                 Write-After-Read (WAR) inter-task dependencies in the
                 dataflow execution, therefore to operate out-of-order
                 task execution on heterogeneous units. We implemented
                 the prototype system within a single FPGA. Experimental
                 results on EEMBC applications demonstrate that
                 MP-Tomasulo can execute the tasks out-of-order to
                 achieve as high as 93.6\% to 97.6\% of ideal peak
                 speedup. A comparative study against a state-of-the-art
                 dataflow execution scheme is illustrated with a classic
                 JPEG application. The promising results show
                 MP-Tomasulo enables programmers to uncover more
                 task-level parallelism on heterogeneous systems, as
                 well as to ease the burden of programmers.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "9",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Anonymous:2013:TR,
  author =       "Anonymous",
  title =        "{TACO} Reviewers 2012",
  journal =      j-TACO,
  volume =       "10",
  number =       "3",
  pages =        "9:1--9:??",
  month =        sep,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2509420.2509421",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Sep 16 17:20:12 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "9",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Shifer:2013:LLA,
  author =       "Eran Shifer and Shlomo Weiss",
  title =        "Low-latency adaptive mode transitions and hierarchical
                 power management in asymmetric clustered cores",
  journal =      j-TACO,
  volume =       "10",
  number =       "3",
  pages =        "10:1--10:??",
  month =        sep,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2499901",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Sep 16 17:20:12 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Recently, engineering solutions that include
                 asymmetric multicores have been fabricated for low
                 form-factor computing devices, indicating a potential
                 direction for future evolution of processors. In this
                 article we propose an asymmetric clustered core
                 architecture, exhibiting low-latency switching between
                 modes relative to asymmetric multicores, and having
                 similarities with the same asymmetric multicore
                 architecture in the context of a wider dynamic range of
                 the processor power-performance characteristic.
                 Asymmetric clustered cores incur additional
                 microarchitectural complexity and area cost inside a
                 core but exhibit better chip-level integration
                 characteristics compared to asymmetric multicores.
                 Focusing on power efficiency of asymmetric clustered
                 cores, we describe: (1) a hierarchical power management
                 partitioning between the operating system and on-die
                 firmware for coarse-grain switch policies, and (2)
                 core-internal tracking hardware for fine-grain
                 switching. The mode switch policies of the core's
                 tracking hardware are dependent on higher-level
                 directives and hints from the operating system, on-die
                 firmware, and compiler or profiling software. We
                 further explore the potential power management benefits
                 of asymmetric clustered cores relative to asymmetric
                 multicores, demonstrating that the ability of
                 asymmetric clustered cores to use tight training
                 periods for adaptive behavior, with low overhead
                 switching between modes, results in a more efficient
                 utilization of power management directives.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "10",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Asher:2013:HTL,
  author =       "Yosi Ben Asher and Nadav Rotem",
  title =        "Hybrid type legalization for a sparse {SIMD}
                 instruction set",
  journal =      j-TACO,
  volume =       "10",
  number =       "3",
  pages =        "11:1--11:??",
  month =        sep,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2509420.2509422",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Sep 16 17:20:12 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "SIMD vector units implement only a subset of the
                 operations used by vectorizing compilers, and there are
                 multiple conflicting techniques to legalize arbitrary
                 vector types into register-sized data types.
                 Traditionally, type legalization is performed using a
                 set of predefined rules, regardless of the operations
                 used in the program. This method is not suitable to
                 sparse SIMD instruction sets and often prevents the
                 vectorization of programs. In this work we introduce a
                 new technique for type legalization, namely vector
                 element promotion, as well as a hybrid method for
                 combining multiple techniques of type legalization. Our
                 hybrid type legalization method makes decisions based
                 on the knowledge of the available instruction set as
                 well as the operations used in the program. Our
                 experimental results demonstrate that program-dependent
                 hybrid type legalization improves the execution time of
                 vector programs, outperforms the existing legalization
                 method, and allows the vectorization of workloads which
                 were not vectorized before.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "11",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Lei:2013:VCI,
  author =       "Yuanwu Lei and Yong Dou and Lei Guo and Jinbo Xu and
                 Jie Zhou and Yazhuo Dong and Hongjian Li",
  title =        "{VLIW} coprocessor for {IEEE-754} quadruple-precision
                 elementary functions",
  journal =      j-TACO,
  volume =       "10",
  number =       "3",
  pages =        "12:1--12:??",
  month =        sep,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2512430",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Sep 16 17:20:12 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/fparith.bib;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "In this article, a unified VLIW coprocessor, based on
                 a common group of atomic operation units, for Quad
                 arithmetic and elementary functions (QP\_VELP) is
                 presented. The explicitly parallel scheme of VLIW
                 instruction and Estrin's evaluation scheme for
                 polynomials are used to improve the performance. A
                 two-level VLIW instruction RAM scheme is introduced to
                 achieve high scalability and customizability, even for
                 more complex key program kernels. Finally, the Quad
                 arithmetic accelerator (QAA) with the QP\_VELP array is
                 implemented on ASIC. Compared with hyper-thread
                 software implementation on an Intel Xeon E5620, QAA
                 with 8 QP\_VELP units achieves improvement by a factor
                 of 18X.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "12",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Kawahito:2013:IRF,
  author =       "Motohiro Kawahito and Hideaki Komatsu and Takao
                 Moriyama and Hiroshi Inoue and Toshio Nakatani",
  title =        "Idiom recognition framework using topological
                 embedding",
  journal =      j-TACO,
  volume =       "10",
  number =       "3",
  pages =        "13:1--13:??",
  month =        sep,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2512431",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Sep 16 17:20:12 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib;
                 https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib",
  abstract =     "Modern processors support hardware-assist instructions
                 (such as TRT and TROT instructions on the IBM System z)
                 to accelerate certain functions such as delimiter
                 search and character conversion. Such special
                 instructions are often used in high-performance
                 libraries, but their exploitation in optimizing
                 compilers has been limited. We devised a new idiom
                 recognition technique based on a topological embedding
                 algorithm to detect idiom patterns in the input
                 programs more aggressively than in previous approaches
                 using exact pattern matching. Our approach can detect a
                 pattern even if the code segment does not exactly match
                 the idiom. For example, we can detect a code segment
                 that includes additional code within the idiom pattern.
                 We also propose an instruction simplification for the
                 idiom recognition. This optimization analyzes all of
                 the usages of the output of the optimized code for a
                 specific idiom. If we find that we do not need an
                 actual value for the output but only a value in a
                 subrange, then we can assign a value in that subrange
                 as the output. The code generation can generate faster
                 code with this optimization. We implemented our new
                 idiom recognition approach based on the Java
                 Just-In-Time (JIT) compiler that is part of the J9 Java
                 Virtual Machine, and we supported several important
                 idioms for the special hardware-assist instructions on
                 the IBM System z and on some models of the IBM System
                 p. To demonstrate the effectiveness of our technique,
                 we performed two experiments. The first experiment was
                 to see how many more patterns we can detect compared to
                 the previous approach. The second experiment measured
                 the performance improvements over the previous
                 approaches. For the first experiment, we used the Java
                 Compatibility Kit (JCK) API tests. For the second
                 experiment we used the IBM XML parser, SPECjvm98, and
                 SPCjbb2000. In summary, relative to a baseline
                 implementation using exact pattern matching, our
                 algorithm converted 76\% more loops in JCK tests. On a
                 z9, we also observed significant average performance
                 improvement of the XML parser by 54\%, of SPECjvm98 by
                 1.9\%, and of SPECjbb2000 by 4.4\%. Finally, we
                 observed that the JIT compilation time increased by
                 only 0.32\% to 0.44\%.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "13",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Shobaki:2013:PIS,
  author =       "Ghassan Shobaki and Maxim Shawabkeh and Najm Eldeen
                 Abu Rmaileh",
  title =        "Preallocation instruction scheduling with register
                 pressure minimization using a combinatorial
                 optimization approach",
  journal =      j-TACO,
  volume =       "10",
  number =       "3",
  pages =        "14:1--14:??",
  month =        sep,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2512432",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Sep 16 17:20:12 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Balancing Instruction-Level Parallelism (ILP) and
                 register pressure during preallocation instruction
                 scheduling is a fundamentally important problem in code
                 generation and optimization. The problem is known to be
                 NP-complete. Many heuristic techniques have been
                 proposed to solve this problem. However, due to the
                 inherently conflicting requirements of maximizing ILP
                 and minimizing register pressure, heuristic techniques
                 may produce poor schedules in many cases. If such cases
                 occur in hot code, significant performance degradation
                 may result. A few combinatorial optimization approaches
                 have also been proposed, but none of them has been
                 shown to solve large real-world instances within
                 reasonable time. This article presents the first
                 combinatorial algorithm that is efficient enough to
                 optimally solve large instances of this problem (basic
                 blocks with hundreds of instructions) within a few
                 seconds per instance. The proposed algorithm uses
                 branch-and-bound enumeration with a number of powerful
                 pruning techniques to efficiently search the solution
                 space. The search is based on a cost function that
                 incorporates schedule length and register pressure. An
                 implementation of the proposed scheduling algorithm has
                 been integrated into the LLVM Compiler and evaluated
                 using SPEC CPU 2006. On x86-64, with a time limit of
                 10ms per instruction, it optimally schedules 79\% of
                 the hot basic blocks in FP2006. Another 19\% of the
                 blocks are not optimally scheduled but are improved in
                 cost relative to LLVM's heuristic. This improves the
                 execution time of some benchmarks by up to 21\%, with a
                 geometric-mean improvement of 2.4\% across the entire
                 benchmark suite. With the use of precise latency
                 information, the geometric-mean improvement is
                 increased to 2.8\%.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "14",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{She:2013:EEM,
  author =       "Dongrui She and Yifan He and Henk Corporaal",
  title =        "An energy-efficient method of supporting flexible
                 special instructions in an embedded processor with
                 compact {ISA}",
  journal =      j-TACO,
  volume =       "10",
  number =       "3",
  pages =        "15:1--15:??",
  month =        sep,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2509420.2509426",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Sep 16 17:20:12 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "In application-specific processor design, a common
                 approach to improve performance and efficiency is to
                 use special instructions that execute complex operation
                 patterns. However, in a generic embedded processor with
                 compact Instruction Set Architecture (ISA), these
                 special instructions may lead to large overhead such
                 as: ( i ) more bits are needed to encode the extra
                 opcodes and operands, resulting in wider instructions;
                 ( ii ) more Register File (RF) ports are required to
                 provide the extra operands to the function units. Such
                 overhead may increase energy consumption considerably.
                 In this article, we propose to support flexible
                 operation pair patterns in a processor with a compact
                 24-bit RISC-like ISA using: ( i ) a partially
                 reconfigurable decoder that exploits the pattern
                 locality to reduce opcode space requirement; ( ii ) a
                 software-controlled bypass network to reduce operand
                 encoding bit and RF port requirement. An energy-aware
                 compiler backend is designed for the proposed
                 architecture that performs pattern selection and
                 bypass-aware scheduling to generate energy-efficient
                 codes. Though the proposed design imposes extra
                 constraints on the operation patterns, the experimental
                 results show that for benchmark applications from
                 different domains, the average dynamic instruction
                 count is reduced by over 25\%, which is only about 2\%
                 less than the architecture without such constraints.
                 The proposed architecture reduces total energy by an
                 average of 15.8\% compared to the RISC baseline, while
                 the one without constraints achieves almost no
                 improvement due to its high overhead. When high
                 performance is required, the proposed architecture is
                 able to achieve a speedup of 13.8\% with 13.1\% energy
                 reduction compared to the baseline by introducing
                 multicycle SFU operations.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "15",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Nandivada:2013:IBA,
  author =       "V. Krishna Nandivada and Rajkishore Barik",
  title =        "Improved bitwidth-aware variable packing",
  journal =      j-TACO,
  volume =       "10",
  number =       "3",
  pages =        "16:1--16:??",
  month =        sep,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2509420.2509427",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Sep 16 17:20:12 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Bitwidth-aware register allocation has caught the
                 attention of researchers aiming to effectively reduce
                 the number of variables spilled into memory. For
                 general-purpose processors, this improves the execution
                 time performance and reduces runtime memory
                 requirements (which in turn helps in the compilation of
                 programs targeted to systems with constrained memory).
                 Additionally, bitwidth-aware register allocation has
                 been effective in reducing power consumption in
                 embedded processors. One of the key components of
                 bitwidth-aware register allocation is the variable
                 packing algorithm that packs multiple narrow-width
                 variables into one physical register. Tallam and Gupta
                 [2003] have proved that optimal variable packing is an
                 NP-complete problem for arbitrary-width variables and
                 have proposed an approximate solution. In this article,
                 we analyze the complexity of the variable packing
                 problem and present three enhancements that improve the
                 overall packing of variables. In particular, the
                 improvements we describe are: (a) Width Static Single
                 Assignment (W-SSA) form representation that splits the
                 live range of a variable into several fixed-width live
                 ranges (W-SSA) variables; (b) PoTR Representation ---
                 use of powers-of-two representation for bitwidth
                 information for W-SSA variables. Our empirical results
                 have shown that the associated bit wastage resulting
                 from the overapproximation of the widths of variables
                 to the nearest next power of two is a small fraction
                 compared to the total number of bits in use ($ \approx
                 $ 13\%). The main advantage of this representation is
                 that it leads to optimal variable packing in polynomial
                 time; (c) Combined Packing and Coalescing --- we
                 discuss the importance of coalescing (combining
                 variables whose live ranges do not interfere) in the
                 context of variable packing and present an iterative
                 algorithm to perform coalescing and packing of W-SSA
                 variables represented in PoTR. Our experimental results
                 show up to 76.00\% decrease in the number of variables
                 compared to the number of variables in the input
                 program in Single Static Assignment (SSA) form. This
                 reduction in the number of variables led to a
                 significant reduction in dynamic spilling, packing, and
                 unpacking instructions.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "16",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Ahn:2013:SHR,
  author =       "Jung Ho Ahn and Young Hoon Son and John Kim",
  title =        "Scalable high-radix router microarchitecture using a
                 network switch organization",
  journal =      j-TACO,
  volume =       "10",
  number =       "3",
  pages =        "17:1--17:??",
  month =        sep,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2512433",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Sep 16 17:20:12 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "As the system size of supercomputers and datacenters
                 increases, cost-efficient networks become critical in
                 achieving good scalability on those systems. High
                 -radix routers reduce network cost by lowering the
                 network diameter while providing a high bisection
                 bandwidth and path diversity. The building blocks of
                 these large-scale networks are the routers or the
                 switches and they need to scale accordingly to the
                 increasing port count and increasing pin bandwidth.
                 However, as the port count increases, the high-radix
                 router microarchitecture itself needs to scale
                 efficiently. Hierarchical crossbar switch organization
                 has been proposed where a single large crossbar used
                 for a router switch is partitioned into many small
                 crossbars and overcomes the limitations of conventional
                 router microarchitecture. Although the organization
                 provides high performance, it has limited scalability
                 due to excessive power and area overheads by the wires
                 and intermediate buffers. In this article, we propose
                 scalable router microarchitectures that leverage a
                 network within the switch design of the high-radix
                 routers themselves. These alternative designs lower the
                 wiring complexity and buffer requirements. For example,
                 when a folded-Clos switch is used instead of the
                 hierarchical crossbar switch for a radix-64 router, it
                 provides up to 73\%, 58\%, and 87\% reduction in area,
                 energy-delay product, and energy-delay-area product,
                 respectively. We also explore more efficient switch
                 designs by exploiting the traffic-pattern
                 characteristics of the global network and its impact on
                 the local network design within the switch for both
                 folded-Clos and flattened butterfly networks. In
                 particular, we propose a bilateral butterfly switch
                 organization that has fewer crossbars and global wires
                 compared to the topology-agnostic folded-Clos switch
                 while achieving better low-load latency and equivalent
                 saturation throughput.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "17",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Huang:2013:ACM,
  author =       "Libo Huang and Zhiying Wang and Nong Xiao and Yongwen
                 Wang and Qiang Dou",
  title =        "Adaptive communication mechanism for accelerating
                 {MPI} functions in {NoC}-based multicore processors",
  journal =      j-TACO,
  volume =       "10",
  number =       "3",
  pages =        "18:1--18:??",
  month =        sep,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2512434",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Sep 16 17:20:12 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Multicore designs have emerged as the dominant
                 organization for future high-performance
                 microprocessors. Communication in such designs is often
                 enabled by Networks-on-Chip (NoCs). A new trend in such
                 architectures is to fit a Message Passing Interface
                 (MPI) programming model on NoCs to achieve optimal
                 parallel application performance. A key issue in
                 designing MPI over NoCs is communication protocol,
                 which has not been explored in previous research. This
                 article advocates a hardware-supported communication
                 mechanism using a protocol-adaptive approach to adjust
                 to varying NoC configurations (e.g., number of buffers)
                 and workload behavior (e.g., number of messages). We
                 propose the ADaptive Communication Mechanism (ADCM), a
                 hybrid protocol that involves behavior similar to
                 buffered communication when sufficient buffer is
                 available in the receiver to that similar to a
                 synchronous protocol when buffers in the receiver are
                 limited. ADCM adapts dynamically by deciding
                 communication protocol on a per-request basis using a
                 local estimate of recent buffer utilization. ADCM
                 attempts to combine both the advantages of buffered and
                 synchronous communication modes to achieve enhanced
                 throughput and performance. Simulations of various
                 workloads show that the proposed communication
                 mechanism can be effectively used in future NoC
                 designs.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "18",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Malik:2013:OSG,
  author =       "Avinash Malik and David Gregg",
  title =        "Orchestrating stream graphs using model checking",
  journal =      j-TACO,
  volume =       "10",
  number =       "3",
  pages =        "19:1--19:??",
  month =        sep,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2512435",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Sep 16 17:20:12 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "In this article we use model checking to statically
                 distribute and schedule Synchronous DataFlow (SDF)
                 graphs on heterogeneous execution architectures. We
                 show that model checking is capable of providing an
                 optimal solution and it arrives at these solutions
                 faster (in terms of algorithm runtime) than equivalent
                 ILP formulations. Furthermore, we also show how
                 different types of optimizations such as task
                 parallelism, data parallelism, and state sharing can be
                 included within our framework. Finally, comparison of
                 our approach with the current state-of-the-art
                 heuristic techniques show the pitfalls of these
                 techniques and gives a glimpse of how these heuristic
                 techniques can be improved.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "19",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Wang:2013:UML,
  author =       "Zheng Wang and Michael F. P. O'Boyle",
  title =        "Using machine learning to partition streaming
                 programs",
  journal =      j-TACO,
  volume =       "10",
  number =       "3",
  pages =        "20:1--20:??",
  month =        sep,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2512436",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Sep 16 17:20:12 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Stream-based parallel languages are a popular way to
                 express parallelism in modern applications. The
                 efficient mapping of streaming parallelism to today's
                 multicore systems is, however, highly dependent on the
                 program and underlying architecture. We address this by
                 developing a portable and automatic compiler-based
                 approach to partitioning streaming programs using
                 machine learning. Our technique predicts the ideal
                 partition structure for a given streaming application
                 using prior knowledge learned offline. Using the
                 predictor we rapidly search the program space (without
                 executing any code) to generate and select a good
                 partition. We applied this technique to standard
                 StreamIt applications and compared against existing
                 approaches. On a 4-core platform, our approach achieves
                 60\% of the best performance found by iteratively
                 compiling and executing over 3000 different partitions
                 per program. We obtain, on average, a 1.90$ \times $
                 speedup over the already tuned partitioning scheme of
                 the StreamIt compiler. When compared against a
                 state-of-the-art analytical, model-based approach, we
                 achieve, on average, a 1.77$ \times $ performance
                 improvement. By porting our approach to an 8-core
                 platform, we are able to obtain 1.8$ \times $
                 improvement over the StreamIt default scheme,
                 demonstrating the portability of our approach.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "20",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Bakhoda:2013:DCN,
  author =       "Ali Bakhoda and John Kim and Tor M. Aamodt",
  title =        "Designing on-chip networks for throughput
                 accelerators",
  journal =      j-TACO,
  volume =       "10",
  number =       "3",
  pages =        "21:1--21:??",
  month =        sep,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2512429",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Sep 16 17:20:12 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "As the number of cores and threads in throughput
                 accelerators such as Graphics Processing Units (GPU)
                 increases, so does the importance of on-chip
                 interconnection network design. This article explores
                 throughput-effective Network-on-Chips (NoC) for future
                 compute accelerators that employ Bulk-Synchronous
                 Parallel (BSP) programming models such as CUDA and
                 OpenCL. A hardware optimization is ``throughput
                 effective'' if it improves parallel application-level
                 performance per unit chip area. We evaluate performance
                 of future looking workloads using detailed closed-loop
                 simulations modeling compute nodes, NoC, and the DRAM
                 memory system. We start from a mesh design with
                 bisection bandwidth balanced to off-chip demand.
                 Accelerator workloads tend to demand high off-chip
                 memory bandwidth which results in a many-to-few traffic
                 pattern when coupled with expected technology
                 constraints of slow growth in pins-per-chip. Leveraging
                 these observations we reduce NoC area by proposing a
                 ``checkerboard'' NoC which alternates between
                 conventional full routers and half routers with limited
                 connectivity. Next, we show that increasing network
                 terminal bandwidth at the nodes connected to DRAM
                 controllers alleviates a significant fraction of the
                 remaining imbalance resulting from the many-to-few
                 traffic pattern. Furthermore, we propose a ``double
                 checkerboard inverted'' NoC organization which takes
                 advantage of channel slicing to reduce area while
                 maintaining the performance improvements of the
                 aforementioned techniques. This organization also has a
                 simpler routing mechanism and improves average
                 application throughput per unit area by 24.3\%.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "21",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Jantz:2013:ESM,
  author =       "Michael R. Jantz and Prasad A. Kulkarni",
  title =        "Exploring single and multilevel {JIT} compilation
                 policy for modern machines 1",
  journal =      j-TACO,
  volume =       "10",
  number =       "4",
  pages =        "22:1--22:??",
  month =        dec,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2541228.2541229",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Dec 23 10:31:41 MST 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Dynamic or Just-in-Time (JIT) compilation is essential
                 to achieve high-performance emulation for programs
                 written in managed languages, such as Java and C\#. It
                 has been observed that a conservative JIT compilation
                 policy is most effective to obtain good runtime
                 performance without impeding application progress on
                 single-core machines. At the same time, it is often
                 suggested that a more aggressive dynamic compilation
                 strategy may perform best on modern machines that
                 provide abundant computing resources, especially with
                 virtual machines (VMs) that are also capable of
                 spawning multiple concurrent compiler threads. However,
                 comprehensive research on the best JIT compilation
                 policy for such modern processors and VMs is currently
                 lacking. The goal of this work is to explore the
                 properties of single-tier and multitier JIT compilation
                 policies that can enable existing and future VMs to
                 realize the best program performance on modern
                 machines. In this work, we design novel experiments and
                 implement new VM configurations to effectively control
                 the compiler aggressiveness and optimization levels (
                 if and when methods are compiled) in the
                 industry-standard Oracle HotSpot Java VM to achieve
                 this goal. We find that the best JIT compilation policy
                 is determined by the nature of the application and the
                 speed and effectiveness of the dynamic compilers. We
                 extend earlier results showing the suitability of
                 conservative JIT compilation on single-core machines
                 for VMs with multiple concurrent compiler threads. We
                 show that employing the free compilation resources
                 (compiler threads and hardware cores) to aggressively
                 compile more program methods quickly reaches a point of
                 diminishing returns. At the same time, we also find
                 that using the free resources to reduce compiler queue
                 backup (compile selected hot methods early )
                 significantly benefits program performance, especially
                 for slower (highly optimizing) JIT compilers. For such
                 compilers, we observe that accurately prioritizing JIT
                 method compiles is crucial to realize the most
                 performance benefit with the smallest hardware budget.
                 Finally, we show that a tiered compilation policy,
                 although complex to implement, greatly alleviates the
                 impact of more and early JIT compilation of programs on
                 modern machines.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "22",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Dong:2013:CAC,
  author =       "Xiangyu Dong and Norman P. Jouppi and Yuan Xie",
  title =        "A circuit-architecture co-optimization framework for
                 exploring nonvolatile memory hierarchies",
  journal =      j-TACO,
  volume =       "10",
  number =       "4",
  pages =        "23:1--23:??",
  month =        dec,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2541228.2541230",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Dec 23 10:31:41 MST 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Many new memory technologies are available for
                 building future energy-efficient memory hierarchies. It
                 is necessary to have a framework that can quickly find
                 the optimal memory technology at each hierarchy level.
                 In this work, we first build a circuit-architecture
                 joint design space exploration framework by combining
                 RC circuit analysis and Artificial Neural Network
                 (ANN)-based performance modeling. Then, we use this
                 framework to evaluate some emerging nonvolatile memory
                 hierarchies. We demonstrate that a Resistive RAM
                 (ReRAM)-based cache hierarchy on an 8-core
                 Chip-Multiprocessor (CMP) system can achieve a 24\%
                 Energy Delay Product (EDP) improvement and a 36\%
                 Energy Delay Area Product (EDAP) improvement compared
                 to a conventional hierarchy with SRAM on-chip caches
                 and DRAM main memory.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "23",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Zhao:2013:OGE,
  author =       "Jishen Zhao and Guangyu Sun and Gabriel H. Loh and
                 Yuan Xie",
  title =        "Optimizing {GPU} energy efficiency with {$3$D}
                 die-stacking graphics memory and reconfigurable memory
                 interface",
  journal =      j-TACO,
  volume =       "10",
  number =       "4",
  pages =        "24:1--24:??",
  month =        dec,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2541228.2541231",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Dec 23 10:31:41 MST 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "The performance of graphics processing unit (GPU)
                 systems is improving rapidly to accommodate the
                 increasing demands of graphics and high-performance
                 computing applications. With such a performance
                 improvement, however, power consumption of GPU systems
                 is dramatically increased. Up to 30\% of the total
                 power of a GPU system is consumed by the graphic memory
                 itself. Therefore, reducing graphics memory power
                 consumption is critical to mitigate the power
                 challenge. In this article, we propose an
                 energy-efficient reconfigurable 3D die-stacking
                 graphics memory design that integrates wide-interface
                 graphics DRAMs side-by-side with a GPU processor on a
                 silicon interposer. The proposed architecture is a
                 ``3D+2.5D'' system, where the DRAM memory itself is 3D
                 stacked memory with through-silicon via (TSV), whereas
                 the integration of DRAM and the GPU processor is
                 through the interposer solution (2.5D). Since GPU
                 computing units, memory controllers, and memory are all
                 integrated in the same package, the number of memory
                 I/Os is no longer constrained by the package's pin
                 count. We can reduce the memory power consumption by
                 scaling down the supply voltage and frequency of memory
                 interface while maintaining the same or even higher
                 peak memory bandwidth. In addition, we design a
                 reconfigurable memory interface that can dynamically
                 adapt to the requirements of various applications. We
                 propose two reconfiguration mechanisms to optimize the
                 GPU system energy efficiency and throughput,
                 respectively, and thus benefit both memory-intensive
                 and compute-intensive applications. The experimental
                 results show that the proposed GPU memory architecture
                 can effectively improve GPU system energy efficiency by
                 21\%, without reconfiguration. The reconfigurable
                 memory interface can further improve the system energy
                 efficiency by 26\%, and system throughput by 31\% under
                 a capped system power budget of 240W.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "24",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Chen:2013:EMT,
  author =       "Chien-Chi Chen and Sheng-De Wang",
  title =        "An efficient multicharacter transition string-matching
                 engine based on the {Aho--Corasick} algorithm",
  journal =      j-TACO,
  volume =       "10",
  number =       "4",
  pages =        "25:1--25:??",
  month =        dec,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2541228.2541232",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Dec 23 10:31:41 MST 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/string-matching.bib;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "A string-matching engine capable of inspecting
                 multiple characters in parallel can multiply the
                 throughput. However, the space required for
                 implementing a matching engine that can process
                 multiple characters in parallel generally grows
                 exponentially with respect to the characters to be
                 processed in parallel. Based on the Aho--Corasick
                 algorithm (AC-algorithm), this work presents a novel
                 multicharacter transition Nondeterministic Finite
                 Automaton (NFA) approach, called multicharacter AC-NFA,
                 to allow for the inspection of multiple characters in
                 parallel. This approach first converts an AC-trie to an
                 AC-NFA by allowing for the simultaneous activation of
                 multiple states and then converts the AC-NFA to a
                 $k$-character AC-NFA by an algorithm with concatenation
                 operations and assistant transitions. Additionally, the
                 alignment problem, which occurs while multiple
                 characters are being inspected in parallel, is solved
                 using assistant transitions. Moreover, a corresponding
                 output is provided for each inspected character by
                 introducing priority multiplexers to determine the
                 final matching outputs during implementation of the
                 multicharacter AC-NFA. Consequently, the number of
                 derived $k$-character transitions grows linearly with
                 respect to the number $k$. Furthermore, the derived
                 multicharacter AC-NFA is implemented on FPGAs for
                 evaluation. The resulting throughput grows
                 approximately 14 times and the hardware cost grows
                 about 18 times for 16-character AC-NFA implementation,
                 as compared with that for 1-character AC-NFA
                 implementation. The achievable throughput is 21.4Gbps
                 for the 16-character AC-NFA implementation operating at
                 a 167.36MHz clock.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "25",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Luo:2013:DIH,
  author =       "Yangchun Luo and Wei-Chung Hsu and Antonia Zhai",
  title =        "The design and implementation of heterogeneous
                 multicore systems for energy-efficient speculative
                 thread execution",
  journal =      j-TACO,
  volume =       "10",
  number =       "4",
  pages =        "26:1--26:??",
  month =        dec,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2541228.2541233",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Dec 23 10:31:41 MST 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "With the emergence of multicore processors, various
                 aggressive execution models have been proposed to
                 exploit fine-grained thread-level parallelism, taking
                 advantage of the fast on-chip interconnection
                 communication. However, the aggressive nature of these
                 execution models often leads to excessive energy
                 consumption incommensurate to execution time reduction.
                 In the context of Thread-Level Speculation, we
                 demonstrated that on a same-ISA heterogeneous multicore
                 system, by dynamically deciding how on-chip resources
                 are utilized, speculative threads can achieve
                 performance gain in an energy-efficient way. Through a
                 systematic design space exploration, we built a
                 multicore architecture that integrates heterogeneous
                 components of processing cores and first-level caches.
                 To cope with processor reconfiguration overheads, we
                 introduced runtime mechanisms to mitigate their
                 impacts. To match program execution with the most
                 energy-efficient processor configuration, the system
                 was equipped with a dynamic resource allocation scheme
                 that characterizes program behaviors using novel
                 processor counters. We evaluated the proposed
                 heterogeneous system with a diverse set of benchmark
                 programs from SPEC CPU2000 and CPU20006 suites.
                 Compared to the most efficient homogeneous TLS
                 implementation, we achieved similar performance but
                 consumed 18\% less energy. Compared to the most
                 efficient homogeneous uniprocessor running sequential
                 programs, we improved performance by 29\% and reduced
                 energy consumption by 3.6\%, which is a 42\%
                 improvement in energy-delay-squared product.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "26",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Rolan:2013:VSC,
  author =       "Dyer Rol{\'a}n and Basilio B. Fraguela and Ram{\'o}n
                 Doallo",
  title =        "Virtually split cache: an efficient mechanism to
                 distribute instructions and data 1",
  journal =      j-TACO,
  volume =       "10",
  number =       "4",
  pages =        "27:1--27:??",
  month =        dec,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2541228.2541234",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Dec 23 10:31:41 MST 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "First-level caches are usually split for both
                 instructions and data instead of unifying them in a
                 single cache. Although that approach eases the pipeline
                 design and provides a simple way to independently treat
                 data and instructions, its global hit rate is usually
                 smaller than that of a unified cache. Furthermore,
                 unified lower-level caches usually behave and process
                 memory requests disregarding whether they are data or
                 instruction requests. In this article, we propose a new
                 technique aimed to balance the amount of space devoted
                 to instructions and data for optimizing set-associative
                 caches: the Virtually Split Cache or VSC. Our technique
                 combines the sharing of resources from unified
                 approaches with the bandwidth and parallelism that
                 split configurations provide, thus reducing power
                 consumption while not degrading performance. Our design
                 dynamically adjusts cache resources devoted to
                 instructions and data depending on their particular
                 demand. Two VSC designs are proposed in order to track
                 the instructions and data requirements. The Shadow Tag
                 VSC (ST-VSC) is based on shadow tags that store the
                 last evicted line related to data and instructions in
                 order to determine how well the cache would work with
                 one more way per set devoted to each kind. The Global
                 Selector VSC (GS-VSC) uses a saturation counter that is
                 updated every time a cache miss occurs either under an
                 instruction or data request applying a duel-like
                 mechanism. Experiments with a variable and a fixed
                 latency VSC show that ST-VSC and GS-VSC reduce on
                 average the cache hierarchy power consumption by 29\%
                 and 24\%, respectively, with respect to a standard
                 baseline. As for performance, while the fixed latency
                 designs virtually match the split baseline in a
                 single-core system, a variable latency ST-VSC and
                 GS-VSC increase the average IPC by 2.5\% and 2\%,
                 respectively. In multicore systems, even the slower
                 fixed latency ST-VSC and GS-VSC designs improve the
                 baseline IPC by 3.1\% and 2.5\%, respectively, in a
                 four-core system thanks to the reduction in the
                 bandwidth demanded from the lower cache levels. This is
                 in contrast with many techniques that trade performance
                 degradation for power consumption reduction. VSC
                 particularly benefits embedded processors with a single
                 level of cache, where up to an average 9.2\% IPC
                 improvement is achieved. Interestingly, we also find
                 that partitioning the LLC for instructions and data can
                 improve performance around 2\%.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "27",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Subramaniam:2013:UFC,
  author =       "Samantika Subramaniam and Simon C. Steely and Will
                 Hasenplaugh and Aamer Jaleel and Carl Beckmann and
                 Tryggve Fossum and Joel Emer",
  title =        "Using in-flight chains to build a scalable cache
                 coherence protocol",
  journal =      j-TACO,
  volume =       "10",
  number =       "4",
  pages =        "28:1--28:??",
  month =        dec,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2541228.2541235",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Dec 23 10:31:41 MST 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "As microprocessor designs integrate more cores,
                 scalability of cache coherence protocols becomes a
                 challenging problem. Most directory-based protocols
                 avoid races by using blocking tag directories that can
                 impact the performance of parallel applications. In
                 this article, we first quantitatively demonstrate that
                 state-of-the-art blocking protocols significantly
                 constrain throughput at large core counts for several
                 parallel applications. Nonblocking protocols address
                 this throughput concern at the expense of scalability
                 in the interconnection network or in the required
                 resource overheads. To address this concern, we enhance
                 nonblocking directory protocols by migrating the point
                 of service of responses. Our approach uses in-flight
                 chains of cores making parallel memory requests to
                 incorporate scalability while maintaining
                 high-throughput. The proposed cache coherence protocol
                 called chained cache coherence, can outperform blocking
                 protocols by up to 20\% on scientific and 12\% on
                 commercial applications. It also has low resource
                 overheads and simple address ordering requirements
                 making it both a high-performance and scalable
                 protocol. Furthermore, in-flight chains provide a
                 scalable solution to building hierarchical and
                 nonblocking tag directories as well as optimize
                 communication latencies.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "28",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Sanchez:2013:MIP,
  author =       "Daniel S{\'a}nchez and Yiannakis Sazeides and Juan M.
                 Cebri{\'a}n and Jos{\'e} M. Garc{\'\i}a and Juan L.
                 Arag{\'o}n",
  title =        "Modeling the impact of permanent faults in caches",
  journal =      j-TACO,
  volume =       "10",
  number =       "4",
  pages =        "29:1--29:??",
  month =        dec,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2541228.2541236",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Dec 23 10:31:41 MST 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "The traditional performance cost benefits we have
                 enjoyed for decades from technology scaling are
                 challenged by several critical constraints including
                 reliability. Increases in static and dynamic variations
                 are leading to higher probability of parametric and
                 wear-out failures and are elevating reliability into a
                 prime design constraint. In particular, SRAM cells used
                 to build caches that dominate the processor area are
                 usually minimum sized and more prone to failure. It is
                 therefore of paramount importance to develop effective
                 methodologies that facilitate the exploration of
                 reliability techniques for caches. To this end, we
                 present an analytical model that can determine for a
                 given cache configuration, address trace, and random
                 probability of permanent cell failure the exact
                 expected miss rate and its standard deviation when
                 blocks with faulty bits are disabled. What
                 distinguishes our model is that it is fully analytical,
                 it avoids the use of fault maps, and yet, it is both
                 exact and simpler than previous approaches. The
                 analytical model is used to produce the miss-rate
                 trends ( expected miss-rate ) for future technology
                 nodes for both uncorrelated and clustered faults. Some
                 of the key findings based on the proposed model are (i)
                 block disabling has a negligible impact on the expected
                 miss-rate unless probability of failure is equal or
                 greater than 2.6e-4, (ii) the fault map methodology can
                 accurately calculate the expected miss-rate as long as
                 1,000 to 10,000 fault maps are used, and (iii) the
                 expected miss-rate for execution of parallel
                 applications increases with the number of threads and
                 is more pronounced for a given probability of failure
                 as compared to sequential execution.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "29",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Lee:2013:APF,
  author =       "Sanghoon Lee and James Tuck",
  title =        "Automatic parallelization of fine-grained
                 metafunctions on a chip multiprocessor",
  journal =      j-TACO,
  volume =       "10",
  number =       "4",
  pages =        "30:1--30:??",
  month =        dec,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2541228.2541237",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Dec 23 10:31:41 MST 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Due to the importance of reliability and security,
                 prior studies have proposed inlining metafunctions into
                 applications for detecting bugs and security
                 vulnerabilities. However, because these software
                 techniques add frequent, fine-grained instrumentation
                 to programs, they often incur large runtime overheads.
                 In this work, we consider an automatic thread
                 extraction technique for removing these fine-grained
                 checks from a main application and scheduling them on
                 helper threads. In this way, we can leverage the
                 resources available on a CMP to reduce the latency and
                 overhead of fine-grained checking codes. Our
                 parallelization strategy extracts metafunctions from a
                 single threaded application and executes them in
                 customized helper threads-threads constructed to mirror
                 relevant fragments of the main program's behavior in
                 order to keep communication and overhead low. To get
                 good performance, we consider optimizations that reduce
                 communication and balance work among many threads. We
                 evaluate our parallelization strategy on Mudflap, a
                 pointer-use checking tool in GCC. To show the benefits
                 of our technique, we compare it to a manually
                 parallelized version of Mudflap. We run our experiments
                 on an architectural simulator with support for fast
                 queueing operations. On a subset of SPECint 2000, our
                 automatically parallelized code using static load
                 balance is only 19\% slower, on average, than the
                 manually parallelized version on a simulated eight-core
                 system. In addition, our automatically parallelized
                 code using dynamic load balance is competitive, on
                 average, to the manually parallelized version on a
                 simulated eight-core system. Furthermore, all the
                 applications except parser achieve better speedups with
                 our automatic algorithms than with the manual approach.
                 Also, our approach introduces very little overhead in
                 the main program-it is kept under 100\%, which is more
                 than a 5.3$ \times $ reduction compared to serial
                 Mudflap.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "30",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Dubach:2013:DMA,
  author =       "Christophe Dubach and Timothy M. Jones and Edwin V.
                 Bonilla",
  title =        "Dynamic microarchitectural adaptation using machine
                 learning",
  journal =      j-TACO,
  volume =       "10",
  number =       "4",
  pages =        "31:1--31:??",
  month =        dec,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2541228.2541238",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Dec 23 10:31:41 MST 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Adaptive microarchitectures are a promising solution
                 for designing high-performance, power-efficient
                 microprocessors. They offer the ability to tailor
                 computational resources to the specific requirements of
                 different programs or program phases. They have the
                 potential to adapt the hardware cost-effectively at
                 runtime to any application's needs. However, one of the
                 key challenges is how to dynamically determine the best
                 architecture configuration at any given time, for any
                 new workload. This article proposes a novel control
                 mechanism based on a predictive model for
                 microarchitectural adaptivity control. This model is
                 able to efficiently control adaptivity by monitoring
                 the behaviour of an application's different phases at
                 runtime. We show that by using this model on SPEC 2000,
                 we double the energy\slash performance efficiency of
                 the processor when compared to the best static
                 configuration tuned for the whole benchmark suite. This
                 represents 74\% of the improvement available if we know
                 the best microarchitecture for each program phase ahead
                 of time. In addition, we present an extended analysis
                 of the best configurations found and show that the
                 overheads associated with the implementation of our
                 scheme have a negligible impact on performance and
                 power.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "31",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Chen:2013:CME,
  author =       "Long Chen and Yanan Cao and Zhao Zhang",
  title =        "{E$^3$CC}: a memory error protection scheme with novel
                 address mapping for subranked and low-power memories",
  journal =      j-TACO,
  volume =       "10",
  number =       "4",
  pages =        "32:1--32:??",
  month =        dec,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2541228.2541239",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Dec 23 10:31:41 MST 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "This study presents and evaluates E$^3$ CC (Enhanced
                 Embedded ECC), a full design and implementation of a
                 generic embedded ECC scheme that enables
                 power-efficient error protection for subranked memory
                 systems. It incorporates a novel address mapping scheme
                 called Biased Chinese Remainder Mapping (BCRM) to
                 resolve the address mapping issue for memories of page
                 interleaving, plus a simple and effective cache design
                 to reduce extra ECC traffic. Our evaluation using SPEC
                 CPU2006 benchmarks confirms the performance and power
                 efficiency of the E$^3$ CC scheme for subranked
                 memories as well as conventional memories.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "32",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Tian:2013:TBM,
  author =       "Yingying Tian and Samira M. Khan and Daniel A.
                 Jim{\'e}nez",
  title =        "Temporal-based multilevel correlating inclusive cache
                 replacement",
  journal =      j-TACO,
  volume =       "10",
  number =       "4",
  pages =        "33:1--33:??",
  month =        dec,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2541228.2555290",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Thu Jan 9 10:42:35 MST 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Inclusive caches have been widely used in Chip
                 Multiprocessors (CMPs) to simplify cache coherence.
                 However, they have poor performance compared with
                 noninclusive caches not only because of the limited
                 capacity of the entire cache hierarchy but also due to
                 ignorance of temporal locality of the Last-Level Cache
                 (LLC). Blocks that are highly referenced (referred to
                 as hot blocks ) are always hit in higher-level caches
                 (e.g., L1 cache) and are rarely referenced in the LLC.
                 Therefore, they become replacement victims in the LLC.
                 Due to the inclusion property, blocks evicted from the
                 LLC have to also be invalidated from higher-level
                 caches. Invalidation of hot blocks from the entire
                 cache hierarchy introduces costly off-chip misses that
                 makes the inclusive cache perform poorly. Neither
                 blocks that are highly referenced in the LLC nor blocks
                 that are highly referenced in higher-level caches
                 should be the LLC replacement victims. We propose
                 temporal-based multilevel correlating cache replacement
                 for inclusive caches to evict blocks in the LLC that
                 are also not hot in higher-level caches using
                 correlated temporal information acquired from all
                 levels of a cache hierarchy with minimal overhead.
                 Invalidation of these blocks does not hurt the
                 performance. By contrast, replacing them as early as
                 possible with useful blocks helps improve cache
                 performance. Based on our experiments, in a dual-core
                 CMP, an inclusive cache with temporal-based multilevel
                 correlating cache replacement significantly outperforms
                 an inclusive cache with traditional LRU replacement by
                 yielding an average speedup of 12.7\%, which is
                 comparable to an enhanced noninclusive cache, while
                 requiring less than 1\% of storage overhead.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "33",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Liu:2013:HSA,
  author =       "Qixiao Liu and Miquel Moreto and Victor Jimenez and
                 Jaume Abella and Francisco J. Cazorla and Mateo
                 Valero",
  title =        "Hardware support for accurate per-task energy metering
                 in multicore systems",
  journal =      j-TACO,
  volume =       "10",
  number =       "4",
  pages =        "34:1--34:??",
  month =        dec,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2541228.2555291",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Thu Jan 9 10:42:35 MST 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Accurately determining the energy consumed by each
                 task in a system will become of prominent importance in
                 future multicore-based systems because it offers
                 several benefits, including (i) better application
                 energy/performance optimizations, (ii) improved
                 energy-aware task scheduling, and (iii) energy-aware
                 billing in data centers. Unfortunately, existing
                 methods for energy metering in multicores fail to
                 provide accurate energy estimates for each task when
                 several tasks run simultaneously. This article makes a
                 case for accurate Per-Task Energy Metering (PTEM) based
                 on tracking the resource utilization and occupancy of
                 each task. Different hardware implementations with
                 different trade-offs between energy prediction accuracy
                 and hardware-implementation complexity are proposed.
                 Our evaluation shows that the energy consumed in a
                 multicore by each task can be accurately measured. For
                 a 32-core, 2-way, simultaneous multithreaded core
                 setup, PTEM reduces the average accuracy error from
                 more than 12\% when our hardware support is not used to
                 less than 4\% when it is used. The maximum observed
                 error for any task in the workload we used reduces from
                 58\% down to 9\% when our hardware support is used.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "34",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Mehta:2013:TSS,
  author =       "Sanyam Mehta and Gautham Beeraka and Pen-Chung Yew",
  title =        "Tile size selection revisited",
  journal =      j-TACO,
  volume =       "10",
  number =       "4",
  pages =        "35:1--35:??",
  month =        dec,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2541228.2555292",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Thu Jan 9 10:42:35 MST 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Loop tiling is a widely used loop transformation to
                 enhance data locality and allow data reuse. In the
                 tiled code, however, tiles of different sizes can lead
                 to significant variation in performance. Thus,
                 selection of an optimal tile size is critical to
                 performance of tiled codes. In the past, tile size
                 selection has been attempted using both static
                 analytical and dynamic empirical (auto-tuning) models.
                 Past work using static models assumed a direct-mapped
                 cache for the purpose of analysis and thus proved to be
                 less robust. On the other hand, the auto-tuning models
                 involve an exhaustive search in a large space of tiled
                 codes. In this article, we propose a new analytical
                 model for tile size selection that leverages the high
                 set associativity in modern caches to minimize conflict
                 misses. Our tile size selection model targets data
                 reuse in multiple levels of cache. In addition, it
                 considers the interaction of tiling with the SIMD unit
                 in modern processors in estimating the optimal tile
                 size. We find that these factors, not considered in
                 previous models, are critical in developing a robust
                 model for tile size selection. We implement our tile
                 size selection model in a polyhedral compiler and test
                 it on 12 benchmark kernels using two different problem
                 sizes. Our model outperforms the previous analytical
                 models that are based on reusing data in a single level
                 of cache and achieves an average performance
                 improvement of 9.7\% and 20.4\%, respectively, over the
                 best square (cubic) tiles for the two problem sizes. In
                 addition, the tile size chosen by our tile size
                 selection algorithm is similar to the best performing
                 size obtained through an extensive search, validating
                 the analytical model underlying the algorithm.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "35",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Prisacari:2013:FPS,
  author =       "Bogdan Prisacari and German Rodriguez and Cyriel
                 Minkenberg and Torsten Hoefler",
  title =        "Fast pattern-specific routing for fat tree networks",
  journal =      j-TACO,
  volume =       "10",
  number =       "4",
  pages =        "36:1--36:??",
  month =        dec,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2541228.2555293",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Thu Jan 9 10:42:35 MST 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "In the context of eXtended Generalized Fat Tree (XGFT)
                 topologies, widely used in HPC and datacenter network
                 designs, we propose a generic method, based on Integer
                 Linear Programming (ILP), to efficiently determine
                 optimal routes for arbitrary workloads. We propose a
                 novel approach that combines ILP with dynamic
                 programming, effectively reducing the time to solution.
                 Specifically, we divide the network into smaller
                 subdomains optimized using a custom ILP formulation
                 that ensures global optimality of local solutions.
                 Local solutions are then combined into an optimal
                 global solution using dynamic programming. Finally, we
                 demonstrate through a series of extensive benchmarks
                 that our approach scales in practice to networks
                 interconnecting several thousands of nodes, using a
                 single-threaded, freely available linear programming
                 solver on commodity hardware, with the potential for
                 higher scalability by means of commercial, parallel
                 solvers.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "36",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Breughe:2013:SRB,
  author =       "Maximilien B. Breughe and Lieven Eeckhout",
  title =        "Selecting representative benchmark inputs for
                 exploring microprocessor design spaces",
  journal =      j-TACO,
  volume =       "10",
  number =       "4",
  pages =        "37:1--37:??",
  month =        dec,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2541228.2555294",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Thu Jan 9 10:42:35 MST 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "The design process of a microprocessor requires
                 representative workloads to steer the search process
                 toward an optimum design point for the target
                 application domain. However, considering a broad set of
                 workloads to cover the large space of potential
                 workloads is infeasible given how time-consuming design
                 space exploration typically is. Hence, it is crucial to
                 select a small yet representative set of workloads,
                 which leads to a shorter design cycle while yielding a
                 (near) optimal design. Prior work has mostly looked
                 into selecting representative benchmarks; however,
                 limited attention was given to the selection of
                 benchmark inputs and how this affects workload
                 representativeness during design space exploration.
                 Using a set of 1,000 inputs for a number of embedded
                 benchmarks and a design space with around 1,700 design
                 points, we find that selecting a single or three random
                 input(s) per benchmark potentially (in a worst-case
                 scenario) leads to a suboptimal design that is 56\% and
                 33\% off, on average, relative to the optimal design in
                 our design space in terms of Energy-Delay Product
                 (EDP). We then propose and evaluate a number of methods
                 for selecting representative inputs and show that we
                 can find the optimum design point with as few as three
                 inputs.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "37",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Kerschbaumer:2013:IFT,
  author =       "Christoph Kerschbaumer and Eric Hennigan and Per
                 Larsen and Stefan Brunthaler and Michael Franz",
  title =        "Information flow tracking meets just-in-time
                 compilation",
  journal =      j-TACO,
  volume =       "10",
  number =       "4",
  pages =        "38:1--38:??",
  month =        dec,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2541228.2555295",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Thu Jan 9 10:42:35 MST 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Web applications are vulnerable to cross-site
                 scripting attacks that enable data thefts. Information
                 flow tracking in web browsers can prevent communication
                 of sensitive data to unintended recipients and thereby
                 stop such data thefts. Unfortunately, existing
                 solutions have focused on incorporating information
                 flow into browsers' JavaScript interpreters, rather
                 than just-in-time compilers, rendering the resulting
                 performance noncompetitive. Few users will switch to a
                 safer browser if it comes at the cost of significantly
                 degrading web application performance. We present the
                 first information flow tracking JavaScript engine that
                 is based on a true just-in-time compiler, and that
                 thereby outperforms all previous interpreter-based
                 information flow tracking JavaScript engines by more
                 than a factor of two. Our JIT-based engine (i) has the
                 same coverage as previous interpreter- based solutions,
                 (ii) requires reasonable implementation effort, and
                 (iii) introduces new optimizations to achieve
                 acceptable performance. When evaluated against three
                 industry-standard JavaScript benchmark suites, there is
                 still an average slowdown of 73\% over engines that do
                 not support information flow, but this is now well
                 within the range that many users will find an
                 acceptable price for obtaining substantially increased
                 security.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "38",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Nasre:2013:TSE,
  author =       "Rupesh Nasre",
  title =        "Time- and space-efficient flow-sensitive points-to
                 analysis",
  journal =      j-TACO,
  volume =       "10",
  number =       "4",
  pages =        "39:1--39:??",
  month =        dec,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2541228.2555296",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Thu Jan 9 10:42:35 MST 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Compilation of real-world programs often requires
                 hours. The term nightly build known to industrial
                 researchers is an artifact of long compilation times.
                 Our goal is to reduce the absolute analysis times for
                 large C codes (of the order of millions of lines).
                 Pointer analysis is one of the key analyses performed
                 during compilation. Its scalability is paramount to
                 achieve the efficiency of the overall compilation
                 process and its precision directly affects that of the
                 client analyses. In this work, we design a time- and
                 space-efficient flow-sensitive pointer analysis and
                 parallelize it on graphics processing units. Our
                 analysis proposes to use an extended bloom filter,
                 called multibloom, to store points-to information in an
                 approximate manner and develops an analysis in terms of
                 the operations over the multibloom. Since bloom filter
                 is a probabilistic data structure, we develop ways to
                 gain back the analysis precision. We achieve effective
                 parallelization by achieving memory coalescing,
                 reducing thread divergence, and improving load balance
                 across GPU warps. Compared to a state-of-the-art
                 sequential solution, our parallel version achieves a
                 7.8 $ \times $ speedup with less than 5\% precision
                 loss on a suite of six large programs. Using two client
                 transformations, we show that this loss in precision
                 only minimally affects a client's precision.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "39",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Ruan:2013:BTB,
  author =       "Wenjia Ruan and Yujie Liu and Michael Spear",
  title =        "Boosting timestamp-based transactional memory by
                 exploiting hardware cycle counters",
  journal =      j-TACO,
  volume =       "10",
  number =       "4",
  pages =        "40:1--40:??",
  month =        dec,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2541228.2555297",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Thu Jan 9 10:42:35 MST 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Time-based transactional memories typically rely on a
                 shared memory counter to ensure consistency.
                 Unfortunately, such a counter can become a bottleneck.
                 In this article, we identify properties of hardware
                 cycle counters that allow their use in place of a
                 shared memory counter. We then devise algorithms that
                 exploit the x86 cycle counter to enable bottleneck-free
                 transactional memory runtime systems. We also consider
                 the impact of privatization safety and hardware
                 ordering constraints on the correctness, performance,
                 and generality of our algorithms.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "40",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Dey:2013:RMD,
  author =       "Tanima Dey and Wei Wang and Jack W. Davidson and Mary
                 Lou Soffa",
  title =        "{ReSense}: Mapping dynamic workloads of colocated
                 multithreaded applications using resource sensitivity",
  journal =      j-TACO,
  volume =       "10",
  number =       "4",
  pages =        "41:1--41:??",
  month =        dec,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2541228.2555298",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Thu Jan 9 10:42:35 MST 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "To utilize the full potential of modern chip
                 multiprocessors and obtain scalable performance
                 improvements, it is critical to mitigate resource
                 contention created by multithreaded workloads. In this
                 article, we describe ReSense, the first runtime system
                 that uses application characteristics to dynamically
                 map multithreaded applications from dynamic
                 workloads-workloads where multithreaded applications
                 arrive, execute, and terminate continuously in
                 unpredictable ways. ReSense mitigates contention for
                 the shared resources in the memory hierarchy by
                 applying a novel thread-mapping algorithm that
                 dynamically adjusts the mapping of threads from dynamic
                 workloads using a precalculated sensitivity score. The
                 sensitivity score quantifies an application's
                 sensitivity to sharing a particular memory resource and
                 is calculated by an efficient characterization process
                 that involves running the multithreaded application by
                 itself on the target platform. To measure ReSense's
                 effectiveness, sensitivity scores were determined for
                 21 benchmarks from PARSEC-2.1 and NPB-OMP-3.3 for the
                 shared resources in the memory hierarchy on four
                 different platforms. Using three different-sized
                 dynamic workloads composed of randomly selected two,
                 four, and eight corunning benchmarks with randomly
                 selected start times, ReSense was able to improve the
                 average response time of the three workloads by up to
                 27.03\%, 20.89\%, and 29.34\% and throughput by up to
                 19.97\%, 46.56\%, and 29.86\%, respectively, over the
                 native OS on real hardware. By estimating and comparing
                 ReSense's effectiveness with the optimal thread mapping
                 for two different workloads, we found that the maximum
                 average difference with the experimentally determined
                 optimal performance was 1.49\% for average response
                 time and 2.08\% for throughput.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "41",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Armejach:2013:TIP,
  author =       "Adri{\`a} Armejach and Ruben Titos-Gil and Anurag Negi
                 and Osman S. Unsal and Adri{\'a}n Cristal",
  title =        "Techniques to improve performance in requester-wins
                 hardware transactional memory",
  journal =      j-TACO,
  volume =       "10",
  number =       "4",
  pages =        "42:1--42:??",
  month =        dec,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2541228.2555299",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Thu Jan 9 10:42:35 MST 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "The simplicity of requester-wins Hardware
                 Transactional Memory (HTM) makes it easy to incorporate
                 in existing chip multiprocessors. Hence, such systems
                 are expected to be widely available in the near future.
                 Unfortunately, these implementations are prone to
                 suffer severe performance degradation due to transient
                 and persistent livelock conditions. This article shows
                 that existing techniques are unable to mitigate this
                 degradation effectively. It then proposes and evaluates
                 four novel techniques-two software-based that employ
                 information provided by the hardware and two that
                 require simple core-local hardware additions-which have
                 the potential to boost the performance of
                 requester-wins HTM designs substantially.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "42",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Jeon:2013:RDR,
  author =       "Myeongjae Jeon and Conglong Li and Alan L. Cox and
                 Scott Rixner",
  title =        "Reducing {DRAM} row activations with eager read\slash
                 write clustering",
  journal =      j-TACO,
  volume =       "10",
  number =       "4",
  pages =        "43:1--43:??",
  month =        dec,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2541228.2555300",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Thu Jan 9 10:42:35 MST 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "This article describes and evaluates a new approach to
                 optimizing DRAM performance and energy consumption that
                 is based on eagerly writing dirty cache lines to DRAM.
                 Under this approach, many dirty cache lines are written
                 to DRAM before they are evicted. In particular, dirty
                 cache lines that have not been recently accessed are
                 eagerly written to DRAM when the corresponding row has
                 been activated by an ordinary, noneager access, such as
                 a read. This approach enables clustering of reads and
                 writes that target the same row, resulting in a
                 significant reduction in row activations. Specifically,
                 for a variety of applications, it reduces the number of
                 DRAM row activations by an average of 42\% and a
                 maximum of 82\%. Moreover, the results from a
                 full-system simulator show compelling performance
                 improvements and energy consumption reductions. Out of
                 23 applications, 6 have overall performance
                 improvements between 10\% and 20\%, and 3 have
                 improvements in excess of 20\%. Furthermore, 12 consume
                 between 10\% and 20\% less DRAM energy, and 7 have
                 energy consumption reductions in excess of 20\%.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "43",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Zhao:2013:HPP,
  author =       "Zhijia Zhao and Michael Bebenita and Dave Herman and
                 Jianhua Sun and Xipeng Shen",
  title =        "{HPar}: a practical parallel parser for {HTML} ---
                 taming {HTML} complexities for parallel parsing",
  journal =      j-TACO,
  volume =       "10",
  number =       "4",
  pages =        "44:1--44:??",
  month =        dec,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2541228.2555301",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Thu Jan 9 10:42:35 MST 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Parallelizing HTML parsing is challenging due to the
                 complexities of HTML documents and the inherent
                 dependencies in its parsing algorithm. As a result,
                 despite numerous studies in parallel parsing, HTML
                 parsing remains sequential today. It forms one of the
                 final barriers for fully parallelizing browser
                 operations to minimize the browser's response time-an
                 important variable for user experiences, especially on
                 portable devices. This article provides a comprehensive
                 analysis on the special complexities of parallel HTML
                 parsing and presents a systematic exploration in
                 overcoming those difficulties through specially
                 designed speculative parallelizations. This work
                 develops, to the best of our knowledge, the first
                 pipelining and data-level parallel HTML parsers. The
                 data-level parallel parser, named HPar, achieves up to
                 2.4$ \times $ speedup on quadcore devices. This work
                 demonstrates the feasibility of efficient, parallel
                 HTML parsing for the first time and offers a set of
                 novel insights for parallel HTML parsing",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "44",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Totoni:2013:EFE,
  author =       "Ehsan Totoni and Mert Dikmen and Mar{\'\i}a Jes{\'u}s
                 Garzar{\'a}n",
  title =        "Easy, fast, and energy-efficient object detection on
                 heterogeneous on-chip architectures",
  journal =      j-TACO,
  volume =       "10",
  number =       "4",
  pages =        "45:1--45:??",
  month =        dec,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2541228.2555302",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Thu Jan 9 10:42:35 MST 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "We optimize a visual object detection application
                 (that uses Vision Video Library kernels) and show that
                 OpenCL is a unified programming paradigm that can
                 provide high performance when running on the Ivy Bridge
                 heterogeneous on-chip architecture. We evaluate
                 different mapping techniques and show that running each
                 kernel where it fits the best and using software
                 pipelining can provide 1.91 times higher performance
                 and 42\% better energy efficiency. We also show how to
                 trade accuracy for energy at runtime. Overall, our
                 application can perform accurate object detection at 40
                 frames per second (fps) in an energy-efficient
                 manner.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "45",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Fedorov:2013:AAL,
  author =       "Viacheslav V. Fedorov and Sheng Qiu and A. L.
                 Narasimha Reddy and Paul V. Gratz",
  title =        "{ARI}: Adaptive {LLC}-memory traffic management",
  journal =      j-TACO,
  volume =       "10",
  number =       "4",
  pages =        "46:1--46:??",
  month =        dec,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2543697",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Thu Jan 9 10:42:35 MST 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Decreasing the traffic from the CPU LLC to main memory
                 is a very important issue in modern systems. Recent
                 work focuses on cache misses, overlooking the impact of
                 writebacks on the total memory traffic, energy
                 consumption, IPC, and so forth. Policies that foster a
                 balanced approach, between reducing write traffic to
                 memory and improving miss rates, can increase overall
                 performance and improve energy efficiency and memory
                 system lifetime for NVM memory technology, such as
                 phase-change memory (PCM). We propose Adaptive
                 Replacement and Insertion (ARI), an adaptive approach
                 to last-level CPU cache management, optimizing the two
                 parameters (miss rate and writeback rate)
                 simultaneously. Our specific focus is to reduce
                 writebacks as much as possible while maintaining or
                 improving the miss rate relative to conventional LRU
                 replacement policy. ARI reduces LLC writebacks by 33\%,
                 on average, while also decreasing misses by 4.7\%, on
                 average. In a typical system, this boosts IPC by 4.9\%,
                 on average, while decreasing energy consumption by
                 8.9\%. These results are achieved with minimal hardware
                 overheads.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "46",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Gonzalez-Alvarez:2013:AAD,
  author =       "Cecilia Gonz{\'a}lez-{\'A}lvarez and Jennifer B.
                 Sartor and Carlos {\'A}lvarez and Daniel
                 Jim{\'e}nez-Gonz{\'a}lez and Lieven Eeckhout",
  title =        "Accelerating an application domain with specialized
                 functional units",
  journal =      j-TACO,
  volume =       "10",
  number =       "4",
  pages =        "47:1--47:??",
  month =        dec,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2541228.2555303",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Thu Jan 9 10:42:35 MST 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Hardware specialization has received renewed interest
                 recently as chips are hitting power limits. Chip
                 designers of traditional processor architectures have
                 primarily focused on general-purpose computing,
                 partially due to time-to-market pressure and simpler
                 design processes. But new power limits require some
                 chip specialization. Although hardware configured for a
                 specific application yields large speedups for
                 low-power dissipation, its design is more complex and
                 less reusable. We instead explore domain-based
                 specialization, a scalable approach that balances
                 hardware's reusability and performance efficiency. We
                 focus on specialization using customized compute units
                 that accelerate particular operations. In this article,
                 we develop automatic techniques to identify code
                 sequences from different applications within a domain
                 that can be targeted to a new custom instruction that
                 will be run inside a configurable specialized
                 functional unit (SFU). We demonstrate that using a
                 canonical representation of computations finds more
                 common code sequences among applications that can be
                 mapped to the same custom instruction, leading to
                 larger speedups while specializing a smaller core area
                 than previous pattern-matching techniques. We also
                 propose new heuristics to narrow the search space of
                 domain-specific custom instructions, finding those that
                 achieve the best performance across applications. We
                 estimate the overall performance achieved with our
                 automatic techniques using hardware models on a set of
                 nine media benchmarks, showing that when limiting the
                 core area devoted to specialization, the SFU
                 customization with the largest speedups includes both
                 application- and domain-specific custom instructions.
                 We demonstrate that exploring domain-specific hardware
                 acceleration is key to continued computing system
                 performance improvements.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "47",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Wang:2013:RMM,
  author =       "Xiaolin Wang and Lingmei Weng and Zhenlin Wang and
                 Yingwei Luo",
  title =        "Revisiting memory management on virtualized
                 environments",
  journal =      j-TACO,
  volume =       "10",
  number =       "4",
  pages =        "48:1--48:??",
  month =        dec,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2541228.2555304",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Thu Jan 9 10:42:35 MST 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "With the evolvement of hardware, 64-bit Central
                 Processing Units (CPUs) and 64-bit Operating Systems
                 (OSs) have dominated the market. This article
                 investigates the performance of virtual memory
                 management of Virtual Machines (VMs) with a large
                 virtual address space in 64-bit OSs, which imposes
                 different pressure on memory virtualization than 32-bit
                 systems. Each of the two conventional memory
                 virtualization approaches, Shadowing Paging (SP) and
                 Hardware-Assisted Paging (HAP), causes different
                 overhead for different applications. Our experiments
                 show that 64-bit applications prefer to run in a VM
                 using SP, while 32-bit applications do not have a
                 uniform preference between SP and HAP. In this article,
                 we trace this inconsistency between 32-bit applications
                 and 64-bit applications to its root cause through a
                 systematic empirical study in Linux systems and
                 discover that the major overhead of SP results from
                 memory management in the 32-bit GNU C library ( glibc
                 ). We propose enhancements to the existing memory
                 management algorithms, which substantially reduce the
                 overhead of SP. Based on the evaluations using SPEC
                 CPU2006, Parsec 2.1, and cloud benchmarks, our results
                 show that SP, with the improved memory allocators, can
                 compete with HAP in almost all cases, in both 64-bit
                 and 32-bit systems. We conclude that without a
                 significant breakthrough in HAP, researchers should pay
                 more attention to SP, which is more flexible and cost
                 effective.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "48",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Jiang:2013:PAP,
  author =       "Chuntao Jiang and Zhibin Yu and Hai Jin and Chengzhong
                 Xu and Lieven Eeckhout and Wim Heirman and Trevor E.
                 Carlson and Xiaofei Liao",
  title =        "{PCantorSim}: Accelerating parallel architecture
                 simulation through fractal-based sampling",
  journal =      j-TACO,
  volume =       "10",
  number =       "4",
  pages =        "49:1--49:??",
  month =        dec,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2541228.2555305",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Thu Jan 9 10:42:35 MST 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Computer architects rely heavily on microarchitecture
                 simulation to evaluate design alternatives.
                 Unfortunately, cycle-accurate simulation is extremely
                 slow, being at least 4 to 6 orders of magnitude slower
                 than real hardware. This longstanding problem is
                 further exacerbated in the multi-/many-core era,
                 because single-threaded simulation performance has not
                 improved much, while the design space has expanded
                 substantially. Parallel simulation is a promising
                 approach, yet does not completely solve the simulation
                 challenge. Furthermore, existing sampling techniques,
                 which are widely used for single-threaded applications,
                 do not readily apply to multithreaded applications as
                 thread interaction and synchronization must now be
                 taken into account. This work presents PCantorSim, a
                 novel Cantor set (a classic fractal)--based sampling
                 scheme to accelerate parallel simulation of
                 multithreaded applications. Through the use of the
                 proposed methodology, only less than 5\% of an
                 application's execution time is simulated in detail. We
                 have implemented our approach in Sniper (a parallel
                 multicore simulator) and evaluated it by running the
                 PARSEC benchmarks on a simulated 8-core system. The
                 results show that PCantorSim increases simulation speed
                 over detailed parallel simulation by a factor of 20$
                 \times $, on average, with an average absolute
                 execution time prediction error of 5.3\%.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "49",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Stipic:2013:PGT,
  author =       "Srdan Stipi{\'c} and Vesna Smiljkovi{\'c} and Osman
                 Unsal and Adri{\'a}n Cristal and Mateo Valero",
  title =        "Profile-guided transaction coalescing-lowering
                 transactional overheads by merging transactions",
  journal =      j-TACO,
  volume =       "10",
  number =       "4",
  pages =        "50:1--50:??",
  month =        dec,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2541228.2555306",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Thu Jan 9 10:42:35 MST 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/hash.bib;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Previous studies in software transactional memory
                 mostly focused on reducing the overhead of
                 transactional read and write operations. In this
                 article, we introduce transaction coalescing, a
                 profile-guided compiler optimization technique that
                 attempts to reduce the overheads of starting and
                 committing a transaction by merging two or more small
                 transactions into one large transaction. We develop a
                 profiling tool and a transaction coalescing heuristic
                 to identify candidate transactions suitable for
                 coalescing. We implement a compiler extension to
                 automatically merge the candidate transactions at the
                 compile time. We evaluate the effectiveness of our
                 technique using the hash table micro-benchmark and the
                 STAMP benchmark suite. Transaction coalescing improves
                 the performance of the hash table significantly and the
                 performance of Vacation and SSCA2 benchmarks by 19.4\%
                 and 36.4\%, respectively, when running with 12
                 threads.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "50",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Wang:2013:WWA,
  author =       "Zhe Wang and Shuchang Shan and Ting Cao and Junli Gu
                 and Yi Xu and Shuai Mu and Yuan Xie and Daniel A.
                 Jim{\'e}nez",
  title =        "{WADE}: Writeback-aware dynamic cache management for
                 {NVM}-based main memory system",
  journal =      j-TACO,
  volume =       "10",
  number =       "4",
  pages =        "51:1--51:??",
  month =        dec,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2541228.2555307",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Thu Jan 9 10:42:35 MST 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Emerging Non-Volatile Memory (NVM) technologies are
                 explored as potential alternatives to traditional
                 SRAM/DRAM-based memory architecture in future
                 microprocessor design. One of the major disadvantages
                 for NVM is the latency and energy overhead associated
                 with write operations. Mitigation techniques to
                 minimize the write overhead for NVM-based main memory
                 architecture have been studied extensively. However,
                 most prior work focuses on optimization techniques for
                 NVM-based main memory itself, with little attention
                 paid to cache management policies for the Last-Level
                 Cache (LLC). In this article, we propose a
                 Writeback-Aware Dynamic CachE (WADE) management
                 technique to help mitigate the write overhead in
                 NVM-based memory. The proposal is based on the
                 observation that, when dirty cache blocks are evicted
                 from the LLC and written into NVM-based memory (with
                 PCM as an example), the long latency and high energy
                 associated with write operations to NVM-based memory
                 can cause system performance/power degradation. Thus,
                 reducing the number of writeback requests from the LLC
                 is critical. The proposed WADE cache management
                 technique tries to keep highly reused dirty cache
                 blocks in the LLC. The technique predicts blocks that
                 are frequently written back in the LLC. The LLC sets
                 are dynamically partitioned into a frequent writeback
                 list and a nonfrequent writeback list. It keeps a best
                 size of each list in the LLC. Our evaluation shows that
                 the technique can reduce the number of writeback
                 requests by 16.5\% for memory-intensive single-threaded
                 benchmarks and 10.8\% for multicore workloads. It
                 yields a geometric mean speedup of 5.1\% for
                 single-thread applications and 7.6\% for multicore
                 workloads. Due to the reduced number of writeback
                 requests to main memory, the technique reduces the
                 energy consumption by 8.1\% for single-thread
                 applications and 7.6\% for multicore workloads.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "51",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Li:2013:CCC,
  author =       "Yong Li and Yaojun Zhang and Hai LI and Yiran Chen and
                 Alex K. Jones",
  title =        "{C1C}: a configurable, compiler-guided {STT-RAM L1}
                 cache",
  journal =      j-TACO,
  volume =       "10",
  number =       "4",
  pages =        "52:1--52:??",
  month =        dec,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2541228.2555308",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Thu Jan 9 10:42:35 MST 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Spin-Transfer Torque RAM (STT-RAM), a promising
                 alternative to SRAM for reducing leakage power
                 consumption, has been widely studied to mitigate the
                 impact of its asymmetrically long write latency.
                 Recently, STT-RAM has been proposed for L1 caches by
                 relaxing the data retention time to improve write
                 performance and dynamic energy. However, as the
                 technology scales down from 65nm to 22nm, the
                 performance of the read operation scales poorly due to
                 reduced sense margins and sense amplifier delays. In
                 this article, we leverage a dual-mode STT memory cell
                 to design a configurable L1 cache architecture termed
                 C1C to mitigate read performance barriers with
                 technology scaling. Guided by application access
                 characteristics discovered through novel compiler
                 analyses, the proposed cache adaptively switches
                 between a high performance and a low-power access mode.
                 Our evaluation demonstrates that the proposed cache
                 with compiler guidance outperforms a state-of-the-art
                 STT-RAM cache design by 9\% with high dynamic energy
                 efficiency, leading to significant performance/watt
                 improvements over several competing approaches.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "52",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Fauzia:2013:BRD,
  author =       "Naznin Fauzia and Venmugil Elango and Mahesh
                 Ravishankar and J. Ramanujam and Fabrice Rastello and
                 Atanas Rountev and Louis-No{\"e}l Pouchet and P.
                 Sadayappan",
  title =        "Beyond reuse distance analysis: Dynamic analysis for
                 characterization of data locality potential",
  journal =      j-TACO,
  volume =       "10",
  number =       "4",
  pages =        "53:1--53:??",
  month =        dec,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2541228.2555309",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Thu Jan 9 10:42:35 MST 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Emerging computer architectures will feature
                 drastically decreased flops/byte (ratio of peak
                 processing rate to memory bandwidth) as highlighted by
                 recent studies on Exascale architectural trends.
                 Further, flops are getting cheaper, while the energy
                 cost of data movement is increasingly dominant. The
                 understanding and characterization of data locality
                 properties of computations is critical in order to
                 guide efforts to enhance data locality. Reuse distance
                 analysis of memory address traces is a valuable tool to
                 perform data locality characterization of programs. A
                 single reuse distance analysis can be used to estimate
                 the number of cache misses in a fully associative LRU
                 cache of any size, thereby providing estimates on the
                 minimum bandwidth requirements at different levels of
                 the memory hierarchy to avoid being bandwidth bound.
                 However, such an analysis only holds for the particular
                 execution order that produced the trace. It cannot
                 estimate potential improvement in data locality through
                 dependence-preserving transformations that change the
                 execution schedule of the operations in the
                 computation. In this article, we develop a novel
                 dynamic analysis approach to characterize the inherent
                 locality properties of a computation and thereby assess
                 the potential for data locality enhancement via
                 dependence-preserving transformations. The execution
                 trace of a code is analyzed to extract a
                 Computational-Directed Acyclic Graph (CDAG) of the data
                 dependences. The CDAG is then partitioned into convex
                 subsets, and the convex partitioning is used to reorder
                 the operations in the execution trace to enhance data
                 locality. The approach enables us to go beyond reuse
                 distance analysis of a single specific order of
                 execution of the operations of a computation in
                 characterization of its data locality properties. It
                 can serve a valuable role in identifying promising code
                 regions for manual transformation, as well as assessing
                 the effectiveness of compiler transformations for data
                 locality enhancement. We demonstrate the effectiveness
                 of the approach using a number of benchmarks, including
                 case studies where the potential shown by the analysis
                 is exploited to achieve lower data movement costs and
                 better performance.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "53",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Bardizbanyan:2013:DPD,
  author =       "Alen Bardizbanyan and Magnus Sj{\"a}lander and David
                 Whalley and Per Larsson-Edefors",
  title =        "Designing a practical data filter cache to improve
                 both energy efficiency and performance",
  journal =      j-TACO,
  volume =       "10",
  number =       "4",
  pages =        "54:1--54:??",
  month =        dec,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2541228.2555310",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Thu Jan 9 10:42:35 MST 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Conventional Data Filter Cache (DFC) designs improve
                 processor energy efficiency, but degrade performance.
                 Furthermore, the single-cycle line transfer suggested
                 in prior studies adversely affects Level-1 Data Cache
                 (L1 DC) area and energy efficiency. We propose a
                 practical DFC that is accessed early in the pipeline
                 and transfers a line over multiple cycles. Our DFC
                 design improves performance and eliminates a
                 substantial fraction of L1 DC accesses for loads, L1 DC
                 tag checks on stores, and data translation lookaside
                 buffer accesses for both loads and stores. Our
                 evaluation shows that the proposed DFC can reduce the
                 data access energy by 42.5\% and improve execution time
                 by 4.2\%.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "54",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Hagiescu:2013:GCG,
  author =       "Andrei Hagiescu and Bing Liu and R. Ramanathan and
                 Sucheendra K. Palaniappan and Zheng Cui and Bipasa
                 Chattopadhyay and P. S. Thiagarajan and Weng-Fai Wong",
  title =        "{GPU} code generation for {ODE}-based applications
                 with phased shared-data access patterns",
  journal =      j-TACO,
  volume =       "10",
  number =       "4",
  pages =        "55:1--55:??",
  month =        dec,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2541228.2555311",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Thu Jan 9 10:42:35 MST 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "We present a novel code generation scheme for GPUs.
                 Its key feature is the platform-aware generation of a
                 heterogeneous pool of threads. This exposes more
                 data-sharing opportunities among the concurrent threads
                 and reduces the memory requirements that would
                 otherwise exceed the capacity of the on-chip memory.
                 Instead of the conventional strategy of focusing on
                 exposing as much parallelism as possible, our scheme
                 leverages on the phased nature of memory access
                 patterns found in many applications that exhibit
                 massive parallelism. We demonstrate the effectiveness
                 of our code generation strategy on a computational
                 systems biology application. This application consists
                 of computing a Dynamic Bayesian Network (DBN)
                 approximation of the dynamics of signalling pathways
                 described as a system of Ordinary Differential
                 Equations (ODEs). The approximation algorithm involves
                 (i) sampling many (of the order of a few million) times
                 from the set of initial states, (ii) generating
                 trajectories through numerical integration, and (iii)
                 storing the statistical properties of this set of
                 trajectories in Conditional Probability Tables (CPTs)
                 of a DBN via a prespecified discretization of the time
                 and value domains. The trajectories can be computed in
                 parallel. However, the intermediate data needed for
                 computing them, as well as the entries for the CPTs,
                 are too large to be stored locally. Our experiments
                 show that the proposed code generation scheme scales
                 well, achieving significant performance improvements on
                 three realistic signalling pathways models. These
                 results suggest how our scheme could be extended to
                 deal with other applications involving systems of
                 ODEs.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "55",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Lee:2013:TLS,
  author =       "Junghee Lee and Chrysostomos Nicopoulos and Hyung Gyu
                 Lee and Jongman Kim",
  title =        "{TornadoNoC}: a lightweight and scalable on-chip
                 network architecture for the many-core era",
  journal =      j-TACO,
  volume =       "10",
  number =       "4",
  pages =        "56:1--56:??",
  month =        dec,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2541228.2555312",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Thu Jan 9 10:42:35 MST 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "The rapid emergence of Chip Multi-Processors (CMP) as
                 the de facto microprocessor archetype has highlighted
                 the importance of scalable and efficient on-chip
                 networks. Packet-based Networks-on-Chip (NoC) are
                 gradually cementing themselves as the medium of choice
                 for the multi-/many-core systems of the near future,
                 due to their innate scalability. However, the
                 prominence of the debilitating power wall requires the
                 NoC to also be as energy efficient as possible. To
                 achieve these two antipodal requirements-scalability
                 and energy efficiency-we propose TornadoNoC, an
                 interconnect architecture that employs a novel flow
                 control mechanism. To prevent livelocks and deadlocks,
                 a sequence numbering scheme and a dynamic ring
                 inflation technique are proposed, and their correctness
                 formally proven. The primary objective of TornadoNoC is
                 to achieve substantial gains in (a) scalability to
                 many-core systems and (b) the area/power footprint, as
                 compared to current state-of-the-art router
                 implementations. The new router is demonstrated to
                 provide better scalability to hundreds of cores than an
                 ideal single-cycle wormhole implementation and other
                 scalability-enhanced low-cost routers. Extensive
                 simulations using both synthetic traffic patterns and
                 real applications running in a full-system simulator
                 corroborate the efficacy of the proposed design.
                 Finally, hardware synthesis analysis using commercial
                 65nm standard-cell libraries indicates that the area
                 and power budgets of the new router are reduced by up
                 to 53\% and 58\%, respectively, as compared to existing
                 state-of-the-art low-cost routers.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "56",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Strydis:2013:SAP,
  author =       "Christos Strydis and Robert M. Seepers and Pedro
                 Peris-Lopez and Dimitrios Siskos and Ioannis Sourdis",
  title =        "A system architecture, processor, and communication
                 protocol for secure implants",
  journal =      j-TACO,
  volume =       "10",
  number =       "4",
  pages =        "57:1--57:??",
  month =        dec,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2541228.2555313",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Thu Jan 9 10:42:35 MST 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Secure and energy-efficient communication between
                 Implantable Medical Devices (IMDs) and authorized
                 external users is attracting increasing attention these
                 days. However, there currently exists no systematic
                 approach to the problem, while solutions from
                 neighboring fields, such as wireless sensor networks,
                 are not directly transferable due to the peculiarities
                 of the IMD domain. This work describes an original,
                 efficient solution for secure IMD communication. A new
                 implant system architecture is proposed, where security
                 and main-implant functionality are made completely
                 decoupled by running the tasks onto two separate cores.
                 Wireless communication goes through a custom security
                 ASIP, called SISC (Smart-Implant Security Core), which
                 runs an energy-efficient security protocol. The
                 security core is powered by RF-harvested energy until
                 it performs external-reader authentication, providing
                 an elegant defense mechanism against battery
                 Denial-of-Service (DoS) and other, more common attacks.
                 The system has been evaluated based on a realistic case
                 study involving an artificial pancreas implant. When
                 synthesized for a UMC 90nm CMOS ASIC technology, our
                 system architecture achieves defense against
                 unauthorized accesses having zero energy cost, running
                 entity authentication through harvesting only 7.45 $
                 \mu $J of RF energy from the requesting entity. In all
                 other successfully authenticated accesses, our
                 architecture achieves secure data exchange without
                 affecting the performance of the main IMD
                 functionality, adding less than 1o/oo (1.3 mJ ) to the
                 daily energy consumption of a typical implant. Compared
                 to a singe-core, secure reference IMD, which would
                 still be more vulnerable to some types of attacks, our
                 secure system on chip (SoC) achieves high security
                 levels at 56\% energy savings and at an area overhead
                 of less than 15\%.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "57",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Kim:2013:FMS,
  author =       "Wonsub Kim and Yoonseo Choi and Haewoo Park",
  title =        "Fast modulo scheduler utilizing patternized routes for
                 coarse-grained reconfigurable architectures",
  journal =      j-TACO,
  volume =       "10",
  number =       "4",
  pages =        "58:1--58:??",
  month =        dec,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2541228.2555314",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Thu Jan 9 10:42:35 MST 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Coarse-Grained Reconfigurable Architectures (CGRAs)
                 present a potential of high compute throughput with
                 energy efficiency. A CGRA consists of an array of
                 Functional Units (FUs), which communicate with each
                 other through an interconnect network containing
                 transmission nodes and register files. To achieve high
                 performance from the software solutions mapped onto
                 CGRAs, modulo scheduling of loops is generally
                 employed. One of the key challenges in modulo
                 scheduling for CGRAs is to explicitly handle routings
                 of operands from a source to a destination operations
                 through various routing resources. Existing modulo
                 schedulers for CGRAs are slow because finding a valid
                 routing is generally a searching problem over a large
                 space, even with the guidance of well-defined cost
                 metrics. Applications in traditional embedded
                 multimedia domains are regarded as relatively tolerant
                 to a slow compile time in exchange for a high-quality
                 solution. However, many rapidly growing domains of
                 applications, such as 3D graphics, require a fast
                 compilation. Entrances of CGRAs to these domains have
                 been blocked mainly due to their long compile time. We
                 attack this problem by utilizing patternized routes,
                 for which resources and time slots for a success can be
                 estimated in advance when a source operation is placed.
                 By conservatively reserving predefined resources at
                 predefined time slots, future routings originating from
                 the source operation are guaranteed. Experiments on a
                 real-world 3D graphics benchmark suite show that our
                 scheduler improves the compile time up to 6,000 times
                 while achieving an average 70\% throughputs of the
                 state-of-the-art CGRA modulo scheduler, the
                 Edge-centric Modulo Scheduler (EMS).",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "58",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Nuzman:2013:JTC,
  author =       "Dorit Nuzman and Revital Eres and Sergei Dyshel and
                 Marcel Zalmanovici and Jose Castanos",
  title =        "{JIT} technology with {C\slash C++}: Feedback-directed
                 dynamic recompilation for statically compiled
                 languages",
  journal =      j-TACO,
  volume =       "10",
  number =       "4",
  pages =        "59:1--59:??",
  month =        dec,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2541228.2555315",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Thu Jan 9 10:42:35 MST 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "The growing gap between the advanced capabilities of
                 static compilers as reflected in benchmarking results
                 and the actual performance that users experience in
                 real-life scenarios makes client-side dynamic
                 optimization technologies imperative to the domain of
                 static languages. Dynamic optimization of software
                 distributed in the form of a platform-agnostic
                 Intermediate-Representation (IR) has been very
                 successful in the domain of managed languages, greatly
                 improving upon interpreted code, especially when online
                 profiling is used. However, can such feedback-directed
                 IR-based dynamic code generation be viable in the
                 domain of statically compiled, rather than interpreted,
                 languages? We show that fat binaries, which combine the
                 IR together with the statically compiled executable,
                 can provide a practical solution for software vendors,
                 allowing their software to be dynamically optimized
                 without the limitation of binary-level approaches,
                 which lack the high-level IR of the program, and
                 without the warm-up costs associated with the IR-only
                 software distribution approach. We describe and
                 evaluate the fat-binary-based runtime compilation
                 approach using SPECint2006, demonstrating that the
                 overheads it incurs are low enough to be successfully
                 surmounted by dynamic optimization. Building on Java
                 JIT technologies, our results already improve upon
                 common real-world usage scenarios, including very small
                 workloads.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "59",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Ramashekar:2013:ADA,
  author =       "Thejas Ramashekar and Uday Bondhugula",
  title =        "Automatic data allocation and buffer management for
                 multi-{GPU} machines",
  journal =      j-TACO,
  volume =       "10",
  number =       "4",
  pages =        "60:1--60:??",
  month =        dec,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2544100",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Thu Jan 9 10:42:35 MST 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Multi-GPU machines are being increasingly used in
                 high-performance computing. Each GPU in such a machine
                 has its own memory and does not share the address space
                 either with the host CPU or other GPUs. Hence,
                 applications utilizing multiple GPUs have to manually
                 allocate and manage data on each GPU. Existing works
                 that propose to automate data allocations for GPUs have
                 limitations and inefficiencies in terms of allocation
                 sizes, exploiting reuse, transfer costs, and
                 scalability. We propose a scalable and fully automatic
                 data allocation and buffer management scheme for affine
                 loop nests on multi-GPU machines. We call it the
                 Bounding-Box-based Memory Manager (BBMM). BBMM can
                 perform at runtime, during standard set operations like
                 union, intersection, and difference, finding subset and
                 superset relations on hyperrectangular regions of array
                 data (bounding boxes). It uses these operations along
                 with some compiler assistance to identify, allocate,
                 and manage data required by applications in terms of
                 disjoint bounding boxes. This allows it to (1) allocate
                 exactly or nearly as much data as is required by
                 computations running on each GPU, (2) efficiently track
                 buffer allocations and hence maximize data reuse across
                 tiles and minimize data transfer overhead, and (3) and
                 as a result, maximize utilization of the combined
                 memory on multi-GPU machines. BBMM can work with any
                 choice of parallelizing transformations, computation
                 placement, and scheduling schemes, whether static or
                 dynamic. Experiments run on a four-GPU machine with
                 various scientific programs showed that BBMM reduces
                 data allocations on each GPU by up to 75\% compared to
                 current allocation schemes, yields performance of at
                 least 88\% of manually written code, and allows
                 excellent weak scaling.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "60",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Vandierendonck:2013:ADT,
  author =       "Hans Vandierendonck and George Tzenakis and Dimitrios
                 S. Nikolopoulos",
  title =        "Analysis of dependence tracking algorithms for task
                 dataflow execution",
  journal =      j-TACO,
  volume =       "10",
  number =       "4",
  pages =        "61:1--61:??",
  month =        dec,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2541228.2555316",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Thu Jan 9 10:42:35 MST 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Processor architectures has taken a turn toward
                 many-core processors, which integrate multiple
                 processing cores on a single chip to increase overall
                 performance, and there are no signs that this trend
                 will stop in the near future. Many-core processors are
                 harder to program than multicore and single-core
                 processors due to the need for writing parallel or
                 concurrent programs with high degrees of parallelism.
                 Moreover, many-cores have to operate in a mode of
                 strong scaling because of memory bandwidth constraints.
                 In strong scaling, increasingly finer-grain parallelism
                 must be extracted in order to keep all processing cores
                 busy. Task dataflow programming models have a high
                 potential to simplify parallel programming because they
                 alleviate the programmer from identifying precisely all
                 intertask dependences when writing programs. Instead,
                 the task dataflow runtime system detects and enforces
                 intertask dependences during execution based on the
                 description of memory accessed by each task. The
                 runtime constructs a task dataflow graph that captures
                 all tasks and their dependences. Tasks are scheduled to
                 execute in parallel, taking into account dependences
                 specified in the task graph. Several papers report
                 important overheads for task dataflow systems, which
                 severely limits the scalability and usability of such
                 systems. In this article, we study efficient schemes to
                 manage task graphs and analyze their scalability. We
                 assume a programming model that supports input, output,
                 and in/out annotations on task arguments, as well as
                 commutative in/out and reductions. We analyze the
                 structure of task graphs and identify versions and
                 generations as key concepts for efficient management of
                 task graphs. Then, we present three schemes to manage
                 task graphs building on graph representations,
                 hypergraphs, and lists. We also consider a fourth
                 edgeless scheme that synchronizes tasks using integers.
                 Analysis using microbenchmarks shows that the graph
                 representation is not always scalable and that the
                 edgeless scheme introduces least overhead in nearly all
                 situations.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "61",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Jeong:2013:EET,
  author =       "Yeonghun Jeong and Seongseok Seo and Jongeun Lee",
  title =        "Evaluator-executor transformation for efficient
                 pipelining of loops with conditionals",
  journal =      j-TACO,
  volume =       "10",
  number =       "4",
  pages =        "62:1--62:??",
  month =        dec,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2541228.2555317",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Thu Jan 9 10:42:35 MST 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Control divergence poses many problems in
                 parallelizing loops. While predicated execution is
                 commonly used to convert control dependence into data
                 dependence, it often incurs high overhead because it
                 allocates resources equally for both branches of a
                 conditional statement regardless of their execution
                 frequencies. For those loops with unbalanced
                 conditionals, we propose a software transformation that
                 divides a loop into two or three smaller loops so that
                 the condition is evaluated only in the first loop,
                 while the less frequent branch is executed in the
                 second loop in a way that is much more efficient than
                 in the original loop. To reduce the overhead of extra
                 data transfer caused by the loop fission, we also
                 present a hardware extension for a class of
                 Coarse-Grained Reconfigurable Architectures (CGRAs).
                 Our experiments using MiBench and computer vision
                 benchmarks on a CGRA demonstrate that our techniques
                 can improve the performance of loops over predicated
                 execution by up to 65\% (37.5\%, on average), when the
                 hardware extension is enabled. Without any hardware
                 modification, our software-only version can improve
                 performance by up to 64\% (33\%, on average), while
                 simultaneously reducing the energy consumption of the
                 entire CGRA including configuration and data memory by
                 22\%, on average.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "62",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Barik:2013:DNS,
  author =       "Rajkishore Barik and Jisheng Zhao and Vivek Sarkar",
  title =        "A decoupled non-{SSA} global register allocation using
                 bipartite liveness graphs",
  journal =      j-TACO,
  volume =       "10",
  number =       "4",
  pages =        "63:1--63:??",
  month =        dec,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2544101",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Thu Jan 9 10:42:35 MST 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Register allocation is an essential optimization for
                 all compilers. A number of sophisticated register
                 allocation algorithms have been developed over the
                 years. The two fundamental classes of register
                 allocation algorithms used in modern compilers are
                 based on Graph Coloring (GC) and Linear Scan (LS).
                 However, these two algorithms have fundamental
                 limitations in terms of precision. For example, the key
                 data structure used in GC-based algorithms, the
                 interference graph, lacks information on the program
                 points at which two variables may interfere. The
                 LS-based algorithms make local decisions regarding
                 spilling, and thereby trade off global optimization for
                 reduced compile-time and space overheads. Recently,
                 researchers have proposed Static Single Assignment
                 (SSA)-based decoupled register allocation algorithms
                 that exploit the live-range split points of the SSA
                 representation to optimally solve the spilling problem.
                 However, SSA-based register allocation often requires
                 extra complexity in repairing register assignments
                 during SSA elimination and in addressing architectural
                 constraints such as aliasing and ABI encoding; this
                 extra overhead can be prohibitively expensive in
                 dynamic compilation contexts. This article proposes a
                 decoupled non-SSA--based global register allocation
                 algorithm for dynamic compilation. It addresses the
                 limitations in current algorithms by introducing a
                 Bipartite Liveness Graph (BLG)-based register
                 allocation algorithm that models the spilling phase as
                 an optimization problem on the BLG itself and the
                 assignment phase as a separate optimization problem.
                 Advanced register allocation optimizations such as move
                 coalescing, live-range splitting, and register class
                 handling are also performed along with the spilling and
                 assignment phases. In the presence of register classes,
                 we propose a bucket-based greedy heuristic for
                 assignment that strikes a balance between spill-cost
                 and register class constraints. We present experimental
                 evaluation of our BLG-based register allocation
                 algorithm and compare it with production-quality
                 register allocators in Jikes RVM and LLVM.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "63",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Gavin:2013:RIF,
  author =       "Peter Gavin and David Whalley and Magnus
                 Sj{\"a}lander",
  title =        "Reducing instruction fetch energy in multi-issue
                 processors",
  journal =      j-TACO,
  volume =       "10",
  number =       "4",
  pages =        "64:1--64:??",
  month =        dec,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2541228.2555318",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Thu Jan 9 10:42:35 MST 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "The need to minimize power while maximizing
                 performance has led to recent developments of powerful
                 superscalar designs targeted at embedded and portable
                 use. Instruction fetch is responsible for a significant
                 fraction of microprocessor power and energy, and is
                 therefore an attractive target for architectural power
                 optimization. We present novel techniques that take
                 advantage of guarantees so that the instruction
                 translation lookaside buffer, branch target buffer, and
                 branch prediction buffer can frequently be disabled,
                 reducing their energy usage, while simultaneously
                 reducing branch predictor contention. These techniques
                 require no changes to the instruction set and can
                 easily be integrated into most single- and
                 multiple-issue processors.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "64",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Anonymous:2013:LDR,
  author =       "Anonymous",
  title =        "List of distinguished reviewers {ACM TACO}",
  journal =      j-TACO,
  volume =       "10",
  number =       "4",
  pages =        "65:1--65:??",
  month =        dec,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2560216",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Mar 14 17:30:44 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "65",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Goel:2014:SPR,
  author =       "Neeraj Goel and Anshul Kumar and Preeti Ranjan Panda",
  title =        "Shared-port register file architecture for low-energy
                 {VLIW} processors",
  journal =      j-TACO,
  volume =       "11",
  number =       "1",
  pages =        "1:1--1:32",
  month =        feb,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2533397",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Mar 14 17:30:52 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "We propose a reduced-port Register File (RF)
                 architecture for reducing RF energy in a VLIW
                 processor. With port reduction, RF ports need to be
                 shared among Function Units (FUs), which may lead to
                 access conflicts, and thus, reduced performance. Our
                 solution includes (i) a carefully designed RF-FU
                 interconnection network that permits port sharing with
                 minimum conflicts and without any delay/energy
                 overheads, and (ii) a novel scheduling and binding
                 algorithm that reduces the performance penalty. With
                 our solution, we observed as much as 83\% RF energy
                 savings with no more than a 10\% loss in performance
                 for a set of Mediabench and Mibench benchmarks.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "1",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Wang:2014:IPD,
  author =       "Zheng Wang and Georgios Tournavitis and Bj{\"o}rn
                 Franke and Michael F. P. O'Boyle",
  title =        "Integrating profile-driven parallelism detection and
                 machine-learning-based mapping",
  journal =      j-TACO,
  volume =       "11",
  number =       "1",
  pages =        "2:1--2:26",
  month =        feb,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2579561",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Mar 14 17:30:52 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Compiler-based auto-parallelization is a much-studied
                 area but has yet to find widespread application. This
                 is largely due to the poor identification and
                 exploitation of application parallelism, resulting in
                 disappointing performance far below that which a
                 skilled expert programmer could achieve. We have
                 identified two weaknesses in traditional parallelizing
                 compilers and propose a novel, integrated approach
                 resulting in significant performance improvements of
                 the generated parallel code. Using profile-driven
                 parallelism detection, we overcome the limitations of
                 static analysis, enabling the identification of more
                 application parallelism, and only rely on the user for
                 final approval. We then replace the traditional
                 target-specific and inflexible mapping heuristics with
                 a machine-learning-based prediction mechanism,
                 resulting in better mapping decisions while automating
                 adaptation to different target architectures. We have
                 evaluated our parallelization strategy on the NAS and
                 SPEC CPU2000 benchmarks and two different multicore
                 platforms (dual quad-core Intel Xeon SMP and
                 dual-socket QS20 Cell blade). We demonstrate that our
                 approach not only yields significant improvements when
                 compared with state-of-the-art parallelizing compilers
                 but also comes close to and sometimes exceeds the
                 performance of manually parallelized codes. On average,
                 our methodology achieves 96\% of the performance of the
                 hand-tuned OpenMP NAS and SPEC parallel benchmarks on
                 the Intel Xeon platform and gains a significant speedup
                 for the IBM Cell platform, demonstrating the potential
                 of profile-guided and machine-learning- based
                 parallelization for complex multicore platforms.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "2",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Samadi:2014:LGU,
  author =       "Mehrzad Samadi and Amir Hormati and Janghaeng Lee and
                 Scott Mahlke",
  title =        "Leveraging {GPUs} using cooperative loop speculation",
  journal =      j-TACO,
  volume =       "11",
  number =       "1",
  pages =        "3:1--3:26",
  month =        feb,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2579617",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Mar 14 17:30:52 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Graphics processing units, or GPUs, provide TFLOPs of
                 additional performance potential in commodity computer
                 systems that frequently go unused by most applications.
                 Even with the emergence of languages such as CUDA and
                 OpenCL, programming GPUs remains a difficult challenge
                 for a variety of reasons, including the inherent
                 algorithmic characteristics and data structure choices
                 used by applications as well as the tedious performance
                 optimization cycle that is necessary to achieve high
                 performance. The goal of this work is to increase the
                 applicability of GPUs beyond CUDA/OpenCL to implicitly
                 data-parallel applications written in C/C++ using
                 speculative parallelization. To achieve this goal, we
                 propose Paragon: a static/dynamic compiler platform to
                 speculatively run possibly data-parallel portions of
                 sequential applications on the GPU while cooperating
                 with the system CPU. For such loops, Paragon utilizes
                 the GPU in an opportunistic way while orchestrating a
                 cooperative relation between the CPU and GPU to reduce
                 the overhead of miss-speculations. Paragon monitors the
                 dependencies for the loops running speculatively on the
                 GPU and nonspeculatively on the CPU using a lightweight
                 distributed conflict detection designed specifically
                 for GPUs, and transfers the execution to the CPU in
                 case a conflict is detected. Paragon resumes the
                 execution on the GPU after the CPU resolves the
                 dependency. Our experiments show that Paragon achieves
                 4x on average and up to 30x speedup compared to unsafe
                 CPU execution with four threads and 7x on average and
                 up to 64x speedup versus sequential execution across a
                 set of sequential but implicitly data-parallel
                 applications.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "3",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Wang:2014:EAC,
  author =       "Jue Wang and Xiangyu Dong and Yuan Xie and Norman P.
                 Jouppi",
  title =        "Endurance-aware cache line management for non-volatile
                 caches",
  journal =      j-TACO,
  volume =       "11",
  number =       "1",
  pages =        "4:1--4:24",
  month =        feb,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2579671",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Mar 14 17:30:52 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Nonvolatile memories (NVMs) have the potential to
                 replace low-level SRAM or eDRAM on-chip caches because
                 NVMs save standby power and provide large cache
                 capacity. However, limited write endurance is a common
                 problem for NVM technologies, and today's cache
                 management might result in unbalanced cache write
                 traffic, causing heavily written cache blocks to fail
                 much earlier than others. Although wear-leveling
                 techniques for NVM-based main memories exist, we cannot
                 simply apply them to NVM-based caches. This is because
                 cache writes have intraset variations as well as
                 interset variations, while writes to main memories only
                 have interset variations. To solve this problem, we
                 propose i$^2$ WAP, a new cache management policy that
                 can reduce both inter- and intraset write variations.
                 i$^2$ WAP has two features: Swap-Shift, an enhancement
                 based on existing main memory wear leveling to reduce
                 cache interset write variations, and Probabilistic Set
                 Line Flush, a novel technique to reduce cache intraset
                 write variations. Implementing i$^2$ WAP only needs two
                 global counters and two global registers. In one of our
                 studies, i$^2$ WAP can improve the NVM cache lifetime
                 by 75\% on average and up to 224\%. We also validate
                 that i$^2$ WAP is effective in systems with different
                 cache configurations and workloads.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "4",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Liu:2014:BBS,
  author =       "Lei Liu and Zehan Cui and Yong Li and Yungang Bao and
                 Mingyu Chen and Chengyong Wu",
  title =        "{{BPM\slash BPM+}}: Software-based dynamic memory
                 partitioning mechanisms for mitigating {DRAM}
                 bank-\slash channel-level interferences in multicore
                 systems",
  journal =      j-TACO,
  volume =       "11",
  number =       "1",
  pages =        "5:1--5:28",
  month =        feb,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2579672",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Mar 14 17:30:52 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "The main memory system is a shared resource in modern
                 multicore machines that can result in serious
                 interference leading to reduced throughput and
                 unfairness. Many new memory scheduling mechanisms have
                 been proposed to address the interference problem.
                 However, these mechanisms usually employ relative
                 complex scheduling logic and need modifications to
                 Memory Controllers (MCs), which incur expensive
                 hardware design and manufacturing overheads. This
                 article presents a practical software approach to
                 effectively eliminate the interference without any
                 hardware modifications. The key idea is to modify the
                 OS memory management system and adopt a
                 page-coloring-based Bank-level Partitioning Mechanism
                 (BPM) that allocates dedicated DRAM banks to each core
                 (or thread). By using BPM, memory requests from
                 distinct programs are segregated across multiple memory
                 banks to promote locality/fairness and reduce
                 interference. We further extend BPM to BPM+ by
                 incorporating channel-level partitioning, on which we
                 demonstrate additional gain over BPM in many cases. To
                 achieve benefits in the presence of diverse application
                 memory needs and avoid performance degradation due to
                 resource underutilization, we propose a dynamic
                 mechanism upon BPM/BPM+ that assigns appropriate
                 bank/channel resources based on application
                 memory/bandwidth demands monitored through PMU
                 (performance-monitoring unit) and a low-overhead OS
                 page table scanning process. We implement BPM/BPM+ in
                 Linux 2.6.32.15 kernel and evaluate the technique on
                 four-core and eight-core real machines by running a
                 large amount of randomly generated multiprogrammed and
                 multithreaded workloads. Experimental results show that
                 BPM/BPM+ can improve the overall system throughput by
                 4.7\%/5.9\%, on average, (up to 8.6\%/9.5\%) and reduce
                 the unfairness by an average of 4.2\%/6.1\% (up to
                 15.8\%/13.9\%).",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "5",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Haubl:2014:TTE,
  author =       "Christian H{\"a}ubl and Christian Wimmer and Hanspeter
                 M{\"o}ssenb{\"o}ck",
  title =        "Trace transitioning and exception handling in a
                 trace-based {JIT} compiler for {Java}",
  journal =      j-TACO,
  volume =       "11",
  number =       "1",
  pages =        "6:1--6:26",
  month =        feb,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2579673",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Mar 14 17:30:52 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/java2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Trace-based Just-In-Time (JIT) compilation generates
                 machine code for frequently executed paths (so-called
                 traces) instead of whole methods. While this has
                 several advantages, it complicates invocation of
                 compiled traces as well as exception handling, so that
                 previous trace-based compilers limited the way in which
                 traces could be invoked. We present a significantly
                 enhanced trace-based compiler where arbitrary
                 transitions between interpreted and compiled traces are
                 possible. For that, we introduce suitable trace calling
                 conventions and extend exception handling to work both
                 within traces and across trace boundaries. Furthermore,
                 we use the recorded trace information for optimizations
                 and combine the tracing ideas with ideas from
                 partial-method compilation to avoid code bloat. An
                 extensive evaluation with the benchmark suites DaCapo
                 9.12 Bach and SPECjvm2008 shows that our trace-based
                 compiler achieves up to 59\% higher peak performance
                 than the method-based Java HotSpot client compiler. On
                 a few benchmarks, our fairly simple trace-based
                 compiler shows a higher peak performance than the Java
                 HotSpot server compiler, which is one of today's best
                 optimizing JIT compilers for Java.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "6",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Huang:2014:HHH,
  author =       "Yongbing Huang and Licheng Chen and Zehan Cui and Yuan
                 Ruan and Yungang Bao and Mingyu Chen and Ninghui Sun",
  title =        "{HMTT}: a hybrid hardware\slash software tracing
                 system for bridging the {DRAM} access trace's semantic
                 gap",
  journal =      j-TACO,
  volume =       "11",
  number =       "1",
  pages =        "7:1--7:25",
  month =        feb,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2579668",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Mar 14 17:30:52 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "DRAM access traces (i.e., off-chip memory references)
                 can be extremely valuable for the design of memory
                 subsystems and performance tuning of software. Hardware
                 snooping on the off-chip memory interface is an
                 effective and nonintrusive approach to monitoring and
                 collecting real-life DRAM accesses. However, compared
                 with software-based approaches, hardware snooping
                 approaches typically lack semantic information, such as
                 process/function/object identifiers, virtual addresses,
                 and lock contexts, that is essential to the complete
                 understanding of the systems and software under
                 investigation. In this article, we propose a hybrid
                 hardware/software mechanism that is able to collect
                 off-chip memory reference traces with semantic
                 information. We have designed and implemented a
                 prototype system called HMTT (Hybrid Memory Trace
                 Tool), which uses a custom-made DIMM connector to
                 collect off-chip memory references and a high-level
                 event-encoding scheme to correlate semantic information
                 with memory references. In addition to providing
                 complete, undistorted DRAM access traces, the proposed
                 system is also able to perform various types of
                 low-overhead profiling, such as object-relative
                 accesses and multithread lock accesses.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "7",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Chen:2014:AWA,
  author =       "Quan Chen and Minyi Guo",
  title =        "Adaptive workload-aware task scheduling for
                 single-{ISA} asymmetric multicore architectures",
  journal =      j-TACO,
  volume =       "11",
  number =       "1",
  pages =        "8:1--8:25",
  month =        feb,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2579674",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Mar 14 17:30:52 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Single-ISA Asymmetric Multicore (AMC) architectures
                 have shown high performance as well as power
                 efficiency. However, current parallel programming
                 environments do not perform well on AMC because they
                 are designed for symmetric multicore architectures in
                 which all cores provide equal performance. Their random
                 task scheduling policies can result in unbalanced
                 workloads in AMC and severely degrade the performance
                 of parallel applications. To balance the workloads of
                 parallel applications in AMC, this article proposes an
                 adaptive Workload-Aware Task Scheduler (WATS) that
                 consists of a history-based task allocator and a
                 preference-based task scheduler. The history-based task
                 allocator is based on a near-optimal, static task
                 allocation using the historical statistics collected
                 during the execution of a parallel application. The
                 preference-based task scheduler, which schedules tasks
                 based on a preference list, can dynamically adjust the
                 workloads in AMC if the task allocation is less optimal
                 due to approximation in the history-based task
                 allocator. Experimental results show that WATS can
                 improve both the performance and energy efficiency of
                 task-based applications, with the performance gain up
                 to 66.1\% compared with traditional task schedulers.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "8",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Savrun-Yeniceri:2014:EHI,
  author =       "G{\"u}lfem Savrun-Yeni{\c{c}}eri and Wei Zhang and
                 Huahan Zhang and Eric Seckler and Chen Li and Stefan
                 Brunthaler and Per Larsen and Michael Franz",
  title =        "Efficient hosted interpreters on the {JVM}",
  journal =      j-TACO,
  volume =       "11",
  number =       "1",
  pages =        "9:1--9:24",
  month =        feb,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2532642",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Tue Mar 10 08:08:33 MDT 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/java2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib;
                 https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/2532642",
  abstract =     "Many guest languages are implemented using the Java
                 Virtual Machine (JVM) as a host environment. There are
                 two major implementation choices: custom compilers and
                 so-called hosted interpreters. Custom compilers are
                 complex to build but offer good performance. Hosted
                 interpreters are comparatively simpler to implement but
                 until now have suffered from poor performance.\par

                 We studied the performance of hosted interpreters and
                 identified common bottlenecks preventing their
                 efficient execution. First, similar to interpreters
                 written in C/C++, instruction dispatch is expensive on
                 the JVM. Second, Java's semantics require expensive
                 runtime exception checks that negatively affect array
                 performance essential to interpreters.\par

                 We present two optimizations targeting these
                 bottlenecks and show that the performance of optimized
                 interpreters increases dramatically: we report speedups
                 by a factor of up to 2.45 over the Jython interpreter,
                 3.57 over the Rhino interpreter, and 2.52 over the
                 JRuby interpreter, respectively. The resulting
                 performance is comparable with that of custom
                 compilers. Our optimizations are enabled by a few
                 simple annotations that require only modest
                 implementation effort; in return, performance increases
                 substantially.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "9",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Nair:2014:RPD,
  author =       "Prashant J. Nair and Chia-Chen Chou and Moinuddin K.
                 Qureshi",
  title =        "Refresh pausing in {DRAM} memory systems",
  journal =      j-TACO,
  volume =       "11",
  number =       "1",
  pages =        "10:1--10:26",
  month =        feb,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2579669",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Tue Mar 10 08:08:33 MDT 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/2579669",
  abstract =     "Dynamic Random Access Memory (DRAM) cells rely on
                 periodic refresh operations to maintain data integrity.
                 As the capacity of DRAM memories has increased, so has
                 the amount of time consumed in doing refresh. Refresh
                 operations contend with read \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "10",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Jothi:2014:TCF,
  author =       "Komal Jothi and Haitham Akkary",
  title =        "Tuning the continual flow pipeline architecture with
                 virtual register renaming",
  journal =      j-TACO,
  volume =       "11",
  number =       "1",
  pages =        "11:1--11:27",
  month =        feb,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2579675",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Mar 14 17:30:52 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Continual Flow Pipelines (CFPs) allow a processor core
                 to process hundreds of in-flight instructions without
                 increasing cycle-critical pipeline resources. When a
                 load misses the data cache, CFP checkpoints the
                 processor register state and then moves all
                 miss-dependent instructions into a low-complexity WB to
                 unblock the pipeline. Meanwhile, miss-independent
                 instructions execute normally and update the processor
                 state. When the miss data return, CFP replays the
                 miss-dependent instructions from the WB and then merges
                 the miss-dependent and miss-independent execution
                 results. CFP was initially proposed for cache misses to
                 DRAM. Later work focused on reducing the execution
                 overhead of CFP by avoiding the pipeline flush before
                 replaying miss-dependent instructions and executing
                 dependent and independent instructions concurrently.
                 The goal of these improvements was to gain performance
                 by applying CFP to L1 data cache misses that hit the
                 last level on chip cache. However, many applications or
                 execution phases of applications incur excessive amount
                 of replay and/or rollbacks to the checkpoint. This
                 frequently cancels benefits from CFP and reduces
                 performance. In this article, we improve the CFP
                 architecture by using a novel virtual register renaming
                 substrate and by tuning the replay policies to mitigate
                 excessive replays and rollbacks to the checkpoint. We
                 describe these new design optimizations and show, using
                 Spec 2006 benchmarks and microarchitecture performance
                 and power models of our design, that our Tuned-CFP
                 architecture improves performance and energy
                 consumption over previous CFP architectures by ~10\%
                 and ~8\%, respectively. We also demonstrate that our
                 proposed architecture gives better performance return
                 on energy per instruction compared to a conventional
                 superscalar as well as previous CFP architectures.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "11",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Carle:2014:PAM,
  author =       "Thomas Carle and Dumitru Potop-Butucaru",
  title =        "Predicate-aware, makespan-preserving software
                 pipelining of scheduling tables",
  journal =      j-TACO,
  volume =       "11",
  number =       "1",
  pages =        "12:1--12:26",
  month =        feb,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2579676",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Tue Mar 10 08:08:33 MDT 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/2579676",
  abstract =     "We propose a software pipelining technique adapted to
                 specific hard real-time scheduling problems. Our
                 technique optimizes both computation throughput and
                 execution cycle makespan, with makespan being
                 prioritary. It also takes advantage of the \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "12",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Kritikakou:2014:SNO,
  author =       "Angeliki Kritikakou and Francky Catthoor and Vasilios
                 Kelefouras and Costas Goutis",
  title =        "A scalable and near-optimal representation of access
                 schemes for memory management",
  journal =      j-TACO,
  volume =       "11",
  number =       "1",
  pages =        "13:1--13:25",
  month =        feb,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2579677",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Mar 14 17:30:52 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Memory management searches for the resources required
                 to store the concurrently alive elements. The solution
                 quality is affected by the representation of the
                 element accesses: a sub-optimal representation leads to
                 overestimation and a non-scalable representation
                 increases the exploration time. We propose a
                 methodology to near-optimal and scalable represent
                 regular and irregular accesses. The representation
                 consists of a set of pattern entries to compactly
                 describe the behavior of the memory accesses and of
                 pattern operations to consistently combine the pattern
                 entries. The result is a final sequence of pattern
                 entries which represents the global access scheme
                 without unnecessary overestimation.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "13",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Leather:2014:AFG,
  author =       "Hugh Leather and Edwin Bonilla and Michael O'Boyle",
  title =        "Automatic feature generation for machine
                 learning--based optimising compilation",
  journal =      j-TACO,
  volume =       "11",
  number =       "1",
  pages =        "14:1--14:32",
  month =        feb,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2536688",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Mar 14 17:30:52 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Recent work has shown that machine learning can
                 automate and in some cases outperform handcrafted
                 compiler optimisations. Central to such an approach is
                 that machine learning techniques typically rely upon
                 summaries or features of the program. The quality of
                 these features is critical to the accuracy of the
                 resulting machine learned algorithm; no machine
                 learning method will work well with poorly chosen
                 features. However, due to the size and complexity of
                 programs, theoretically there are an infinite number of
                 potential features to choose from. The compiler writer
                 now has to expend effort in choosing the best features
                 from this space. This article develops a novel
                 mechanism to automatically find those features that
                 most improve the quality of the machine learned
                 heuristic. The feature space is described by a grammar
                 and is then searched with genetic programming and
                 predictive modelling. We apply this technique to loop
                 unrolling in GCC 4.3.1 and evaluate our approach on a
                 Pentium 6. On a benchmark suite of 57 programs, GCCs
                 hard-coded heuristic achieves only 3\% of the maximum
                 performance available, whereas a state-of-the-art
                 machine learning approach with hand-coded features
                 obtains 59\%. Our feature generation technique is able
                 to achieve 76\% of the maximum available speedup,
                 outperforming existing approaches.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "14",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Kluter:2014:VWL,
  author =       "Theo Kluter and Samuel Burri and Philip Brisk and
                 Edoardo Charbon and Paolo Ienne",
  title =        "Virtual Ways: Low-Cost Coherence for Instruction Set
                 Extensions with Architecturally Visible Storage",
  journal =      j-TACO,
  volume =       "11",
  number =       "2",
  pages =        "15:1--15:26",
  month =        jul,
  year =         "2014",
  DOI =          "https://doi.org/10.1145/2576877",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Tue Mar 10 08:13:09 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Instruction set extensions (ISEs) improve the
                 performance and energy consumption of
                 application-specific processors. ISEs can use
                 architecturally visible storage (AVS), localized
                 compiler-controlled memories, to provide higher I/O
                 bandwidth than reading data from the processor
                 pipeline. AVS creates coherence and consistence
                 problems with the data cache. Although a hardware
                 coherence protocol could solve the problem, this
                 approach is costly for a single-processor system. As a
                 low-cost alternative, we introduce Virtual Ways, which
                 ensures coherence through a reduced form of inclusion
                 between the data cache and AVS. Virtual Ways achieve
                 higher performance and lower energy consumption than
                 using a hardware coherence protocol.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "15",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Ren:2014:POE,
  author =       "Bin Ren and Todd Mytkowicz and Gagan Agrawal",
  title =        "A Portable Optimization Engine for Accelerating
                 Irregular Data-Traversal Applications on {SIMD}
                 Architectures",
  journal =      j-TACO,
  volume =       "11",
  number =       "2",
  pages =        "16:1--16:??",
  month =        jun,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2632215",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Jun 30 19:02:49 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Fine-grained data parallelism is increasingly common
                 in the form of longer vectors integrated with
                 mainstream processors (SSE, AVX) and various GPU
                 architectures. This article develops support for
                 exploiting such data parallelism for a class of
                 nonnumeric, nongraphic applications, which perform
                 computations while traversing many independent,
                 irregular data structures. We address this problem by
                 developing several novel techniques. First, for code
                 generation, we develop an intermediate language for
                 specifying such traversals, followed by a runtime
                 scheduler that maps traversals to various SIMD units.
                 Second, we observe that good data locality is crucial
                 to sustained performance from SIMD architectures,
                 whereas many applications that operate on irregular
                 data structures (e.g., trees and graphs) have poor data
                 locality. To address this challenge, we develop a set
                 of data layout optimizations that improve spatial
                 locality for applications that traverse many irregular
                 data structures. Unlike prior data layout
                 optimizations, our approach incorporates a notion of
                 both interthread and intrathread spatial reuse into
                 data layout. Finally, we enable performance portability
                 (i.e., the ability to automatically optimize
                 applications for different architectures) by accurately
                 modeling the impact of inter- and intrathread locality
                 on program performance. As a consequence, our model can
                 predict which data layout optimization to use on a wide
                 variety of SIMD architectures. To demonstrate the
                 efficacy of our approach and optimizations, we first
                 show how they enable up to a 12X speedup on one SIMD
                 architecture for a set of real-world applications. To
                 demonstrate that our approach enables performance
                 portability, we show how our model predicts the optimal
                 layout for applications across a diverse set of three
                 real-world SIMD architectures, which offers as much as
                 45\% speedup over a suboptimal solution.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "16",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Qi:2014:VVG,
  author =       "Zhengwei Qi and Jianguo Yao and Chao Zhang and Miao Yu
                 and Zhizhou Yang and Haibing Guan",
  title =        "{VGRIS}: Virtualized {GPU} Resource Isolation and
                 Scheduling in Cloud Gaming",
  journal =      j-TACO,
  volume =       "11",
  number =       "2",
  pages =        "17:1--17:25",
  month =        jul,
  year =         "2014",
  DOI =          "https://doi.org/10.1145/2632216",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Tue Mar 10 08:16:31 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "To achieve efficient resource management on a graphics
                 processing unit (GPU), there is a demand to develop a
                 framework for scheduling virtualized resources in cloud
                 gaming. In this article, we propose VGRIS, a resource
                 management framework for virtualized GPU resource
                 isolation and scheduling in cloud gaming. A set of
                 application programming interfaces (APIs) is provided
                 so that a variety of scheduling algorithms can be
                 implemented within the framework without modifying the
                 framework itself. Three scheduling algorithms are
                 implemented by the APIs within VGRIS. Experimental
                 results show that VGRIS can effectively schedule GPU
                 resources among various workloads.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "17",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Shen:2014:RSB,
  author =       "Bor-Yeh Shen and Wei-Chung Hsu and Wuu Yang",
  title =        "A Retargetable Static Binary Translator for the {ARM}
                 Architecture",
  journal =      j-TACO,
  volume =       "11",
  number =       "2",
  pages =        "18:1--18:??",
  month =        jun,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2629335",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Jun 30 19:02:49 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Machines designed with new but incompatible
                 Instruction Set Architecture (ISA) may lack proper
                 applications. Binary translation can address this
                 incompatibility by migrating applications from one
                 legacy ISA to a new one, although binary translation
                 has problems such as code discovery for variable-length
                 ISA and code location issues for handling indirect
                 branches. Dynamic Binary Translation (DBT) has been
                 widely adopted for migrating applications since it
                 avoids those problems. Static Binary Translation (SBT)
                 is a less general solution and has not been actively
                 researched. However, SBT performs more aggressive
                 optimizations, which could yield more compact code and
                 better code quality. Applications translated by SBT can
                 consume less memory, processor cycles, and power than
                 DBT and can be started more quickly. These advantages
                 are even more critical for embedded systems than for
                 general systems. In this article, we designed and
                 implemented a new SBT tool, called LLBT, which
                 translates ARM instructions into LLVM IRs and then
                 retargets the LLVM IRs to various ISAs, including x86,
                 x86-64, ARM, and MIPS. LLBT leverages two important
                 functionalities from LLVM: comprehensive optimizations
                 and retargetability. More importantly, LLBT solves the
                 code discovery problem for ARM/Thumb binaries without
                 resorting to interpretation. LLBT also effectively
                 reduced the size of the address mapping table, making
                 SBT a viable solution for embedded systems. Our
                 experiments based on the EEMBC benchmark suite show
                 that the LLBT-generated code can run more than $ 6
                 \times $ and $ 2.3 \times $ faster on average than
                 emulation with QEMU and HQEMU, respectively.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "18",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Gracia:2014:RLN,
  author =       "Dar{\'\i}o Su{\'a}rez Gracia and Alexandra
                 Ferrer{\'o}n and Luis Montesano {Del Campo} and Teresa
                 Monreal Arnal and V{\'\i}ctor Vi{\~n}als Y{\'u}fera",
  title =        "Revisiting {LP--NUCA} Energy Consumption: Cache Access
                 Policies and Adaptive Block Dropping",
  journal =      j-TACO,
  volume =       "11",
  number =       "2",
  pages =        "19:1--19:??",
  month =        jun,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2632217",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Jun 30 19:02:49 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Cache working-set adaptation is key as embedded
                 systems move to multiprocessor and Simultaneous
                 Multithreaded Architectures (SMT) because interthread
                 pollution harms system performance and battery life.
                 Light-Power NUCA (LP-NUCA) is a working-set adaptive
                 cache that depends on temporal-locality to save energy.
                 This work identifies the sources of energy waste in
                 LP-NUCAs: parallel access to the tag and data arrays of
                 the tiles and low locality phases with useless block
                 migration. To counteract both issues, we prove that
                 switching to serial access reduces energy without
                 harming performance and propose a machine learning
                 Adaptive Drop Rate (ADR) controller that minimizes the
                 amount of replacement and migration when locality is
                 low. This work demonstrates that these techniques
                 efficiently adapt the cache drop and access policies to
                 save energy. They reduce LP-NUCA consumption 22.7\% for
                 1SMT. With interthread cache contention in 2SMT, the
                 savings rise to 29\%. Versus a conventional
                 organization, energy--delay improves 20.8\% and 25\%
                 for 1- and 2SMT benchmarks, and, in 65\% of the 2SMT
                 mixes, gains are larger than 20\%.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "19",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Liang:2014:DCC,
  author =       "Zhibin Liang and Wei Zhang and Yung-Cheng Ma",
  title =        "Deadline-Constrained Clustered Scheduling for {VLIW}
                 Architectures using Power-Gated Register Files",
  journal =      j-TACO,
  volume =       "11",
  number =       "2",
  pages =        "20:1--20:26",
  month =        jul,
  year =         "2014",
  DOI =          "https://doi.org/10.1145/2632218",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Tue Mar 10 08:18:32 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Designing energy-efficient Digital Signal Processor
                 (DSP) cores has become a key concern in embedded
                 systems development. This paper proposes an
                 energy-proportional computing scheme for Very Long
                 Instruction Word (VLIW) architectures. To make the
                 processor power scales with adapted parallelism, we
                 propose incorporating distributed Power-Gated Register
                 Files (PGRF) into VLIW to achieve a PGRF-VLIW
                 architecture. For energy efficiency, we also propose an
                 instruction scheduling algorithm called the
                 Deadline-Constrained Clustered Scheduling (DCCS)
                 algorithm. The algorithm clusters the data dependence
                 graph to reduce data transfer energy and makes optimal
                 use of low-powered local registers for tree-structured
                 data dependence graphs. The results of evaluations
                 conducted using the MiBench and DSPstone benchmark
                 suites substantiate the expected power saving and
                 scaling effects.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "20",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Fang:2014:PPA,
  author =       "Shuangde Fang and Zidong Du and Yuntan Fang and
                 Yuanjie Huang and Yang Chen and Lieven Eeckhout and
                 Olivier Temam and Huawei Li and Yunji Chen and
                 Chengyong Wu",
  title =        "Performance Portability Across Heterogeneous {SoCs}
                 Using a Generalized Library-Based Approach",
  journal =      j-TACO,
  volume =       "11",
  number =       "2",
  pages =        "21:1--21:??",
  month =        jun,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2608253",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Jun 30 19:02:49 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Because of tight power and energy constraints,
                 industry is progressively shifting toward heterogeneous
                 system-on-chip (SoC) architectures composed of a mix of
                 general-purpose cores along with a number of
                 accelerators. However, such SoC architectures can be
                 very challenging to efficiently program for the vast
                 majority of programmers, due to numerous programming
                 approaches and languages. Libraries, on the other hand,
                 provide a simple way to let programmers take advantage
                 of complex architectures, which does not require
                 programmers to acquire new accelerator-specific or
                 domain-specific languages. Increasingly, library-based,
                 also called algorithm-centric, programming approaches
                 propose to generalize the usage of libraries and to
                 compose programs around these libraries, instead of
                 using libraries as mere complements. In this article,
                 we present a software framework for achieving
                 performance portability by leveraging a generalized
                 library-based approach. Inspired by the notion of a
                 component, as employed in software engineering and
                 HW/SW codesign, we advocate nonexpert programmers to
                 write simple wrapper code around existing libraries to
                 provide simple but necessary semantic information to
                 the runtime. To achieve performance portability, the
                 runtime employs machine learning (simulated annealing)
                 to select the most appropriate accelerator and its
                 parameters for a given algorithm. This selection
                 factors in the possibly complex composition of
                 algorithms used in the application, the communication
                 among the various accelerators, and the tradeoff
                 between different objectives (i.e., accuracy,
                 performance, and energy). Using a set of benchmarks run
                 on a real heterogeneous SoC composed of a multicore
                 processor and a GPU, we show that the runtime overhead
                 is fairly small at 5.1\% for the GPU and 6.4\% for the
                 multi-core. We then apply our accelerator selection
                 approach to a simulated SoC platform containing
                 multiple inexact accelerators. We show that accelerator
                 selection together with hardware parameter tuning
                 achieves an average 46.2\% energy reduction and a
                 speedup of 2.1$ \times $ while meeting the desired
                 application error target.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "21",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Kaitoua:2014:HED,
  author =       "Abdulrahman Kaitoua and Hazem Hajj and Mazen A. R.
                 Saghir and Hassan Artail and Haitham Akkary and
                 Mariette Awad and Mageda Sharafeddine and Khaleel
                 Mershad",
  title =        "{Hadoop} Extensions for Distributed Computing on
                 Reconfigurable Active {SSD} Clusters",
  journal =      j-TACO,
  volume =       "11",
  number =       "2",
  pages =        "22:1--22:??",
  month =        jun,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2608199",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Oct 27 17:02:18 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "In this article, we propose new extensions to Hadoop
                 to enable clusters of reconfigurable active solid-state
                 drives (RASSDs) to process streaming data from SSDs
                 using FPGAs. We also develop an analytical model to
                 estimate the performance of RASSD clusters running
                 under Hadoop. Using the Hadoop RASSD platform and
                 network simulators, we validate our design and
                 demonstrate its impact on performance for different
                 workloads taken from Stanford's Phoenix MapReduce
                 project. Our results show that for a hardware
                 acceleration factor of 20$ \times $, compute-intensive
                 workloads processing 153MB of data can run up to 11$
                 \times $ faster than a standard Hadoop cluster.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "22",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Wang:2014:PSR,
  author =       "Jue Wang and Xiangyu Dong and Yuan Xie",
  title =        "Preventing {STT-RAM} Last-Level Caches from Port
                 Obstruction",
  journal =      j-TACO,
  volume =       "11",
  number =       "3",
  pages =        "23:1--23:??",
  month =        oct,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2633046",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Oct 27 17:02:20 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Many new nonvolatile memory (NVM) technologies have
                 been heavily studied to replace the power-hungry
                 SRAM/DRAM-based memory hierarchy in today's computers.
                 Among various emerging NVM technologies, Spin-Transfer
                 Torque RAM (STT-RAM) has many benefits, such as fast
                 read latency, low leakage power, and high density,
                 making it a promising candidate for last-level caches
                 (LLCs).$^1$ However, STT-RAM write operation is
                 expensive. In particular, a long STT-RAM cache write
                 operation might obstruct other cache accesses and
                 result in severe performance degradation. Consequently,
                 how to mitigate STT-RAM write overhead is critical to
                 the success of STT-RAM adoption. In this article, we
                 propose an obstruction-aware cache management policy
                 called OAP. OAP monitors cache traffic, detects
                 LLC-obstructive processes, and differentiates the cache
                 accesses from different processes. Our experiment on a
                 four-core architecture with an 8MB STT-RAM L3 cache
                 shows a 14\% performance improvement and 64\% energy
                 reduction.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "23",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Gonzalez-Mesa:2014:ETM,
  author =       "M. A. Gonzalez-Mesa and Eladio Gutierrez and Emilio L.
                 Zapata and Oscar Plata",
  title =        "Effective Transactional Memory Execution Management
                 for Improved Concurrency",
  journal =      j-TACO,
  volume =       "11",
  number =       "3",
  pages =        "24:1--24:??",
  month =        oct,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2633048",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Oct 27 17:02:20 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "This article describes a transactional memory
                 execution model intended to exploit maximum parallelism
                 from sequential and multithreaded programs. A program
                 code section is partitioned into chunks that will be
                 mapped onto threads and executed transactionally. These
                 transactions run concurrently and out of order, trying
                 to exploit maximum parallelism but managed by a
                 specific fully distributed commit control to meet data
                 dependencies. To accomplish correct parallel execution,
                 a partial precedence order relation is derived from the
                 program code section and/or defined by the programmer.
                 When a conflict between chunks is eagerly detected, the
                 precedence order relation is used to determine the best
                 policy to solve the conflict that preserves the
                 precedence order while maximizing concurrency. The
                 model defines a new transactional state called executed
                 but not committed. This state allows exploiting
                 concurrency on two levels: intrathread and interthread.
                 Intrathread concurrency is improved by having pending
                 uncommitted transactions while executing a new one in
                 the same thread. The new state improves interthread
                 concurrency because it permits out-of-order transaction
                 commits regarding the precedence order. Our model has
                 been implemented in a lightweight software
                 transactional memory system, TinySTM, and has been
                 evaluated on a set of benchmarks obtaining an important
                 performance improvement over the baseline TM system.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "24",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Kumar:2014:EPG,
  author =       "Rakesh Kumar and Alejandro Mart{\'\i}nez and Antonio
                 Gonz{\'a}lez",
  title =        "Efficient Power Gating of {SIMD} Accelerators Through
                 Dynamic Selective Devectorization in an {HW\slash SW}
                 Codesigned Environment",
  journal =      j-TACO,
  volume =       "11",
  number =       "3",
  pages =        "25:1--25:??",
  month =        oct,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2629681",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Oct 27 17:02:20 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Leakage energy is a growing concern in current and
                 future microprocessors. Functional units of
                 microprocessors are responsible for a major fraction of
                 this energy. Therefore, reducing functional unit
                 leakage has received much attention in recent years.
                 Power gating is one of the most widely used techniques
                 to minimize leakage energy. Power gating turns off the
                 functional units during the idle periods to reduce the
                 leakage. Therefore, the amount of leakage energy
                 savings is directly proportional to the idle time
                 duration. This article focuses on increasing the idle
                 interval for the higher SIMD lanes. The applications
                 are profiled dynamically, in a hardware/software
                 codesigned environment, to find the higher SIMD lanes'
                 usage pattern. If the higher lanes need to be turned on
                 for small time periods, the corresponding portion of
                 the code is devectorized to keep the higher lanes off.
                 The devectorized code is executed on the lowest SIMD
                 lane. Our experimental results show that the average
                 energy savings of the proposed mechanism are 15\%,
                 12\%, and 71\% greater than power gating for
                 SPECFP2006, Physicsbench, and Eigen benchmark suites,
                 respectively. Moreover, the slowdown caused by
                 devectorization is negligible.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "25",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Carlo:2014:FAA,
  author =       "Stefano {Di Carlo} and Salvatore Galfano and Marco
                 Indaco and Paolo Prinetto and Davide Bertozzi and Piero
                 Olivo and Cristian Zambelli",
  title =        "{FLARES}: an Aging Aware Algorithm to Autonomously
                 Adapt the Error Correction Capability in {NAND} Flash
                 Memories",
  journal =      j-TACO,
  volume =       "11",
  number =       "3",
  pages =        "26:1--26:??",
  month =        oct,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2631919",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Oct 27 17:02:20 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "With the advent of solid-state storage systems, NAND
                 flash memories are becoming a key storage technology.
                 However, they suffer from serious reliability and
                 endurance issues during the operating lifetime that can
                 be handled by the use of appropriate error correction
                 codes (ECCs) in order to reconstruct the information
                 when needed. Adaptable ECCs may provide the flexibility
                 to avoid worst-case reliability design, thus leading to
                 improved performance. However, a way to control such
                 adaptable ECCs' strength is required. This article
                 proposes FLARES, an algorithm able to adapt the ECC
                 correction capability of each page of a flash based on
                 a flash RBER prediction model and on a measurement of
                 the number of errors detected in a given time window.
                 FLARES has been fully implemented within the YAFFS 2
                 filesystem under the Linux operating system. This
                 allowed us to perform an extensive set of simulations
                 on a set of standard benchmarks that highlighted the
                 benefit of FLARES on the overall storage subsystem
                 performances.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "26",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Bartolini:2014:AFG,
  author =       "Davide B. Bartolini and Filippo Sironi and Donatella
                 Sciuto and Marco D. Santambrogio",
  title =        "Automated Fine-Grained {CPU} Provisioning for Virtual
                 Machines",
  journal =      j-TACO,
  volume =       "11",
  number =       "3",
  pages =        "27:1--27:??",
  month =        oct,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2637480",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Oct 27 17:02:20 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib;
                 https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib",
  abstract =     "Ideally, the pay-as-you-go model of Infrastructure as
                 a Service (IaaS) clouds should enable users to rent
                 just enough resources (e.g., CPU or memory bandwidth)
                 to fulfill their service level objectives (SLOs).
                 Achieving this goal is hard on current IaaS offers,
                 which require users to explicitly specify the amount of
                 resources to reserve; this requirement is nontrivial
                 for users, because estimating the amount of resources
                 needed to attain application-level SLOs is often
                 complex, especially when resources are virtualized and
                 the service provider colocates virtual machines (VMs)
                 on host nodes. For this reason, users who deploy VMs
                 subject to SLOs are usually prone to overprovisioning
                 resources, thus resulting in inflated business costs.
                 This article tackles this issue with AutoPro: a runtime
                 system that enhances IaaS clouds with automated and
                 fine-grained resource provisioning based on performance
                 SLOs. Our main contribution with AutoPro is filling the
                 gap between application-level performance SLOs and
                 allocation of a contended resource, without requiring
                 explicit reservations from users. In this article, we
                 focus on CPU bandwidth allocation to throughput-driven,
                 compute-intensive multithreaded applications colocated
                 on a multicore processor; we show that a theoretically
                 sound, yet simple, control strategy can enable
                 automated fine-grained allocation of this contended
                 resource, without the need for offline profiling.
                 Additionally, AutoPro helps service providers optimize
                 infrastructure utilization by provisioning idle
                 resources to best-effort workloads, so as to maximize
                 node-level utilization. Our extensive experimental
                 evaluation confirms that AutoPro is able to
                 automatically determine and enforce allocations to meet
                 performance SLOs while maximizing node-level
                 utilization by supporting batch workloads on a
                 best-effort basis.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "27",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Carlson:2014:EHL,
  author =       "Trevor E. Carlson and Wim Heirman and Stijn Eyerman
                 and Ibrahim Hur and Lieven Eeckhout",
  title =        "An Evaluation of High-Level Mechanistic Core Models",
  journal =      j-TACO,
  volume =       "11",
  number =       "3",
  pages =        "28:1--28:??",
  month =        oct,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2629677",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Oct 27 17:02:20 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Large core counts and complex cache hierarchies are
                 increasing the burden placed on commonly used
                 simulation and modeling techniques. Although analytical
                 models provide fast results, they do not apply to
                 complex, many-core shared-memory systems. In contrast,
                 detailed cycle-level simulation can be accurate but
                 also tends to be slow, which limits the number of
                 configurations that can be evaluated. A middle ground
                 is needed that provides for fast simulation of complex
                 many-core processors while still providing accurate
                 results. In this article, we explore, analyze, and
                 compare the accuracy and simulation speed of
                 high-abstraction core models as a potential solution to
                 slow cycle-level simulation. We describe a number of
                 enhancements to interval simulation to improve its
                 accuracy while maintaining simulation speed. In
                 addition, we introduce the instruction-window centric
                 (IW-centric) core model, a new mechanistic core model
                 that bridges the gap between interval simulation and
                 cycle-accurate simulation by enabling high-speed
                 simulations with higher levels of detail. We also show
                 that using accurate core models like these are
                 important for memory subsystem studies, and that
                 simple, naive models, like a one-IPC core model, can
                 lead to misleading and incorrect results and
                 conclusions in practical design studies. Validation
                 against real hardware shows good accuracy, with an
                 average single-core error of 11.1\% and a maximum of
                 18.8\% for the IW-centric model with a 1.5$ \times $
                 slowdown compared to interval simulation.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "28",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Hijaz:2014:NLN,
  author =       "Farrukh Hijaz and Omer Khan",
  title =        "{NUCA-L1}: a Non-Uniform Access Latency Level-1 Cache
                 Architecture for Multicores Operating at Near-Threshold
                 Voltages",
  journal =      j-TACO,
  volume =       "11",
  number =       "3",
  pages =        "29:1--29:??",
  month =        oct,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2631918",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Oct 27 17:02:20 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Research has shown that operating in the
                 near-threshold region is expected to provide up to 10$
                 \times $ energy efficiency for future processors.
                 However, reliable operation below a minimum voltage
                 (Vccmin) cannot be guaranteed due to process
                 variations. Because SRAM margins can easily be violated
                 at near-threshold voltages, their bit-cell failure
                 rates are expected to rise steeply. Multicore
                 processors rely on fast private L1 caches to exploit
                 data locality and achieve high performance. In the
                 presence of high bit-cell fault rates, traditionally an
                 L1 cache either sacrifices capacity or incurs
                 additional latency to correct the faults. We observe
                 that L1 cache sensitivity to hit latency offers a
                 design trade-off between capacity and latency. When
                 fault rate is high at extreme Vccmin, it is beneficial
                 to recover L1 cache capacity, even if it comes at the
                 cost of additional latency. However, at low fault
                 rates, the additional constant latency to recover cache
                 capacity degrades performance. With this trade-off in
                 mind, we propose a Non-Uniform Cache Access L1
                 architecture (NUCA-L1) that avoids additional latency
                 on accesses to fault-free cache lines. To mitigate the
                 capacity bottleneck, it deploys a correction mechanism
                 to recover capacity at the cost of additional latency.
                 Using extensive simulations of a 64-core multicore, we
                 demonstrate that at various bit-cell fault rates, our
                 proposed private NUCA-L1 cache architecture performs
                 better than state-of-the-art schemes, along with a
                 significant reduction in energy consumption.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "29",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Drebes:2014:TAD,
  author =       "Andi Drebes and Karine Heydemann and Nathalie Drach
                 and Antoniu Pop and Albert Cohen",
  title =        "Topology-Aware and Dependence-Aware Scheduling and
                 Memory Allocation for Task-Parallel Languages",
  journal =      j-TACO,
  volume =       "11",
  number =       "3",
  pages =        "30:1--30:??",
  month =        oct,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2641764",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Oct 27 17:02:20 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "We present a joint scheduling and memory allocation
                 algorithm for efficient execution of task-parallel
                 programs on non-uniform memory architecture (NUMA)
                 systems. Task and data placement decisions are based on
                 a static description of the memory hierarchy and on
                 runtime information about intertask communication.
                 Existing locality-aware scheduling strategies for
                 fine-grained tasks have strong limitations: they are
                 specific to some class of machines or applications,
                 they do not handle task dependences, they require
                 manual program annotations, or they rely on fragile
                 profiling schemes. By contrast, our solution makes no
                 assumption on the structure of programs or on the
                 layout of data in memory. Experimental results, based
                 on the OpenStream language, show that locality of
                 accesses to main memory of scientific applications can
                 be increased significantly on a 64-core machine,
                 resulting in a speedup of up to 1.63$ \times $ compared
                 to a state-of-the-art work-stealing scheduler.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "30",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Tawa:2014:EEF,
  author =       "Venkata Kalyan Tawa and Ravi Kasha and Madhu Mutyam",
  title =        "{EFGR}: an Enhanced Fine Granularity Refresh Feature
                 for High-Performance {DDR4 DRAM} Devices",
  journal =      j-TACO,
  volume =       "11",
  number =       "3",
  pages =        "31:1--31:??",
  month =        oct,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2656340",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Oct 27 17:02:20 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "High-density DRAM devices spend significant time
                 refreshing the DRAM cells, leading to performance drop.
                 The JEDEC DDR4 standard provides a Fine Granularity
                 Refresh (FGR) feature to tackle refresh. Motivated by
                 the observation that in FGR mode, only a few banks are
                 involved, we propose an Enhanced FGR (EFGR) feature
                 that introduces three optimizations to the basic FGR
                 feature and exposes the bank-level parallelism within
                 the rank even during the refresh. The first
                 optimization decouples the nonrefreshing banks. The
                 second and third optimizations determine the maximum
                 number of nonrefreshing banks that can be active during
                 refresh and selectively precharge the banks before
                 refresh, respectively. Our simulation results show that
                 the EFGR feature is able to recover almost 56.6\% of
                 the performance loss incurred due to refresh
                 operations.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "31",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Yalcin:2014:EEC,
  author =       "Gulay Yalcin and Oguz Ergin and Emrah Islek and Osman
                 Sabri Unsal and Adrian Cristal",
  title =        "Exploiting Existing Comparators for Fine-Grained
                 Low-Cost Error Detection",
  journal =      j-TACO,
  volume =       "11",
  number =       "3",
  pages =        "32:1--32:??",
  month =        oct,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2656341",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Oct 27 17:02:20 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Fault tolerance has become a fundamental concern in
                 computer design, in addition to performance and power.
                 Although several error detection schemes have been
                 proposed to discover a faulty core in the system, these
                 proposals could waste the whole core, including many
                 error-free structures in it after error detection.
                 Moreover, many fault-tolerant designs require
                 additional hardware for data replication or for
                 comparing the replicated data. In this study, we
                 provide a low-cost, fine-grained error detection scheme
                 by exploiting already existing comparators and data
                 replications in the several pipeline stages such as
                 issue queue, rename logic, and translation lookaside
                 buffer. We reduce the vulnerability of the source
                 register tags in IQ by 60\%, the vulnerability of
                 instruction TLB by 64\%, the vulnerability of data TLB
                 by 45\%, and the vulnerability of the register tags of
                 rename logic by 20\%.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "32",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Ramachandran:2014:HFR,
  author =       "Pradeep Ramachandran and Siva Kumar Sastry Hari and
                 Manlap Li and Sarita V. Adve",
  title =        "Hardware Fault Recovery for {I/O} Intensive
                 Applications",
  journal =      j-TACO,
  volume =       "11",
  number =       "3",
  pages =        "33:1--33:??",
  month =        oct,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2656342",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Oct 27 17:02:20 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "With continued process scaling, the rate of hardware
                 failures in commodity systems is increasing. Because
                 these commodity systems are highly sensitive to cost,
                 traditional solutions that employ heavy redundancy to
                 handle such failures are no longer acceptable owing to
                 their high associated costs. Detecting such faults by
                 identifying anomalous software execution and recovering
                 through checkpoint-and-replay is emerging as a viable
                 low-cost alternative for future commodity systems. An
                 important but commonly ignored aspect of such solutions
                 is ensuring that external outputs to the system are
                 fault-free. The outputs must be delayed until the
                 detectors guarantee this, influencing fault-free
                 performance. The overheads for resiliency must thus be
                 evaluated while taking these delays into consideration;
                 prior work has largely ignored this relationship. This
                 article concerns recovery for I/O intensive
                 applications from in-core faults. We present a strategy
                 to buffer external outputs using dedicated hardware and
                 show that checkpoint intervals previously considered as
                 acceptable incur exorbitant overheads when hardware
                 buffering is considered. We then present two techniques
                 to reduce the checkpoint interval and demonstrate a
                 practical solution that provides high resiliency while
                 incurring low overheads.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "33",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Eyerman:2014:MTM,
  author =       "Stijn Eyerman and Pierre Michaud and Wouter Rogiest",
  title =        "Multiprogram Throughput Metrics: a Systematic
                 Approach",
  journal =      j-TACO,
  volume =       "11",
  number =       "3",
  pages =        "34:1--34:??",
  month =        oct,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2663346",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Oct 27 17:02:20 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Running multiple programs on a processor aims at
                 increasing the throughput of that processor. However,
                 defining meaningful throughput metrics in a simulation
                 environment is not as straightforward as reporting
                 execution time. This has led to an ongoing debate on
                 what forms a meaningful throughput metric for
                 multiprogram workloads. We present a method to
                 construct throughput metrics in a systematic way: we
                 start by expressing assumptions on job size, job
                 distribution, scheduling, and so forth that together
                 define a theoretical throughput experiment. The
                 throughput metric is then the average throughput of
                 this experiment. Different assumptions lead to
                 different metrics, so one should be aware of these
                 assumptions when making conclusions based on results
                 using a specific metric. Throughput metrics should
                 always be defined from explicit assumptions, because
                 this leads to a better understanding of the
                 implications and limits of the results obtained with
                 that metric. We elaborate multiple metrics based on
                 different assumptions. In particular, we identify the
                 assumptions that lead to the commonly used weighted
                 speedup and harmonic mean of speedups. Our study
                 clarifies that they are actual throughput metrics,
                 which was recently questioned. We also propose some new
                 throughput metrics, which cannot always be expressed as
                 a closed formula. We use real experimental data to
                 characterize metrics and show how they relate to each
                 other.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "34",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Nugteren:2015:BAS,
  author =       "Cedric Nugteren and Henk Corporaal",
  title =        "{Bones}: an Automatic Skeleton-Based {C-to-CUDA}
                 Compiler for {GPUs}",
  journal =      j-TACO,
  volume =       "11",
  number =       "4",
  pages =        "35:1--35:??",
  month =        jan,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2665079",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Jan 12 11:38:56 MST 2015",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "The shift toward parallel processor architectures has
                 made programming and code generation increasingly
                 challenging. To address this programmability challenge,
                 this article presents a technique to fully
                 automatically generate efficient and readable code for
                 parallel processors (with a focus on GPUs). This is
                 made possible by combining algorithmic skeletons,
                 traditional compilation, and ``algorithmic species,'' a
                 classification of program code. Compilation starts by
                 automatically annotating C code with class information
                 (the algorithmic species). This code is then fed into
                 the skeleton-based source-to-source compiler bones to
                 generate CUDA code. To generate efficient code, bones
                 also performs optimizations including host-accelerator
                 transfer optimization and kernel fusion. This results
                 in a unique approach, integrating a skeleton-based
                 compiler for the first time into an automated flow. The
                 benefits are demonstrated experimentally for PolyBench
                 GPU kernels, showing geometric mean speed-ups of 1.4$
                 \times $ and 2.4$ \times $ compared to ppcg and
                 Par4All, and for five Rodinia GPU benchmarks, showing a
                 gap of only 1.2$ \times $ compared to hand-optimized
                 code.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "35",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Wang:2015:BOM,
  author =       "Jue Wang and Xiangyu Dong and Yuan Xie",
  title =        "Building and Optimizing {MRAM}-Based Commodity
                 Memories",
  journal =      j-TACO,
  volume =       "11",
  number =       "4",
  pages =        "36:1--36:??",
  month =        jan,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2667105",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Jan 12 11:38:56 MST 2015",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Emerging non-volatile memory technologies such as MRAM
                 are promising design solutions for energy-efficient
                 memory architecture, especially for mobile systems.
                 However, building commodity MRAM by reusing DRAM
                 designs is not straightforward. The existing memory
                 interfaces are incompatible with MRAM small page size,
                 and they fail to leverage MRAM unique properties,
                 causing unnecessary performance and energy overhead. In
                 this article, we propose four techniques to enable and
                 optimize an LPDDRx-compatible MRAM solution: ComboAS to
                 solve the pin incompatibility; DynLat to avoid
                 unnecessary access latencies; and EarlyPA and BufW to
                 further improve performance by exploiting the MRAM
                 unique features of non-destructive read and independent
                 write path. Combining all these techniques together, we
                 boost the MRAM performance by 17\% and provide a
                 DRAM-compatible MRAM solution consuming 21\% less
                 energy.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "36",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Komuravelli:2015:RCH,
  author =       "Rakesh Komuravelli and Sarita V. Adve and Ching-Tsun
                 Chou",
  title =        "Revisiting the Complexity of Hardware Cache Coherence
                 and Some Implications",
  journal =      j-TACO,
  volume =       "11",
  number =       "4",
  pages =        "37:1--37:??",
  month =        jan,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2663345",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Jan 12 11:38:56 MST 2015",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Cache coherence is an integral part of shared-memory
                 systems but is also widely considered to be one of the
                 most complex parts of such systems. Much prior work has
                 addressed this complexity and the verification
                 techniques to prove the correctness of hardware
                 coherence. Given the new multicore era with increasing
                 number of cores, there is a renewed debate about
                 whether the complexity of hardware coherence has been
                 tamed or whether it should be abandoned in favor of
                 software coherence. This article revisits the
                 complexity of hardware cache coherence by verifying a
                 publicly available, state-of-the-art implementation of
                 the widely used MESI protocol, using the Mur$ \varphi $
                 model checking tool. To our surprise, we found six bugs
                 in this protocol, most of which were hard to analyze
                 and took several days to fix. To compare the
                 complexity, we also verified the recently proposed
                 DeNovo protocol, which exploits disciplined software
                 programming models. We found three relatively easy to
                 fix bugs in this less mature protocol. After fixing
                 these bugs, our verification experiments showed that,
                 compared to DeNovo, MESI had 15X more reachable states
                 leading to a 20X increase in verification (model
                 checking) time. Although we were eventually successful
                 in verifying the protocols, the tool required making
                 several simplifying assumptions (e.g., two cores, one
                 address). Our results have several implications: (1)
                 they indicate that hardware coherence protocols remain
                 complex; (2) they reinforce the need for protocol
                 designers to embrace formal verification tools to
                 demonstrate correctness of new protocols and
                 extensions; (3) they reinforce the need for formal
                 verification tools that are both scalable and usable by
                 non-expert; and (4) they show that a system based on
                 hardware-software co-design can offer a simpler
                 approach for cache coherence, thus reducing the overall
                 verification effort and allowing verification of more
                 detailed models and protocol extensions that are
                 otherwise limited by computing resources.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "37",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Rodriguez:2015:VSR,
  author =       "Gabriel Rodr{\'\i}guez and Juan Touri{\~n}o and Mahmut
                 T. Kandemir",
  title =        "Volatile {STT--RAM} Scratchpad Design and Data
                 Allocation for Low Energy",
  journal =      j-TACO,
  volume =       "11",
  number =       "4",
  pages =        "38:1--38:??",
  month =        jan,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2669556",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Jan 12 11:38:56 MST 2015",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "On-chip power consumption is one of the fundamental
                 challenges of current technology scaling. Cache
                 memories consume a sizable part of this power,
                 particularly due to leakage energy. STT-RAM is one of
                 several new memory technologies that have been proposed
                 in order to improve power while preserving performance.
                 It features high density and low leakage, but at the
                 expense of write energy and performance. This article
                 explores the use of STT-RAM--based scratchpad memories
                 that trade nonvolatility in exchange for faster and
                 less energetically expensive accesses, making them
                 feasible for on-chip implementation in embedded
                 systems. A novel multiretention scratchpad partitioning
                 is proposed, featuring multiple storage spaces with
                 different retention, energy, and performance
                 characteristics. A customized compiler-based allocation
                 algorithm suitable for use with such a scratchpad
                 organization is described. Our experiments indicate
                 that a multiretention STT-RAM scratchpad can provide
                 energy savings of 53\% with respect to an iso-area,
                 hardware-managed SRAM cache.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "38",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Camarero:2015:TCH,
  author =       "Crist{\'o}bal Camarero and Enrique Vallejo and
                 Ram{\'o}n Beivide",
  title =        "Topological Characterization of {Hamming} and
                 Dragonfly Networks and Its Implications on Routing",
  journal =      j-TACO,
  volume =       "11",
  number =       "4",
  pages =        "39:1--39:??",
  month =        jan,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2677038",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Jan 12 11:38:56 MST 2015",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Current High-Performance Computing (HPC) and data
                 center networks rely on large-radix routers. Hamming
                 graphs (Cartesian products of complete graphs) and
                 dragonflies (two-level direct networks with nodes
                 organized in groups) are some direct topologies
                 proposed for such networks. The original definition of
                 the dragonfly topology is very loose, with several
                 degrees of freedom, such as the inter- and intragroup
                 topology, the specific global connectivity, and the
                 number of parallel links between groups (or trunking
                 level). This work provides a comprehensive analysis of
                 the topological properties of the dragonfly network,
                 providing balancing conditions for network
                 dimensioning, as well as introducing and classifying
                 several alternatives for the global connectivity and
                 trunking level. From a topological study of the
                 network, it is noted that a Hamming graph can be seen
                 as a canonical dragonfly topology with a high level of
                 trunking. Based on this observation and by carefully
                 selecting the global connectivity, the Dimension Order
                 Routing (DOR) mechanism safely used in Hamming graphs
                 is adapted to dragonfly networks with trunking. The
                 resulting routing algorithms approximate the
                 performance of minimal, nonminimal, and adaptive
                 routings typically used in dragonflies but without
                 requiring virtual channels to avoid packet deadlock,
                 thus allowing for lower cost router implementations.
                 This is obtained by properly selecting the link to
                 route between groups based on a graph coloring of
                 network routers. Evaluations show that the proposed
                 mechanisms are competitive with traditional solutions
                 when using the same number of virtual channels and
                 enable for simpler implementations with lower cost.
                 Finally, multilevel dragonflies are discussed,
                 considering how the proposed mechanisms could be
                 adapted to them.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "39",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Yoon:2015:EDM,
  author =       "Hanbin Yoon and Justin Meza and Naveen Muralimanohar
                 and Norman P. Jouppi and Onur Mutlu",
  title =        "Efficient Data Mapping and Buffering Techniques for
                 Multilevel Cell Phase-Change Memories",
  journal =      j-TACO,
  volume =       "11",
  number =       "4",
  pages =        "40:1--40:??",
  month =        jan,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2669365",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Jan 12 11:38:56 MST 2015",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "New phase-change memory (PCM) devices have low-access
                 latencies (like DRAM) and high capacities (i.e., low
                 cost per bit, like Flash). In addition to being able to
                 scale to smaller cell sizes than DRAM, a PCM cell can
                 also store multiple bits per cell (referred to as
                 multilevel cell, or MLC), enabling even greater
                 capacity per bit. However, reading and writing the
                 different bits of data from and to an MLC PCM cell
                 requires different amounts of time: one bit is read or
                 written first, followed by another. Due to this
                 asymmetric access process, the bits in an MLC PCM cell
                 have different access latency and energy depending on
                 which bit in the cell is being read or written. We
                 leverage this observation to design a new way to store
                 and buffer data in MLC PCM devices. While traditional
                 devices couple the bits in each cell next to one
                 another in the address space, our key idea is to
                 logically decouple the bits in each cell into two
                 separate regions depending on their read/write
                 characteristics: fast-read/slow-write bits and
                 slow-read/fast-write bits. We propose a low-overhead
                 hardware/software technique to predict and map data
                 that would benefit from being in each region at
                 runtime. In addition, we show how MLC bit decoupling
                 provides more flexibility in the way data is buffered
                 in the device, enabling more efficient use of existing
                 device buffer space. Our evaluations for a multicore
                 system show that MLC bit decoupling improves system
                 performance by 19.2\%, memory energy efficiency by
                 14.4\%, and thread fairness by 19.3\% over a
                 state-of-the-art MLC PCM system that couples the bits
                 in its cells. We show that our results are consistent
                 across a variety of workloads and system
                 configurations.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "40",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Premillieu:2015:EOE,
  author =       "Nathanael Pr{\'e}millieu and Andr{\'e} Seznec",
  title =        "Efficient Out-of-Order Execution of Guarded {ISAs}",
  journal =      j-TACO,
  volume =       "11",
  number =       "4",
  pages =        "41:1--41:??",
  month =        jan,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2677037",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Jan 12 11:38:56 MST 2015",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "ARM ISA-based processors are no longer low-cost,
                 low-power processors. Nowadays, ARM ISA-based processor
                 manufacturers are striving to implement medium-end to
                 high-end processor cores, which implies implementing a
                 state-of-the-art out-of-order execution engine.
                 Unfortunately, providing efficient out-of-order
                 execution on legacy ARM codes may be quite challenging
                 due to guarded instructions. Predicting the guarded
                 instructions addresses the main serialization impact
                 associated with guarded instructions execution and the
                 multiple definition problem. Moreover, guard prediction
                 allows one to use a global branch-and-guard history
                 predictor to predict both branches and guards, often
                 improving branch prediction accuracy. Unfortunately,
                 such a global branch-and-guard history predictor
                 requires the systematic use of guard predictions. In
                 that case, poor guard prediction accuracy would lead to
                 poor overall performance on some applications. Building
                 on top of recent advances in branch prediction and
                 confidence estimation, we propose a hybrid
                 branch-and-guard predictor, combining a global branch
                 history component and global branch-and-guard history
                 component. The potential gain or loss due to the
                 systematic use of guard prediction is dynamically
                 evaluated at runtime. Two computing modes are enabled:
                 systematic guard prediction use and
                 high-confidence-only guard prediction use. Our
                 experiments show that on most applications, an
                 overwhelming majority of guarded instructions are
                 predicted. Therefore, a simple but relatively
                 inefficient hardware solution can be used to execute
                 the few unpredicted guarded instructions. Significant
                 performance benefits are observed on most applications,
                 while applications with poorly predictable guards do
                 not suffer from performance loss.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "41",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Wang:2015:APM,
  author =       "Zheng Wang and Dominik Grewe and Michael F. P.
                 O'Boyle",
  title =        "Automatic and Portable Mapping of Data Parallel
                 Programs to {OpenCL} for {GPU}-Based Heterogeneous
                 Systems",
  journal =      j-TACO,
  volume =       "11",
  number =       "4",
  pages =        "42:1--42:??",
  month =        jan,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2677036",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Jan 12 11:38:56 MST 2015",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "General-purpose GPU-based systems are highly
                 attractive, as they give potentially massive
                 performance at little cost. Realizing such potential is
                 challenging due to the complexity of programming. This
                 article presents a compiler-based approach to
                 automatically generate optimized OpenCL code from data
                 parallel OpenMP programs for GPUs. A key feature of our
                 scheme is that it leverages existing transformations,
                 especially data transformations, to improve performance
                 on GPU architectures and uses automatic machine
                 learning to build a predictive model to determine if it
                 is worthwhile running the OpenCL code on the GPU or
                 OpenMP code on the multicore host. We applied our
                 approach to the entire NAS parallel benchmark suite and
                 evaluated it on distinct GPU-based systems. We achieved
                 average (up to) speedups of $ 4.51 \times $ and $ 4.20
                 \times $ ($ 143 \times $ and $ 67 \times $) on Core
                 i7/NVIDIA GeForce GTX580 and Core i7/AMD Radeon 7970
                 platforms, respectively, over a sequential baseline.
                 Our approach achieves, on average, greater than $ 10
                 \times $ speedups over two state-of-the-art automatic
                 GPU code generators.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "42",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{He:2015:IHF,
  author =       "Dan He and Fang Wang and Hong Jiang and Dan Feng and
                 Jing Ning Liu and Wei Tong and Zheng Zhang",
  title =        "Improving Hybrid {FTL} by Fully Exploiting Internal
                 {SSD} Parallelism with Virtual Blocks",
  journal =      j-TACO,
  volume =       "11",
  number =       "4",
  pages =        "43:1--43:??",
  month =        jan,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2677160",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Jan 12 11:38:56 MST 2015",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Compared with either block or page-mapping Flash
                 Translation Layer (FTL), hybrid-mapping FTL for flash
                 Solid State Disks (SSDs), such as Fully Associative
                 Section Translation (FAST), has relatively high space
                 efficiency because of its smaller mapping table than
                 the latter and higher flexibility than the former. As a
                 result, hybrid-mapping FTL has become the most commonly
                 used scheme in SSDs. But the hybrid-mapping FTL incurs
                 a large number of costly full-merge operations. Thus, a
                 critical challenge to hybrid-mapping FTL is how to
                 reduce the cost of full-merge operations and improve
                 partial merge operations and switch operations. In this
                 article, we propose a novel FTL scheme, called Virtual
                 Block-based Parallel FAST (VBP-FAST), that divides
                 flash area into Virtual Blocks (VBlocks) and Physical
                 Blocks (PBlocks) where VBlocks are used to fully
                 exploit channel-level, die-level, and plane-level
                 parallelism of flash. Leveraging these three levels of
                 parallelism, the cost of full merge in VBP-FAST is
                 significantly reduced from that of FAST. In the
                 meantime, VBP-FAST uses PBlocks to retain the
                 advantages of partial merge and switch operations. Our
                 extensive trace-driven simulation results show that
                 VBP-FAST speeds up FAST by a factor of 5.3--8.4 for
                 random workloads and of 1.7 for sequential workloads
                 with channel-level, die-level, and plane-level
                 parallelism of 8, 2, and 2 (i.e., eight channels, two
                 dies, and two planes).",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "43",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Rubin:2015:MOM,
  author =       "Eri Rubin and Ely Levy and Amnon Barak and Tal
                 Ben-Nun",
  title =        "{MAPS}: Optimizing Massively Parallel Applications
                 Using Device-Level Memory Abstraction",
  journal =      j-TACO,
  volume =       "11",
  number =       "4",
  pages =        "44:1--44:??",
  month =        jan,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2680544",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Jan 12 11:38:56 MST 2015",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "GPUs play an increasingly important role in
                 high-performance computing. While developing naive code
                 is straightforward, optimizing massively parallel
                 applications requires deep understanding of the
                 underlying architecture. The developer must struggle
                 with complex index calculations and manual memory
                 transfers. This article classifies memory access
                 patterns used in most parallel algorithms, based on
                 Berkeley's Parallel ``Dwarfs.'' It then proposes the
                 MAPS framework, a device-level memory abstraction that
                 facilitates memory access on GPUs, alleviating complex
                 indexing using on-device containers and iterators. This
                 article presents an implementation of MAPS and shows
                 that its performance is comparable to carefully
                 optimized implementations of real-world applications.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "44",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Cilardo:2015:IMM,
  author =       "Alessandro Cilardo and Luca Gallo",
  title =        "Improving Multibank Memory Access Parallelism with
                 Lattice-Based Partitioning",
  journal =      j-TACO,
  volume =       "11",
  number =       "4",
  pages =        "45:1--45:??",
  month =        jan,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2675359",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Jan 12 11:38:56 MST 2015",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Emerging architectures, such as reconfigurable
                 hardware platforms, provide the unprecedented
                 opportunity of customizing the memory infrastructure
                 based on application access patterns. This work
                 addresses the problem of automated memory partitioning
                 for such architectures, taking into account potentially
                 parallel data accesses to physically independent banks.
                 Targeted at affine static control parts (SCoPs), the
                 technique relies on the Z-polyhedral model for program
                 analysis and adopts a partitioning scheme based on
                 integer lattices. The approach enables the definition
                 of a solution space including previous works as
                 particular cases. The problem of minimizing the total
                 amount of memory required across the partitioned banks,
                 referred to as storage minimization throughout the
                 article, is tackled by an optimal approach yielding
                 asymptotically zero memory waste or, as an alternative,
                 an efficient approach ensuring arbitrarily small waste.
                 The article also presents a prototype toolchain and a
                 detailed step-by-step case study demonstrating the
                 impact of the proposed technique along with extensive
                 comparisons with alternative approaches in the
                 literature.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "45",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Martinsen:2015:EPT,
  author =       "Jan Kasper Martinsen and H{\aa}kan Grahn and Anders
                 Isberg",
  title =        "The Effects of Parameter Tuning in Software
                 Thread-Level Speculation in {JavaScript} Engines",
  journal =      j-TACO,
  volume =       "11",
  number =       "4",
  pages =        "46:1--46:??",
  month =        jan,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2686036",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Jan 12 11:38:56 MST 2015",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "JavaScript is a sequential programming language that
                 has a large potential for parallel execution in Web
                 applications. Thread-level speculation can take
                 advantage of this, but it has a large memory overhead.
                 In this article, we evaluate the effects of adjusting
                 various parameters for thread-level speculation. Our
                 results clearly show that thread-level speculation is a
                 useful technique for taking advantage of multicore
                 architectures for JavaScript in Web applications, that
                 nested speculation is required in thread-level
                 speculation, and that the execution characteristics of
                 Web applications significantly reduce the needed
                 memory, the number of threads, and the depth of our
                 speculation.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "46",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Colombet:2015:SOS,
  author =       "Quentin Colombet and Florian Brandner and Alain
                 Darte",
  title =        "Studying Optimal Spilling in the Light of {SSA}",
  journal =      j-TACO,
  volume =       "11",
  number =       "4",
  pages =        "47:1--47:??",
  month =        jan,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2685392",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Jan 12 11:38:56 MST 2015",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Recent developments in register allocation, mostly
                 linked to static single assignment (SSA) form, have
                 shown the benefits of decoupling the problem in two
                 phases: a first spilling phase places load and store
                 instructions so that the register pressure at all
                 program points is small enough, and a second assignment
                 and coalescing phase maps the variables to physical
                 registers and reduces the number of move instructions
                 among registers. This article focuses on the first
                 phase, for which many open questions remain: in
                 particular, we study the notion of optimal spilling
                 (what can be expressed?) and the impact of SSA form
                 (does it help?). To identify the important features for
                 optimal spilling on load-store architectures, we
                 develop a new integer linear programming formulation,
                 more accurate and expressive than previous approaches.
                 Among other features, we can express SSA $ \phi
                 $-functions, memory-to-memory copies, and the fact that
                 a value can be stored simultaneously in a register and
                 in memory. Based on this formulation, we present a
                 thorough analysis of the results obtained for the
                 SPECINT 2000 and EEMBC 1.1 benchmarks, from which we
                 draw, among others, the following conclusions: (1)
                 rematerialization is extremely important; (2) SSA
                 complicates the formulation of optimal spilling,
                 especially because of memory coalescing when the code
                 is not in conventional SSA (CSSA); (3)
                 microarchitectural features are significant and thus
                 have to be accounted for; and (4) significant savings
                 can be obtained in terms of static spill costs, cache
                 miss rates, and dynamic instruction counts.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "47",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Haj-Yihia:2015:CDP,
  author =       "Jawad Haj-Yihia and Yosi Ben Asher and Efraim Rotem
                 and Ahmad Yasin and Ran Ginosar",
  title =        "Compiler-Directed Power Management for Superscalars",
  journal =      j-TACO,
  volume =       "11",
  number =       "4",
  pages =        "48:1--48:??",
  month =        jan,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2685393",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Jan 12 11:38:56 MST 2015",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Modern superscalar CPUs contain large complex
                 structures and diverse execution units, consuming wide
                 dynamic power range. Building a power delivery network
                 for the worst-case power consumption is not energy
                 efficient and often is impossible to fit in small
                 systems. Instantaneous power excursions can cause
                 voltage droops. Power management algorithms are too
                 slow to respond to instantaneous events. In this
                 article, we propose a novel compiler-directed framework
                 to address this problem. The framework is validated on
                 a 4th Generation Intel\reg{} CoreTM processor and with
                 simulator on output trace. Up to 16\% performance
                 speedup is measured over baseline for the SPEC CPU2006
                 benchmarks.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "48",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Trinh:2015:EDE,
  author =       "Hong-Phuc Trinh and Marc Duranton and Michel
                 Paindavoine",
  title =        "Efficient Data Encoding for Convolutional Neural
                 Network application",
  journal =      j-TACO,
  volume =       "11",
  number =       "4",
  pages =        "49:1--49:??",
  month =        jan,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2685394",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Jan 12 11:38:56 MST 2015",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "This article presents an approximate data encoding
                 scheme called Significant Position Encoding (SPE). The
                 encoding allows efficient implementation of the recall
                 phase (forward propagation pass) of Convolutional
                 Neural Networks (CNN)-a typical Feed-Forward Neural
                 Network. This implementation uses only 7 bits data
                 representation and achieves almost the same
                 classification performance compared with the initial
                 network: on MNIST handwriting recognition task, using
                 this data encoding scheme losses only 0.03\% in terms
                 of recognition rate (99.27\% vs. 99.3\%). In terms of
                 storage, we achieve a 12.5\% gain compared with an 8
                 bits fixed-point implementation of the same CNN.
                 Moreover, this data encoding allows efficient
                 implementation of processing unit thanks to the
                 simplicity of scalar product operation-the principal
                 operation in a Feed-Forward Neural Network.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "49",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Breugh:2015:MAM,
  author =       "Maximilien B. Breugh and Stijn Eyerman and Lieven
                 Eeckhout",
  title =        "Mechanistic Analytical Modeling of Superscalar
                 In-Order Processor Performance",
  journal =      j-TACO,
  volume =       "11",
  number =       "4",
  pages =        "50:1--50:??",
  month =        jan,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2678277",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Jan 12 11:38:56 MST 2015",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Superscalar in-order processors form an interesting
                 alternative to out-of-order processors because of their
                 energy efficiency and lower design complexity. However,
                 despite the reduced design complexity, it is nontrivial
                 to get performance estimates or insight in the
                 application--microarchitecture interaction without
                 running slow, detailed cycle-level simulations, because
                 performance highly depends on the order of instructions
                 within the application's dynamic instruction stream, as
                 in-order processors stall on interinstruction
                 dependences and functional unit contention. To limit
                 the number of detailed cycle-level simulations needed
                 during design space exploration, we propose a
                 mechanistic analytical performance model that is built
                 from understanding the internal mechanisms of the
                 processor. The mechanistic performance model for
                 superscalar in-order processors is shown to be accurate
                 with an average performance prediction error of 3.2\%
                 compared to detailed cycle-accurate simulation using
                 gem5. We also validate the model against hardware,
                 using the ARM Cortex-A8 processor and show that it is
                 accurate within 10\% on average. We further demonstrate
                 the usefulness of the model through three case studies:
                 (1) design space exploration, identifying the optimum
                 number of functional units for achieving a given
                 performance target; (2) program--machine interactions,
                 providing insight into microarchitecture bottlenecks;
                 and (3) compiler--architecture interactions,
                 visualizing the impact of compiler optimizations on
                 performance.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "50",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Seshadri:2015:MPC,
  author =       "Vivek Seshadri and Samihan Yedkar and Hongyi Xin and
                 Onur Mutlu and Phillip B. Gibbons and Michael A. Kozuch
                 and Todd C. Mowry",
  title =        "Mitigating Prefetcher-Caused Pollution Using Informed
                 Caching Policies for Prefetched Blocks",
  journal =      j-TACO,
  volume =       "11",
  number =       "4",
  pages =        "51:1--51:??",
  month =        jan,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2677956",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Jan 12 11:38:56 MST 2015",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Many modern high-performance processors prefetch
                 blocks into the on-chip cache. Prefetched blocks can
                 potentially pollute the cache by evicting more useful
                 blocks. In this work, we observe that both accurate and
                 inaccurate prefetches lead to cache pollution, and
                 propose a comprehensive mechanism to mitigate
                 prefetcher-caused cache pollution. First, we observe
                 that over 95\% of useful prefetches in a wide variety
                 of applications are not reused after the first demand
                 hit (in secondary caches). Based on this observation,
                 our first mechanism simply demotes a prefetched block
                 to the lowest priority on a demand hit. Second, to
                 address pollution caused by inaccurate prefetches, we
                 propose a self-tuning prefetch accuracy predictor to
                 predict if a prefetch is accurate or inaccurate. Only
                 predicted-accurate prefetches are inserted into the
                 cache with a high priority. Evaluations show that our
                 final mechanism, which combines these two ideas,
                 significantly improves performance compared to both the
                 baseline LRU policy and two state-of-the-art approaches
                 to mitigating prefetcher-caused cache pollution (up to
                 49\%, and 6\% on average for 157 two-core
                 multiprogrammed workloads). The performance improvement
                 is consistent across a wide variety of system
                 configurations.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "51",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Matheou:2015:ASD,
  author =       "George Matheou and Paraskevas Evripidou",
  title =        "Architectural Support for Data-Driven Execution",
  journal =      j-TACO,
  volume =       "11",
  number =       "4",
  pages =        "52:1--52:??",
  month =        jan,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2686874",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Jan 12 11:38:56 MST 2015",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "The exponential growth of sequential processors has
                 come to an end, and thus, parallel processing is
                 probably the only way to achieve performance growth. We
                 propose the development of parallel architectures based
                 on data-driven scheduling. Data-driven scheduling
                 enforces only a partial ordering as dictated by the
                 true data dependencies, which is the minimum
                 synchronization possible. This is very beneficial for
                 parallel processing because it enables it to exploit
                 the maximum possible parallelism. We provide
                 architectural support for data-driven execution for the
                 Data-Driven Multithreading (DDM) model. In the past,
                 DDM has been evaluated mostly in the form of virtual
                 machines. The main contribution of this work is the
                 development of a highly efficient hardware support for
                 data-driven execution and its integration into a
                 multicore system with eight cores on a Virtex-6 FPGA.
                 The DDM semantics make barriers and cache coherence
                 unnecessary, which reduces the synchronization
                 latencies significantly and makes the cache simpler.
                 The performance evaluation has shown that the support
                 for data-driven execution is very efficient with
                 negligible overheads. Our prototype can support very
                 small problem sizes (matrix $ 16 \times 16$) and
                 ultra-lightweight threads (block of $ 4 \times 4$) that
                 achieve speedups close to linear. Such results cannot
                 be achieved by software-based systems.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "52",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Morad:2015:GSP,
  author =       "Amir Morad and Leonid Yavits and Ran Ginosar",
  title =        "{GP--SIMD} Processing-in-Memory",
  journal =      j-TACO,
  volume =       "11",
  number =       "4",
  pages =        "53:1--53:??",
  month =        jan,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2686875",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Jan 12 11:38:56 MST 2015",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "GP-SIMD, a novel hybrid general-purpose SIMD computer
                 architecture, resolves the issue of data
                 synchronization by in-memory computing through
                 combining data storage and massively parallel
                 processing. GP-SIMD employs a two-dimensional access
                 memory with modified SRAM storage cells and a
                 bit-serial processing unit per each memory row. An
                 analytic performance model of the GP-SIMD architecture
                 is presented, comparing it to associative processor and
                 to conventional SIMD architectures. Cycle-accurate
                 simulation of four workloads supports the analytical
                 comparison. Assuming a moderate die area, GP-SIMD
                 architecture outperforms both the associative processor
                 and conventional SIMD coprocessor architectures by
                 almost an order of magnitude while consuming less
                 power.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "53",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Schaub:2015:ISW,
  author =       "Thomas Schaub and Simon Moll and Ralf Karrenberg and
                 Sebastian Hack",
  title =        "The Impact of the {SIMD} Width on Control-Flow and
                 Memory Divergence",
  journal =      j-TACO,
  volume =       "11",
  number =       "4",
  pages =        "54:1--54:??",
  month =        jan,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2687355",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Jan 12 11:38:56 MST 2015",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Power consumption is a prevalent issue in current and
                 future computing systems. SIMD processors amortize the
                 power consumption of managing the instruction stream by
                 executing the same instruction in parallel on multiple
                 data. Therefore, in the past years, the SIMD width has
                 steadily increased, and it is not unlikely that it will
                 continue to do so. In this article, we experimentally
                 study the influence of the SIMD width to the execution
                 of data-parallel programs. We investigate how an
                 increasing SIMD width (up to 1024) influences
                 control-flow divergence and memory-access divergence,
                 and how well techniques to mitigate them will work on
                 larger SIMD widths. We perform our study on 76 OpenCL
                 applications and show that a group of programs scales
                 well up to SIMD width 1024, whereas another group of
                 programs increasingly suffers from control-flow
                 divergence. For those programs, thread regrouping
                 techniques may become increasingly important for larger
                 SIMD widths. We show what average speedups can be
                 expected when increasing the SIMD width. For example,
                 when switching from scalar execution to SIMD width 64,
                 one can expect a speedup of 60.11, which increases to
                 62.46 when using thread regrouping. We also analyze the
                 frequency of regular (uniform, consecutive) memory
                 access patterns and observe a monotonic decrease of
                 regular memory accesses from 82.6 at SIMD width 4 to
                 43.1\% at SIMD width 1024.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "54",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Fang:2015:MMD,
  author =       "Zhenman Fang and Sanyam Mehta and Pen-Chung Yew and
                 Antonia Zhai and James Greensky and Gautham Beeraka and
                 Binyu Zang",
  title =        "Measuring Microarchitectural Details of Multi- and
                 Many-Core Memory Systems through Microbenchmarking",
  journal =      j-TACO,
  volume =       "11",
  number =       "4",
  pages =        "55:1--55:??",
  month =        jan,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2687356",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Jan 12 11:38:56 MST 2015",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "As multicore and many-core architectures evolve, their
                 memory systems are becoming increasingly more complex.
                 To bridge the latency and bandwidth gap between the
                 processor and memory, they often use a mix of
                 multilevel private/shared caches that are either
                 blocking or nonblocking and are connected by high-speed
                 network-on-chip. Moreover, they also incorporate
                 hardware and software prefetching and simultaneous
                 multithreading (SMT) to hide memory latency. On such
                 multi- and many-core systems, to incorporate various
                 memory optimization schemes using compiler
                 optimizations and performance tuning techniques, it is
                 crucial to have microarchitectural details of the
                 target memory system. Unfortunately, such details are
                 often unavailable from vendors, especially for newly
                 released processors. In this article, we propose a
                 novel microbenchmarking methodology based on short
                 elapsed-time events (SETEs) to obtain comprehensive
                 memory microarchitectural details in multi- and
                 many-core processors. This approach requires detailed
                 analysis of potential interfering factors that could
                 affect the intended behavior of such memory systems. We
                 lay out effective guidelines to control and mitigate
                 those interfering factors. Taking the impact of SMT
                 into consideration, our proposed methodology not only
                 can measure traditional cache/memory latency and
                 off-chip bandwidth but also can uncover the details of
                 software and hardware prefetching units not attempted
                 in previous studies. Using the newly released Intel
                 Xeon Phi many-core processor (with in-order cores) as
                 an example, we show how we can use a set of
                 microbenchmarks to determine various microarchitectural
                 features of its memory system (many are undocumented
                 from vendors). To demonstrate the portability and
                 validate the correctness of such a methodology, we use
                 the well-documented Intel Sandy Bridge multicore
                 processor (with out-of-order cores) as another example,
                 where most data are available and can be validated.
                 Moreover, to illustrate the usefulness of the measured
                 data, we do a multistage coordinated data prefetching
                 case study on both Xeon Phi and Sandy Bridge and show
                 that by using the measured data, we can achieve 1.3X
                 and 1.08X performance speedup, respectively, compared
                 to the state-of-the-art Intel ICC compiler. We believe
                 that these measurements also provide useful insights
                 into memory optimization, analysis, and modeling of
                 such multicore and many-core architectures.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "55",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Chi:2015:LPH,
  author =       "Chi Ching Chi and Mauricio Alvarez-Mesa and Ben
                 Juurlink",
  title =        "Low-Power High-Efficiency Video Decoding using
                 General-Purpose Processors",
  journal =      j-TACO,
  volume =       "11",
  number =       "4",
  pages =        "56:1--56:??",
  month =        jan,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2685551",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Jan 12 11:38:56 MST 2015",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "In this article, we investigate how code optimization
                 techniques and low-power states of general-purpose
                 processors improve the power efficiency of HEVC
                 decoding. The power and performance efficiency of the
                 use of SIMD instructions, multicore architectures, and
                 low-power active and idle states are analyzed in detail
                 for offline video decoding. In addition, the power
                 efficiency of techniques such as ``race to idle'' and
                 ``exploiting slack'' with DVFS are evaluated for
                 real-time video decoding. Results show that
                 ``exploiting slack'' is more power efficient than
                 ``race to idle'' for all evaluated platforms
                 representing smartphone, tablet, laptop, and desktop
                 computing systems.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "56",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Luporini:2015:CLO,
  author =       "Fabio Luporini and Ana Lucia Varbanescu and Florian
                 Rathgeber and Gheorghe-Teodor Bercea and J. Ramanujam
                 and David A. Ham and Paul H. J. Kelly",
  title =        "Cross-Loop Optimization of Arithmetic Intensity for
                 Finite Element Local Assembly",
  journal =      j-TACO,
  volume =       "11",
  number =       "4",
  pages =        "57:1--57:??",
  month =        jan,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2687415",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Jan 12 11:38:56 MST 2015",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "We study and systematically evaluate a class of
                 composable code transformations that improve arithmetic
                 intensity in local assembly operations, which represent
                 a significant fraction of the execution time in finite
                 element methods. Their performance optimization is
                 indeed a challenging issue. Even though affine loop
                 nests are generally present, the short trip counts and
                 the complexity of mathematical expressions, which vary
                 among different problems, make it hard to determine an
                 optimal sequence of successful transformations. Our
                 investigation has resulted in the implementation of a
                 compiler (called COFFEE) for local assembly kernels,
                 fully integrated with a framework for developing finite
                 element methods. The compiler manipulates abstract
                 syntax trees generated from a domain-specific language
                 by introducing domain-aware optimizations for
                 instruction-level parallelism and register locality.
                 Eventually, it produces C code including vector SIMD
                 intrinsics. Experiments using a range of real-world
                 finite element problems of increasing complexity show
                 that significant performance improvement is achieved.
                 The generality of the approach and the applicability of
                 the proposed code transformations to other domains is
                 also discussed.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "57",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Zhou:2015:OPS,
  author =       "Xing Zhou and Mar{\'\i}a J. Garzar{\'a}n and David A.
                 Padua",
  title =        "Optimal Parallelogram Selection for Hierarchical
                 Tiling",
  journal =      j-TACO,
  volume =       "11",
  number =       "4",
  pages =        "58:1--58:??",
  month =        jan,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2687414",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Jan 12 11:38:56 MST 2015",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Loop tiling is an effective optimization to improve
                 performance of multiply nested loops, which are the
                 most time-consuming parts in many programs. Most
                 massively parallel systems today are organized
                 hierarchically, and different levels of the hierarchy
                 differ in the organization of parallelism and the
                 memory models they adopt. To make better use of these
                 machines, it is clear that loop nests should be tiled
                 hierarchically to fit the hierarchical organization of
                 the machine; however, it is not so clear what should be
                 the exact form of these hierarchical tiles. In
                 particular, tile shape selection is of critical
                 importance to expose parallelism of the tiled loop
                 nests. Although loop tiling is a well-known
                 optimization, not much is known about tile shape
                 selection. In this article, we study tile shape
                 selection when the shapes are any type of
                 parallelograms and introduce a model to relate the tile
                 shape of the hierarchy to the execution time. Using
                 this model, we implement a system that automatically
                 finds the tile shapes that minimize the execution time
                 in a hierarchical system. Our experimental results show
                 that in several cases, the tiles automatically selected
                 by our system outperform the most intuitive tiling
                 schemes usually adopted by programmers because of their
                 simplicity.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "58",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Porter:2015:MMS,
  author =       "Leo Porter and Michael A. Laurenzano and Ananta Tiwari
                 and Adam Jundt and William A. {Ward, Jr.} and Roy
                 Campbell and Laura Carrington",
  title =        "Making the Most of {SMT} in {HPC}: System- and
                 Application-Level Perspectives",
  journal =      j-TACO,
  volume =       "11",
  number =       "4",
  pages =        "59:1--59:??",
  month =        jan,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2687651",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Jan 12 11:38:56 MST 2015",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "This work presents an end-to-end methodology for
                 quantifying the performance and power benefits of
                 simultaneous multithreading (SMT) for HPC centers and
                 applies this methodology to a production system and
                 workload. Ultimately, SMT's value system-wide depends
                 on whether users effectively employ SMT at the
                 application level. However, predicting SMT's benefit
                 for HPC applications is challenging; by doubling the
                 number of threads, the application's characteristics
                 may change. This work proposes statistical modeling
                 techniques to predict the speedup SMT confers to HPC
                 applications. This approach, accurate to within 8\%,
                 uses only lightweight, transparent performance monitors
                 collected during a single run of the application.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "59",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Tong:2015:OMT,
  author =       "Xin Tong and Toshihiko Koju and Motohiro Kawahito and
                 Andreas Moshovos",
  title =        "Optimizing Memory Translation Emulation in Full System
                 Emulators",
  journal =      j-TACO,
  volume =       "11",
  number =       "4",
  pages =        "60:1--60:??",
  month =        jan,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2686034",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Jan 12 11:38:56 MST 2015",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "The emulation speed of a full system emulator (FSE)
                 determines its usefulness. This work quantitatively
                 measures where time is spent in QEMU [Bellard 2005], an
                 industrial-strength FSE. The analysis finds that memory
                 emulation is one of the most heavily exercised emulator
                 components. For workloads studied, 38.1\% of the
                 emulation time is spent in memory emulation on average,
                 even though QEMU implements a software translation
                 lookaside buffer (STLB) to accelerate dynamic address
                 translation. Despite the amount of time spent in memory
                 emulation, there has been no study on how to further
                 improve its speed. This work analyzes where time is
                 spent in memory emulation and studies the performance
                 impact of a number of STLB optimizations. Although
                 there are several performance optimization techniques
                 for hardware TLBs, this work finds that the trade-offs
                 with an STLB are quite different compared to those with
                 hardware TLBs. As a result, not all hardware TLB
                 performance optimization techniques are applicable to
                 STLBs and vice versa. The evaluated STLB optimizations
                 target STLB lookups, as well as refills, and result in
                 an average emulator performance improvement of 24.4\%
                 over the baseline.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "60",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Kong:2015:CRF,
  author =       "Martin Kong and Antoniu Pop and Louis-No{\"e}l Pouchet
                 and R. Govindarajan and Albert Cohen and P.
                 Sadayappan",
  title =        "Compiler\slash Runtime Framework for Dynamic Dataflow
                 Parallelization of Tiled Programs",
  journal =      j-TACO,
  volume =       "11",
  number =       "4",
  pages =        "61:1--61:??",
  month =        jan,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2687652",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Jan 12 11:38:56 MST 2015",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Task-parallel languages are increasingly popular. Many
                 of them provide expressive mechanisms for intertask
                 synchronization. For example, OpenMP 4.0 will integrate
                 data-driven execution semantics derived from the StarSs
                 research language. Compared to the more restrictive
                 data-parallel and fork-join concurrency models, the
                 advanced features being introduced into task-parallel
                 models in turn enable improved scalability through load
                 balancing, memory latency hiding, mitigation of the
                 pressure on memory bandwidth, and, as a side effect,
                 reduced power consumption. In this article, we develop
                 a systematic approach to compile loop nests into
                 concurrent, dynamically constructed graphs of dependent
                 tasks. We propose a simple and effective heuristic that
                 selects the most profitable parallelization idiom for
                 every dependence type and communication pattern. This
                 heuristic enables the extraction of interband
                 parallelism (cross-barrier parallelism) in a number of
                 numerical computations that range from linear algebra
                 to structured grids and image processing. The proposed
                 static analysis and code generation alleviates the
                 burden of a full-blown dependence resolver to track the
                 readiness of tasks at runtime. We evaluate our approach
                 and algorithms in the PPCG compiler, targeting
                 OpenStream, a representative dataflow task-parallel
                 language with explicit intertask dependences and a
                 lightweight runtime. Experimental results demonstrate
                 the effectiveness of the approach.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "61",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Melot:2015:FCS,
  author =       "Nicolas Melot and Christoph Kessler and J{\"o}rg
                 Keller and Patrick Eitschberger",
  title =        "Fast Crown Scheduling Heuristics for Energy-Efficient
                 Mapping and Scaling of Moldable Streaming Tasks on
                 Manycore Systems",
  journal =      j-TACO,
  volume =       "11",
  number =       "4",
  pages =        "62:1--62:??",
  month =        jan,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2687653",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Jan 12 11:38:56 MST 2015",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Exploiting effectively massively parallel
                 architectures is a major challenge that stream
                 programming can help facilitate. We investigate the
                 problem of generating energy-optimal code for a
                 collection of streaming tasks that include
                 parallelizable or moldable tasks on a generic manycore
                 processor with dynamic discrete frequency scaling.
                 Streaming task collections differ from classical task
                 sets in that all tasks are running concurrently, so
                 that cores typically run several tasks that are
                 scheduled round-robin at user level in a data-driven
                 way. A stream of data flows through the tasks and
                 intermediate results may be forwarded to other tasks,
                 as in a pipelined task graph. In this article, we
                 consider crown scheduling, a novel technique for the
                 combined optimization of resource allocation, mapping,
                 and discrete voltage/frequency scaling for moldable
                 streaming task collections in order to optimize energy
                 efficiency given a throughput constraint. We first
                 present optimal offline algorithms for separate and
                 integrated crown scheduling based on integer linear
                 programming (ILP). We make no restricting assumption
                 about speedup behavior. We introduce the fast heuristic
                 Longest Task, Lowest Group (LTLG) as a generalization
                 of the Longest Processing Time (LPT) algorithm to
                 achieve a load-balanced mapping of parallel tasks, and
                 the Height heuristic for crown frequency scaling. We
                 use them in feedback loop heuristics based on binary
                 search and simulated annealing to optimize crown
                 allocation. Our experimental evaluation of the ILP
                 models for a generic manycore architecture shows that
                 at least for small and medium-sized streaming task
                 collections even the integrated variant of crown
                 scheduling can be solved to optimality by a
                 state-of-the-art ILP solver within a few seconds. Our
                 heuristics produce makespan and energy consumption
                 close to optimality within the limits of the
                 phase-separated crown scheduling technique and the
                 crown structure. Their optimization time is longer than
                 the one of other algorithms we test, but our heuristics
                 consistently produce better solutions.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "62",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Ruan:2015:TRM,
  author =       "Wenjia Ruan and Yujie Liu and Michael Spear",
  title =        "Transactional Read-Modify-Write Without Aborts",
  journal =      j-TACO,
  volume =       "11",
  number =       "4",
  pages =        "63:1--63:??",
  month =        jan,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2688904",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Jan 12 11:38:56 MST 2015",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Language-level transactions are said to provide
                 ``atomicity,'' implying that the order of operations
                 within a transaction should be invisible to concurrent
                 transactions and thus that independent operations
                 within a transaction should be safe to execute in any
                 order. In this article, we present a mechanism for
                 dynamically reordering memory operations within a
                 transaction so that read-modify-write operations on
                 highly contended locations can be delayed until the
                 very end of the transaction. When integrated with
                 traditional transactional conflict detection
                 mechanisms, our approach reduces aborts on hot memory
                 locations, such as statistics counters, thereby
                 improving throughput and reducing wasted work. We
                 present three algorithms for delaying highly contended
                 read-modify-write operations within transactions, and
                 we evaluate their impact on throughput for eager and
                 lazy transactional systems across multiple workloads.
                 We also discuss complications that arise from the
                 interaction between our mechanism and the need for
                 strong language-level semantics, and we propose
                 algorithmic extensions that prevent errors from
                 occurring when accesses are aggressively reordered in a
                 transactional memory implementation with weak
                 semantics.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "63",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{UlHuda:2015:UTM,
  author =       "Zia {Ul Huda} and Ali Jannesari and Felix Wolf",
  title =        "Using Template Matching to Infer Parallel Design
                 Patterns",
  journal =      j-TACO,
  volume =       "11",
  number =       "4",
  pages =        "64:1--64:??",
  month =        jan,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2688905",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Jan 12 11:38:56 MST 2015",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "The triumphant spread of multicore processors over the
                 past decade increases the pressure on software
                 developers to exploit the growing amount of parallelism
                 available in the hardware. However, writing parallel
                 programs is generally challenging. For sequential
                 programs, the formulation of design patterns marked a
                 turning point in software development, boosting
                 programmer productivity and leading to more reusable
                 and maintainable code. While the literature is now also
                 reporting a rising number of parallel design patterns,
                 programmers confronted with the task of parallelizing
                 an existing sequential program still struggle with the
                 question of which parallel pattern to apply where in
                 their code. In this article, we show how template
                 matching, a technique traditionally used in the
                 discovery of sequential design patterns, can also be
                 used to support parallelization decisions. After
                 looking for matches in a previously extracted dynamic
                 dependence graph, we classify code blocks of the input
                 program according to the structure of the parallel
                 patterns we find. Based on this information, the
                 programmer can easily implement the detected pattern
                 and create a parallel version of his or her program. We
                 tested our approach with six programs, in which we
                 successfully detected pipeline and do-all patterns.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "64",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Litz:2015:ECA,
  author =       "Heiner Litz and Ricardo J. Dias and David R.
                 Cheriton",
  title =        "Efficient Correction of Anomalies in Snapshot
                 Isolation Transactions",
  journal =      j-TACO,
  volume =       "11",
  number =       "4",
  pages =        "65:1--65:??",
  month =        jan,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2693260",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Jan 12 11:38:56 MST 2015",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Transactional memory systems providing snapshot
                 isolation enable concurrent access to shared data
                 without incurring aborts on read-write conflicts.
                 Reducing aborts is extremely relevant as it leads to
                 higher concurrency, greater performance, and better
                 predictability. Unfortunately, snapshot isolation does
                 not provide serializability as it allows certain
                 anomalies that can lead to subtle consistency
                 violations. While some mechanisms have been proposed to
                 verify the correctness of a program utilizing snapshot
                 isolation transactions, it remains difficult to repair
                 incorrect applications. To reduce the programmer's
                 burden in this case, we present a technique based on
                 dynamic code and graph dependency analysis that
                 automatically corrects existing snapshot isolation
                 anomalies in transactional memory programs. Our
                 evaluation shows that corrected applications retain the
                 performance benefits characteristic of snapshot
                 isolation over conventional transactional memory
                 systems.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "65",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Bahmann:2015:PRC,
  author =       "Helge Bahmann and Nico Reissmann and Magnus Jahre and
                 Jan Christian Meyer",
  title =        "Perfect Reconstructability of Control Flow from Demand
                 Dependence Graphs",
  journal =      j-TACO,
  volume =       "11",
  number =       "4",
  pages =        "66:1--66:??",
  month =        jan,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2693261",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Jan 12 11:38:56 MST 2015",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Demand-based dependence graphs (DDGs), such as the
                 (Regionalized) Value State Dependence Graph ((R)VSDG),
                 are intermediate representations (IRs) well suited for
                 a wide range of program transformations. They
                 explicitly model the flow of data and state, and only
                 implicitly represent a restricted form of control flow.
                 These features make DDGs especially suitable for
                 automatic parallelization and vectorization, but cannot
                 be leveraged by practical compilers without efficient
                 construction and destruction algorithms. Construction
                 algorithms remodel the arbitrarily complex control flow
                 of a procedure to make it amenable to DDG
                 representation, whereas destruction algorithms
                 reestablish control flow for generating efficient
                 object code. Existing literature presents solutions to
                 both problems, but these impose structural constraints
                 on the generatable control flow, and omit qualitative
                 evaluation. The key contribution of this article is to
                 show that there is no intrinsic structural limitation
                 in the control flow directly extractable from RVSDGs.
                 This fundamental result originates from an
                 interpretation of loop repetition and decision
                 predicates as computed continuations, leading to the
                 introduction of the predicate continuation normal form.
                 We provide an algorithm for constructing RVSDGs in
                 predicate continuation form, and propose a novel
                 destruction algorithm for RVSDGs in this form. Our
                 destruction algorithm can generate arbitrarily complex
                 control flow; we show this by proving that the original
                 CFG an RVSDG was derived from can, apart from
                 overspecific detail, be reconstructed perfectly.
                 Additionally, we prove termination and correctness of
                 these algorithms. Furthermore, we empirically evaluate
                 the performance, the representational overhead at
                 compile time, and the reduction in branch instructions
                 compared to existing solutions. In contrast to previous
                 work, our algorithms impose no additional overhead on
                 the control flow of the produced object code. To our
                 knowledge, this is the first scheme that allows the
                 original control flow of a procedure to be recovered
                 from a DDG representation.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "66",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Elango:2015:URM,
  author =       "Venmugil Elango and Naser Sedaghati and Fabrice
                 Rastello and Louis-No{\"e}l Pouchet and J. Ramanujam
                 and Radu Teodorescu and P. Sadayappan",
  title =        "On Using the Roofline Model with Lower Bounds on Data
                 Movement",
  journal =      j-TACO,
  volume =       "11",
  number =       "4",
  pages =        "67:1--67:??",
  month =        jan,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2693656",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Jan 12 11:38:56 MST 2015",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "The roofline model is a popular approach for ``bound
                 and bottleneck'' performance analysis. It focuses on
                 the limits to the performance of processors because of
                 limited bandwidth to off-chip memory. It models upper
                 bounds on performance as a function of operational
                 intensity, the ratio of computational operations per
                 byte of data moved from/to memory. While operational
                 intensity can be directly measured for a specific
                 implementation of an algorithm on a particular target
                 platform, it is of interest to obtain broader insights
                 on bottlenecks, where various semantically equivalent
                 implementations of an algorithm are considered, along
                 with analysis for variations in architectural
                 parameters. This is currently very cumbersome and
                 requires performance modeling and analysis of many
                 variants. In this article, we address this problem by
                 using the roofline model in conjunction with upper
                 bounds on the operational intensity of computations as
                 a function of cache capacity, derived from lower bounds
                 on data movement. This enables bottleneck analysis that
                 holds across all dependence-preserving semantically
                 equivalent implementations of an algorithm. We
                 demonstrate the utility of the approach in assessing
                 fundamental limits to performance and energy efficiency
                 for several benchmark algorithms across a design space
                 of architectural variations.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "67",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Anonymous:2015:LDR,
  author =       "Anonymous",
  title =        "List of Distinguished Reviewers {ACM TACO 2014}",
  journal =      j-TACO,
  volume =       "11",
  number =       "4",
  pages =        "68:1--68:??",
  month =        jan,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2714082",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Jan 12 11:38:56 MST 2015",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "68",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Zimmer:2015:NSM,
  author =       "Christopher Zimmer and Frank Mueller",
  title =        "{NoCMsg}: a Scalable Message-Passing Abstraction for
                 Network-on-Chips",
  journal =      j-TACO,
  volume =       "12",
  number =       "1",
  pages =        "1:1--1:??",
  month =        apr,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2701426",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Thu Apr 16 18:39:56 MDT 2015",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "The number of cores of contemporary processors is
                 constantly increasing and thus continues to deliver
                 ever higher peak performance (following Moore's
                 transistor law). Yet high core counts present a
                 challenge to hardware and software alike. Following
                 this trend, the network-on-chip (NoC) topology has
                 changed from buses over rings and fully connected
                 meshes to 2D meshes. This work contributes NoCMsg, a
                 low-level message-passing abstraction over NoCs, which
                 is specifically designed for large core counts in 2D
                 meshes. NoCMsg ensures deadlock-free messaging for
                 wormhole Manhattan-path routing over the NoC via a
                 polling-based message abstraction and
                 non--flow-controlled communication for selective
                 communication patterns. Experimental results on the
                 TilePro hardware platform show that NoCMsg can
                 significantly reduce communication times by up to 86\%
                 for single packet messages and up to 40\% for larger
                 messages compared to other NoC-based message
                 approaches. On the TilePro platform, NoCMsg outperforms
                 shared memory abstractions by up to 93\% as core counts
                 and interprocess communication increase. Results for
                 fully pipelined double-precision numerical codes show
                 speedups of up to 64\% for message passing over shared
                 memory at 32 cores. Overall, we observe that shared
                 memory scales up to about 16 cores on this platform,
                 whereas message passing performs well beyond that
                 threshold. These results generalize to similar
                 NoC-based platforms.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "1",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Grigorian:2015:ADA,
  author =       "Beayna Grigorian and Glenn Reinman",
  title =        "Accelerating Divergent Applications on {SIMD}
                 Architectures Using Neural Networks",
  journal =      j-TACO,
  volume =       "12",
  number =       "1",
  pages =        "2:1--2:??",
  month =        apr,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2717311",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Thu Apr 16 18:39:56 MDT 2015",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "The purpose of this research is to find a
                 neural-network-based solution to the well-known problem
                 of branch divergence in Single Instruction Multiple
                 Data (SIMD) architectures. Our approach differs from
                 existing techniques that handle branch (or
                 control-flow) divergence, which use costly hardware
                 modifications, low-utilization masking techniques, or
                 static prediction methods. As we examine divergent
                 applications, we characterize the degree of
                 data-dependent control flow seen in each and isolate
                 the code regions (or ``kernels'') that cause the most
                 performance degradation due to branch divergence. We
                 then train neural networks (NNs) offline to approximate
                 these kernels and inject the NN computations directly
                 into the applications as substitutes for the kernels
                 they approximate. This essentially translates control
                 flow into nondivergent computation, trading off
                 precision for performance. As our methodology
                 manipulates application source code directly, it is
                 inherently platform agnostic and can be adopted as a
                 general means for accelerating divergent applications
                 on data-parallel architectures. In this article, we
                 present the Neuralizer, an automated software flow for
                 kernel identification, NN training, and NN integration,
                 as well as supplementary user-controlled optimization
                 techniques. Evaluating our approach on a variety of
                 divergent applications run on a Graphics Processing
                 Unit (GPU), we on average achieve performance gains of
                 13.6 $ \times $ and energy savings of 14.8 $ \times $
                 with 96\% accuracy.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "2",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Holey:2015:PEC,
  author =       "Anup Holey and Vineeth Mekkat and Pen-Chung Yew and
                 Antonia Zhai",
  title =        "Performance-Energy Considerations for Shared Cache
                 Management in a Heterogeneous Multicore Processor",
  journal =      j-TACO,
  volume =       "12",
  number =       "1",
  pages =        "3:1--3:??",
  month =        apr,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2710019",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Thu Apr 16 18:39:56 MDT 2015",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Heterogeneous multicore processors that integrate CPU
                 cores and data-parallel accelerators such as graphic
                 processing unit (GPU) cores onto the same die raise
                 several new issues for sharing various on-chip
                 resources. The shared last-level cache (LLC) is one of
                 the most important shared resources due to its impact
                 on performance. Accesses to the shared LLC in
                 heterogeneous multicore processors can be dominated by
                 the GPU due to the significantly higher number of
                 concurrent threads supported by the architecture. Under
                 current cache management policies, the CPU
                 applications' share of the LLC can be significantly
                 reduced in the presence of competing GPU applications.
                 For many CPU applications, a reduced share of the LLC
                 could lead to significant performance degradation. On
                 the contrary, GPU applications can tolerate increase in
                 memory access latency when there is sufficient
                 thread-level parallelism (TLP). In addition to the
                 performance challenge, introduction of diverse cores
                 onto the same die changes the energy consumption
                 profile and, in turn, affects the energy efficiency of
                 the processor. In this work, we propose heterogeneous
                 LLC management (HeLM), a novel shared LLC management
                 policy that takes advantage of the GPU's tolerance for
                 memory access latency. HeLM is able to throttle GPU LLC
                 accesses and yield LLC space to cache-sensitive CPU
                 applications. This throttling is achieved by allowing
                 GPU accesses to bypass the LLC when an increase in
                 memory access latency can be tolerated. The latency
                 tolerance of a GPU application is determined by the
                 availability of TLP, which is measured at runtime as
                 the average number of threads that are available for
                 issuing. For a baseline configuration with two CPU
                 cores and four GPU cores, modeled after existing
                 heterogeneous processor designs, HeLM outperforms least
                 recently used (LRU) policy by 10.4\%. Additionally,
                 HeLM also outperforms competing policies. Our
                 evaluations show that HeLM is able to sustain
                 performance with varying core mix. In addition to the
                 performance benefit, bypassing also reduces total
                 accesses to the LLC, leading to a reduction in the
                 energy consumption of the LLC module. However, LLC
                 bypassing has the potential to increase off-chip
                 bandwidth utilization and DRAM energy consumption. Our
                 experiments show that HeLM exhibits better energy
                 efficiency by reducing the ED$^2$ by 18\% over LRU
                 while impacting only a 7\% increase in off-chip
                 bandwidth utilization.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "3",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Suh:2015:DMR,
  author =       "Jinho Suh and Chieh-Ting Huang and Michel Dubois",
  title =        "Dynamic {MIPS} Rate Stabilization for Complex
                 Processors",
  journal =      j-TACO,
  volume =       "12",
  number =       "1",
  pages =        "4:1--4:??",
  month =        apr,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2714575",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Thu Apr 16 18:39:56 MDT 2015",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Modern microprocessor cores reach their high
                 performance levels with the help of high clock rates,
                 parallel and speculative execution of a large number of
                 instructions, and vast cache hierarchies. Modern cores
                 also have adaptive features to regulate power and
                 temperature and avoid thermal emergencies. All of these
                 features contribute to highly unpredictable execution
                 times. In this article, we demonstrate that the
                 execution time of in-order (IO), out-of-order (OoO),
                 and OoO simultaneous multithreaded processors can be
                 stable and predictable by stabilizing their mega
                 instructions executed per second (MIPS) rate via a
                 proportional, integral, and differential (PID) gain
                 feedback controller and dynamic voltage and frequency
                 scaling (DVFS). Processor cores in idle cycles are
                 continuously consuming power, which is highly
                 undesirable in systems, especially in real-time
                 systems. In addition to meeting deadlines in real-time
                 systems, our MIPS rate stabilization framework can be
                 applied on top of it to reduce power and energy by
                 avoiding idle cycles. If processors are equipped with
                 MIPS rate stabilization, the execution time can be
                 predicted. Because the MIPS rate remains steady, a
                 stabilized processor meets deadlines on time in
                 real-time systems or in systems with quality-of-service
                 execution latency requirements at the lowest possible
                 frequency. To demonstrate and evaluate this capability,
                 we have selected a subset of the MiBench benchmarks
                 with the widest execution rate variations. We stabilize
                 their MIPS rate on a 1GHz Pentium III--like OoO
                 single-thread microarchitecture, a 1.32GHz
                 StrongARM-like IO microarchitecture, and the 1GHz OoO
                 processor augmented with two-way and four-way
                 simultaneous multithreading. Both IO and OoO cores can
                 take advantage of the stabilization framework, but the
                 energy per instruction of the stabilized OoO core is
                 less because it runs at a lower frequency to meet the
                 same deadlines. The MIPS rate stabilization of complex
                 processors using a PID feedback control loop is a
                 general technique applicable to environments in which
                 lower power or energy coupled with steady, predictable
                 performance are desirable, although we target more
                 specifically real-time systems in this article.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "4",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Karimi:2015:MMA,
  author =       "Naghmeh Karimi and Arun Karthik Kanuparthi and Xueyang
                 Wang and Ozgur Sinanoglu and Ramesh Karri",
  title =        "{MAGIC}: Malicious Aging in Circuits\slash Cores",
  journal =      j-TACO,
  volume =       "12",
  number =       "1",
  pages =        "5:1--5:??",
  month =        apr,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2724718",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Thu Apr 16 18:39:56 MDT 2015",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "The performance of an IC degrades over its lifetime,
                 ultimately resulting in IC failure. In this article, we
                 present a hardware attack (called MAGIC) to maliciously
                 accelerate NBTI aging effects in cores. In this attack,
                 we identify the input patterns that maliciously age the
                 pipestages of a core. We then craft a program that
                 generates these patterns at the inputs of the targeted
                 pipestage. We demonstrate the MAGIC-based attack on the
                 OpenSPARC processor. Executing this program
                 dramatically accelerates the aging process and degrades
                 the processor's performance by 10.92\% in 1 month,
                 bypassing existing aging mitigation and timing-error
                 correction schemes. We also present two low-cost
                 techniques to thwart the proposed attack.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "5",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{DeOliveiraCastro:2015:CLB,
  author =       "Pablo {De Oliveira Castro} and Chadi Akel and Eric
                 Petit and Mihail Popov and William Jalby",
  title =        "{CERE}: {LLVM}-Based {Codelet Extractor and REplayer}
                 for Piecewise Benchmarking and Optimization",
  journal =      j-TACO,
  volume =       "12",
  number =       "1",
  pages =        "6:1--6:??",
  month =        apr,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2724717",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Thu Apr 16 18:39:56 MDT 2015",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "This article presents Codelet Extractor and REplayer
                 (CERE), an open-source framework for code isolation.
                 CERE finds and extracts the hotspots of an application
                 as isolated fragments of code, called codelets.
                 Codelets can be modified, compiled, run, and measured
                 independently from the original application. Code
                 isolation reduces benchmarking cost and allows
                 piecewise optimization of an application. Unlike
                 previous approaches, CERE isolates codes at the
                 compiler Intermediate Representation (IR) level.
                 Therefore CERE is language agnostic and supports many
                 input languages such as C, C++, Fortran, and D. CERE
                 automatically detects codelets invocations that have
                 the same performance behavior. Then, it selects a
                 reduced set of representative codelets and invocations,
                 much faster to replay, which still captures accurately
                 the original application. In addition, CERE supports
                 recompiling and retargeting the extracted codelets.
                 Therefore, CERE can be used for cross-architecture
                 performance prediction or piecewise code optimization.
                 On the SPEC 2006 FP benchmarks, CERE codelets cover
                 90.9\% and accurately replay 66.3\% of the execution
                 time. We use CERE codelets in a realistic study to
                 evaluate three different architectures on the NAS
                 benchmarks. CERE accurately estimates each architecture
                 performance and is 7.3 $ \times $ to 46.6 $ \times $
                 cheaper than running the full benchmark.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "6",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Gaster:2015:HRA,
  author =       "Benedict R. Gaster and Derek Hower and Lee Howes",
  title =        "{HRF}-Relaxed: Adapting {HRF} to the Complexities of
                 Industrial Heterogeneous Memory Models",
  journal =      j-TACO,
  volume =       "12",
  number =       "1",
  pages =        "7:1--7:??",
  month =        apr,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2701618",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Thu Apr 16 18:39:56 MDT 2015",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Memory consistency models, or memory models, allow
                 both programmers and program language implementers to
                 reason about concurrent accesses to one or more memory
                 locations. Memory model specifications balance the
                 often conflicting needs for precise semantics,
                 implementation flexibility, and ease of understanding.
                 Toward that end, popular programming languages like
                 Java, C, and C++ have adopted memory models built on
                 the conceptual foundation of Sequential Consistency for
                 Data-Race-Free programs (SC for DRF). These SC for DRF
                 languages were created with general-purpose homogeneous
                 CPU systems in mind, and all assume a single, global
                 memory address space. Such a uniform address space is
                 usually power and performance prohibitive in
                 heterogeneous Systems on Chips (SoCs), and for that
                 reason most heterogeneous languages have adopted split
                 address spaces and operations with nonglobal
                 visibility. There have recently been two attempts to
                 bridge the disconnect between the CPU-centric
                 assumptions of the SC for DRF framework and the
                 realities of heterogeneous SoC architectures. Hower et
                 al. proposed a class of Heterogeneous-Race-Free (HRF)
                 memory models that provide a foundation for
                 understanding many of the issues in heterogeneous
                 memory models. At the same time, the Khronos Group
                 developed the OpenCL 2.0 memory model that builds on
                 the C++ memory model. The OpenCL 2.0 model includes
                 features not addressed by HRF: primarily support for
                 relaxed atomics and a property referred to as scope
                 inclusion. In this article, we generalize HRF to allow
                 formalization of and reasoning about more complicated
                 models using OpenCL 2.0 as a point of reference. With
                 that generalization, we (1) make the OpenCL 2.0 memory
                 model more accessible by introducing a platform for
                 feature comparisons to other models, (2) consider a
                 number of shortcomings in the current OpenCL 2.0 model,
                 and (3) propose changes that could be adopted by future
                 OpenCL 2.0 revisions or by other, related, models.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "7",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Streit:2015:GTP,
  author =       "Kevin Streit and Johannes Doerfert and Clemens
                 Hammacher and Andreas Zeller and Sebastian Hack",
  title =        "Generalized Task Parallelism",
  journal =      j-TACO,
  volume =       "12",
  number =       "1",
  pages =        "8:1--8:??",
  month =        apr,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2723164",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Thu Apr 16 18:39:56 MDT 2015",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Existing approaches to automatic parallelization
                 produce good results in specific domains. Yet, it is
                 unclear how to integrate their individual strengths to
                 match the demands and opportunities of complex
                 software. This lack of integration has both practical
                 reasons, as integrating those largely differing
                 approaches into one compiler would impose an
                 engineering hell, as well as theoretical reasons, as no
                 joint cost model exists that would drive the choice
                 between parallelization methods. By reducing the
                 problem of generating parallel code from a program
                 dependence graph to integer linear programming, {\em
                 generalized task parallelization\/} integrates central
                 aspects of existing parallelization approaches into a
                 single unified framework. Implemented on top of LLVM,
                 the framework seamlessly integrates enabling
                 technologies such as speculation, privatization, and
                 the realization of reductions. Evaluating our
                 implementation on various C programs from different
                 domains, we demonstrate the effectiveness and
                 generality of generalized task parallelization. On a
                 quad-core machine with hyperthreading we achieve
                 speedups of up to $ 4.6 \times $.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "8",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Tabkhi:2015:JSH,
  author =       "Hamed Tabkhi and Gunar Schirner",
  title =        "A Joint {SW\slash HW} Approach for Reducing Register
                 File Vulnerability",
  journal =      j-TACO,
  volume =       "12",
  number =       "2",
  pages =        "9:1--9:??",
  month =        jul,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2733378",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Aug 7 09:46:00 MDT 2015",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "The Register File (RF) is a particularly vulnerable
                 component within processor core and at the same time a
                 hotspot with high power density. To reduce RF
                 vulnerability, conventional HW-only approaches such as
                 Error Correction Codes (ECCs) or modular redundancies
                 are not suitable due to their significant power
                 overhead. Conversely, SW-only approaches either have
                 limited improvement on RF reliability or require
                 considerable performance overhead. As a result, new
                 approaches are needed that reduce RF vulnerability with
                 minimal power and performance overhead. This article
                 introduces Application-guided Reliability-enhanced
                 Register file Architecture (ARRA), a novel approach to
                 reduce RF vulnerability of embedded processors. Taking
                 advantage of uneven register utilization, ARRA mirrors,
                 guided by a SW instrumentation, frequently used active
                 registers into passive registers. ARRA is particularly
                 suitable for control applications, as they have a high
                 reliability demand with fairly low (uneven) RF
                 utilization. ARRA is a cross-layer joint HW/SW approach
                 based on an ARRA-extended RF microarchitecture, an ISA
                 extension, as well as static binary analysis and
                 instrumentation. We evaluate ARRA benefits using an
                 ARRA-enhanced Blackfin processor executing a set of
                 DSPBench and MiBench benchmarks. We quantify the
                 benefits using RF Vulnerability Factor (RFVF) and Mean
                 Work To Failure (MWTF). ARRA significantly reduces RFVF
                 from 35\% to 6.9\% in cost of 0.5\% performance lost
                 for control applications. With ARRA's register
                 mirroring, it can also correct Multiple Bit Upsets
                 (MBUs) errors, achieving an 8x increase in MWTF.
                 Compared to a partially ECC-protected RF approach, ARRA
                 demonstrates higher efficiency by achieving comparable
                 vulnerability reduction at much lower power
                 consumption.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "9",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Kanuparthi:2015:RIC,
  author =       "Arun Kanuparthi and Ramesh Karri",
  title =        "Reliable Integrity Checking in Multicore Processors",
  journal =      j-TACO,
  volume =       "12",
  number =       "2",
  pages =        "10:1--10:??",
  month =        jul,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2738052",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Aug 7 09:46:00 MDT 2015",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Security and reliability have become important
                 concerns in the design of computer systems. On one
                 hand, microarchitectural enhancements for security
                 (such as for dynamic integrity checking of code at
                 runtime) have been proposed. On the other hand,
                 independently, microarchitectural enhancements for
                 reliability to detect and tolerate natural faults have
                 also been proposed. A fault in these security
                 enhancements due to alpha particles or aging might
                 potentially pass off maliciously modified instructions
                 as safe, rendering the security enhancements useless.
                 Deliberate fault attacks by attackers can be launched
                 to disable the security enhancements and then launch
                 the well-known security attacks that would otherwise
                 have been detected by these enhancements. We report an
                 integrated microarchitecture support for security and
                 reliability in multicore processors. Specifically, we
                 add integrity checkers to protect the code running on
                 the multiple cores in a multicore processor. We then
                 adapt these checkers to check one another periodically
                 to ensure reliable operation. These checkers naturally
                 can check the other parts of the core. The average
                 performance, power, and area costs for these
                 security-reliability enhancements are 6.42\%, 0.73\%,
                 and 0.53\%, respectively.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "10",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Lee:2015:NMD,
  author =       "Do-Heon Lee and Su-Kyung Yoon and Jung-Geun Kim and
                 Charles C. Weems and Shin-Dug Kim",
  title =        "A New Memory-Disk Integrated System with {HW}
                 Optimizer",
  journal =      j-TACO,
  volume =       "12",
  number =       "2",
  pages =        "11:1--11:??",
  month =        jul,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2738053",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Aug 7 09:46:00 MDT 2015",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Current high-performance computer systems utilize a
                 memory hierarchy of on-chip cache, main memory, and
                 secondary storage due to differences in device
                 characteristics. Limiting the amount of main memory
                 causes page swap operations and duplicates data between
                 the main memory and the storage device. The
                 characteristics of next-generation memory, such as
                 nonvolatility, byte addressability, and scaling to
                 greater capacity, can be used to solve these problems.
                 Simple replacement of secondary storage with new forms
                 of nonvolatile memory in a traditional memory hierarchy
                 still causes typical problems, such as memory
                 bottleneck, page swaps, and write overhead. Thus, we
                 suggest a single architecture that merges the main
                 memory and secondary storage into a system called a
                 Memory-Disk Integrated System (MDIS). The MDIS
                 architecture is composed of a virtually decoupled NVRAM
                 and a nonvolatile memory performance optimizer
                 combining hardware and software to support this system.
                 The virtually decoupled NVRAM module can support
                 conventional main memory and disk storage operations
                 logically without data duplication and can reduce write
                 operations to the NVRAM. To increase the lifetime and
                 optimize the performance of this NVRAM, another
                 hardware module called a Nonvolatile Performance
                 Optimizer (NVPO) is used that is composed of four small
                 buffers. The NVPO exploits spatial and temporal
                 characteristics of static/dynamic data based on program
                 execution characteristics. Enhanced virtual memory
                 management and address translation modules in the
                 operating system can support these hardware components
                 to achieve a seamless memory-storage environment. Our
                 experimental results show that the proposed
                 architecture can improve execution time by about 89\%
                 over a conventional DRAM main memory/HDD storage
                 system, and 77\% over a state-of-the-art PRAM main
                 memory/HDD disk system with DRAM buffer. Also, the
                 lifetime of the virtually decoupled NVRAM is estimated
                 to be 40\% longer than that of a traditional hierarchy
                 based on the same device technology.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "11",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Kafshdooz:2015:DSS,
  author =       "Morteza Mohajjel Kafshdooz and Alireza Ejlali",
  title =        "Dynamic Shared {SPM} Reuse for Real-Time Multicore
                 Embedded Systems",
  journal =      j-TACO,
  volume =       "12",
  number =       "2",
  pages =        "12:1--12:??",
  month =        jul,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2738051",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Aug 7 09:46:00 MDT 2015",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Allocating the scratchpad memory (SPM) space to tasks
                 is a challenging problem in real-time multicore
                 embedded systems that use shared SPM. Proper SPM space
                 allocation is important, as it considerably influences
                 the application worst-case execution time (WCET), which
                 is of great importance in real-time applications. To
                 address this problem, in this article we present a
                 dynamic SPM reuse scheme, where SPM space can be reused
                 by other tasks during runtime without requiring any
                 static SPM partitioning. Although the proposed scheme
                 is applied dynamically at runtime, the required
                 decision making is fairly complex and hence cannot be
                 performed at runtime. We have developed techniques to
                 perform the decision making offline at design time in
                 the form of optimization problems combined with task
                 scheduling/mapping. The proposed work is unlike
                 previous works that either exploit static schemes for
                 SPM space allocation or perform task scheduling/mapping
                 and SPM space allocation incoherently. The experimental
                 results show that our dynamic SPM reuse scheme can
                 reduce WCET by up to 55\% as compared to recent
                 previous works on SPM allocation in real-time multicore
                 embedded systems.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "12",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Jia:2015:GPP,
  author =       "Wenhao Jia and Elba Garza and Kelly A. Shaw and
                 Margaret Martonosi",
  title =        "{GPU} Performance and Power Tuning Using Regression
                 Trees",
  journal =      j-TACO,
  volume =       "12",
  number =       "2",
  pages =        "13:1--13:??",
  month =        jul,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2736287",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Aug 7 09:46:00 MDT 2015",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "GPU performance and power tuning is difficult,
                 requiring extensive user expertise and time-consuming
                 trial and error. To accelerate design tuning,
                 statistical design space exploration methods have been
                 proposed. This article presents Starchart, a novel
                 design space partitioning tool that uses regression
                 trees to approach GPU tuning problems. Improving on
                 prior work, Starchart offers more automation in
                 identifying key design trade-offs and models design
                 subspaces with distinctly different behaviors.
                 Starchart achieves good model accuracy using very few
                 random samples: less than 0.3\% of a given design
                 space; iterative sampling can more quickly target
                 subspaces of interest.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "13",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Pananilath:2015:OCG,
  author =       "Irshad Pananilath and Aravind Acharya and Vinay
                 Vasista and Uday Bondhugula",
  title =        "An Optimizing Code Generator for a Class of
                 Lattice-{Boltzmann} Computations",
  journal =      j-TACO,
  volume =       "12",
  number =       "2",
  pages =        "14:1--14:??",
  month =        jul,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2739047",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Aug 7 09:46:00 MDT 2015",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "The Lattice-Boltzmann method (LBM), a promising new
                 particle-based simulation technique for complex and
                 multiscale fluid flows, has seen tremendous adoption in
                 recent years in computational fluid dynamics. Even with
                 a state-of-the-art LBM solver such as Palabos, a user
                 has to still manually write the program using
                 library-supplied primitives. We propose an automated
                 code generator for a class of LBM computations with the
                 objective to achieve high performance on modern
                 architectures. Few studies have looked at time tiling
                 for LBM codes. We exploit a key similarity between
                 stencils and LBM to enable polyhedral optimizations and
                 in turn time tiling for LBM. We also characterize the
                 performance of LBM with the Roofline performance model.
                 Experimental results for standard LBM simulations like
                 Lid Driven Cavity, Flow Past Cylinder, and Poiseuille
                 Flow show that our scheme consistently outperforms
                 Palabos-on average by up to $ 3 \times $ while running
                 on 16 cores of an Intel Xeon (Sandybridge). We also
                 obtain an improvement of $ 2.47 \times $ on the SPEC
                 LBM benchmark.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "14",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Fang:2015:PIO,
  author =       "Shuangde Fang and Wenwen Xu and Yang Chen and Lieven
                 Eeckhout and Olivier Temam and Yunji Chen and Chengyong
                 Wu and Xiaobing Feng",
  title =        "Practical Iterative Optimization for the Data Center",
  journal =      j-TACO,
  volume =       "12",
  number =       "2",
  pages =        "15:1--15:??",
  month =        jul,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2739048",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Aug 7 09:46:00 MDT 2015",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Iterative optimization is a simple but powerful
                 approach that searches the best possible combination of
                 compiler optimizations for a given workload. However,
                 iterative optimization is plagued by several practical
                 issues that prevent it from being widely used in
                 practice: a large number of runs are required to find
                 the best combination, the optimum combination is
                 dataset dependent, and the exploration process incurs
                 significant overhead that needs to be compensated for
                 by performance benefits. Therefore, although iterative
                 optimization has been shown to have a significant
                 performance potential, it seldom is used in production
                 compilers. In this article, we propose iterative
                 optimization for the data center (IODC): we show that
                 the data center offers a context in which all of the
                 preceding hurdles can be overcome. The basic idea is to
                 spawn different combinations across workers and
                 recollect performance statistics at the master, which
                 then evolves to the optimum combination of compiler
                 optimizations. IODC carefully manages costs and
                 benefits, and it is transparent to the end user. To
                 bring IODC to practice, we evaluate it in the presence
                 of co-runners to better reflect real-life data center
                 operation with multiple applications co-running per
                 server. We enhance IODC with the capability to find
                 compatible co-runners along with a mechanism to
                 dynamically adjust the level of aggressiveness to
                 improve its robustness in the presence of co-running
                 applications. We evaluate IODC using both MapReduce and
                 compute-intensive throughput server applications. To
                 reflect the large number of users interacting with the
                 system, we gather a very large collection of datasets
                 (up to hundreds of millions of unique datasets per
                 program), for a total storage of 16.4TB and 850 days of
                 CPU time. We report an average performance improvement
                 of $ 1.48 \times $ and up to $ 2.08 \times $ for five
                 MapReduce applications, and $ 1.12 \times $ and up to $
                 1.39 \times $ for nine server applications.
                 Furthermore, our experiments demonstrate that IODC is
                 effective in the presence of co-runners, improving
                 performance by greater than 13\% compared to the worst
                 possible co-runner schedule.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "15",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Zhang:2015:BSS,
  author =       "Tao Zhang and Naifeng Jing and Kaiming Jiang and Wei
                 Shu and Min-You Wu and Xiaoyao Liang",
  title =        "{Buddy SM}: Sharing Pipeline Front-End for Improved
                 Energy Efficiency in {GPGPUs}",
  journal =      j-TACO,
  volume =       "12",
  number =       "2",
  pages =        "16:1--16:??",
  month =        jul,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2744202",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Aug 7 09:46:00 MDT 2015",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "A modern general-purpose graphics processing unit
                 (GPGPU) usually consists of multiple streaming
                 multiprocessors (SMs), each having a pipeline that
                 incorporates a group of threads executing a common
                 instruction flow. Although SMs are designed to work
                 independently, we observe that they tend to exhibit
                 very similar behavior for many workloads. If multiple
                 SMs can be grouped and work in the lock-step manner, it
                 is possible to save energy by sharing the front-end
                 units among multiple SMs, including the instruction
                 fetch, decode, and schedule components. However, such
                 sharing brings architectural challenges and sometime
                 causes performance degradation. In this article, we
                 show our design, implementation, and evaluation for
                 such an architecture, which we call Buddy SM.
                 Specifically, multiple SMs can be opportunistically
                 grouped into a buddy cluster. One SM becomes the
                 master, and the rest become the slaves. The front-end
                 unit of the master works actively for itself as well as
                 for the slaves, whereas the front-end logics of the
                 slaves are power gated. For efficient flow control and
                 program correctness, the proposed architecture can
                 identify unfavorable conditions and ungroup the buddy
                 cluster when necessary. We analyze various techniques
                 to improve the performance and energy efficiency of
                 Buddy SM. Detailed experiments manifest that 37.2\%
                 front-end and 7.5\% total GPU energy reduction can be
                 achieved.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "16",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Cheng:2015:ECS,
  author =       "Hsiang-Yun Cheng and Matt Poremba and Narges Shahidi
                 and Ivan Stalev and Mary Jane Irwin and Mahmut Kandemir
                 and Jack Sampson and Yuan Xie",
  title =        "{EECache}: a Comprehensive Study on the Architectural
                 Design for Energy-Efficient Last-Level Caches in Chip
                 Multiprocessors",
  journal =      j-TACO,
  volume =       "12",
  number =       "2",
  pages =        "17:1--17:??",
  month =        jul,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2756552",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Aug 7 09:46:00 MDT 2015",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Power management for large last-level caches (LLCs) is
                 important in chip multiprocessors (CMPs), as the
                 leakage power of LLCs accounts for a significant
                 fraction of the limited on-chip power budget. Since not
                 all workloads running on CMPs need the entire cache,
                 portions of a large, shared LLC can be disabled to save
                 energy. In this article, we explore different design
                 choices, from circuit-level cache organization to
                 microarchitectural management policies, to propose a
                 low-overhead runtime mechanism for energy reduction in
                 the large, shared LLC. We first introduce a slice-based
                 cache organization that can shut down parts of the
                 shared LLC with minimal circuit overhead. Based on this
                 slice-based organization, part of the shared LLC can be
                 turned off according to the spatial and temporal cache
                 access behavior captured by low-overhead sampling-based
                 hardware. In order to eliminate the performance
                 penalties caused by flushing data before powering off a
                 cache slice, we propose data migration policies to
                 prevent the loss of useful data in the LLC. Results
                 show that our energy-efficient cache design (EECache)
                 provides 14.1\% energy savings at only 1.2\%
                 performance degradation and consumes negligible
                 hardware overhead compared to prior work.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "17",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Suresh:2015:IFM,
  author =       "Arjun Suresh and Bharath Narasimha Swamy and Erven
                 Rohou and Andr{\'e} Seznec",
  title =        "Intercepting Functions for Memoization: a Case Study
                 Using Transcendental Functions",
  journal =      j-TACO,
  volume =       "12",
  number =       "2",
  pages =        "18:1--18:??",
  month =        jul,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2751559",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Aug 7 09:46:00 MDT 2015",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Memoization is the technique of saving the results of
                 executions so that future executions can be omitted
                 when the input set repeats. Memoization has been
                 proposed in previous literature at the instruction,
                 basic block, and function levels using hardware, as
                 well as pure software-level approaches including
                 changes to programming language. In this article, we
                 focus on software memoization for procedural languages
                 such as C and Fortran at the granularity of a function.
                 We propose a simple linker-based technique for enabling
                 software memoization of any dynamically linked pure
                 function by function interception and illustrate our
                 framework using a set of computationally expensive pure
                 functions-the transcendental functions. Transcendental
                 functions are those that cannot be expressed in terms
                 of a finite sequence of algebraic operations
                 (trigonometric functions, exponential functions, etc.)
                 and hence are computationally expensive. Our technique
                 does not need the availability of source code and thus
                 can even be applied to commercial applications, as well
                 as applications with legacy codes. As far as users are
                 concerned, enabling memoization is as simple as setting
                 an environment variable. Our framework does not make
                 any specific assumptions about the underlying
                 architecture or compiler toolchains and can work with a
                 variety of current architectures. We present
                 experimental results for a x86-64 platform using both
                 gcc and icc compiler toolchains, and an ARM Cortex-A9
                 platform using gcc. Our experiments include a mix of
                 real-world programs and standard benchmark suites: SPEC
                 and Splash2x. On standard benchmark applications that
                 extensively call the transcendental functions, we
                 report memoization benefits of up to 50\% on Intel Ivy
                 Bridge and up to 10\% on ARM Cortex-A9. Memoization was
                 able to regain a performance loss of 76\% in bwaves due
                 to a known performance bug in the GNU implementation of
                 the pow function. The same benchmark on ARM Cortex-A9
                 benefited by more than 200\%.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "18",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Lin:2015:SSE,
  author =       "Chung-Hsiang Lin and De-Yu Shen and Yi-Jung Chen and
                 Chia-Lin Yang and Cheng-Yuan Michael Wang",
  title =        "{SECRET}: a Selective Error Correction Framework for
                 Refresh Energy Reduction in {DRAMs}",
  journal =      j-TACO,
  volume =       "12",
  number =       "2",
  pages =        "19:1--19:??",
  month =        jul,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2747876",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Aug 7 09:46:00 MDT 2015",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "DRAMs are used as the main memory in most computing
                 systems today. Studies show that DRAMs contribute to a
                 significant part of overall system power consumption.
                 One of the main challenges in low-power DRAM design is
                 the inevitable refresh process. Due to process
                 variation, memory cells exhibit retention time
                 variations. Current DRAMs use a single refresh period
                 determined by the cell with the largest leakage. Since
                 prolonging refresh intervals introduces retention
                 errors, a set of previous works adopt conventional
                 error-correcting code (ECC) to correct retention
                 errors. However, these approaches introduce significant
                 area and energy overheads. In this article, we propose
                 a novel error correction framework for retention errors
                 in DRAMs, called SECRET (selective error correction for
                 refresh energy reduction). The key observations we make
                 are that retention errors are hard errors rather than
                 soft errors, and only few DRAM cells have large
                 leakage. Therefore, instead of equipping error
                 correction capability for all memory cells as existing
                 ECC schemes, we only allocate error correction
                 information to leaky cells under a refresh interval.
                 Our SECRET framework contains two parts: an offline
                 phase to identify memory cells with retention errors
                 given a target error rate and a low-overhead error
                 correction mechanism. The experimental results show
                 that among all test cases performed, the proposed
                 SECRET framework can reduce refresh power by 87.2\% and
                 overall DRAM power up to 18.57\% with negligible area
                 and performance overheads.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "19",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Simon:2015:STH,
  author =       "Doug Simon and Christian Wimmer and Bernhard Urban and
                 Gilles Duboscq and Lukas Stadler and Thomas
                 W{\"u}rthinger",
  title =        "Snippets: Taking the High Road to a Low Level",
  journal =      j-TACO,
  volume =       "12",
  number =       "2",
  pages =        "20:1--20:??",
  month =        jul,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2764907",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Aug 7 09:46:00 MDT 2015",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/java2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "When building a compiler for a high-level language,
                 certain intrinsic features of the language must be
                 expressed in terms of the resulting low-level
                 operations. Complex features are often expressed by
                 explicitly weaving together bits of low-level IR, a
                 process that is tedious, error prone, difficult to
                 read, difficult to reason about, and machine dependent.
                 In the Graal compiler for Java, we take a different
                 approach: we use snippets of Java code to express
                 semantics in a high-level, architecture-independent
                 way. Two important restrictions make snippets feasible
                 in practice: they are compiler specific, and they are
                 explicitly prepared and specialized. Snippets make
                 Graal simpler and more portable while still capable of
                 generating machine code that can compete with other
                 compilers of the Java HotSpot VM.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "20",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Balasubramanian:2015:EGL,
  author =       "Raghuraman Balasubramanian and Vinay Gangadhar and
                 Ziliang Guo and Chen-Han Ho and Cherin Joseph and
                 Jaikrishnan Menon and Mario Paulo Drumond and Robin
                 Paul and Sharath Prasad and Pradip Valathol and
                 Karthikeyan Sankaralingam",
  title =        "Enabling {GPGPU} Low-Level Hardware Explorations with
                 {MIAOW}: an Open-Source {RTL} Implementation of a
                 {GPGPU}",
  journal =      j-TACO,
  volume =       "12",
  number =       "2",
  pages =        "21:1--21:??",
  month =        jul,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2764908",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Aug 7 09:46:00 MDT 2015",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Graphic processing unit (GPU)-based general-purpose
                 computing is developing as a viable alternative to
                 CPU-based computing in many domains. Today's tools for
                 GPU analysis include simulators like GPGPU-Sim,
                 Multi2Sim, and Barra. While useful for modeling
                 first-order effects, these tools do not provide a
                 detailed view of GPU microarchitecture and physical
                 design. Further, as GPGPU research evolves, design
                 ideas and modifications demand detailed estimates of
                 impact on overall area and power. Fueled by this need,
                 we introduce MIAOW (Many-core Integrated Accelerator Of
                 Wisconsin), an open-source RTL implementation of the
                 AMD Southern Islands GPGPU ISA, capable of running
                 unmodified OpenCL-based applications. We present our
                 design motivated by our goals to create a realistic,
                 flexible, OpenCL-compatible GPGPU, capable of emulating
                 a full system. We first explore if MIAOW is realistic
                 and then use four case studies to show that MIAOW
                 enables the following: physical design perspective to
                 ``traditional'' microarchitecture, new types of
                 research exploration, and validation/calibration of
                 simulator-based characterization of hardware. The
                 findings and ideas are contributions in their own
                 right, in addition to MIAOW's utility as a tool for
                 others' research.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "21",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Chen:2015:LAW,
  author =       "Quan Chen and Minyi Guo",
  title =        "Locality-Aware Work Stealing Based on Online Profiling
                 and Auto-Tuning for Multisocket Multicore
                 Architectures",
  journal =      j-TACO,
  volume =       "12",
  number =       "2",
  pages =        "22:1--22:??",
  month =        jul,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2766450",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Aug 7 09:46:00 MDT 2015",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Modern mainstream powerful computers adopt multisocket
                 multicore CPU architecture and NUMA-based memory
                 architecture. While traditional work-stealing
                 schedulers are designed for single-socket
                 architectures, they incur severe shared cache misses
                 and remote memory accesses in these computers. To solve
                 the problem, we propose a locality-aware work-stealing
                 (LAWS) scheduler, which better utilizes both the shared
                 cache and the memory system. In LAWS, a load-balanced
                 task allocator is used to evenly split and store the
                 dataset of a program to all the memory nodes and
                 allocate a task to the socket where the local memory
                 node stores its data for reducing remote memory
                 accesses. Then, an adaptive DAG packer adopts an
                 auto-tuning approach to optimally pack an execution DAG
                 into cache-friendly subtrees. After cache-friendly
                 subtrees are created, every socket executes
                 cache-friendly subtrees sequentially for optimizing
                 shared cache usage. Meanwhile, a triple-level
                 work-stealing scheduler is applied to schedule the
                 subtrees and the tasks in each subtree. Through
                 theoretical analysis, we show that LAWS has comparable
                 time and space bounds compared with traditional
                 work-stealing schedulers. Experimental results show
                 that LAWS can improve the performance of memory-bound
                 programs up to 54.2\% on AMD-based experimental
                 platforms and up to 48.6\% on Intel-based experimental
                 platforms compared with traditional work-stealing
                 schedulers.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "22",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Das:2015:SBP,
  author =       "Madan Das and Gabriel Southern and Jose Renau",
  title =        "Section-Based Program Analysis to Reduce Overhead of
                 Detecting Unsynchronized Thread Communication",
  journal =      j-TACO,
  volume =       "12",
  number =       "2",
  pages =        "23:1--23:??",
  month =        jul,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2766451",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Aug 7 09:46:00 MDT 2015",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Most systems that test and verify parallel programs,
                 such as deterministic execution engines, data race
                 detectors, and software transactional memory systems,
                 require instrumenting loads and stores in an
                 application. This can cause a very significant runtime
                 and memory overhead compared to executing
                 uninstrumented code. Multithreaded programming
                 typically allows any thread to perform loads and stores
                 to any location in the process's address space
                 independently, and such tools monitor all these memory
                 accesses. However, many of the addresses in these
                 unsynchronized memory accesses are only used by a
                 single thread and do not affect other executing
                 threads. We propose Section-Based Program Analysis
                 (SBPA), a novel way to decompose the program into
                 disjoint code sections to identify and eliminate
                 instrumenting such loads and stores during program
                 compilation so that the program runtime overhead is
                 significantly reduced. Our analysis includes
                 improvements to pointer analysis and uses a few user
                 directives to increase the effectiveness of SBPA
                 further. We implemented SBPA for a deterministic
                 execution runtime environment and were able to
                 eliminate 51\% of dynamic memory access
                 instrumentations. When combined with directives, such
                 reduction increased to 63\%. We also integrated SBPA
                 with ThreadSanitizer, a state-of-the-art dynamic race
                 detector, and achieved a speedup of 2.43 (2.74 with
                 directives) on a geometric mean basis.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "23",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Lotfi:2015:AAC,
  author =       "Atieh Lotfi and Abbas Rahimi and Luca Benini and
                 Rajesh K. Gupta",
  title =        "Aging-Aware Compilation for {GP-GPUs}",
  journal =      j-TACO,
  volume =       "12",
  number =       "2",
  pages =        "24:1--24:??",
  month =        jul,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2778984",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Aug 7 09:46:00 MDT 2015",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "General-purpose graphic processing units (GP-GPUs)
                 offer high computational throughput using thousands of
                 integrated processing elements (PEs). These PEs are
                 stressed during workload execution, and negative bias
                 temperature instability (NBTI) adversely affects their
                 reliability by introducing new delay-induced faults.
                 However, the effect of these delay variations is not
                 uniformly spread across the PEs: some are affected more
                 --- hence less reliable --- than others. This variation
                 causes significant reduction in the lifetime of GP-GPU
                 parts. In this article, we address the problem of
                 ``wear leveling'' across processing units to mitigate
                 lifetime uncertainty in GP-GPUs. We propose innovations
                 in the static compiled code that can improve healing in
                 PEs and stream cores (SCs) based on their degradation
                 status. PE healing is a fine-grained very long
                 instruction word (VLIW) slot assignment scheme that
                 balances the stress of instructions across the PEs
                 within an SC. SC healing is a coarse-grained workload
                 allocation scheme that distributes workload across SCs
                 in GP-GPUs. Both schemes share a common property: they
                 adaptively shift workload from less reliable units to
                 more reliable units, either spatially or temporally.
                 These software schemes are based on online calibration
                 with NBTI monitoring that equalizes the expected
                 lifetime of PEs and SCs by regenerating adaptive
                 compiled codes to respond to the specific health state
                 of the GP-GPUs. We evaluate the effectiveness of the
                 proposed schemes for various OpenCL kernels from the
                 AMD APP SDK on Evergreen and Southern Island GPU
                 architectures. The aging-aware healthy kernels
                 generated by the PE (or SC) healing scheme reduce
                 NBTI-induced voltage threshold shift by 30\% (77\% in
                 the case of SCs), with no (moderate) performance
                 penalty compared to the naive kernels.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "24",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Railing:2015:CEG,
  author =       "Brian P. Railing and Eric R. Hein and Thomas M.
                 Conte",
  title =        "{Contech}: Efficiently Generating Dynamic Task Graphs
                 for Arbitrary Parallel Programs",
  journal =      j-TACO,
  volume =       "12",
  number =       "2",
  pages =        "25:1--25:??",
  month =        jul,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2776893",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Aug 7 09:46:00 MDT 2015",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Parallel programs can be characterized by task graphs
                 encoding instructions, memory accesses, and the
                 parallel work's dependencies, while representing any
                 threading library and architecture. This article
                 presents Contech, a high performance framework for
                 generating dynamic task graphs from arbitrary parallel
                 programs, and a novel representation enabling
                 programmers and compiler optimizations to understand
                 and exploit program aspects. The Contech framework
                 supports a variety of languages (including C, C++, and
                 Fortran), parallelization libraries, and ISAs
                 (including x86 and ARM). Running natively for
                 collection speed and minimizing program perturbation,
                 the instrumentation shows $ 4 \times $ improvement over
                 a Pin-based implementation on PARSEC and NAS
                 benchmarks.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "25",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Davari:2015:EGA,
  author =       "Mahdad Davari and Alberto Ros and Erik Hagersten and
                 Stefanos Kaxiras",
  title =        "The Effects of Granularity and Adaptivity on
                 Private\slash Shared Classification for Coherence",
  journal =      j-TACO,
  volume =       "12",
  number =       "3",
  pages =        "26:1--26:??",
  month =        oct,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2790301",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Wed Oct 7 18:51:05 MDT 2015",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Classification of data into private and shared has
                 proven to be a catalyst for techniques to reduce
                 coherence cost, since private data can be taken out of
                 coherence and resources can be concentrated on
                 providing coherence for shared data. In this article,
                 we examine how granularity-page-level versus cache-line
                 level-and adaptivity-going from shared to
                 private-affect the outcome of classification and its
                 final impact on coherence. We create a classification
                 technique, called Generational Classification, and a
                 coherence protocol called Generational Coherence, which
                 treats data as private or shared based on cache-line
                 generations. We compare two coherence protocols based
                 on self-invalidation/self-downgrade with respect to
                 data classification. Our findings are enlightening: (i)
                 Some programs benefit from finer granularity, some
                 benefit further from adaptivity, but some do not
                 benefit from either. (ii) Reducing the amount of shared
                 data has no perceptible impact on coherence misses
                 caused by self-invalidation of shared data, hence no
                 impact on performance. (iii) In contrast, classifying
                 more data as private has implications for protocols
                 that employ write-through as a means of self-downgrade,
                 resulting in network traffic reduction-up to 30\%-by
                 reducing write-through traffic.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "26",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Gottscho:2015:DDP,
  author =       "Mark Gottscho and Abbas BanaiyanMofrad and Nikil Dutt
                 and Alex Nicolau and Puneet Gupta",
  title =        "{DPCS}: Dynamic Power\slash Capacity Scaling for
                 {SRAM} Caches in the Nanoscale Era",
  journal =      j-TACO,
  volume =       "12",
  number =       "3",
  pages =        "27:1--27:??",
  month =        oct,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2792982",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Wed Oct 7 18:51:05 MDT 2015",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Fault-Tolerant Voltage-Scalable (FTVS) SRAM cache
                 architectures are a promising approach to improve
                 energy efficiency of memories in the presence of
                 nanoscale process variation. Complex FTVS schemes are
                 commonly proposed to achieve very low minimum supply
                 voltages, but these can suffer from high overheads and
                 thus do not always offer the best power/capacity
                 trade-offs. We observe on our 45nm test chips that the
                 ``fault inclusion property'' can enable lightweight
                 fault maps that support multiple runtime supply
                 voltages. Based on this observation, we propose a
                 simple and low-overhead FTVS cache architecture for
                 power/capacity scaling. Our mechanism combines
                 multilevel voltage scaling with optional architectural
                 support for power gating of blocks as they become
                 faulty at low voltages. A static (SPCS) policy sets the
                 runtime cache VDD once such that a only a few cache
                 blocks may be faulty in order to minimize the impact on
                 performance. We describe a Static Power/Capacity
                 Scaling (SPCS) policy and two alternate Dynamic
                 Power/Capacity Scaling (DPCS) policies that
                 opportunistically reduce the cache voltage even further
                 for more energy savings. This architecture achieves
                 lower static power for all effective cache capacities
                 than a recent more complex FTVS scheme. This is due to
                 significantly lower overheads, despite the inability of
                 our approach to match the min-VDD of the competing work
                 at a fixed target yield. Over a set of SPEC CPU2006
                 benchmarks on two system configurations, the average
                 total cache (system) energy saved by SPCS is 62\%
                 (22\%), while the two DPCS policies achieve roughly
                 similar energy reduction, around 79\% (26\%). On
                 average, the DPCS approaches incur 2.24\% performance
                 and 6\% area penalties.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "27",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Michaud:2015:RCM,
  author =       "Pierre Michaud and Andrea Mondelli and Andr{\'e}
                 Seznec",
  title =        "Revisiting Clustered Microarchitecture for Future
                 Superscalar Cores: a Case for Wide Issue Clusters",
  journal =      j-TACO,
  volume =       "12",
  number =       "3",
  pages =        "28:1--28:??",
  month =        oct,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2800787",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Wed Oct 7 18:51:05 MDT 2015",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "During the past 10 years, the clock frequency of
                 high-end superscalar processors has not increased.
                 Performance keeps growing mainly by integrating more
                 cores on the same chip and by introducing new
                 instruction set extensions. However, this benefits only
                 some applications and requires rewriting and/or
                 recompiling these applications. A more general way to
                 accelerate applications is to increase the IPC, the
                 number of instructions executed per cycle. Although the
                 focus of academic microarchitecture research moved away
                 from IPC techniques, the IPC of commercial processors
                 was continuously improved during these years. We argue
                 that some of the benefits of technology scaling should
                 be used to raise the IPC of future superscalar cores
                 further. Starting from microarchitecture parameters
                 similar to recent commercial high-end cores, we show
                 that an effective way to increase the IPC is to allow
                 the out-of-order engine to issue more micro-ops per
                 cycle. But this must be done without impacting the
                 clock cycle. We propose combining two techniques:
                 clustering and register write specialization. Past
                 research on clustered microarchitectures focused on
                 narrow issue clusters, as the emphasis at that time was
                 on allowing high clock frequencies. Instead, in this
                 study, we consider wide issue clusters, with the goal
                 of increasing the IPC under a constant clock frequency.
                 We show that on a wide issue dual cluster, a very
                 simple steering policy that sends 64 consecutive
                 instructions to the same cluster, the next 64
                 instructions to the other cluster, and so forth,
                 permits tolerating an intercluster delay of three
                 cycles. We also propose a method for decreasing the
                 energy cost of sending results from one cluster to the
                 other cluster.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "28",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Natarajan:2015:LTE,
  author =       "Ragavendra Natarajan and Antonia Zhai",
  title =        "Leveraging Transactional Execution for Memory
                 Consistency Model Emulation",
  journal =      j-TACO,
  volume =       "12",
  number =       "3",
  pages =        "29:1--29:??",
  month =        oct,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2786980",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Wed Oct 7 18:51:05 MDT 2015",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "System emulation is widely used in today's computer
                 systems. This technology opens new opportunities for
                 resource sharing as well as enhancing system security
                 and reliability. System emulation across different
                 instruction set architectures (ISA) can enable further
                 opportunities. For example, cross-ISA emulation can
                 enable workload consolidation over a wide range of
                 microprocessors and potentially facilitate the seamless
                 deployment of new processor architectures. As multicore
                 and manycore processors become pervasive, it is
                 important to address the challenges toward supporting
                 system emulation on these platforms. A key challenge in
                 cross-ISA emulation on multicore systems is ensuring
                 the correctness of emulation when the guest and the
                 host memory consistency models differ. Many existing
                 cross-ISA system emulators are sequential, thus they
                 are able to avoid this problem at the cost of
                 significant performance degradation. Recently proposed
                 parallel emulators are able to address the performance
                 limitation; however, they provide limited support for
                 memory consistency model emulation. When the host
                 system has a weaker memory consistency model compared
                 to the guest system, the emulator can insert memory
                 fences at appropriate locations in the translated code
                 to enforce the guest memory ordering constraints. These
                 memory fences can significantly degrade the performance
                 of the translated code. Transactional execution support
                 available on certain recent microprocessors provides an
                 alternative approach. Transactional execution of the
                 translated code enforces sequential consistency (SC) at
                 the coarse-grained transaction level, which in turn
                 ensures that all memory accesses made on the host
                 machine conform to SC. Enforcing SC on the host machine
                 guarantees that the emulated execution will be correct
                 for any guest memory model. In this article, we compare
                 and evaluate the overheads associated with using
                 transactions and fences for memory consistency model
                 emulation on the Intel Haswell processor. Our
                 experience of implementing these two approaches on a
                 state-of-the-art parallel emulator, COREMU,
                 demonstrates that memory consistency model emulation
                 using transactions performs better when the transaction
                 sizes are large enough to amortize the transaction
                 overhead and the transaction conflict rate is low,
                 whereas inserting memory fences is better for
                 applications in which the transaction overhead is high.
                 A hybrid implementation that dynamically determines
                 which approach to invoke can outperform both
                 approaches. Our results, based on the SPLASH-2 and the
                 PARSEC benchmark suites, demonstrate that the proposed
                 hybrid approach is able to outperform the fence
                 insertion mechanism by 4.9\% and the transactional
                 execution approach by 24.9\% for two-thread
                 applications, and outperform them by 4.5\% and 44.7\%,
                 respectively, for four-threaded execution.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "29",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Panda:2015:CUD,
  author =       "Biswabandan Panda and Shankar Balachandran",
  title =        "{CAFFEINE}: a Utility-Driven Prefetcher Aggressiveness
                 Engine for Multicores",
  journal =      j-TACO,
  volume =       "12",
  number =       "3",
  pages =        "30:1--30:??",
  month =        oct,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2806891",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Wed Oct 7 18:51:05 MDT 2015",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Aggressive prefetching improves system performance by
                 hiding and tolerating off-chip memory latency. However,
                 on a multicore system, prefetchers of different cores
                 contend for shared resources and aggressive prefetching
                 can degrade the overall system performance. The role of
                 a prefetcher aggressiveness engine is to select
                 appropriate aggressiveness levels for each prefetcher
                 such that shared resource contention caused by
                 prefetchers is reduced, thereby improving system
                 performance. State-of-the-art prefetcher aggressiveness
                 engines monitor metrics such as prefetch accuracy,
                 bandwidth consumption, and last-level cache pollution.
                 They use carefully tuned thresholds for these metrics,
                 and when the thresholds are crossed, they trigger
                 aggressiveness control measures. These engines have
                 three major shortcomings: (1) thresholds are dependent
                 on the system configuration (cache size, DRAM
                 scheduling policy, and cache replacement policy) and
                 have to be tuned appropriately, (2) there is no single
                 threshold that works well across all the workloads, and
                 (3) thresholds are oblivious to the phase change of
                 applications. To overcome these shortcomings, we
                 propose CAFFEINE, a model-based approach that analyzes
                 the effectiveness of a prefetcher and uses a metric
                 called net utility to control the aggressiveness. Our
                 metric provides net processor cycles saved because of
                 prefetching by approximating the cycles saved across
                 the memory subsystem, from last-level cache to DRAM. We
                 evaluate CAFFEINE across a wide range of workloads and
                 compare it with the state-of-the-art prefetcher
                 aggressiveness engine. Experimental results demonstrate
                 that, on average (geomean), CAFFEINE achieves 9.5\% (as
                 much as 38.29\%) and 11\% (as much as 20.7\%) better
                 performance than the best-performing aggressiveness
                 engine for four-core and eight-core systems,
                 respectively.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "30",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Zhao:2015:BSB,
  author =       "Jishen Zhao and Sheng Li and Jichuan Chang and John L.
                 Byrne and Laura L. Ramirez and Kevin Lim and Yuan Xie
                 and Paolo Faraboschi",
  title =        "{Buri}: Scaling Big-Memory Computing with
                 Hardware-Based Memory Expansion",
  journal =      j-TACO,
  volume =       "12",
  number =       "3",
  pages =        "31:1--31:??",
  month =        oct,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2808233",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Wed Oct 7 18:51:05 MDT 2015",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Motivated by the challenges of scaling up memory
                 capacity and fully exploiting the benefits of memory
                 compression, we propose Buri, a hardware-based memory
                 compression scheme, which simultaneously achieves cost
                 efficiency, high performance, and ease of adoption.
                 Buri combines (1) a self-contained, ready-to-adopt
                 hardware compression module, which manages metadata
                 compression and memory allocation/relocation
                 operations; (2) a set of hardware optimization
                 mechanisms, which reduce the area and performance
                 overheads in accommodating the address indirection
                 required by memory compression; and (3) lightweight
                 BIOS/OS extensions used to handle exceptions. Our
                 evaluation with large memory workload traces shows that
                 Buri can increase capacity by 70\%, in addition to the
                 compression ratio already provided by database
                 software.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "31",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Lucas:2015:SSS,
  author =       "Jan Lucas and Michael Andersch and Mauricio
                 Alvarez-Mesa and Ben Juurlink",
  title =        "Spatiotemporal {SIMT} and Scalarization for Improving
                 {GPU} Efficiency",
  journal =      j-TACO,
  volume =       "12",
  number =       "3",
  pages =        "32:1--32:??",
  month =        oct,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2811402",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Wed Oct 7 18:51:05 MDT 2015",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Temporal SIMT (TSIMT) has been suggested as an
                 alternative to conventional (spatial) SIMT for
                 improving GPU performance on branch-intensive code.
                 Although TSIMT has been briefly mentioned before, it
                 was not evaluated. We present a complete design and
                 evaluation of TSIMT GPUs, along with the inclusion of
                 scalarization and a combination of temporal and spatial
                 SIMT, named Spatiotemporal SIMT (STSIMT). Simulations
                 show that TSIMT alone results in a performance
                 reduction, but a combination of scalarization and
                 STSIMT yields a mean performance enhancement of 19.6\%
                 and improves the energy-delay product by 26.2\%
                 compared to SIMT.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "32",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Das:2016:RDB,
  author =       "Subhasis Das and Tor M. Aamodt and William J. Dally",
  title =        "Reuse Distance-Based Probabilistic Cache Replacement",
  journal =      j-TACO,
  volume =       "12",
  number =       "4",
  pages =        "33:1--33:??",
  month =        jan,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2818374",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Tue Feb 16 15:36:38 MST 2016",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "This article proposes Probabilistic Replacement Policy
                 (PRP), a novel replacement policy that evicts the line
                 with minimum estimated hit probability under optimal
                 replacement instead of the line with maximum expected
                 reuse distance. The latter is optimal under the
                 independent reference model of programs, which does not
                 hold for last-level caches (LLC). PRP requires 7\% and
                 2\% metadata overheads in the cache and DRAM
                 respectively. Using a sampling scheme makes DRAM
                 overhead negligible, with minimal performance impact.
                 Including detailed overhead modeling and equal cache
                 areas, PRP outperforms SHiP, a state-of-the-art LLC
                 replacement algorithm, by 4\% for memory-intensive
                 SPEC-CPU2006 benchmarks.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "33",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Deniz:2016:MGM,
  author =       "Etem Deniz and Alper Sen",
  title =        "{MINIME-GPU}: Multicore Benchmark Synthesizer for
                 {GPUs}",
  journal =      j-TACO,
  volume =       "12",
  number =       "4",
  pages =        "34:1--34:??",
  month =        jan,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2818693",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Tue Feb 16 15:36:38 MST 2016",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "We introduce MINIME-GPU, a novel automated benchmark
                 synthesis framework for graphics processing units
                 (GPUs) that serves to speed up architectural simulation
                 of modern GPU architectures. Our framework captures
                 important characteristics of original GPU applications
                 and generates synthetic GPU benchmarks using the Open
                 Computing Language (OpenCL) library from those
                 applications. To the best of our knowledge, this is the
                 first time synthetic OpenCL benchmarks for GPUs are
                 generated from existing applications. We use several
                 characteristics, including instruction throughput,
                 compute unit occupancy, and memory efficiency, to
                 compare the similarity of original applications and
                 their corresponding synthetic benchmarks. The
                 experimental results show that our synthetic benchmark
                 generation framework is capable of generating synthetic
                 benchmarks that have similar characteristics with the
                 original applications from which they are generated. On
                 average, the similarity (accuracy) is 96\% and the
                 speedup is 541 $ \times $ . In addition, our synthetic
                 benchmarks use the OpenCL library, which allows us to
                 obtain portable human readable benchmarks as opposed to
                 using assembly-level code, and they are faster and
                 smaller than the original applications from which they
                 are generated. We experimentally validated that our
                 synthetic benchmarks preserve the characteristics of
                 the original applications across different
                 architectures.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "34",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Tan:2016:SEE,
  author =       "Li Tan and Zizhong Chen and Shuaiwen Leon Song",
  title =        "Scalable Energy Efficiency with Resilience for High
                 Performance Computing Systems: a Quantitative
                 Methodology",
  journal =      j-TACO,
  volume =       "12",
  number =       "4",
  pages =        "35:1--35:??",
  month =        jan,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2822893",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Tue Feb 16 15:36:38 MST 2016",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Ever-growing performance of supercomputers nowadays
                 brings demanding requirements of energy efficiency and
                 resilience, due to rapidly expanding size and duration
                 in use of the large-scale computing systems. Many
                 application/architecture-dependent parameters that
                 determine energy efficiency and resilience individually
                 have causal effects with each other, which directly
                 affect the trade-offs among performance, energy
                 efficiency and resilience at scale. To enable
                 high-efficiency management for large-scale
                 High-Performance Computing (HPC) systems nowadays,
                 quantitatively understanding the entangled effects
                 among performance, energy efficiency, and resilience is
                 thus required. While previous work focuses on exploring
                 energy-saving and resilience-enhancing opportunities
                 separately, little has been done to theoretically and
                 empirically investigate the interplay between energy
                 efficiency and resilience at scale. In this article, by
                 extending the Amdahl's Law and the Karp-Flatt Metric,
                 taking resilience into consideration, we quantitatively
                 model the integrated energy efficiency in terms of
                 performance per Watt and showcase the trade-offs among
                 typical HPC parameters, such as number of cores,
                 frequency/voltage, and failure rates. Experimental
                 results for a wide spectrum of HPC benchmarks on two
                 HPC systems show that the proposed models are accurate
                 in extrapolating resilience-aware performance and
                 energy efficiency, and capable of capturing the
                 interplay among various energy-saving and resilience
                 factors. Moreover, the models can help find the optimal
                 HPC configuration for the highest integrated energy
                 efficiency, in the presence of failures and applied
                 resilience techniques.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "35",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Pusukuri:2016:TEL,
  author =       "Kishore Kumar Pusukuri and Rajiv Gupta and Laxmi N.
                 Bhuyan",
  title =        "{Tumbler}: an Effective Load-Balancing Technique for
                 Multi-{CPU} Multicore Systems",
  journal =      j-TACO,
  volume =       "12",
  number =       "4",
  pages =        "36:1--36:??",
  month =        jan,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2827698",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Tue Feb 16 15:36:38 MST 2016",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Schedulers used by modern OSs (e.g., Oracle Solaris
                 11{\trademark} and GNU/Linux) balance load by balancing
                 the number of threads in run queues of different cores.
                 While this approach is effective for a single CPU
                 multicore system, we show that it can lead to a
                 significant load imbalance across CPUs of a multi-CPU
                 multicore system. Because different threads of a
                 multithreaded application often exhibit different
                 levels of CPU utilization, load cannot be measured in
                 terms of the number of threads alone. We propose
                 Tumbler that migrates the threads of a multithreaded
                 program across multiple CPUs to balance the load across
                 the CPUs. While Tumbler distributes the threads equally
                 across the CPUs, its assignment of threads to CPUs is
                 aimed at minimizing the variation in utilization of
                 different CPUs to achieve load balance. We evaluated
                 Tumbler using a wide variety of 35 multithreaded
                 applications, and our experimental results show that
                 Tumbler outperforms both Oracle Solaris 11{\trademark}
                 and GNU/Linux.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "36",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Tomusk:2016:FME,
  author =       "Erik Tomusk and Christophe Dubach and Michael
                 O'Boyle",
  title =        "Four Metrics to Evaluate Heterogeneous Multicores",
  journal =      j-TACO,
  volume =       "12",
  number =       "4",
  pages =        "37:1--37:??",
  month =        jan,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2829950",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Tue Feb 16 15:36:38 MST 2016",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Semiconductor device scaling has made single-ISA
                 heterogeneous processors a reality. Heterogeneous
                 processors contain a number of different CPU cores that
                 all implement the same Instruction Set Architecture
                 (ISA). This enables greater flexibility and
                 specialization, as runtime constraints and workload
                 characteristics can influence which core a given
                 workload is run on. A major roadblock to the further
                 development of heterogeneous processors is the lack of
                 appropriate evaluation metrics. Existing metrics can be
                 used to evaluate individual cores, but to evaluate a
                 heterogeneous processor, the cores must be considered
                 as a collective. Without appropriate metrics, it is
                 impossible to establish design goals for processors,
                 and it is difficult to accurately compare two different
                 heterogeneous processors. We present four new metrics
                 to evaluate user-oriented aspects of sets of
                 heterogeneous cores: localized nonuniformity, gap
                 overhead, set overhead, and generality. The metrics
                 consider sets rather than individual cores. We use
                 examples to demonstrate each metric, and show that the
                 metrics can be used to quantify intuitions about
                 heterogeneous cores.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "37",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Hoseinzadeh:2016:SSP,
  author =       "Morteza Hoseinzadeh and Mohammad Arjomand and Hamid
                 Sarbazi-Azad",
  title =        "{SPCM}: The Striped Phase Change Memory",
  journal =      j-TACO,
  volume =       "12",
  number =       "4",
  pages =        "38:1--38:??",
  month =        jan,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2829951",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Tue Feb 16 15:36:38 MST 2016",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Phase Change Memory (PCM) devices are one of the known
                 promising technologies to take the place of DRAM
                 devices with the aim of overcoming the obstacles of
                 reducing feature size and stopping ever growing amounts
                 of leakage power. In exchange for providing high
                 capacity, high density, and nonvolatility, PCM
                 Multilevel Cells (MLCs) impose high write energy and
                 long latency. Many techniques have been proposed to
                 resolve these side effects. However, read performance
                 issues are usually left behind the great importance of
                 write latency, energy, and lifetime. In this article,
                 we focus on read performance and improve the critical
                 path latency of the main memory system. To this end, we
                 exploit striping scheme by which multiple lines are
                 grouped and lie on a single MLC line array. In order to
                 achieve more performance gain, an adaptive ordering
                 mechanism is used to sort lines in a group based on
                 their read frequency. This scheme imposes large energy
                 and lifetime overheads due to its intensive demand for
                 higher write bandwidth. Thus, we equipped our design
                 with a grouping/pairing write queue to synchronize
                 write-back requests such that all updates to an MLC
                 array occur at once. The design is also augmented by a
                 directional write scheme that takes benefits of the
                 uniformity of accesses to the PCM device---caused by
                 the large DRAM cache---to determine the writing mode
                 (striped or nonstriped). This adaptation to write
                 operations relaxes the energy and lifetime overheads.
                 We improve the read latency of a 2-bit MLC PCM memory
                 by more than 24\% (and Instructions Per Cycle (IPC) by
                 about 9\%) and energy-delay product by about 20\% for a
                 small lifetime degradation of 8\%, on average.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "38",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Jiang:2016:TLH,
  author =       "Chuntao Jiang and Zhibin Yu and Lieven Eeckhout and
                 Hai Jin and Xiaofei Liao and Chengzhong Xu",
  title =        "Two-Level Hybrid Sampled Simulation of Multithreaded
                 Applications",
  journal =      j-TACO,
  volume =       "12",
  number =       "4",
  pages =        "39:1--39:??",
  month =        jan,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2818353",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Tue Feb 16 15:36:38 MST 2016",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Sampled microarchitectural simulation of
                 single-threaded applications is mature technology for
                 over a decade now. Sampling multithreaded applications,
                 on the other hand, is much more complicated. Not until
                 very recently have researchers proposed solutions for
                 sampled simulation of multithreaded applications.
                 Time-Based Sampling (TBS) samples multithreaded
                 application execution based on time---not instructions
                 as is typically done for single-threaded
                 applications---yielding estimates for a multithreaded
                 application's execution time. In this article, we
                 revisit and analyze previously proposed TBS approaches
                 (periodic and cantor fractal based sampling), and we
                 obtain a number of novel and surprising insights, such
                 as (i) accurately estimating fast-forwarding IPC, that
                 is, performance in-between sampling units, is more
                 important than accurately estimating sample IPC, that
                 is, performance within the sampling units; (ii)
                 fast-forwarding IPC estimation accuracy is determined
                 by both the sampling unit distribution and how to use
                 the sampling units to predict fast-forwarding IPC; and
                 (iii) cantor sampling is more accurate at small
                 sampling unit sizes, whereas periodic is more accurate
                 at large sampling unit sizes. These insights lead to
                 the development of Two-level Hybrid Sampling (THS), a
                 novel sampling methodology for multithreaded
                 applications that combines periodic sampling's accuracy
                 at large time scales (i.e., uniformly selecting
                 coarse-grain sampling units across the entire program
                 execution) with cantor sampling's accuracy at small
                 time scales (i.e., the ability to accurately predict
                 fast-forwarding IPC in-between small sampling units).
                 The clustered occurrence of small sampling units under
                 cantor sampling also enables shortened warmup and thus
                 enhanced simulation speed. Overall, THS achieves an
                 average absolute execution time prediction error of 4\%
                 while yielding an average simulation speedup of 40 $
                 \times $ compared to detailed simulation, which is both
                 more accurate and faster than the current
                 state-of-the-art. Case studies illustrate THS' ability
                 to accurately predict relative performance differences
                 across the design space.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "39",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Dsouza:2016:IMS,
  author =       "Sandeep D'souza and Soumya J. and Santanu
                 Chattopadhyay",
  title =        "Integrated Mapping and Synthesis Techniques for
                 Network-on-Chip Topologies with Express Channels",
  journal =      j-TACO,
  volume =       "12",
  number =       "4",
  pages =        "40:1--40:??",
  month =        jan,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2831233",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Tue Feb 16 15:36:38 MST 2016",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "The addition of express channels to a traditional mesh
                 network-on-chip (NoC) has emerged as a viable solution
                 to solve the problem of high latency. In this article,
                 we address the problem of integrated mapping and
                 synthesis for express channel--based mesh NoC
                 topologies. An integer linear programming--based
                 formulation has been presented for the mapping problem
                 followed by a constructive heuristic for simultaneous
                 application mapping and synthesis for an express
                 channel--based NoC. The static and dynamic simulation
                 results indicate that the obtained mappings lead to
                 significant reduction in both average packet delay and
                 network energy consumption. The obtained synthesized
                 topologies were also found to be much more power
                 efficient compared to conventional express channel
                 topologies.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "40",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Chasapis:2016:PEI,
  author =       "Dimitrios Chasapis and Marc Casas and Miquel
                 Moret{\'o} and Raul Vidal and Eduard Ayguad{\'e} and
                 Jes{\'u}s Labarta and Mateo Valero",
  title =        "{PARSECSs}: Evaluating the Impact of Task Parallelism
                 in the {PARSEC} Benchmark Suite",
  journal =      j-TACO,
  volume =       "12",
  number =       "4",
  pages =        "41:1--41:??",
  month =        jan,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2829952",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Tue Feb 16 15:36:38 MST 2016",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "In this work, we show how parallel applications can be
                 implemented efficiently using task parallelism. We also
                 evaluate the benefits of such parallel paradigm with
                 respect to other approaches. We use the PARSEC
                 benchmark suite as our test bed, which includes
                 applications representative of a wide range of domains
                 from HPC to desktop and server applications. We adopt
                 different parallelization techniques, tailored to the
                 needs of each application, to fully exploit the
                 task-based model. Our evaluation shows that task
                 parallelism achieves better performance than
                 thread-based parallelization models, such as Pthreads.
                 Our experimental results show that we can obtain
                 scalability improvements up to 42\% on a 16-core system
                 and code size reductions up to 81\%. Such reductions
                 are achieved by removing from the source code
                 application specific schedulers or thread pooling
                 systems and transferring these responsibilities to the
                 runtime system software.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "41",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Gaspar:2016:FAG,
  author =       "Francisco Gaspar and Luis Tani{\c{c}}a and Pedro
                 Tom{\'a}s and Aleksandar Ilic and Leonel Sousa",
  title =        "A Framework for Application-Guided Task Management on
                 Heterogeneous Embedded Systems",
  journal =      j-TACO,
  volume =       "12",
  number =       "4",
  pages =        "42:1--42:??",
  month =        jan,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2835177",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Tue Feb 16 15:36:38 MST 2016",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "In this article, we propose a general framework for
                 fine-grain application-aware task management in
                 heterogeneous embedded platforms, which allows
                 integration of different mechanisms for an efficient
                 resource utilization, frequency scaling, and task
                 migration. The proposed framework incorporates several
                 components for accurate runtime monitoring by relying
                 on the OS facilities and performance self-reporting for
                 parallel and iterative applications. The framework
                 efficiency is experimentally evaluated on a real
                 hardware platform, where significant power and energy
                 savings are attained for SPEC CPU2006 and PARSEC
                 benchmarks, by guiding frequency scaling and
                 intercluster migrations according to the runtime
                 application behavior and predefined performance
                 targets.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "42",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Ardestani:2016:MMV,
  author =       "Ehsan K. Ardestani and Rafael Trapani Possignolo and
                 Jose Luis Briz and Jose Renau",
  title =        "Managing Mismatches in Voltage Stacking with
                 {CoreUnfolding}",
  journal =      j-TACO,
  volume =       "12",
  number =       "4",
  pages =        "43:1--43:??",
  month =        jan,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2835178",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Tue Feb 16 15:36:38 MST 2016",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Five percent to 25\% of power could be wasted before
                 it is delivered to the computational resources on a
                 die, due to inefficiencies of voltage regulators and
                 resistive loss. The power delivery could benefit if, at
                 the same power, the delivered voltage increases and the
                 current decreases. This article presents CoreUnfolding,
                 a technique that leverages voltage Stacking to improve
                 power delivery efficiency. Our experiments show that
                 about 10\% system-wide power can be saved, the voltage
                 regulator area can be reduced by 30\%, di / dt improves
                 49\%, and the power pin count is reduced by 40\%
                 ({\SGMLap} 20\% reduction in packaging costs), with
                 negligible performance degradation.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "43",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Nair:2016:FFC,
  author =       "Prashant J. Nair and David A. Roberts and Moinuddin K.
                 Qureshi",
  title =        "{FaultSim}: a Fast, Configurable Memory-Reliability
                 Simulator for Conventional and {$3$D}-Stacked Systems",
  journal =      j-TACO,
  volume =       "12",
  number =       "4",
  pages =        "44:1--44:??",
  month =        jan,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2831234",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Tue Feb 16 15:36:38 MST 2016",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "As memory systems scale, maintaining their Reliability
                 Availability and Serviceability (RAS) is becoming more
                 complex. To make matters worse, recent studies of DRAM
                 failures in data centers and supercomputer environments
                 have highlighted that large-granularity failures are
                 common in DRAM chips. Furthermore, the move toward
                 3D-stacked memories can make the system vulnerable to
                 newer failure modes, such as those occurring from
                 faults in Through-Silicon Vias (TSVs). To architect
                 future systems and to use emerging technology, system
                 designers will need to employ strong error correction
                 and repair techniques. Unfortunately, evaluating the
                 relative effectiveness of these reliability mechanisms
                 is often difficult and is traditionally done with
                 analytical models, which are both error prone and
                 time-consuming to develop. To this end, this article
                 proposes FaultSim, a fast configurable
                 memory-reliability simulation tool for 2D and
                 3D-stacked memory systems. FaultSim employs Monte Carlo
                 simulations, which are driven by real-world failure
                 statistics. We discuss the novel algorithms and data
                 structures used in FaultSim to accelerate the
                 evaluation of different resilience schemes. We
                 implement BCH-1 (SECDED) and ChipKill codes using
                 FaultSim and validate against an analytical model.
                 FaultSim implements BCH-1 and ChipKill codes with a
                 deviation of only 0.032\% and 8.41\% from the
                 analytical model. FaultSim can simulate 1 million Monte
                 Carlo trials (each for a period of 7 years) of BCH-1
                 and ChipKill codes in only 34 seconds and 33 seconds,
                 respectively.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "44",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Lee:2016:ACS,
  author =       "Byeongcheol Lee",
  title =        "Adaptive Correction of Sampling Bias in Dynamic Call
                 Graphs",
  journal =      j-TACO,
  volume =       "12",
  number =       "4",
  pages =        "45:1--45:??",
  month =        jan,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2840806",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Tue Feb 16 15:36:38 MST 2016",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "This article introduces a practical low-overhead
                 adaptive technique of correcting sampling bias in
                 profiling dynamic call graphs. Timer-based sampling
                 keeps the overhead low but sampling bias lowers the
                 accuracy when either observable call events or sampling
                 actions are not equally spaced in time. To mitigate
                 sampling bias, our adaptive correction technique
                 weights each sample by monitoring time-varying spacing
                 of call events and sampling actions. We implemented and
                 evaluated our adaptive correction technique in Jikes
                 RVM, a high-performance virtual machine. In our
                 empirical evaluation, our technique significantly
                 improved the sampling accuracy without measurable
                 overhead and resulted in effective feedback directed
                 inlining.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "45",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Mcpherson:2016:FPL,
  author =       "Andrew J. Mcpherson and Vijay Nagarajan and Susmit
                 Sarkar and Marcelo Cintra",
  title =        "Fence Placement for Legacy Data-Race-Free Programs via
                 Synchronization Read Detection",
  journal =      j-TACO,
  volume =       "12",
  number =       "4",
  pages =        "46:1--46:??",
  month =        jan,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2835179",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Tue Feb 16 15:36:38 MST 2016",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Shared-memory programmers traditionally assumed
                 Sequential Consistency (SC), but modern systems have
                 relaxed memory consistency. Here, the trend in
                 languages is toward Data-Race-Free (DRF) models, where,
                 assuming annotated synchronizations and the program
                 being well-synchronized by those synchronizations, the
                 hardware and compiler guarantee SC. However, legacy
                 programs lack annotations, so even well-synchronized
                 (legacy DRF) programs aren't recognized. For legacy DRF
                 programs, we can significantly prune the set of memory
                 orderings determined by automated fence placement by
                 automatically identifying synchronization reads. We
                 prove our rules for identifying them conservatively,
                 implement them within LLVM, and observe a 30\% average
                 performance improvement over previous techniques.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "46",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Hong:2016:OCT,
  author =       "Ding-Yong Hong and Chun-Chen Hsu and Cheng-Yi Chou and
                 Wei-Chung Hsu and Pangfeng Liu and Jan-Jan Wu",
  title =        "Optimizing Control Transfer and Memory Virtualization
                 in Full System Emulators",
  journal =      j-TACO,
  volume =       "12",
  number =       "4",
  pages =        "47:1--47:??",
  month =        jan,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2837027",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Tue Feb 16 15:36:38 MST 2016",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Full system emulators provide virtual platforms for
                 several important applications, such as kernel and
                 system software development, co-verification with cycle
                 accurate CPU simulators, or application development for
                 hardware still in development. Full system emulators
                 usually use dynamic binary translation to obtain
                 reasonable performance. This paper focuses on
                 optimizing the performance of full system emulators.
                 First, we optimize performance by enabling classic
                 control transfer optimizations of dynamic binary
                 translation in full system emulation, such as indirect
                 branch target caching and block chaining. Second, we
                 improve the performance of memory virtualization of
                 cross-ISA virtual machines by improving the efficiency
                 of the software translation lookaside buffer (software
                 TLB). We implement our optimizations on QEMU, an
                 industrial-strength full system emulator, along with
                 the Android emulator. Experimental results show that
                 our optimizations achieve an average speedup of 1.98X
                 for ARM-to-X86-64 QEMU running SPEC CINT2006 benchmarks
                 with train inputs. Our optimizations also achieve an
                 average speedup of 1.44X and 1.40X for IA32-to-X86-64
                 QEMU and AArch64-to-X86-64 QEMU on SPEC CINT2006. We
                 use a set of real applications downloaded from Google
                 Play as benchmarks for the Android emulator.
                 Experimental results show that our optimizations
                 achieve an average speedup of 1.43X for the Android
                 emulator running these applications.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "47",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Sukumaran-Rajam:2016:PMN,
  author =       "Aravind Sukumaran-Rajam and Philippe Clauss",
  title =        "The Polyhedral Model of Nonlinear Loops",
  journal =      j-TACO,
  volume =       "12",
  number =       "4",
  pages =        "48:1--48:??",
  month =        jan,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2838734",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Tue Feb 16 15:36:38 MST 2016",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Runtime code optimization and speculative execution
                 are becoming increasingly prominent to leverage
                 performance in the current multi- and many-core era.
                 However, a wider and more efficient use of such
                 techniques is mainly hampered by the prohibitive time
                 overhead induced by centralized data race detection,
                 dynamic code behavior modeling, and code generation.
                 Most of the existing Thread Level Speculation (TLS)
                 systems rely on naively slicing the target loops into
                 chunks and trying to execute the chunks in parallel
                 with the help of a centralized performance-penalizing
                 verification module that takes care of data races. Due
                 to the lack of a data dependence model, these
                 speculative systems are not capable of doing advanced
                 transformations, and, more importantly, the chances of
                 rollback are high. The polyhedral model is a well-known
                 mathematical model to analyze and optimize loop nests.
                 The current state-of-art tools limit the application of
                 the polyhedral model to static control codes. Thus,
                 none of these tools can generally handle codes with
                 while loops, indirect memory accesses, or pointers.
                 Apollo (Automatic POLyhedral Loop Optimizer) is a
                 framework that goes one step beyond and applies the
                 polyhedral model dynamically by using TLS. Apollo can
                 predict, at runtime, whether the codes are behaving
                 linearly or not, and it applies polyhedral
                 transformations on-the-fly. This article presents a
                 novel system that enables Apollo to handle codes whose
                 memory accesses and loop bounds are not necessarily
                 linear. More generally, this approach expands the
                 applicability of the polyhedral model at runtime to a
                 wider class of codes. Plugging together both linear and
                 nonlinear accesses to the dependence prediction model
                 enables the application of polyhedral loop optimizing
                 transformations even for nonlinear code kernels while
                 also allowing a low-cost speculation verification.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "48",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Nair:2016:CEP,
  author =       "Prashant J. Nair and David A. Roberts and Moinuddin K.
                 Qureshi",
  title =        "Citadel: Efficiently Protecting Stacked Memory from
                 {TSV} and Large Granularity Failures",
  journal =      j-TACO,
  volume =       "12",
  number =       "4",
  pages =        "49:1--49:??",
  month =        jan,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2840807",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Tue Feb 16 15:36:38 MST 2016",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Stacked memory modules are likely to be tightly
                 integrated with the processor. It is vital that these
                 memory modules operate reliably, as memory failure can
                 require the replacement of the entire socket. To make
                 matters worse, stacked memory designs are susceptible
                 to newer failure modes (e.g., due to faulty
                 through-silicon vias, or TSVs) that can cause large
                 portions of memory, such as a bank, to become faulty.
                 To avoid data loss from large-granularity failures, the
                 memory system may use symbol-based codes that stripe
                 the data for a cache line across several banks (or
                 channels). Unfortunately, such data-striping reduces
                 memory-level parallelism, causing significant slowdown
                 and higher power consumption. This article proposes
                 Citadel, a robust memory architecture that allows the
                 memory system to retain each cache line within one
                 bank. By retaining cache lines within banks, Citadel
                 enables a high-performance and low-power memory system
                 and also efficiently protects the stacked memory system
                 from large-granularity failures. Citadel consists of
                 three components; TSV-Swap, which can tolerate both
                 faulty data-TSVs and faulty address-TSVs;
                 Tri-Dimensional Parity (3DP), which can tolerate column
                 failures, row failures, and bank failures; and Dynamic
                 Dual-Granularity Sparing (DDS), which can mitigate
                 permanent faults by dynamically sparing faulty memory
                 regions either at a row granularity or at a bank
                 granularity. Our evaluations with real-world data for
                 DRAM failures show that Citadel provides performance
                 and power similar to maintaining the entire cache line
                 in the same bank, and yet provides 700 $ \times $
                 higher reliability than ChipKill-like ECC codes.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "49",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Anderson:2016:AVI,
  author =       "Andrew Anderson and Avinash Malik and David Gregg",
  title =        "Automatic Vectorization of Interleaved Data
                 Revisited",
  journal =      j-TACO,
  volume =       "12",
  number =       "4",
  pages =        "50:1--50:??",
  month =        jan,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2838735",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Tue Feb 16 15:36:38 MST 2016",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Automatically exploiting short vector instructions
                 sets (SSE, AVX, NEON) is a critically important task
                 for optimizing compilers. Vector instructions typically
                 work best on data that is contiguous in memory, and
                 operating on non-contiguous data requires additional
                 work to gather and scatter the data. There are several
                 varieties of non-contiguous access, including
                 interleaved data access. An existing approach used by
                 GCC generates extremely efficient code for loops with
                 power-of-2 interleaving factors (strides). In this
                 paper we propose a generalization of this approach that
                 produces similar code for any compile-time constant
                 interleaving factor. In addition, we propose several
                 novel program transformations, which were made possible
                 by our generalized representation of the problem.
                 Experiments show that our approach achieves significant
                 speedups for both power-of-2 and non--power-of-2
                 interleaving factors. Our vectorization approach
                 results in mean speedups over scalar code of 1.77x on
                 Intel SSE and 2.53x on Intel AVX2 in real-world
                 benchmarking on a selection of BLAS Level 1 routines.
                 On the same benchmark programs, GCC 5.0 achieves mean
                 improvements of 1.43x on Intel SSE and 1.30x on Intel
                 AVX2. In synthetic benchmarking on Intel SSE, our
                 maximum improvement on data movement is over 4x for
                 gathering operations and over 6x for scattering
                 operations versus scalar code.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "50",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Zhao:2016:FMR,
  author =       "Lihang Zhao and Lizhong Chen and Woojin Choi and
                 Jeffrey Draper",
  title =        "A Filtering Mechanism to Reduce Network Bandwidth
                 Utilization of Transaction Execution",
  journal =      j-TACO,
  volume =       "12",
  number =       "4",
  pages =        "51:1--51:??",
  month =        jan,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2837028",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Tue Feb 16 15:36:38 MST 2016",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Hardware Transactional Memory (HTM) relies heavily on
                 the on-chip network for intertransaction communication.
                 However, the network bandwidth utilization of
                 transactions has been largely neglected in HTM designs.
                 In this work, we propose a cost model to analyze
                 network bandwidth in transaction execution. The cost
                 model identifies a set of key factors that can be
                 optimized through system design to reduce the
                 communication cost of HTM. Based on the model and
                 network traffic characterization of a representative
                 HTM design, we identify a huge source of superfluous
                 traffic due to failed requests in transaction
                 conflicts. As observed in a spectrum of workloads, 39\%
                 of the transactional requests fail due to conflicts,
                 which renders 58\% of the transactional network traffic
                 futile. To combat this pathology, a novel in-network
                 filtering mechanism is proposed. The on-chip router is
                 augmented to predict conflicts among transactions and
                 proactively filter out those requests that have a high
                 probability to fail. Experimental results show the
                 proposed mechanism reduces total network traffic by
                 24\% on average for a set of high-contention TM
                 applications, thereby reducing energy consumption by an
                 average of 24\%. Meanwhile, the contention in the
                 coherence directory is reduced by 68\%, on average.
                 These improvements are achieved with only 5\% area
                 added to a conventional on-chip router design.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "51",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Serres:2016:EPP,
  author =       "Olivier Serres and Abdullah Kayi and Ahmad Anbar and
                 Tarek El-Ghazawi",
  title =        "Enabling {PGAS} Productivity with Hardware Support for
                 Shared Address Mapping: a {UPC} Case Study",
  journal =      j-TACO,
  volume =       "12",
  number =       "4",
  pages =        "52:1--52:??",
  month =        jan,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2842686",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Tue Feb 16 15:36:38 MST 2016",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Due to its rich memory model, the partitioned global
                 address space (PGAS) parallel programming model strikes
                 a balance between locality-awareness and the ease of
                 use of the global address space model. Although
                 locality-awareness can lead to high performance,
                 supporting the PGAS memory model is associated with
                 penalties that can hinder PGAS's potential for
                 scalability and speed of execution. This is because
                 mapping the PGAS memory model to the underlying system
                 requires a mapping process that is done in software,
                 thereby introducing substantial overhead for shared
                 accesses even when they are local. Compiler
                 optimizations have not been sufficient to offset this
                 overhead. On the other hand, manual code optimizations
                 can help, but this eliminates the productivity edge of
                 PGAS. This article proposes a processor
                 microarchitecture extension that can perform such
                 address mapping in hardware with nearly no performance
                 overhead. These extensions are then availed to
                 compilers through extensions to the processor
                 instructions. Thus, the need for manual optimizations
                 is eliminated and the productivity of PGAS languages is
                 unleashed. Using Unified Parallel C (UPC), a PGAS
                 language, we present a case study of a prototype
                 compiler and architecture support. Two different
                 implementations of the system were realized. The first
                 uses a full-system simulator, gem5, which evaluates the
                 overall performance gain of the new hardware support.
                 The second uses an FPGA Leon3 soft-core processor to
                 verify implementation feasibility and to parameterize
                 the cost of the new hardware. The new instructions show
                 promising results on all tested codes, including the
                 NAS Parallel Benchmark kernels in UPC. Performance
                 improvements of up to 5.5 $ \times $ for unmodified
                 codes, sometimes surpassing hand-optimized performance,
                 were demonstrated. We also show that our four-core FPGA
                 prototype requires less than 2.4\% of the overall
                 chip's area.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "52",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Cattaneo:2016:HAI,
  author =       "Riccardo Cattaneo and Giuseppe Natale and Carlo
                 Sicignano and Donatella Sciuto and Marco Domenico
                 Santambrogio",
  title =        "On How to Accelerate Iterative Stencil Loops: a
                 Scalable Streaming-Based Approach",
  journal =      j-TACO,
  volume =       "12",
  number =       "4",
  pages =        "53:1--53:??",
  month =        jan,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2842615",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Tue Feb 16 15:36:38 MST 2016",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "In high-performance systems, stencil computations play
                 a crucial role as they appear in a variety of different
                 fields of application, ranging from partial
                 differential equation solving, to computer simulation
                 of particles' interaction, to image processing and
                 computer vision. The computationally intensive nature
                 of those algorithms created the need for solutions to
                 efficiently implement them in order to save both
                 execution time and energy. This, in combination with
                 their regular structure, has justified their widespread
                 study and the proposal of largely different approaches
                 to their optimization. However, most of these works are
                 focused on aggressive compile time optimization, cache
                 locality optimization, and parallelism extraction for
                 the multicore/multiprocessor domain, while fewer works
                 are focused on the exploitation of custom architectures
                 to further exploit the regular structure of Iterative
                 Stencil Loops (ISLs), specifically with the goal of
                 improving power efficiency. This work introduces a
                 methodology to systematically design power-efficient
                 hardware accelerators for the optimal execution of ISL
                 algorithms on Field-programmable Gate Arrays (FPGAs).
                 As part of the methodology, we introduce the notion of
                 Streaming Stencil Time-step (SST), a streaming-based
                 architecture capable of achieving both low resource
                 usage and efficient data reuse thanks to an optimal
                 data buffering strategy, and we introduce a technique
                 called SSTs queuing that is capable of delivering a
                 pseudolinear execution time speedup with constant
                 bandwidth. The methodology has been validated on
                 significant benchmarks on a Virtex-7 FPGA using the
                 Xilinx Vivado suite. Results demonstrate how the
                 efficient usage of the on-chip memory resources
                 realized by an SST allows one to treat problem sizes
                 whose implementation would otherwise not be possible
                 via direct synthesis of the original, unmanipulated
                 code via High-Level Synthesis (HLS). We also show how
                 the SSTs queuing effectively ensures a pseudolinear
                 throughput speedup while consuming constant off-chip
                 bandwidth.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "53",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{C:2016:FGM,
  author =       "Unnikrishnan C and Rupesh Nasre and Y. N. Srikant",
  title =        "{Falcon}: a Graph Manipulation Language for
                 Heterogeneous Systems",
  journal =      j-TACO,
  volume =       "12",
  number =       "4",
  pages =        "54:1--54:??",
  month =        jan,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2842618",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Tue Feb 16 15:36:38 MST 2016",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Graph algorithms have been shown to possess enough
                 parallelism to keep several computing resources
                 busy---even hundreds of cores on a GPU. Unfortunately,
                 tuning their implementation for efficient execution on
                 a particular hardware configuration of heterogeneous
                 systems consisting of multicore CPUs and GPUs is
                 challenging, time consuming, and error prone. To
                 address these issues, we propose a domain-specific
                 language (DSL), Falcon, for implementing graph
                 algorithms that (i) abstracts the hardware, (ii)
                 provides constructs to write explicitly parallel
                 programs at a higher level, and (iii) can work with
                 general algorithms that may change the graph structure
                 (morph algorithms). We illustrate the usage of our DSL
                 to implement local computation algorithms (that do not
                 change the graph structure) and morph algorithms such
                 as Delaunay mesh refinement, survey propagation, and
                 dynamic SSSP on GPU and multicore CPUs. Using a set of
                 benchmark graphs, we illustrate that the generated code
                 performs close to the state-of-the-art hand-tuned
                 implementations.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "54",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
  remark =       "Yes, the first author name is correct as given:
                 Unnikrishnan C.",
}

@Article{Kalayappan:2016:FRT,
  author =       "Rajshekar Kalayappan and Smruti R. Sarangi",
  title =        "{FluidCheck}: a Redundant Threading-Based Approach for
                 Reliable Execution in Manycore Processors",
  journal =      j-TACO,
  volume =       "12",
  number =       "4",
  pages =        "55:1--55:??",
  month =        jan,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2842620",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Tue Feb 16 15:36:38 MST 2016",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Soft errors have become a serious cause of concern
                 with reducing feature sizes. The ability to accommodate
                 complex, Simultaneous Multithreading (SMT) cores on a
                 single chip presents a unique opportunity to achieve
                 reliable execution, safe from soft errors, with low
                 performance penalties. In this context, we present
                 FluidCheck, a checker architecture that allows highly
                 flexible assignment and migration of checking duties
                 across cores. In this article, we present a mechanism
                 to dynamically use the resources of SMT cores for
                 checking the results of other threads, and propose a
                 variety of heuristics for migration of such checker
                 threads across cores. Secondly, to make the process of
                 checking more efficient, we propose a set of
                 architectural enhancements that reduce power
                 consumption, decrease the length of the critical path,
                 and reduce the load on the Network-on-Chip (NoC). Based
                 on our observations, we design a 16 core system for
                 running SPEC2006 based bag-of-tasks applications. Our
                 experiments demonstrate that fully reliable execution
                 can be attained with a mere 27\% slowdown, surpassing
                 traditional redundant threading based techniques by
                 roughly 42\%.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "55",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Elwell:2016:RMP,
  author =       "Jesse Elwell and Ryan Riley and Nael Abu-Ghazaleh and
                 Dmitry Ponomarev and Iliano Cervesato",
  title =        "Rethinking Memory Permissions for Protection Against
                 Cross-Layer Attacks",
  journal =      j-TACO,
  volume =       "12",
  number =       "4",
  pages =        "56:1--56:??",
  month =        jan,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2842621",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Tue Feb 16 15:36:38 MST 2016",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "The inclusive permissions structure (e.g., the Intel
                 ring model) of modern commodity CPUs provides
                 privileged system software layers with arbitrary
                 permissions to access and modify client processes,
                 allowing them to manage these clients and the system
                 resources efficiently. Unfortunately, these inclusive
                 permissions allow a compromised high-privileged
                 software layer to perform arbitrary malicious
                 activities. In this article, our goal is to prevent
                 attacks that cross system layers while maintaining the
                 abilities of system software to manage the system and
                 allocate resources. In particular, we present a
                 hardware-supported page permission framework for
                 physical pages that is based on the concept of
                 noninclusive sets of memory permissions for different
                 layers of system software (such as hypervisors,
                 operating systems, and user-level applications).
                 Instead of viewing privilege levels as an ordered
                 hierarchy with each successive level being more
                 privileged, we view them as distinct levels each with
                 its own set of permissions. In order to enable system
                 software to manage client processes, we define a set of
                 legal permission transitions that support resource
                 allocation but preserve security. We show that the
                 model prevents a range of recent attacks. We also show
                 that it can be implemented with negligible performance
                 overhead (both at load time and at runtime), low
                 hardware complexity, and minimal changes to the
                 commodity OS and hypervisor code.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "56",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Morad:2016:RGS,
  author =       "Amir Morad and Leonid Yavits and Shahar Kvatinsky and
                 Ran Ginosar",
  title =        "Resistive {GP-SIMD} Processing-In-Memory",
  journal =      j-TACO,
  volume =       "12",
  number =       "4",
  pages =        "57:1--57:??",
  month =        jan,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2845084",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Tue Feb 16 15:36:38 MST 2016",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "GP-SIMD, a novel hybrid general-purpose SIMD
                 architecture, addresses the challenge of data
                 synchronization by in-memory computing, through
                 combining data storage and massive parallel processing.
                 In this article, we explore a resistive implementation
                 of the GP-SIMD architecture. In resistive GP-SIMD, a
                 novel resistive row and column addressable 4F$^2$
                 crossbar is utilized, replacing the modified CMOS
                 190F$^2$ SRAM storage previously proposed for GP-SIMD
                 architecture. The use of the resistive crossbar allows
                 scaling the GP-SIMD from few millions to few hundred
                 millions of processing units on a single silicon die.
                 The performance, power consumption and power efficiency
                 of a resistive GP-SIMD are compared with the CMOS
                 version. We find that PiM architectures and,
                 specifically, GP-SIMD benefit more than other many-core
                 architectures from using resistive memory. A framework
                 for in-place arithmetic operation on a single
                 multivalued resistive cell is explored, demonstrating a
                 potential to become a building block for
                 next-generation PiM architectures.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "57",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Wang:2016:IIB,
  author =       "Yaohua Wang and Dong Wang and Shuming Chen and Zonglin
                 Liu and Shenggang Chen and Xiaowen Chen and Xu Zhou",
  title =        "Iteration Interleaving--Based {SIMD} Lane Partition",
  journal =      j-TACO,
  volume =       "12",
  number =       "4",
  pages =        "58:1--58:??",
  month =        jan,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2847253",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Tue Feb 16 15:36:38 MST 2016",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "The efficacy of single instruction, multiple data
                 (SIMD) architectures is limited when handling divergent
                 control flows. This circumstance results in SIMD
                 fragments using only a subset of the available lanes.
                 We propose an iteration interleaving--based SIMD lane
                 partition (IISLP) architecture that interleaves the
                 execution of consecutive iterations and dynamically
                 partitions SIMD lanes into branch paths with comparable
                 execution time. The benefits are twofold: SIMD
                 fragments under divergent branches can execute in
                 parallel, and the pathology of fragment starvation can
                 also be well eliminated. Our experiments show that
                 IISLP doubles the performance of a baseline mechanism
                 and provides a speedup of 28\% versus instruction
                 shuffle.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "58",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Aijo:2016:ILP,
  author =       "Tomi {\"A}ij{\"o} and Pekka J{\"a}{\"a}skel{\"a}inen
                 and Tapio Elomaa and Heikki Kultala and Jarmo Takala",
  title =        "Integer Linear Programming-Based Scheduling for
                 Transport Triggered Architectures",
  journal =      j-TACO,
  volume =       "12",
  number =       "4",
  pages =        "59:1--59:??",
  month =        jan,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2845082",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Tue Feb 16 15:36:38 MST 2016",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Static multi-issue machines, such as traditional Very
                 Long Instructional Word (VLIW) architectures, move
                 complexity from the hardware to the compiler. This is
                 motivated by the ability to support high degrees of
                 instruction-level parallelism without requiring
                 complicated scheduling logic in the processor hardware.
                 The simpler-control hardware results in reduced area
                 and power consumption, but leads to a challenge of
                 engineering a compiler with good code-generation
                 quality. Transport triggered architectures (TTA), and
                 other so-called exposed datapath architectures, take
                 the compiler-oriented philosophy even further by
                 pushing more details of the datapath under software
                 control. The main benefit of this is the reduced
                 register file pressure, with a drawback of adding even
                 more complexity to the compiler side. In this article,
                 we propose an Integer Linear Programming (ILP) -based
                 instruction scheduling model for TTAs. The model
                 describes the architecture characteristics, the
                 particular processor resource constraints, and the
                 operation dependencies of the scheduled program. The
                 model is validated and measured by compiling
                 application kernels to various TTAs with a different
                 number of datapath components and connectivity. In the
                 best case, the cycle count is reduced to 52\% when
                 compared to a heuristic scheduler. In addition to
                 producing shorter schedules, the number of register
                 accesses in the compiled programs is generally notably
                 less than those with the heuristic scheduler; in the
                 best case, the ILP scheduler reduced the number of
                 register file reads to 33\% of the heuristic results
                 and register file writes to 18\%. On the other hand, as
                 expected, the ILP-based scheduler uses distinctly more
                 time to produce a schedule than the heuristic
                 scheduler, but the compilation time is within tolerable
                 limits for production-code generation.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "59",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Liu:2016:SEA,
  author =       "Qixiao Liu and Miquel Moreto and Jaume Abella and
                 Francisco J. Cazorla and Daniel A. Jimenez and Mateo
                 Valero",
  title =        "Sensible Energy Accounting with Abstract Metering for
                 Multicore Systems",
  journal =      j-TACO,
  volume =       "12",
  number =       "4",
  pages =        "60:1--60:??",
  month =        jan,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2842616",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Tue Feb 16 15:36:38 MST 2016",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Chip multicore processors (CMPs) are the preferred
                 processing platform across different domains such as
                 data centers, real-time systems, and mobile devices. In
                 all those domains, energy is arguably the most
                 expensive resource in a computing system. Accurately
                 quantifying energy usage in a multicore environment
                 presents a challenge as well as an opportunity for
                 optimization. Standard metering approaches are not
                 capable of delivering consistent results with shared
                 resources, since the same task with the same inputs may
                 have different energy consumption based on the mix of
                 co-running tasks. However, it is reasonable for
                 data-center operators to charge on the basis of
                 estimated energy usage rather than time since energy is
                 more correlated with their actual cost. This article
                 introduces the concept of Sensible Energy Accounting
                 (SEA). For a task running in a multicore system, SEA
                 accurately estimates the energy the task would have
                 consumed running in isolation with a given fraction of
                 the CMP shared resources. We explain the potential
                 benefits of SEA in different domains and describe two
                 hardware techniques to implement it for a shared
                 last-level cache and on-core resources in SMT
                 processors. Moreover, with SEA, an energy-aware
                 scheduler can find a highly efficient on-chip resource
                 assignment, reducing by up to 39\% the total processor
                 energy for a 4-core system.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "60",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Zhou:2016:SAC,
  author =       "Miao Zhou and Yu Du and Bruce Childers and Daniel
                 Mosse and Rami Melhem",
  title =        "Symmetry-Agnostic Coordinated Management of the Memory
                 Hierarchy in Multicore Systems",
  journal =      j-TACO,
  volume =       "12",
  number =       "4",
  pages =        "61:1--61:??",
  month =        jan,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2847254",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Tue Feb 16 15:36:38 MST 2016",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "In a multicore system, many applications share the
                 last-level cache (LLC) and memory bandwidth. These
                 resources need to be carefully managed in a coordinated
                 way to maximize performance. DRAM is still the
                 technology of choice in most systems. However, as
                 traditional DRAM technology faces energy, reliability,
                 and scalability challenges, nonvolatile memory (NVM)
                 technologies are gaining traction. While DRAM is
                 read/write symmetric (a read operation has comparable
                 latency and energy consumption as a write operation),
                 many NVM technologies (such as Phase-Change Memory,
                 PCM) experience read/write asymmetry: write operations
                 are typically much slower and more power hungry than
                 read operations. Whether the memory's characteristics
                 are symmetric or asymmetric influences the way shared
                 resources are managed. We propose two symmetry-agnostic
                 schemes to manage a shared LLC through way partitioning
                 and memory through bandwidth allocation. The proposals
                 work well for both symmetric and asymmetric memory.
                 First, an exhaustive search is proposed to find the
                 best combination of a cache way partition and bandwidth
                 allocation. Second, an approximate scheme, derived from
                 a theoretical model, is proposed without the overhead
                 of exhaustive search. Simulation results show that the
                 approximate scheme improves weighted speedup by at
                 least 14\% on average (regardless of the memory
                 symmetry) over a state-of-the-art way partitioning and
                 memory bandwidth allocation. Simulation results also
                 show that the approximate scheme achieves comparable
                 weighted speedup as a state-of-the-art multiple
                 resource management scheme, XChange, for symmetric
                 memory, and outperforms it by an average of 10\% for
                 asymmetric memory.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "61",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Yazdanbakhsh:2016:RRF,
  author =       "Amir Yazdanbakhsh and Gennady Pekhimenko and Bradley
                 Thwaites and Hadi Esmaeilzadeh and Onur Mutlu and Todd
                 C. Mowry",
  title =        "{RFVP}: Rollback-Free Value Prediction with
                 Safe-to-Approximate Loads",
  journal =      j-TACO,
  volume =       "12",
  number =       "4",
  pages =        "62:1--62:??",
  month =        jan,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2836168",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Tue Feb 16 15:36:38 MST 2016",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "This article aims to tackle two fundamental memory
                 bottlenecks: limited off-chip bandwidth (bandwidth
                 wall) and long access latency (memory wall). To achieve
                 this goal, our approach exploits the inherent error
                 resilience of a wide range of applications. We
                 introduce an approximation technique, called
                 Rollback-Free Value Prediction (RFVP). When certain
                 safe-to-approximate load operations miss in the cache,
                 RFVP predicts the requested values. However, RFVP does
                 not check for or recover from load-value
                 mispredictions, hence, avoiding the high cost of
                 pipeline flushes and re-executions. RFVP mitigates the
                 memory wall by enabling the execution to continue
                 without stalling for long-latency memory accesses. To
                 mitigate the bandwidth wall, RFVP drops a fraction of
                 load requests that miss in the cache after predicting
                 their values. Dropping requests reduces memory
                 bandwidth contention by removing them from the system.
                 The drop rate is a knob to control the trade-off
                 between performance/energy efficiency and output
                 quality. Our extensive evaluations show that RFVP, when
                 used in GPUs, yields significant performance
                 improvement and energy reduction for a wide range of
                 quality-loss levels. We also evaluate RFVP's latency
                 benefits for a single core CPU. The results show
                 performance improvement and energy reduction for a wide
                 variety of applications with less than 1\% loss in
                 quality.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "62",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Lee:2016:SML,
  author =       "Donghyuk Lee and Saugata Ghose and Gennady Pekhimenko
                 and Samira Khan and Onur Mutlu",
  title =        "Simultaneous Multi-Layer Access: Improving
                 {$3$D}-Stacked Memory Bandwidth at Low Cost",
  journal =      j-TACO,
  volume =       "12",
  number =       "4",
  pages =        "63:1--63:??",
  month =        jan,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2832911",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Tue Feb 16 15:36:38 MST 2016",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "3D-stacked DRAM alleviates the limited memory
                 bandwidth bottleneck that exists in modern systems by
                 leveraging through silicon vias (TSVs) to deliver
                 higher external memory channel bandwidth. Today's
                 systems, however, cannot fully utilize the higher
                 bandwidth offered by TSVs, due to the limited internal
                 bandwidth within each layer of the 3D-stacked DRAM. We
                 identify that the bottleneck to enabling higher
                 bandwidth in 3D-stacked DRAM is now the global bitline
                 interface, the connection between the DRAM row buffer
                 and the peripheral IO circuits. The global bitline
                 interface consists of a limited and expensive set of
                 wires and structures, called global bitlines and global
                 sense amplifiers, whose high cost makes it difficult to
                 simply scale up the bandwidth of the interface within a
                 single DRAM layer in the 3D stack. We alleviate this
                 bandwidth bottleneck by exploiting the observation that
                 several global bitline interfaces already exist across
                 the multiple DRAM layers in current 3D-stacked designs,
                 but only a fraction of them are enabled at the same
                 time. We propose a new 3D-stacked DRAM architecture,
                 called Simultaneous Multi-Layer Access (SMLA), which
                 increases the internal DRAM bandwidth by accessing
                 multiple DRAM layers concurrently, thus making much
                 greater use of the bandwidth that the TSVs offer. To
                 avoid channel contention, the DRAM layers must
                 coordinate with each other when simultaneously
                 transferring data. We propose two approaches to
                 coordination, both of which deliver four times the
                 bandwidth for a four-layer DRAM, over a baseline that
                 accesses only one layer at a time. Our first approach,
                 Dedicated-IO, statically partitions the TSVs by
                 assigning each layer to a dedicated set of TSVs that
                 operate at a higher frequency. Unfortunately,
                 Dedicated-IO requires a nonuniform design for each
                 layer (increasing manufacturing costs), and its DRAM
                 energy consumption scales linearly with the number of
                 layers. Our second approach, Cascaded-IO, solves both
                 issues by instead time multiplexing all of the TSVs
                 across layers. Cascaded-IO reduces DRAM energy
                 consumption by lowering the operating frequency of
                 higher layers. Our evaluations show that SMLA provides
                 significant performance improvement and energy
                 reduction across a variety of workloads (55\%/18\% on
                 average for multiprogrammed workloads, respectively)
                 over a baseline 3D-stacked DRAM, with low overhead.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "63",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Na:2016:JPC,
  author =       "Yeoul Na and Seon Wook Kim and Youngsun Han",
  title =        "{JavaScript} Parallelizing Compiler for Exploiting
                 Parallelism from Data-Parallel {HTML5} Applications",
  journal =      j-TACO,
  volume =       "12",
  number =       "4",
  pages =        "64:1--64:??",
  month =        jan,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2846098",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Tue Feb 16 15:36:38 MST 2016",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/java2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "With the advent of the HTML5 standard, JavaScript is
                 increasingly processing computationally intensive,
                 data-parallel workloads. Thus, the enhancement of
                 JavaScript performance has been emphasized because the
                 performance gap between JavaScript and native
                 applications is still substantial. Despite this
                 urgency, conventional JavaScript compilers do not
                 exploit much of parallelism even from data-parallel
                 JavaScript applications, despite contemporary mobile
                 devices being equipped with expensive parallel hardware
                 platforms, such as multicore processors and GPGPUs. In
                 this article, we propose an automatically parallelizing
                 JavaScript compiler that targets emerging,
                 data-parallel HTML5 applications by leveraging the
                 mature affine loop analysis of conventional static
                 compilers. We identify that the most critical issues
                 when parallelizing JavaScript with a conventional
                 static analysis are ensuring correct parallelization,
                 minimizing compilation overhead, and conducting
                 low-cost recovery when there is a speculation failure
                 during parallel execution. We propose a mechanism for
                 safely handling the failure at a low cost, based on
                 compiler techniques and the property of idempotence.
                 Our experiment shows that the proposed JavaScript
                 parallelizing compiler detects most affine parallel
                 loops. Also, we achieved a maximum speedup of 3.22
                 times on a quad-core system, while incurring negligible
                 compilation and recovery overheads with various sets of
                 data-parallel HTML5 applications.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "64",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Usui:2016:DDA,
  author =       "Hiroyuki Usui and Lavanya Subramanian and Kevin
                 Kai-Wei Chang and Onur Mutlu",
  title =        "{DASH}: Deadline-Aware High-Performance Memory
                 Scheduler for Heterogeneous Systems with Hardware
                 Accelerators",
  journal =      j-TACO,
  volume =       "12",
  number =       "4",
  pages =        "65:1--65:??",
  month =        jan,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2847255",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Tue Feb 16 15:36:38 MST 2016",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Modern SoCs integrate multiple CPU cores and hardware
                 accelerators (HWAs) that share the same main memory
                 system, causing interference among memory requests from
                 different agents. The result of this interference, if
                 it is not controlled well, is missed deadlines for HWAs
                 and low CPU performance. Few previous works have
                 tackled this problem. State-of-the-art mechanisms
                 designed for CPU-GPU systems strive to meet a target
                 frame rate for GPUs by prioritizing the GPU close to
                 the time when it has to complete a frame. We observe
                 two major problems when such an approach is adapted to
                 a heterogeneous CPU-HWA system. First, HWAs miss
                 deadlines because they are prioritized only when close
                 to their deadlines. Second, such an approach does not
                 consider the diverse memory access characteristics of
                 different applications running on CPUs and HWAs,
                 leading to low performance for latency-sensitive CPU
                 applications and deadline misses for some HWAs,
                 including GPUs. In this article, we propose a
                 Deadline-Aware memory Scheduler for Heterogeneous
                 systems (DASH), which overcomes these problems using
                 three key ideas, with the goal of meeting HWAs'
                 deadlines while providing high CPU performance. First,
                 DASH prioritizes an HWA when it is not on track to meet
                 its deadline any time during a deadline period, instead
                 of prioritizing it only when close to a deadline.
                 Second, DASH prioritizes HWAs over memory-intensive CPU
                 applications based on the observation that
                 memory-intensive applications' performance is not
                 sensitive to memory latency. Third, DASH treats
                 short-deadline HWAs differently as they are more likely
                 to miss their deadlines and schedules their requests
                 based on worst-case memory access time estimates.
                 Extensive evaluations across a wide variety of
                 different workloads and systems show that DASH achieves
                 significantly better CPU performance than the best
                 previous scheduler while always meeting the deadlines
                 for all HWAs, including GPUs, thereby largely improving
                 frame rates.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "65",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Kafshdooz:2016:CTO,
  author =       "Morteza Mohajjel Kafshdooz and Mohammadkazem Taram and
                 Sepehr Assadi and Alireza Ejlali",
  title =        "A Compile-Time Optimization Method for {WCET}
                 Reduction in Real-Time Embedded Systems through Block
                 Formation",
  journal =      j-TACO,
  volume =       "12",
  number =       "4",
  pages =        "66:1--66:25",
  month =        jan,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2845083",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Tue Feb 16 15:36:38 MST 2016",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Compile-time optimizations play an important role in
                 the efficient design of real-time embedded systems.
                 Usually, compile-time optimizations are designed to
                 reduce average-case execution time (ACET). While ACET
                 is a main concern in high-performance computing
                 systems, in real-time embedded systems, concerns are
                 different and worst-case execution time (WCET) is much
                 more important than ACET. Therefore, WCET reduction is
                 more desirable than ACET reduction in many real-time
                 embedded systems. In this article, we propose a
                 compile-time optimization method aimed at reducing WCET
                 in real-time embedded systems. In the proposed method,
                 based on the predicated execution capability of
                 embedded processors, program code blocks that are in
                 the worst-case paths of the program are merged to
                 increase instruction-level parallelism and opportunity
                 for WCET reduction. The use of predicated execution
                 enables merging code blocks from different worst-case
                 paths that can be very effective in WCET reduction. The
                 experimental results show that the proposed method can
                 reduce WCET by up to 45\% as compared to previous
                 compile-time block formation methods. It is noteworthy
                 that compared to previous works, while the proposed
                 method usually achieves more WCET reduction, it has
                 considerably less negative impact on ACET and code
                 size.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "66",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Koukos:2016:BHU,
  author =       "Konstantinos Koukos and Alberto Ros and Erik Hagersten
                 and Stefanos Kaxiras",
  title =        "Building Heterogeneous {Unified Virtual Memories
                 (UVMs)} without the Overhead",
  journal =      j-TACO,
  volume =       "13",
  number =       "1",
  pages =        "1:1--1:22",
  month =        apr,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2889488",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Tue Apr 5 16:27:36 MDT 2016",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "This work proposes a novel scheme to facilitate
                 heterogeneous systems with unified virtual memory.
                 Research proposals implement coherence protocols for
                 sequential consistency (SC) between central processing
                 unit (CPU) cores and between devices. Such mechanisms
                 introduce severe bottlenecks in the system; therefore,
                 we adopt the heterogeneous-race-free (HRF) memory
                 model. The use of HRF simplifies the coherency protocol
                 and the graphics processing unit (GPU) memory
                 management unit (MMU). Our protocol optimizes CPU and
                 GPU demands separately, with the GPU part being simpler
                 while the CPU is more elaborate and latency aware. We
                 achieve an average 45\% speedup and 45\% electronic
                 data processing reduction (20\% energy) over the
                 corresponding SC implementation.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "1",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Wang:2016:DMB,
  author =       "Zhigang Wang and Xiaolin Wang and Fang Hou and Yingwei
                 Luo and Zhenlin Wang",
  title =        "Dynamic Memory Balancing for Virtualization",
  journal =      j-TACO,
  volume =       "13",
  number =       "1",
  pages =        "2:1--2:??",
  month =        apr,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2851501",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Tue Apr 5 16:27:36 MDT 2016",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib;
                 https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib",
  abstract =     "Allocating memory dynamically for virtual machines
                 (VMs) according to their demands provides significant
                 benefits as well as great challenges. Efficient memory
                 resource management requires knowledge of the memory
                 demands of applications or systems at runtime. A widely
                 proposed approach is to construct a miss ratio curve
                 (MRC) for a VM, which not only summarizes the current
                 working set size (WSS) of the VM but also models the
                 relationship between its performance and the target
                 memory allocation size. Unfortunately, the cost of
                 monitoring and maintaining the MRC structures is
                 nontrivial. This article first introduces a low-cost
                 WSS tracking system with effective optimizations on
                 data structures, as well as an efficient mechanism to
                 decrease the frequency of monitoring. We also propose a
                 Memory Balancer (MEB), which dynamically reallocates
                 guest memory based on the predicted WSS. Our
                 experimental results show that our prediction schemes
                 yield a high accuracy of 95.2\% and low overhead of
                 2\%. Furthermore, the overall system throughput can be
                 significantly improved with MEB, which brings a speedup
                 up to 7.4 for two to four VMs and 4.54 for an
                 overcommitted system with 16 VMs.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "2",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Wang:2016:HPC,
  author =       "Xueyang Wang and Sek Chai and Michael Isnardi and
                 Sehoon Lim and Ramesh Karri",
  title =        "Hardware Performance Counter-Based Malware
                 Identification and Detection with Adaptive Compressive
                 Sensing",
  journal =      j-TACO,
  volume =       "13",
  number =       "1",
  pages =        "3:1--3:??",
  month =        apr,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2857055",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Tue Apr 5 16:27:36 MDT 2016",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Hardware Performance Counter-based (HPC) runtime
                 checking is an effective way to identify malicious
                 behaviors of malware and detect malicious modifications
                 to a legitimate program's control flow. To reduce the
                 overhead in the monitored system which has limited
                 storage and computing resources, we present a
                 ``sample-locally-analyze-remotely'' technique. The
                 sampled HPC data are sent to a remote server for
                 further analysis. To minimize the I/O bandwidth
                 required for transmission, the fine-grained HPC
                 profiles are compressed into much smaller vectors with
                 Compressive Sensing. The experimental results
                 demonstrate an 80\% I/O bandwidth reduction after
                 applying Compressive Sensing, without compromising the
                 detection and identification capabilities.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "3",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Akram:2016:BPG,
  author =       "Shoaib Akram and Jennifer B. Sartor and Kenzo {Van
                 Craeynest} and Wim Heirman and Lieven Eeckhout",
  title =        "Boosting the Priority of Garbage: Scheduling
                 Collection on Heterogeneous Multicore Processors",
  journal =      j-TACO,
  volume =       "13",
  number =       "1",
  pages =        "4:1--4:??",
  month =        apr,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2875424",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Tue Apr 5 16:27:36 MDT 2016",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/java2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "While hardware is evolving toward heterogeneous
                 multicore architectures, modern software applications
                 are increasingly written in managed languages.
                 Heterogeneity was born of a need to improve energy
                 efficiency; however, we want the performance of our
                 applications not to suffer from limited resources. How
                 best to schedule managed language applications on a mix
                 of big, out-of-order cores and small, in-order cores is
                 an open question, complicated by the host of service
                 threads that perform key tasks such as memory
                 management. These service threads compete with the
                 application for core and memory resources, and garbage
                 collection (GC) must sometimes suspend the application
                 if there is not enough memory available for allocation.
                 In this article, we explore concurrent garbage
                 collection's behavior, particularly when it becomes
                 critical, and how to schedule it on a heterogeneous
                 system to optimize application performance. While some
                 applications see no difference in performance when GC
                 threads are run on big versus small cores, others ---
                 those with GC criticality --- see up to an 18\%
                 performance improvement. We develop a new, adaptive
                 scheduling algorithm that responds to GC criticality
                 signals from the managed runtime, giving more big-core
                 cycles to the concurrent collector when it is under
                 pressure and in danger of suspending the application.
                 Our experimental results show that our
                 GC-criticality-aware scheduler is robust across a range
                 of heterogeneous architectures with different core
                 counts and frequency scaling and across heap sizes. Our
                 algorithm is performance and energy neutral for
                 GC-uncritical Java applications and significantly
                 speeds up GC-critical applications by 16\%, on average,
                 while being 20\% more energy efficient for a
                 heterogeneous multicore with three big cores and one
                 small core.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "4",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Yilmaz:2016:ARS,
  author =       "Buse Yilmaz and Baris Aktemur and Mar{\'\i}A J.
                 Garzar{\'a}n and Sam Kamin and Furkan Kira{\c{c}}",
  title =        "Autotuning Runtime Specialization for Sparse
                 Matrix-Vector Multiplication",
  journal =      j-TACO,
  volume =       "13",
  number =       "1",
  pages =        "5:1--5:??",
  month =        apr,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2851500",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Tue Apr 5 16:27:36 MDT 2016",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Runtime specialization is used for optimizing programs
                 based on partial information available only at runtime.
                 In this paper we apply autotuning on runtime
                 specialization of Sparse Matrix-Vector Multiplication
                 to predict a best specialization method among several.
                 In 91\% to 96\% of the predictions, either the best or
                 the second-best method is chosen. Predictions achieve
                 average speedups that are very close to the speedups
                 achievable when only the best methods are used. By
                 using an efficient code generator and a carefully
                 designed set of matrix features, we show the runtime
                 costs can be amortized to bring performance benefits
                 for many real-world cases.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "5",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Zhou:2016:ERI,
  author =       "Mingzhou Zhou and Bo Wu and Xipeng Shen and Yaoqing
                 Gao and Graham Yiu",
  title =        "Examining and Reducing the Influence of Sampling
                 Errors on Feedback-Driven Optimizations",
  journal =      j-TACO,
  volume =       "13",
  number =       "1",
  pages =        "6:1--6:??",
  month =        apr,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2851502",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Tue Apr 5 16:27:36 MDT 2016",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Feedback-driven optimization (FDO) is an important
                 component in mainstream compilers. By allowing the
                 compiler to reoptimize the program based on some
                 profiles of the program's dynamic behaviors, it often
                 enhances the quality of the generated code
                 substantially. A barrier for using FDO is that it often
                 requires many training runs to collect enough profiles
                 to amortize the sensitivity of program optimizations to
                 program input changes. Various sampling techniques have
                 been explored to alleviate this time-consuming process.
                 However, the lowered profile accuracy caused by
                 sampling often hurts the benefits of FDO. This article
                 gives the first systematic study in how sampling rates
                 affect the accuracy of collected profiles and how the
                 accuracy correlates with the usefulness of the profile
                 for modern FDO. Studying basic block and edge profiles
                 for FDO in two mature compilers reveals several
                 counterintuitive observations, one of which is that
                 profiling accuracy does not strongly correlate with the
                 benefits of the FDO. A detailed analysis identifies
                 three types of sampling-caused errors that critically
                 impair the quality of the profiles for FDO. It then
                 introduces a simple way to rectify profiles based on
                 the findings. Experiments demonstrate that the simple
                 rectification fixes most of those critical errors in
                 sampled profiles and significantly enhances the
                 effectiveness of FDO.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "6",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Dantras:2016:OIB,
  author =       "Amanieu D'antras and Cosmin Gorgovan and Jim Garside
                 and Mikel Luj{\'a}n",
  title =        "Optimizing Indirect Branches in Dynamic Binary
                 Translators",
  journal =      j-TACO,
  volume =       "13",
  number =       "1",
  pages =        "7:1--7:??",
  month =        apr,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2866573",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Tue Apr 5 16:27:36 MDT 2016",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/hash.bib;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Dynamic binary translation is a technology for
                 transparently translating and modifying a program at
                 the machine code level as it is running. A significant
                 factor in the performance of a dynamic binary
                 translator is its handling of indirect branches. Unlike
                 direct branches, which have a known target at
                 translation time, an indirect branch requires
                 translating a source program counter address to a
                 translated program counter address every time the
                 branch is executed. This translation can impose a
                 serious runtime penalty if it is not handled
                 efficiently. MAMBO-X64, a dynamic binary translator
                 that translates 32-bit ARM (AArch32) code to 64-bit ARM
                 (AArch64) code, uses three novel techniques to improve
                 the performance of indirect branch translation.
                 Together, these techniques allow MAMBO-X64 to achieve a
                 very low performance overhead of only 10\% on average
                 compared to native execution of 32-bit programs.
                 Hardware-assisted function returns use a software
                 return address stack to predict the targets of function
                 returns, making use of several novel optimizations
                 while also exploiting hardware return address
                 prediction. This technique has a significant impact on
                 most benchmarks, reducing binary translation overhead
                 compared to native execution by 40\% on average and by
                 90\% on some benchmarks. Branch table inference, an
                 algorithm for detecting and translating branch tables,
                 can reduce the overhead of translated code by up to
                 40\% on some SPEC CPU2006 benchmarks. The remaining
                 indirect branches are handled using a fast atomic hash
                 table, which is optimized to work with multiple
                 threads. This last technique translates indirect
                 branches using a single shared hash table while
                 avoiding expensive synchronization in
                 performance-critical lookup code. This allows the
                 performance to be on par with thread-private hash
                 tables while having superior memory scalability.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "7",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Martins:2016:CBS,
  author =       "Luiz G. A. Martins and Ricardo Nobre and Jo{\~a}o M.
                 P. Cardoso and Alexandre C. B. Delbem and Eduardo
                 Marques",
  title =        "Clustering-Based Selection for the Exploration of
                 Compiler Optimization Sequences",
  journal =      j-TACO,
  volume =       "13",
  number =       "1",
  pages =        "8:1--8:??",
  month =        apr,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2883614",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Tue Apr 5 16:27:36 MDT 2016",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "A large number of compiler optimizations are nowadays
                 available to users. These optimizations interact with
                 each other and with the input code in several and
                 complex ways. The sequence of application of
                 optimization passes can have a significant impact on
                 the performance achieved. The effect of the
                 optimizations is both platform and application
                 dependent. The exhaustive exploration of all viable
                 sequences of compiler optimizations for a given code
                 fragment is not feasible. As this exploration is a
                 complex and time-consuming task, several researchers
                 have focused on Design Space Exploration (DSE)
                 strategies both to select optimization sequences to
                 improve the performance of each function of the
                 application and to reduce the exploration time. In this
                 article, we present a DSE scheme based on a clustering
                 approach for grouping functions with similarities and
                 exploration of a reduced search space resulting from
                 the combination of optimizations previously suggested
                 for the functions in each group. The identification of
                 similarities between functions uses a data mining
                 method that is applied to a symbolic code
                 representation. The data mining process combines three
                 algorithms to generate clusters: the Normalized
                 Compression Distance, the Neighbor Joining, and a new
                 ambiguity-based clustering algorithm. Our experiments
                 for evaluating the effectiveness of the proposed
                 approach address the exploration of optimization
                 sequences in the context of the ReflectC compiler,
                 considering 49 compilation passes while targeting a
                 Xilinx MicroBlaze processor, and aiming at performance
                 improvements for 51 functions and four applications.
                 Experimental results reveal that the use of our
                 clustering-based DSE approach achieves a significant
                 reduction in the total exploration time of the search
                 space ($ 20 \times $ over a Genetic Algorithm approach)
                 at the same time that considerable performance speedups
                 (41\% over the baseline) were obtained using the
                 optimized codes. Additional experiments were performed
                 considering the LLVM compiler, considering 124
                 compilation passes, and targeting a LEON3 processor.
                 The results show that our approach achieved geometric
                 mean speedups of $ 1.49 \times $, $ 1.32 \times $, and
                 $ 1.24 \times $ for the best 10, 20, and 30 functions,
                 respectively, and a global improvement of 7\% over the
                 performance obtained when compiling with -O2.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "8",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Do:2016:PEH,
  author =       "Sang Wook Stephen Do and Michel Dubois",
  title =        "Power Efficient Hardware Transactional Memory: Dynamic
                 Issue of Transactions",
  journal =      j-TACO,
  volume =       "13",
  number =       "1",
  pages =        "9:1--9:??",
  month =        apr,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2875425",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Tue Apr 5 16:27:36 MDT 2016",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Transactional Memory (TM) is no longer just an
                 academic interest as industry has started to adopt the
                 idea in its commercial products. In this paper, we
                 propose Dynamic Transaction Issue (DTI), a new scheme
                 that can be easily implemented on top of existing
                 Hardware TM (HTM) systems, provided additional
                 messages. Instead of wasting power and energy in
                 transaction aborts, Dynamic Transaction Issue puts a
                 processor core into a low-power state when there is a
                 reasonable suspicion that the current transaction
                 running on it will be aborted soon in the future. We
                 have implemented Dynamic Transaction Issue on a
                 cycle-accurate simulator of a multicore processor
                 system with out-of-order superscalar cores, augmented
                 with a power package and a TM package which add
                 accurate dynamic power estimates and a TM framework to
                 the simulator. Our simulation results show that Dynamic
                 Transaction Issue can achieve energy savings up to 37\%
                 from the energy consumption of a base machine with no
                 mechanism to suppress useless aborts. We also compare
                 Dynamic Transaction Issue with various alternative
                 hardware TM mechanisms.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "9",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Evtyushkin:2016:UMC,
  author =       "Dmitry Evtyushkin and Dmitry Ponomarev and Nael
                 Abu-Ghazaleh",
  title =        "Understanding and Mitigating Covert Channels Through
                 Branch Predictors",
  journal =      j-TACO,
  volume =       "13",
  number =       "1",
  pages =        "10:1--10:??",
  month =        apr,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2870636",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Tue Apr 5 16:27:36 MDT 2016",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/cryptography2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Covert channels through shared processor resources
                 provide secret communication between two malicious
                 processes: the trojan and the spy. In this article, we
                 classify, analyze, and compare covert channels through
                 dynamic branch prediction units in modern processors.
                 Through experiments on a real hardware platform, we
                 compare contention-based channel and the channel that
                 is based on exploiting the branch predictor's residual
                 state. We analyze these channels in SMT and
                 single-threaded environments under both clean and noisy
                 conditions. Our results show that the residual
                 state-based channel provides a cleaner signal and is
                 effective even in noisy execution environments with
                 another application sharing the same physical core with
                 the trojan and the spy. We also estimate the capacity
                 of the branch predictor covert channels and describe a
                 software-only mitigation technique that is based on
                 randomizing the state of the predictor tables on
                 context switches. We show that this protection
                 eliminates all covert channels through the branch
                 prediction unit with minimal impact on performance.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "10",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Zhou:2016:CAE,
  author =       "Hao Zhou and Jingling Xue",
  title =        "A Compiler Approach for Exploiting Partial {SIMD}
                 Parallelism",
  journal =      j-TACO,
  volume =       "13",
  number =       "1",
  pages =        "11:1--11:??",
  month =        apr,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2886101",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Tue Apr 5 16:27:36 MDT 2016",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Existing vectorization techniques are ineffective for
                 loops that exhibit little loop-level parallelism but
                 some limited superword-level parallelism (SLP). We show
                 that effectively vectorizing such loops requires
                 partial vector operations to be executed correctly and
                 efficiently, where the degree of partial SIMD
                 parallelism is smaller than the SIMD datapath width. We
                 present a simple yet effective SLP compiler technique
                 called P aver (PArtial VEctorizeR), formulated and
                 implemented in LLVM as a generalization of the
                 traditional SLP algorithm, to optimize such partially
                 vectorizable loops. The key idea is to maximize SIMD
                 utilization by widening vector instructions used while
                 minimizing the overheads caused by memory access,
                 packing/unpacking, and/or masking operations, without
                 introducing new memory errors or new numeric
                 exceptions. For a set of 9 C/C++/Fortran applications
                 with partial SIMD parallelism, Paver achieves
                 significantly better kernel and whole-program speedups
                 than LLVM on both Intel's AVX and ARM's NEON.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "11",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{VanDenBraak:2016:RGR,
  author =       "Gert-Jan {Van Den Braak} and Henk Corporaal",
  title =        "{R-GPU}: a Reconfigurable {GPU} Architecture",
  journal =      j-TACO,
  volume =       "13",
  number =       "1",
  pages =        "12:1--12:??",
  month =        apr,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2890506",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Tue Apr 5 16:27:36 MDT 2016",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Over the last decade, Graphics Processing Unit (GPU)
                 architectures have evolved from a fixed-function
                 graphics pipeline to a programmable, energy-efficient
                 compute accelerator for massively parallel
                 applications. The compute power arises from the GPU's
                 Single Instruction/Multiple Threads architecture:
                 concurrently running many threads and executing them as
                 Single Instruction/Multiple Data--style vectors.
                 However, compute power is still lost due to cycles
                 spent on data movement and control instructions instead
                 of data computations. Even more cycles are lost on
                 pipeline stalls resulting from long latency (memory)
                 operations. To improve not only performance but also
                 energy efficiency, we introduce R-GPU: a reconfigurable
                 GPU architecture with communicating cores. R-GPU is an
                 addition to a GPU, which can still be used as such, but
                 also has the ability to reorganize the cores of a GPU
                 in a reconfigurable network. In R-GPU data movement and
                 control is implicit in the configuration of the
                 network. Each core executes a fixed instruction,
                 reducing instruction decode count and increasing energy
                 efficiency. On a number of benchmarks we show an
                 average performance improvement of $ 2.1 \times $ over
                 the same GPU without modifications. We further make a
                 conservative power estimation of R-GPU which shows that
                 power consumption can be reduced by 6\%, leading to an
                 energy consumption reduction of 55\%, while area only
                 increases by a mere 4\%.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "12",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Liu:2016:TAA,
  author =       "Peng Liu and Jiyang Yu and Michael C. Huang",
  title =        "Thread-Aware Adaptive Prefetcher on Multicore Systems:
                 Improving the Performance for Multithreaded Workloads",
  journal =      j-TACO,
  volume =       "13",
  number =       "1",
  pages =        "13:1--13:??",
  month =        apr,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2890505",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Tue Apr 5 16:27:36 MDT 2016",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Most processors employ hardware data prefetching
                 techniques to hide memory access latencies. However,
                 the prefetching requests from different threads on a
                 multicore processor can cause severe interference with
                 prefetching and/or demand requests of others. The data
                 prefetching can lead to significant performance
                 degradation due to shared resource contention on shared
                 memory multicore systems. This article proposes a
                 thread-aware data prefetching mechanism based on
                 low-overhead runtime information to tune prefetching
                 modes and aggressiveness, mitigating the resource
                 contention in the memory system. Our solution has three
                 new components: (1) a self-tuning prefetcher that uses
                 runtime feedback to dynamically adjust data prefetching
                 modes and arguments of each thread, (2) a filtering
                 mechanism that informs the hardware about which
                 prefetching request can cause shared data invalidation
                 and should be discarded, and (3) a limiter thread
                 acceleration mechanism to estimate and accelerate the
                 critical thread which has the longest completion time
                 in the parallel region of execution. On a set of
                 multithreaded parallel benchmarks, our thread-aware
                 data prefetching mechanism improves the overall
                 performance of 64-core system by 13\% over a multimode
                 prefetch baseline system with two-level cache
                 organization and conventional modified, exclusive,
                 shared, and invalid-based directory coherence protocol.
                 We compare our approach with the feedback directed
                 prefetching technique and find that it provides 9\%
                 performance improvement on multicore systems, while
                 saving the memory bandwidth consumption.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "13",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Gorgovan:2016:MLO,
  author =       "Cosmin Gorgovan and Amanieu D'antras and Mikel
                 Luj{\'a}n",
  title =        "{MAMBO}: a Low-Overhead Dynamic Binary Modification
                 Tool for {ARM}",
  journal =      j-TACO,
  volume =       "13",
  number =       "1",
  pages =        "14:1--14:??",
  month =        apr,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2896451",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Tue Apr 5 16:27:36 MDT 2016",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "As the ARM architecture expands beyond its traditional
                 embedded domain, there is a growing interest in dynamic
                 binary modification (DBM) tools for general-purpose
                 multicore processors that are part of the ARM family.
                 Existing DBM tools for ARM suffer from introducing
                 large overheads in the execution of applications. The
                 specific questions that this article addresses are (i)
                 how to develop such DBM tools for the ARM architecture
                 and (ii) whether new optimisations are plausible and
                 needed. We describe the general design of MAMBO, a new
                 DBM tool for ARM, which we release together with this
                 publication, and introduce novel optimisations to
                 handle indirect branches. In addition, we explore
                 scenarios in which it may be possible to relax the
                 transparency offered by DBM tools to allow extra
                 optimisations to be applied. These scenarios arise from
                 analysing the most typical usages: for example,
                 application binaries without handcrafted assembly. The
                 performance evaluation shows that MAMBO introduces
                 small overheads for SPEC CPU2006 and PARSEC 3.0 when
                 comparing with the execution times of the unmodified
                 programs: a geometric mean overhead of 28\% on a
                 Cortex-A9 and of 34\% on a Cortex-A15 for CPU2006, and
                 between 27\% and 32\%, depending on the number of
                 threads, for PARSEC on a Cortex-A15.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "14",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Theocharis:2016:BSC,
  author =       "Panagiotis Theocharis and Bjorn {De Sutter}",
  title =        "A Bimodal Scheduler for Coarse-Grained Reconfigurable
                 Arrays",
  journal =      j-TACO,
  volume =       "13",
  number =       "2",
  pages =        "15:1--15:??",
  month =        jun,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2893475",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Jun 27 16:18:10 MDT 2016",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Compilers for Course-Grained Reconfigurable Array
                 (CGRA) architectures suffer from long compilation times
                 and code quality levels far below the theoretical upper
                 bounds. This article presents a new scheduler, called
                 the Bimodal Modulo Scheduler (BMS), to map inner loops
                 onto (heterogeneous) CGRAs of the Architecture for
                 Dynamically Reconfigurable Embedded Systems (ADRES)
                 family. BMS significantly outperforms existing
                 schedulers for similar architectures in terms of
                 generated code quality and compilation time. This is
                 achieved by combining new schemes for backtracking with
                 extended and adapted forms of priority functions and
                 cost functions, as described in the article. BMS is
                 evaluated by mapping multimedia and software-defined
                 radio benchmarks onto tuned ADRES instances.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "15",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Anbar:2016:EHL,
  author =       "Ahmad Anbar and Olivier Serres and Engin Kayraklioglu
                 and Abdel-Hameed A. Badawy and Tarek El-Ghazawi",
  title =        "Exploiting Hierarchical Locality in Deep Parallel
                 Architectures",
  journal =      j-TACO,
  volume =       "13",
  number =       "2",
  pages =        "16:1--16:??",
  month =        jun,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2897783",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Jun 27 16:18:10 MDT 2016",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Parallel computers are becoming deeply hierarchical.
                 Locality-aware programming models allow programmers to
                 control locality at one level through establishing
                 affinity between data and executing activities. This,
                 however, does not enable locality exploitation at other
                 levels. Therefore, we must conceive an efficient
                 abstraction of hierarchical locality and develop
                 techniques to exploit it. Techniques applied directly
                 by programmers, beyond the first level, burden the
                 programmer and hinder productivity. In this article, we
                 propose the Parallel Hierarchical Locality Abstraction
                 Model for Execution (PHLAME). PHLAME is an execution
                 model to abstract and exploit machine hierarchical
                 properties through locality-aware programming and a
                 runtime that takes into account machine
                 characteristics, as well as a data sharing and
                 communication profile of the underlying application.
                 This article presents and experiments with concepts and
                 techniques that can drive such runtime system in
                 support of PHLAME. Our experiments show that our
                 techniques scale up and achieve performance gains of up
                 to 88\%.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "16",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Gonzalez-alvarez:2016:MEF,
  author =       "Cecilia Gonz{\'a}lez-{\'a}lvarez and Jennifer B.
                 Sartor and Carlos {\'A}lvarez and Daniel
                 Jim{\'e}nez-Gonz{\'a}lez and Lieven Eeckhout",
  title =        "{MInGLE}: an Efficient Framework for Domain
                 Acceleration Using Low-Power Specialized Functional
                 Units",
  journal =      j-TACO,
  volume =       "13",
  number =       "2",
  pages =        "17:1--17:??",
  month =        jun,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2898356",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Jun 27 16:18:10 MDT 2016",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "The end of Dennard scaling leads to new research
                 directions that try to cope with the utilization wall
                 in modern chips, such as the design of specialized
                 architectures. Processor customization utilizes
                 transistors more efficiently, optimizing not only for
                 performance but also for power. However, hardware
                 specialization for each application is costly and
                 impractical due to time-to-market constraints.
                 Domain-specific specialization is an alternative that
                 can increase hardware reutilization across applications
                 that share similar computations. This article explores
                 the specialization of low-power processors with custom
                 instructions (CIs) that run on a specialized functional
                 unit. We are the first, to our knowledge, to design CIs
                 for an application domain and across basic blocks,
                 selecting CIs that maximize both performance and energy
                 efficiency improvements. We present the Merged
                 Instructions Generator for Large Efficiency (MInGLE),
                 an automated framework that identifies and selects CIs.
                 Our framework analyzes large sequences of code (across
                 basic blocks) to maximize acceleration potential while
                 also performing partial matching across applications to
                 optimize for reuse of the specialized hardware. To do
                 this, we convert the code into a new canonical
                 representation, the Merging Diagram, which represents
                 the code's functionality instead of its structure. This
                 is key to being able to find similarities across such
                 large code sequences from different applications with
                 different coding styles. Groups of potential CIs are
                 clustered depending on their similarity score to
                 effectively reduce the search space. Additionally, we
                 create new CIs that cover not only whole-body loops but
                 also fragments of the code to optimize hardware
                 reutilization further. For a set of 11 applications
                 from the media domain, our framework generates CIs that
                 significantly improve the energy-delay product (EDP)
                 and performance speedup. CIs with the highest
                 utilization opportunities achieve an average EDP
                 improvement of 3.8 $ \times $ compared to a baseline
                 processor modeled after an Intel Atom. We demonstrate
                 that we can efficiently accelerate a domain with
                 partially matched CIs, and that their design time, from
                 identification to selection, stays within tractable
                 bounds.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "17",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Andreetta:2016:FPF,
  author =       "Christian Andreetta and Vivien B{\'e}got and Jost
                 Berthold and Martin Elsman and Fritz Henglein and
                 Troels Henriksen and Maj-Britt Nordfang and Cosmin E.
                 Oancea",
  title =        "{FinPar}: a Parallel Financial Benchmark",
  journal =      j-TACO,
  volume =       "13",
  number =       "2",
  pages =        "18:1--18:??",
  month =        jun,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2898354",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Jun 27 16:18:10 MDT 2016",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Commodity many-core hardware is now mainstream, but
                 parallel programming models are still lagging behind in
                 efficiently utilizing the application parallelism.
                 There are (at least) two principal reasons for this.
                 First, real-world programs often take the form of a
                 deeply nested composition of parallel operators, but
                 mapping the available parallelism to the hardware
                 requires a set of transformations that are tedious to
                 do by hand and beyond the capability of the common
                 user. Second, the best optimization strategy, such as
                 what to parallelize and what to efficiently
                 sequentialize, is often sensitive to the input dataset
                 and therefore requires multiple code versions that are
                 optimized differently, which also raises
                 maintainability problems. This article presents three
                 array-based applications from the financial domain that
                 are suitable for gpgpu execution. Common
                 benchmark-design practice has been to provide the same
                 code for the sequential and parallel versions that are
                 optimized for only one class of datasets. In
                 comparison, we document (1) all available parallelism
                 via nested map-reduce functional combinators, in a
                 simple Haskell implementation that closely resembles
                 the original code structure, (2) the invariants and
                 code transformations that govern the main trade-offs of
                 a data-sensitive optimization space, and (3) report
                 target cpu and multiversion gpgpu code together with an
                 evaluation that demonstrates optimization trade-offs
                 and other difficulties. We believe that this work
                 provides useful insight into the language constructs
                 and compiler infrastructure capable of expressing and
                 optimizing such applications, and we report in-progress
                 work in this direction.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "18",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Dardaillon:2016:NCF,
  author =       "Micka{\"e}l Dardaillon and Kevin Marquet and Tanguy
                 Risset and J{\'e}r{\^o}me Martin and Henri-Pierre
                 Charles",
  title =        "A New Compilation Flow for Software-Defined Radio
                 Applications on Heterogeneous {MPSoCs}",
  journal =      j-TACO,
  volume =       "13",
  number =       "2",
  pages =        "19:1--19:??",
  month =        jun,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2910583",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Jun 27 16:18:10 MDT 2016",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "The advent of portable software-defined radio ( sdr)
                 technology is tightly linked to the resolution of a
                 difficult problem: efficient compilation of signal
                 processing applications on embedded computing devices.
                 Modern wireless communication protocols use packet
                 processing rather than infinite stream processing and
                 also introduce dependencies between data value and
                 computation behavior leading to dynamic dataflow
                 behavior. Recently, parametric dataflow has been
                 proposed to support dynamicity while maintaining the
                 high level of analyzability needed for efficient
                 real-life implementations of signal processing
                 computations. This article presents a new compilation
                 flow that is able to compile parametric dataflow
                 graphs. Built on the llvm compiler infrastructure, the
                 compiler offers an actor-based C++ programming model to
                 describe parametric graphs, a compilation front end for
                 graph analysis, and a back end that currently matches
                 the Magali platform: a prototype heterogeneous MPSoC
                 dedicated to LTE-Advanced. We also introduce an
                 innovative scheduling technique, called
                 microscheduling, allowing one to adapt the mapping of
                 parametric dataflow programs to the specificities of
                 the different possible MPSoCs targeted. A specific
                 focus on fifo sizing on the target architecture is
                 presented. The experimental results show compilation of
                 3gpp lte-advanced demodulation on Magali with tight
                 memory size constraints. The compiled programs achieve
                 performance similar to handwritten code.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "19",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Liao:2016:DPM,
  author =       "Jianwei Liao and Fran{\c{c}}ois Trahay and Guoqiang
                 Xiao",
  title =        "Dynamic Process Migration Based on Block Access
                 Patterns Occurring in Storage Servers",
  journal =      j-TACO,
  volume =       "13",
  number =       "2",
  pages =        "20:1--20:??",
  month =        jun,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2899002",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Jun 27 16:18:10 MDT 2016",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "An emerging trend in developing large and complex
                 applications on today's high-performance computers is
                 to couple independent components into a comprehensive
                 application. The components may employ the global file
                 system to exchange their data when executing the
                 application. In order to reduce the time required for
                 input/output (I/O) data exchange and data transfer in
                 the coupled systems or other applications, this article
                 proposes a dynamic process migration mechanism on the
                 basis of block access pattern similarity for utilizing
                 the local file cache to exchange the data. We first
                 introduce the scheme of the block access counting
                 diagram to profile the process access pattern during a
                 time period on the storage server. Next, we propose an
                 algorithm that compares the access patterns of
                 processes running on different computing nodes. Last,
                 processes are migrated in order to group processes with
                 similar access patterns. Consequently, the processes on
                 the computing node can exchange their data by accessing
                 the local file cache, instead of the global file
                 system. The experimental results show that the proposed
                 process migration mechanism can reduce the execution
                 time required by the application because of the shorter
                 I/O time, as well as yield attractive I/O throughput.
                 In summary, this dynamic process migration technique
                 can work fairly well for distributed applications whose
                 data dependency rely on distributed file systems.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "20",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Ashouri:2016:CCA,
  author =       "Amir Hossein Ashouri and Giovanni Mariani and Gianluca
                 Palermo and Eunjung Park and John Cavazos and Cristina
                 Silvano",
  title =        "{COBAYN}: Compiler Autotuning Framework Using
                 {Bayesian} Networks",
  journal =      j-TACO,
  volume =       "13",
  number =       "2",
  pages =        "21:1--21:??",
  month =        jun,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2928270",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Jun 27 16:18:10 MDT 2016",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "The variety of today's architectures forces
                 programmers to spend a great deal of time porting and
                 tuning application codes across different platforms.
                 Compilers themselves need additional tuning, which has
                 considerable complexity as the standard optimization
                 levels, usually designed for the average case and the
                 specific target architecture, often fail to bring the
                 best results. This article proposes COBAYN: Compiler
                 autotuning framework using BAYesian Networks, an
                 approach for a compiler autotuning methodology using
                 machine learning to speed up application performance
                 and to reduce the cost of the compiler optimization
                 phases. The proposed framework is based on the
                 application characterization done dynamically by using
                 independent microarchitecture features and Bayesian
                 networks. The article also presents an evaluation based
                 on using static analysis and hybrid feature collection
                 approaches. In addition, the article compares Bayesian
                 networks with respect to several state-of-the-art
                 machine-learning models. Experiments were carried out
                 on an ARM embedded platform and GCC compiler by
                 considering two benchmark suites with 39 applications.
                 The set of compiler configurations, selected by the
                 model (less than 7\% of the search space), demonstrated
                 an application performance speedup of up to 4.6 $
                 \times $ on Polybench (1.85 $ \times $ on average) and
                 3.1 $ \times $ on cBench (1.54 $ \times $ on average)
                 with respect to standard optimization levels. Moreover,
                 the comparison of the proposed technique with (i)
                 random iterative compilation, (ii) machine
                 learning--based iterative compilation, and (iii)
                 noniterative predictive modeling techniques shows, on
                 average, 1.2 $ \times $ , 1.37 $ \times $ , and 1.48 $
                 \times $ speedup, respectively. Finally, the proposed
                 method demonstrates 4 $ \times $ and 3 $ \times $
                 speedup, respectively, on cBench and Polybench in terms
                 of exploration efficiency given the same quality of the
                 solutions generated by the random iterative compilation
                 model.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "21",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Chrysanthou:2016:ORT,
  author =       "Kypros Chrysanthou and Panayiotis Englezakis and
                 Andreas Prodromou and Andreas Panteli and Chrysostomos
                 Nicopoulos and Yiannakis Sazeides and Giorgos
                 Dimitrakopoulos",
  title =        "An Online and Real-Time Fault Detection and
                 Localization Mechanism for Network-on-Chip
                 Architectures",
  journal =      j-TACO,
  volume =       "13",
  number =       "2",
  pages =        "22:1--22:??",
  month =        jun,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2930670",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Jun 27 16:18:10 MDT 2016",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Networks-on-Chip (NoC) are becoming increasingly
                 susceptible to emerging reliability threats. The need
                 to detect and localize the occurrence of faults at
                 runtime is steadily becoming imperative. In this work,
                 we propose NoCAlert, a comprehensive online and
                 real-time fault detection and localization mechanism
                 that demonstrates 0\% false negatives within the
                 interconnect for the fault models and stimulus set used
                 in this study. Based on the concept of invariance
                 checking, NoCAlert employs a group of lightweight
                 microchecker modules that collectively implement
                 real-time hardware assertions. The checkers operate
                 concurrently with normal NoC operation, thus
                 eliminating the need for periodic, or triggered-based,
                 self-testing. Based on the pattern/signature of
                 asserted checkers, NoCAlert can pinpoint the location
                 of the fault at various granularity levels. Most
                 important, 97\% of the transient and 90\% of the
                 permanent faults are detected instantaneously, within a
                 single clock cycle upon fault manifestation. The fault
                 localization accuracy ranges from 90\% to 100\%,
                 depending on the desired localization granularity.
                 Extensive cycle-accurate simulations in a 64-node CMP
                 and analysis at the RTL netlist-level demonstrate the
                 efficacy of the proposed technique.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "22",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Mehta:2016:VL,
  author =       "Sanyam Mehta and Pen-Chung Yew",
  title =        "Variable Liberalization",
  journal =      j-TACO,
  volume =       "13",
  number =       "3",
  pages =        "23:1--23:??",
  month =        sep,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2963101",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Sat Sep 17 16:20:58 MDT 2016",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "In the wake of the current trend of increasing the
                 number of cores on a chip, compiler optimizations for
                 improving the memory performance have assumed increased
                 importance. Loop fusion is one such key optimization
                 that can alleviate memory and bandwidth wall and thus
                 improve parallel performance. However, we find that
                 loop fusion in interesting memory-intensive
                 applications is prevented by the existence of
                 dependences between temporary variables that appear in
                 different loop nests. Furthermore, known techniques of
                 allowing useful transformations in the presence of
                 temporary variables, such as privatization and
                 expansion, prove insufficient in such cases. In this
                 work, we introduce variable liberalization, a technique
                 that selectively removes dependences on temporary
                 variables in different loop nests to achieve loop
                 fusion while preserving the semantical correctness of
                 the optimized program. This removal of extra-stringent
                 dependences effectively amounts to variable expansion,
                 thus achieving the benefit of an increased degree of
                 freedom for program transformation but without an
                 actual expansion. Hence, there is no corresponding
                 increase in the memory footprint incurred. We implement
                 liberalization in the Pluto polyhedral compiler and
                 evaluate its performance on nine hot regions in five
                 real applications. Results demonstrate parallel
                 performance improvement of 1.92 $ \times $ over the
                 Intel compiler, averaged over the nine hot regions, and
                 an overall improvement of as much as 2.17 $ \times $
                 for an entire application, on an eight-core Intel Xeon
                 processor.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "23",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Chen:2016:RER,
  author =       "Hsing-Min Chen and Carole-Jean Wu and Trevor Mudge and
                 Chaitali Chakrabarti",
  title =        "{RATT-ECC}: Rate Adaptive Two-Tiered Error Correction
                 Codes for Reliable {$3$D} Die-Stacked Memory",
  journal =      j-TACO,
  volume =       "13",
  number =       "3",
  pages =        "24:1--24:??",
  month =        sep,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2957758",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Sat Sep 17 16:20:58 MDT 2016",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "This article proposes a rate-adaptive, two-tiered
                 error-correction scheme (RATT-ECC) that provides strong
                 reliability (10$^{10}$ x reduction in raw FIT rate) for
                 an HBM-like 3D DRAM system. The tier-1 code is a strong
                 symbol-based code that can correct errors due to small
                 granularity faults and detect errors caused by large
                 granularity faults; the tier-2 code is an XOR-based
                 code that corrects errors detected by the tier-1 code.
                 The rate-adaptive feature of RATT-ECC enables permanent
                 bank failures to be handled through sparing. It can
                 also be used to significantly reduce the refresh power
                 consumption without decreasing reliability and timing
                 performance.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "24",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Chen:2016:IDO,
  author =       "Wenjie Chen and Zhibin Wang and Qin Wu and Jiuzhen
                 Liang and Zhilei Chai",
  title =        "Implementing Dense Optical Flow Computation on a
                 Heterogeneous {FPGA SoC} in {C}",
  journal =      j-TACO,
  volume =       "13",
  number =       "3",
  pages =        "25:1--25:??",
  month =        sep,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2948976",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Sat Sep 17 16:20:58 MDT 2016",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "High-quality optical flow computation algorithms are
                 computationally intensive. The low computational speed
                 of such algorithms causes difficulties for real-world
                 applications. In this article, we propose an optimized
                 implementation of the classical
                 Combine-Brightness-Gradient (CBG) model on the Xilinx
                 ZYNQ FPGA-SoC, by taking advantage of the inherent
                 algorithmic parallelism and ZYNQ architecture. The
                 execution time decreases to 0.82 second with a lower
                 power consumption (1.881W). It is better than software
                 implementation on PC (Intel i7-3520M, 2.9GHz), which
                 costs 2.635 seconds and 35W. We use C rather than HDLs
                 to describe the algorithm for rapid prototyping.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "25",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Vaish:2016:OMT,
  author =       "Nilay Vaish and Michael C. Ferris and David A. Wood",
  title =        "Optimization Models for Three On-Chip Network
                 Problems",
  journal =      j-TACO,
  volume =       "13",
  number =       "3",
  pages =        "26:1--26:??",
  month =        sep,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2943781",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Sat Sep 17 16:20:58 MDT 2016",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "We model three on-chip network design problems-memory
                 controller placement, resource allocation in
                 heterogeneous on-chip networks, and their
                 combination-as mathematical optimization problems. We
                 model the first two problems as mixed integer linear
                 programs. We model the third problem as a mixed integer
                 nonlinear program, which we then linearize exactly.
                 Sophisticated optimization algorithms enable solutions
                 to be obtained much more efficiently. Detailed
                 simulations using synthetic traffic and benchmark
                 applications validate that our designs provide better
                 performance than solutions proposed previously. Our
                 work provides further evidence toward suitability of
                 optimization models in searching/pruning architectural
                 design space.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "26",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Sardashti:2016:YAC,
  author =       "Somayeh Sardashti and Andre Seznec and David A. Wood",
  title =        "Yet Another Compressed Cache: a Low-Cost Yet Effective
                 Compressed Cache",
  journal =      j-TACO,
  volume =       "13",
  number =       "3",
  pages =        "27:1--27:??",
  month =        sep,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2976740",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Sat Sep 17 16:20:58 MDT 2016",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Cache memories play a critical role in bridging the
                 latency, bandwidth, and energy gaps between cores and
                 off-chip memory. However, caches frequently consume a
                 significant fraction of a multicore chip's area and
                 thus account for a significant fraction of its cost.
                 Compression has the potential to improve the effective
                 capacity of a cache, providing the performance and
                 energy benefits of a larger cache while using less
                 area. The design of a compressed cache must address two
                 important issues: (i) a low-latency, low-overhead
                 compression algorithm that can represent a fixed-size
                 cache block using fewer bits and (ii) a cache
                 organization that can efficiently store the resulting
                 variable-size compressed blocks. This article focuses
                 on the latter issue. Here, we propose Yet Another
                 Compressed Cache (YACC), a new compressed cache design
                 that targets improving effective cache capacity with a
                 simple design. YACC uses super-blocks to reduce tag
                 overheads while packing variable-size compressed blocks
                 to reduce internal fragmentation. YACC achieves the
                 benefits of two state-of-the art compressed
                 caches-Decoupled Compressed Cache (DCC) [Sardashti and
                 Wood 2013a, 2013b] and Skewed Compressed Cache (SCC)
                 [Sardashti et al. 2014]-with a more practical and
                 simpler design. YACC's cache layout is similar to
                 conventional caches, with a largely unmodified tag
                 array and unmodified data array. Compared to DCC and
                 SCC, YACC requires neither the significant extra
                 metadata (i.e., back pointers) needed by DCC to track
                 blocks nor the complexity and overhead of skewed
                 associativity (i.e., indexing ways differently) needed
                 by SCC. An additional advantage over previous work is
                 that YACC enables modern replacement mechanisms, such
                 as RRIP. For our benchmark set, compared to a
                 conventional uncompressed 8MB LLC, YACC improves
                 performance by 8\% on average and up to 26\%, and
                 reduces total energy by 6\% on average and up to 20\%.
                 An 8MB YACC achieves approximately the same performance
                 and energy improvements as a 16MB conventional cache at
                 a much smaller silicon footprint, with only 1.6\%
                 greater area than an 8MB conventional cache. YACC
                 performs comparably to DCC and SCC but is much simpler
                 to implement.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "27",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Cruz:2016:HAT,
  author =       "Eduardo H. M. Cruz and Matthias Diener and La{\'e}rcio
                 L. Pilla and Philippe O. A. Navaux",
  title =        "Hardware-Assisted Thread and Data Mapping in
                 Hierarchical Multicore Architectures",
  journal =      j-TACO,
  volume =       "13",
  number =       "3",
  pages =        "28:1--28:??",
  month =        sep,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2975587",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Sat Sep 17 16:20:58 MDT 2016",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "The performance and energy efficiency of modern
                 architectures depend on memory locality, which can be
                 improved by thread and data mappings considering the
                 memory access behavior of parallel applications. In
                 this article, we propose intense pages mapping, a
                 mechanism that analyzes the memory access behavior
                 using information about the time the entry of each page
                 resides in the translation lookaside buffer. It
                 provides accurate information with a very low overhead.
                 We present experimental results with simulation and
                 real machines, with average performance improvements of
                 13.7\% and energy savings of 4.4\%, which come from
                 reductions in cache misses and interconnection
                 traffic.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "28",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Adileh:2016:MHP,
  author =       "Almutaz Adileh and Stijn Eyerman and Aamer Jaleel and
                 Lieven Eeckhout",
  title =        "Maximizing Heterogeneous Processor Performance Under
                 Power Constraints",
  journal =      j-TACO,
  volume =       "13",
  number =       "3",
  pages =        "29:1--29:??",
  month =        sep,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2976739",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Sat Sep 17 16:20:58 MDT 2016",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Heterogeneous processors (e.g., ARM's big.LITTLE)
                 improve performance in power-constrained environments
                 by executing applications on the `little' low-power
                 core and move them to the `big' high-performance core
                 when there is available power budget. The total time
                 spent on the big core depends on the rate at which the
                 application dissipates the available power budget. When
                 applications with different big-core power consumption
                 characteristics concurrently execute on a heterogeneous
                 processor, it is best to give a larger share of the
                 power budget to applications that can run longer on the
                 big core, and a smaller share to applications that run
                 for a very short duration on the big core. This article
                 investigates mechanisms to manage the available power
                 budget on power-constrained heterogeneous processors.
                 We show that existing proposals that schedule
                 applications onto a big core based on various
                 performance metrics are not high performing, as these
                 strategies do not optimize over an entire power period
                 and are unaware of the applications' power/performance
                 characteristics. We use linear programming to design
                 the DPDP power management technique, which guarantees
                 optimal performance on heterogeneous processors. We
                 mathematically derive a metric (Delta Performance by
                 Delta Power) that takes into account the
                 power/performance characteristics of each running
                 application and allows our power-management technique
                 to decide how best to distribute the available power
                 budget among the co-running applications at minimal
                 overhead. Our evaluations with a 4-core heterogeneous
                 processor consisting of big.LITTLE pairs show that DPDP
                 improves performance by 16\% on average and up to 40\%
                 compared to a strategy that globally and greedily
                 optimizes the power budget. We also show that DPDP
                 outperforms existing heterogeneous scheduling policies
                 that use performance metrics to decide how best to
                 schedule applications on the big core.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "29",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Wibowo:2016:ACL,
  author =       "Bagus Wibowo and Abhinav Agrawal and Thomas Stanton
                 and James Tuck",
  title =        "An Accurate Cross-Layer Approach for Online
                 Architectural Vulnerability Estimation",
  journal =      j-TACO,
  volume =       "13",
  number =       "3",
  pages =        "30:1--30:??",
  month =        sep,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2975588",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Sat Sep 17 16:20:58 MDT 2016",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Processor soft-error rates are projected to increase
                 as feature sizes scale down, necessitating the adoption
                 of reliability-enhancing techniques, but power and
                 performance overhead remain a concern of such
                 techniques. Dynamic cross-layer techniques are a
                 promising way to improve the cost-effectiveness of
                 resilient systems. As a foundation for making such a
                 system, we propose a cross-layer approach for
                 estimating the architectural vulnerability of a
                 processor core online that works by combining
                 information from software, compiler, and
                 microarchitectural layers at runtime. The hardware
                 layer combines the metadata from software and compiler
                 layers with microarchitectural measurements to estimate
                 architectural vulnerability online. We describe our
                 design and evaluate it in detail on a set of SPEC CPU
                 2006 applications. We find that our online AVF estimate
                 is highly accurate with respect to a postmortem AVF
                 analysis, with only 0.46\% average absolute error.
                 Also, our design incurs negligible performance impact
                 for SPEC2006 applications and about 1.2\% for a Monte
                 Carlo application, requires approximately 1.4\% area
                 overhead, and costs about 3.3\% more power on average.
                 We compare our technique against two prior online AVF
                 estimation techniques, one using a linear regression to
                 estimate AVF and another based on PVF-HVF; our
                 evaluation finds that our approach, on average, is more
                 accurate. Our case study of a Monte Carlo simulation
                 shows that our AVF estimate can adapt to the inherent
                 resiliency of the algorithm. Finally, we demonstrate
                 the effectiveness of our approach using a dynamic
                 protection scheme that limits vulnerability to soft
                 errors while reducing the energy consumption by an
                 average of 4.8\%, and with a target normalized SER of
                 10\%, compared to enabling a simple parity+ECC
                 protection at all times.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "30",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Acacio:2016:LDR,
  author =       "Manuel Acacio",
  title =        "List of Distinguished Reviewers {ACM TACO 2014}",
  journal =      j-TACO,
  volume =       "13",
  number =       "3",
  pages =        "31:1--31:??",
  month =        sep,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2989990",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Sat Sep 17 16:20:58 MDT 2016",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "31",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Vora:2016:SAE,
  author =       "Keval Vora and Rajiv Gupta and Guoqing Xu",
  title =        "Synergistic Analysis of Evolving Graphs",
  journal =      j-TACO,
  volume =       "13",
  number =       "4",
  pages =        "32:1--32:??",
  month =        dec,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2992784",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Wed Dec 28 16:24:46 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Evolving graph processing involves repeating analyses,
                 which are often iterative, over multiple snapshots of
                 the graph corresponding to different points in time.
                 Since the snapshots of an evolving graph share a great
                 number of vertices and edges, traditional approaches
                 that process these snapshots one at a time without
                 exploiting this overlap contain much wasted effort on
                 both data loading and computation, making them
                 extremely inefficient. In this article, we identify
                 major sources of inefficiencies and present two
                 optimization techniques to address them. First, we
                 propose a technique for amortizing the fetch cost by
                 merging fetching of values for different snapshots of
                 the same vertex. Second, we propose a technique for
                 amortizing the processing cost by feeding values
                 computed by earlier snapshots into later snapshots. We
                 have implemented these optimizations in two distributed
                 graph processing systems, namely, GraphLab and ASPIRE.
                 Our experiments with multiple real evolving graphs and
                 algorithms show that, on average fetch amortization
                 speeds up execution of GraphLab and ASPIRE by 5.2$
                 \times $ and 4.1$ \times $ , respectively. Amortizing
                 the processing cost yields additional average speedups
                 of 2$ \times $ and 7.9$ \times $, respectively.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "32",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Zhang:2016:CPS,
  author =       "Yunquan Zhang and Shigang Li and Shengen Yan and
                 Huiyang Zhou",
  title =        "A Cross-Platform {SpMV} Framework on Many-Core
                 Architectures",
  journal =      j-TACO,
  volume =       "13",
  number =       "4",
  pages =        "33:1--33:??",
  month =        dec,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2994148",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Wed Dec 28 16:24:46 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Sparse Matrix-Vector multiplication (SpMV) is a key
                 operation in engineering and scientific computing.
                 Although the previous work has shown impressive
                 progress in optimizing SpMV on many-core architectures,
                 load imbalance and high memory bandwidth remain the
                 critical performance bottlenecks. We present our novel
                 solutions to these problems, for both GPUs and Intel
                 MIC many-core architectures. First, we devise a new
                 SpMV format, called Blocked Compressed Common
                 Coordinate (BCCOO). BCCOO extends the blocked Common
                 Coordinate (COO) by using bit flags to store the row
                 indices to alleviate the bandwidth problem. We further
                 improve this format by partitioning the matrix into
                 vertical slices for better data locality. Then, to
                 address the load imbalance problem, we propose a highly
                 efficient matrix-based segmented sum/scan algorithm for
                 SpMV, which eliminates global synchronization. At last,
                 we introduce an autotuning framework to choose
                 optimization parameters. Experimental results show that
                 our proposed framework has a significant advantage over
                 the existing SpMV libraries. In single precision, our
                 proposed scheme outperforms clSpMV COCKTAIL format by
                 255\% on average on AMD FirePro W8000, and outperforms
                 CUSPARSE V7.0 by 73.7\% on average and outperforms CSR5
                 by 53.6\% on average on GeForce Titan X; in double
                 precision, our proposed scheme outperforms CUSPARSE
                 V7.0 by 34.0\% on average and outperforms CSR5 by
                 16.2\% on average on Tesla K20, and has equivalent
                 performance compared with CSR5 on Intel MIC.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "33",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Ahn:2016:AEE,
  author =       "Junwhan Ahn and Sungjoo Yoo and Kiyoung Choi",
  title =        "{AIM}: Energy-Efficient Aggregation Inside the Memory
                 Hierarchy",
  journal =      j-TACO,
  volume =       "13",
  number =       "4",
  pages =        "34:1--34:??",
  month =        dec,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2994149",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Wed Dec 28 16:24:46 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "In this article, we propose Aggregation-in-Memory
                 (AIM), a new processing-in-memory system designed for
                 energy efficiency and near-term adoption. In order to
                 efficiently perform aggregation, we implement simple
                 aggregation operations in main memory and develop a
                 locality-adaptive host architecture for in-memory
                 aggregation, called cache-conscious aggregation.
                 Through this, AIM executes aggregation at the most
                 energy-efficient location among all levels of the
                 memory hierarchy. Moreover, AIM minimally changes
                 existing sequential programming models and provides
                 fully automated compiler toolchain, thereby allowing
                 unmodified legacy software to use AIM. Evaluations show
                 that AIM greatly improves the energy efficiency of main
                 memory and the system performance.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "34",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Ziabari:2016:UHB,
  author =       "Amir Kavyan Ziabari and Yifan Sun and Yenai Ma and
                 Dana Schaa and Jos{\'e} L. Abell{\'a}n and Rafael Ubal
                 and John Kim and Ajay Joshi and David Kaeli",
  title =        "{UMH}: a Hardware-Based Unified Memory Hierarchy for
                 Systems with Multiple Discrete {GPUs}",
  journal =      j-TACO,
  volume =       "13",
  number =       "4",
  pages =        "35:1--35:??",
  month =        dec,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2996190",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Wed Dec 28 16:24:46 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "In this article, we describe how to ease memory
                 management between a Central Processing Unit (CPU) and
                 one or multiple discrete Graphic Processing Units
                 (GPUs) by architecting a novel hardware-based Unified
                 Memory Hierarchy (UMH). Adopting UMH, a GPU accesses
                 the CPU memory only if it does not find its required
                 data in the directories associated with its
                 high-bandwidth memory, or the NMOESI coherency protocol
                 limits the access to that data. Using UMH with NMOESI
                 improves performance of a CPU-multiGPU system by at
                 least 1.92 $ \times $ in comparison to alternative
                 software-based approaches. It also allows the CPU to
                 access GPUs modified data by at least 13 $ \times $
                 faster.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "35",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Spink:2016:HAC,
  author =       "Tom Spink and Harry Wagstaff and Bj{\"o}rn Franke",
  title =        "Hardware-Accelerated Cross-Architecture Full-System
                 Virtualization",
  journal =      j-TACO,
  volume =       "13",
  number =       "4",
  pages =        "36:1--36:??",
  month =        dec,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2996798",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Wed Dec 28 16:24:46 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib;
                 https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib",
  abstract =     "Hardware virtualization solutions provide users with
                 benefits ranging from application isolation through
                 server consolidation to improved disaster recovery and
                 faster server provisioning. While hardware assistance
                 for virtualization is supported by all major processor
                 architectures, including Intel, ARM, PowerPC, and MIPS,
                 these extensions are targeted at virtualization of the
                 same architecture, for example, an x86 guest on an x86
                 host system. Existing techniques for cross-architecture
                 virtualization, for example, an ARM guest on an x86
                 host, still incur a substantial overhead for CPU,
                 memory, and I/O virtualization due to the necessity for
                 software emulation of these mismatched system
                 components. In this article, we present a new
                 hardware-accelerated hypervisor called C aptive,
                 employing a range of novel techniques that exploit
                 existing hardware virtualization extensions for
                 improving the performance of full-system cross-platform
                 virtualization. We illustrate how (1) guest memory
                 management unit (MMU) events and operations can be
                 mapped onto host memory virtualization extensions,
                 eliminating the need for costly software MMU emulation,
                 (2) a block-based dynamic binary translation engine
                 inside the virtual machine can improve CPU
                 virtualization performance, (3) memory-mapped guest I/O
                 can be efficiently translated to fast I/O specific
                 calls to emulated devices, and (4) the cost for
                 asynchronous guest interrupts can be reduced. For an
                 ARM-based Linux guest system running on an x86 host
                 with Intel VT support, we demonstrate application
                 performance levels, based on SPEC CPU2006 benchmarks,
                 of up to 5.88$ \times $ over state-of-the-art Qemu and
                 2.5$ \times $ on average, achieving a guest dynamic
                 instruction throughput of up to 1280 MIPS (million
                 instructions per second) and 915.52 MIPS, on average.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "36",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Shi:2016:LLA,
  author =       "Qingchuan Shi and George Kurian and Farrukh Hijaz and
                 Srinivas Devadas and Omer Khan",
  title =        "{LDAC}: Locality-Aware Data Access Control for
                 Large-Scale Multicore Cache Hierarchies",
  journal =      j-TACO,
  volume =       "13",
  number =       "4",
  pages =        "37:1--37:??",
  month =        dec,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2983632",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Wed Dec 28 16:24:46 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "The trend of increasing the number of cores to achieve
                 higher performance has challenged efficient management
                 of on-chip data. Moreover, many emerging applications
                 process massive amounts of data with varying degrees of
                 locality. Therefore, exploiting locality to improve
                 on-chip traffic and resource utilization is of
                 fundamental importance. Conventional multicore cache
                 management schemes either manage the private cache (L1)
                 or the Last-Level Cache (LLC), while ignoring the
                 other. We propose a holistic locality-aware cache
                 hierarchy management protocol for large-scale
                 multicores. The proposed scheme improves on-chip data
                 access latency and energy consumption by intelligently
                 bypassing cache line replication in the L1 caches,
                 and/or intelligently replicating cache lines in the
                 LLC. The approach relies on low overhead yet highly
                 accurate in-hardware runtime classification of data
                 locality at both L1 cache and the LLC. The decision to
                 bypass L1 and/or replicate in LLC is then based on the
                 measured reuse at the fine granularity of cache lines.
                 The locality tracking mechanism is decoupled from the
                 sharer tracking structures that cause scalability
                 concerns in traditional cache coherence protocols.
                 Moreover, the complexity of the protocol is low since
                 no additional coherence states are created. However,
                 the proposed classifier incurs a 5.6 KB per-core
                 storage overhead. On a set of parallel benchmarks, the
                 locality-aware protocol reduces average energy
                 consumption by 26\% and completion time by 16\%, when
                 compared to the state-of-the-art Reactive-NUCA
                 multicore cache management scheme.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "37",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Fernandes:2016:EHO,
  author =       "Fernando Fernandes and Lucas Weigel and Claudio Jung
                 and Philippe Navaux and Luigi Carro and Paolo Rech",
  title =        "Evaluation of Histogram of Oriented Gradients Soft
                 Errors Criticality for Automotive Applications",
  journal =      j-TACO,
  volume =       "13",
  number =       "4",
  pages =        "38:1--38:??",
  month =        dec,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2998573",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Wed Dec 28 16:24:46 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Pedestrian detection reliability is a key problem for
                 autonomous or aided driving, and methods that use
                 Histogram of Oriented Gradients (HOG) are very popular.
                 Embedded Graphics Processing Units (GPUs) are exploited
                 to run HOG in a very efficient manner. Unfortunately,
                 GPUs architecture has been shown to be particularly
                 vulnerable to radiation-induced failures. This article
                 presents an experimental evaluation and analytical
                 study of HOG reliability. We aim at quantifying and
                 qualifying the radiation-induced errors on pedestrian
                 detection applications executed in embedded GPUs. We
                 analyze experimental results obtained executing HOG on
                 embedded GPUs from two different vendors, exposed for
                 about 100 hours to a controlled neutron beam at Los
                 Alamos National Laboratory. We consider the number and
                 position of detected objects as well as precision and
                 recall to discriminate critical erroneous computations.
                 The reported analysis shows that, while being
                 intrinsically resilient (65\% to 85\% of output errors
                 only slightly impact detection), HOG experienced some
                 particularly critical errors that could result in
                 undetected pedestrians or unnecessary vehicle stops.
                 Additionally, we perform a fault-injection campaign to
                 identify HOG critical procedures. We observe that
                 Resize and Normalize are the most sensitive and
                 critical phases, as about 20\% of injections generate
                 an output error that significantly impacts HOG
                 detection. With our insights, we are able to find those
                 limited portions of HOG that, if hardened, are more
                 likely to increase reliability without introducing
                 unnecessary overhead.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "38",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Dublish:2016:CCG,
  author =       "Saumay Dublish and Vijay Nagarajan and Nigel Topham",
  title =        "Cooperative Caching for {GPUs}",
  journal =      j-TACO,
  volume =       "13",
  number =       "4",
  pages =        "39:1--39:??",
  month =        dec,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3001589",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Wed Dec 28 16:24:46 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "The rise of general-purpose computing on GPUs has
                 influenced architectural innovation on them. The
                 introduction of an on-chip cache hierarchy is one such
                 innovation. High L1 miss rates on GPUs, however,
                 indicate inefficient cache usage due to myriad factors,
                 such as cache thrashing and extensive multithreading.
                 Such high L1 miss rates in turn place high demands on
                 the shared L2 bandwidth. Extensive congestion in the L2
                 access path therefore results in high memory access
                 latencies. In memory-intensive applications, these
                 latencies get exposed due to a lack of active compute
                 threads to mask such high latencies. In this article,
                 we aim to reduce the pressure on the shared L2
                 bandwidth, thereby reducing the memory access latencies
                 that lie in the critical path. We identify significant
                 replication of data among private L1 caches, presenting
                 an opportunity to reuse data among L1s. We further show
                 how this reuse can be exploited via an L1 Cooperative
                 Caching Network (CCN), thereby reducing the bandwidth
                 demand on L2. In the proposed architecture, we connect
                 the L1 caches with a lightweight ring network to
                 facilitate intercore communication of shared data. We
                 show that this technique reduces traffic to the L2
                 cache by an average of 29\%, freeing up the bandwidth
                 for other accesses. We also show that the CCN reduces
                 the average memory latency by 24\%, thereby reducing
                 core stall cycles by 26\% on average. This translates
                 into an overall performance improvement of 14.7\% on
                 average (and up to 49\%) for applications that exhibit
                 reuse across L1 caches. In doing so, the CCN incurs a
                 nominal area and energy overhead of 1.3\% and 2.5\%,
                 respectively. Notably, the performance improvement with
                 our proposed CCN compares favorably to the performance
                 improvement achieved by simply doubling the number of
                 L2 banks by up to 34\%.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "39",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Tampouratzis:2016:AIH,
  author =       "Nikolaos Tampouratzis and Pavlos M. Mattheakis and
                 Ioannis Papaefstathiou",
  title =        "Accelerating Intercommunication in Highly Parallel
                 Systems",
  journal =      j-TACO,
  volume =       "13",
  number =       "4",
  pages =        "40:1--40:??",
  month =        dec,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3005717",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Wed Dec 28 16:24:46 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Every HPC system consists of numerous processing nodes
                 interconnect using a number of different inter-process
                 communication protocols such as Messaging Passing
                 Interface (MPI) and Global Arrays (GA). Traditionally,
                 research has focused on optimizing these protocols and
                 identifying the most suitable ones for each system
                 and/or application. Recently, there has been a proposal
                 to unify the primitive operations of the different
                 inter-processor communication protocols through the
                 Portals library. Portals offer a set of low-level
                 communication routines which can be composed in order
                 to implement the functionality of different
                 intercommunication protocols. However, Portals
                 modularity comes at a performance cost, since it adds
                 one more layer in the actual protocol implementation.
                 This work aims at closing the performance gap between a
                 generic and reusable intercommunication layer, such as
                 Portals, and the several monolithic and highly
                 optimized intercommunication protocols. This is
                 achieved through the development of a novel hardware
                 offload engine efficiently implementing the basic
                 Portals' modules. Our innovative system is up to two2
                 orders of magnitude faster than the conventional
                 software implementation of Portals' while the speedup
                 achieved over the conventional monolithic software
                 implementations of MPI and GAs is more than an order of
                 magnitude. The power consumption of our hardware system
                 is less than 1/100th of what a low-power CPU consumes
                 when executing the Portal's software while its silicon
                 cost is less than 1/10th of that of a very simple RISC
                 CPU. Moreover, our design process is also innovative
                 since we have first modeled the hardware within an
                 untimed virtual prototype which allowed for rapid
                 design space exploration; then we applied a novel
                 methodology to transform the untimed description into
                 an efficient timed hardware description, which was then
                 transformed into a hardware netlist through a
                 High-Level Synthesis (HLS) tool.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "40",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Park:2016:CJP,
  author =       "Hyukwoo Park and Myungsu Cha and Soo-Mook Moon",
  title =        "Concurrent {JavaScript} Parsing for Faster Loading of
                 {Web} Apps",
  journal =      j-TACO,
  volume =       "13",
  number =       "4",
  pages =        "41:1--41:??",
  month =        dec,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3004281",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Wed Dec 28 16:24:46 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "JavaScript is a dynamic language mainly used as a
                 client-side web script. Nowadays, web is evolving into
                 an application platform with its web apps, and
                 JavaScript increasingly undertakes complex computations
                 and interactive user interfaces, requiring a
                 high-performance JavaScript engine. There have been
                 many optimizations for efficient JavaScript engines,
                 but one component that has not been optimized much is
                 JavaScript parsing. A JavaScript function needs to be
                 parsed before being executed, and the parsing overhead
                 takes a substantial portion of JavaScript execution
                 time for web apps, especially during app loading. This
                 article proposes concurrent parsing of JavaScript,
                 which performs the parsing of JavaScript functions in
                 advance on different threads, while the main thread is
                 executing the parsed JavaScript functions. This can
                 hide the parsing overhead from the main execution
                 thread, reducing the JavaScript execution time, thus
                 reducing the overall app loading time. More
                 specifically, we separated JavaScript parsing and made
                 it run on different threads without violating the
                 execution semantics of JavaScript. We also designed an
                 efficient multi-threaded parsing architecture, which
                 reduces the synchronization overhead and schedules the
                 parsing requests appropriately. Finally, we explored
                 two methods of choosing the target functions for
                 concurrent parsing: one based on profiled information
                 and the other based on speculative heuristics. We
                 performed experiments on the WebKit browser with the
                 JSC engine for real web apps. The result shows that the
                 proposed concurrent parsing can improve the JavaScript
                 performance during app loading by as much as 64\% and
                 by 39.7\% on average. This improves the whole app
                 loading performance tangibly, by as much as 32.7\% and
                 by 18.2\%, on average.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "41",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Xiong:2016:MAS,
  author =       "Dongliang Xiong and Kai Huang and Xiaowen Jiang and
                 Xiaolang Yan",
  title =        "Memory Access Scheduling Based on Dynamic Multilevel
                 Priority in Shared {DRAM} Systems",
  journal =      j-TACO,
  volume =       "13",
  number =       "4",
  pages =        "42:1--42:??",
  month =        dec,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3007647",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Wed Dec 28 16:24:46 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Interapplication interference at shared main memory
                 severely degrades performance and increasing DRAM
                 frequency calls for simple memory schedulers. Previous
                 memory schedulers employ a per-application ranking
                 scheme for high system performance or a per-group
                 ranking scheme for low hardware cost, but few provide a
                 balance. We propose DMPS, a memory scheduler based on
                 dynamic multilevel priority. First, DMPS uses ``memory
                 occupancy'' to measure interference quantitatively.
                 Second, DMPS groups applications, favors
                 latency-sensitive groups, and dynamically prioritizes
                 applications by employing a per-level ranking scheme.
                 The simulation results show that DMPS has 7.2\% better
                 system performance and 22\% better fairness over FRFCFS
                 at low hardware complexity and cost.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "42",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{DeSensi:2016:RAP,
  author =       "Daniele {De Sensi} and Massimo Torquati and Marco
                 Danelutto",
  title =        "A Reconfiguration Algorithm for Power-Aware Parallel
                 Applications",
  journal =      j-TACO,
  volume =       "13",
  number =       "4",
  pages =        "43:1--43:??",
  month =        dec,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3004054",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Wed Dec 28 16:24:46 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "In current computing systems, many applications
                 require guarantees on their maximum power consumption
                 to not exceed the available power budget. On the other
                 hand, for some applications, it could be possible to
                 decrease their performance, yet maintain an acceptable
                 level, in order to reduce their power consumption. To
                 provide such guarantees, a possible solution consists
                 in changing the number of cores assigned to the
                 application, their clock frequency, and the placement
                 of application threads over the cores. However, power
                 consumption and performance have different trends
                 depending on the application considered and on its
                 input. Finding a configuration of resources satisfying
                 user requirements is, in the general case, a
                 challenging task. In this article, we propose Nornir,
                 an algorithm to automatically derive, without relying
                 on historical data about previous executions,
                 performance and power consumption models of an
                 application in different configurations. By using these
                 models, we are able to select a close-to-optimal
                 configuration for the given user requirement, either
                 performance or power consumption. The configuration of
                 the application will be changed on-the-fly throughout
                 the execution to adapt to workload fluctuations,
                 external interferences, and/or application's phase
                 changes. We validate the algorithm by simulating it
                 over the applications of the Parsec benchmark suit.
                 Then, we implement our algorithm and we analyse its
                 accuracy and overhead over some of these applications
                 on a real execution environment. Eventually, we compare
                 the quality of our proposal with that of the optimal
                 algorithm and of some state-of-the-art solutions.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "43",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Jantz:2016:IIP,
  author =       "Michael R. Jantz and Forrest J. Robinson and Prasad A.
                 Kulkarni",
  title =        "Impact of Intrinsic Profiling Limitations on
                 Effectiveness of Adaptive Optimizations",
  journal =      j-TACO,
  volume =       "13",
  number =       "4",
  pages =        "44:1--44:??",
  month =        dec,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3008661",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Wed Dec 28 16:24:46 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Many performance optimizations rely on or are enhanced
                 by runtime profile information. However, both offline
                 and online profiling techniques suffer from intrinsic
                 and practical limitations that affect the quality of
                 delivered profile data. The quality of profile data is
                 its ability to accurately predict (relevant aspects of)
                 future program behavior. While these limitations are
                 known, their impact on the effectiveness of
                 profile-guided optimizations, compared to the ideal
                 performance, is not as well understood. We define ideal
                 performance for adaptive optimizations as that achieved
                 with a precise profile of future program behavior. In
                 this work, we study and quantify the performance impact
                 of fundamental profiling limitations by comparing the
                 effectiveness of typical adaptive optimizations when
                 using the best profiles generated by offline and online
                 schemes against a baseline where the adaptive
                 optimization is given access to profile information
                 about the future execution of the program. We model and
                 compare the behavior of three adaptive JVM
                 optimizations-heap memory management using object usage
                 profiles, code cache management using method usage
                 profiles, and selective just-in-time compilation using
                 method hotness profiles-for the Java DaCapo benchmarks.
                 Our results provide insight into the advantages and
                 drawbacks of current profiling strategies and shed
                 light on directions for future profiling research.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "44",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Damschen:2016:EWP,
  author =       "Marvin Damschen and Lars Bauer and J{\"o}rg Henkel",
  title =        "Extending the {WCET} Problem to Optimize for
                 Runtime-Reconfigurable Processors",
  journal =      j-TACO,
  volume =       "13",
  number =       "4",
  pages =        "45:1--45:??",
  month =        dec,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3014059",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Wed Dec 28 16:24:46 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "The correctness of a real-time system does not depend
                 on the correctness of its calculations alone but also
                 on the non-functional requirement of adhering to
                 deadlines. Guaranteeing these deadlines by static
                 timing analysis, however, is practically infeasible for
                 current microarchitectures with out-of-order scheduling
                 pipelines, several hardware threads, and multiple
                 (shared) cache layers. Novel timing-analyzable features
                 are required to sustain the strongly increasing demand
                 for processing power in real-time systems. Recent
                 advances in timing analysis have shown that
                 runtime-reconfigurable instruction set processors are
                 one way to escape the scarcity of analyzable processing
                 power while preserving the flexibility of the system.
                 When moving calculations from software to hardware by
                 means of reconfigurable custom instructions
                 (CIs)-additional to a considerable speedup-the
                 overestimation of a task's worst-case execution time
                 (WCET) can be reduced. CIs typically implement
                 functionality that corresponds to several hundred
                 instructions on the central processing unit (CPU)
                 pipeline. While analyzing instructions for worst-case
                 latency may introduce pessimism, the latency of
                 CIs-executed on the reconfigurable fabric-is precisely
                 known. In this work, we introduce the problem of
                 selecting reconfigurable CIs to optimize the WCET of an
                 application. We model this problem as an extension to
                 state-of-the-art integer linear programming (ILP)-based
                 program path analysis. This way, we enable optimization
                 based on accurate WCET estimates with integration of
                 information about global program flow, for example,
                 infeasible paths. We present an optimal solution with
                 effective techniques to prune the search space and a
                 greedy heuristic that performs a maximum number of
                 steps linear in the number of partitions of
                 reconfigurable area available. Finally, we show the
                 effectiveness of optimizing the WCET on a
                 reconfigurable processor by evaluating a complex
                 multimedia application with multiple reconfigurable CIs
                 for several hardware parameters.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "45",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Li:2016:MAP,
  author =       "Zheng Li and Fang Wang and Dan Feng and Yu Hua and
                 Jingning Liu and Wei Tong",
  title =        "{MaxPB}: Accelerating {PCM} Write by Maximizing the
                 Power Budget Utilization",
  journal =      j-TACO,
  volume =       "13",
  number =       "4",
  pages =        "46:1--46:??",
  month =        dec,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3012007",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Wed Dec 28 16:24:46 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Phase Change Memory (PCM) is one of the promising
                 memory technologies but suffers from some critical
                 problems such as poor write performance and high write
                 energy consumption. Due to the high write energy
                 consumption and limited power supply, the size of
                 concurrent bit-write is restricted inside one PCM chip.
                 Typically, the size of concurrent bit-write is much
                 less than the cache line size and it is normal that
                 many serially executed write units are consumed to
                 write down the data block to PCM when using it as the
                 main memory. Existing state-of-the-art PCM write
                 schemes, such as FNW (Flip-N-Write) and
                 two-stage-write, address the problem of poor
                 performance by improving the write parallelism under
                 the power constraints. The parallelism is obtained via
                 reducing the data amount and leveraging power as well
                 as time asymmetries, respectively. However, due to the
                 extremely pessimistic assumptions of current
                 utilization (FNW) and optimistic assumptions of
                 asymmetries (two-stage-write), these schemes fail to
                 maximize the power supply utilization and hence improve
                 the write parallelism. In this article, we propose a
                 novel PCM write scheme, called MaxPB (Maximize the
                 Power Budget utilization) to maximize the power budget
                 utilization with minimum changes about the circuits
                 design. MaxPB is a ``think before acting'' method. The
                 main idea of MaxPB is to monitor the actual power needs
                 of all data units first and then effectively package
                 them into the least number of write units under the
                 power constraints. Experimental results show the
                 efficiency and performance improvements on MaxPB. For
                 example, four-core PARSEC and SPEC experimental results
                 show that MaxPB gets 32.0\% and 20.3\% more read
                 latency reduction, 26.5\% and 16.1\% more write latency
                 reduction, 24.3\% and 15.6\% more running time
                 decrease, 1.32$ \times $ and 0.92$ \times $ more
                 speedup, as well as 30.6\% and 18.4\% more energy
                 consumption reduction on average compared with the
                 state-of-the-art FNW and two-stage-write write schemes,
                 respectively.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "46",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Muralidharan:2016:DTN,
  author =       "Saurav Muralidharan and Michael Garland and Albert
                 Sidelnik and Mary Hall",
  title =        "Designing a Tunable Nested Data-Parallel Programming
                 System",
  journal =      j-TACO,
  volume =       "13",
  number =       "4",
  pages =        "47:1--47:??",
  month =        dec,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3012011",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Wed Dec 28 16:24:46 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "This article describes Surge, a nested data-parallel
                 programming system designed to simplify the porting and
                 tuning of parallel applications to multiple target
                 architectures. Surge decouples high-level specification
                 of computations, expressed using a C++ programming
                 interface, from low-level implementation details using
                 two first-class constructs: schedules and policies.
                 Schedules describe the valid ways in which
                 data-parallel operators may be implemented, while
                 policies encapsulate a set of parameters that govern
                 platform-specific code generation. These two mechanisms
                 are used to implement a code generation system that
                 analyzes computations and automatically generates a
                 search space of valid platform-specific
                 implementations. An input and architecture-adaptive
                 autotuning system then explores this search space to
                 find optimized implementations. We express in Surge
                 five real-world benchmarks from domains such as machine
                 learning and sparse linear algebra and from the
                 high-level specifications, Surge automatically
                 generates CPU and GPU implementations that perform on
                 par with or better than manually optimized versions.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "47",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Akturk:2016:ABN,
  author =       "Ismail Akturk and Riad Akram and Mohammad Majharul
                 Islam and Abdullah Muzahid and Ulya R. Karpuzcu",
  title =        "Accuracy Bugs: a New Class of Concurrency Bugs to
                 Exploit Algorithmic Noise Tolerance",
  journal =      j-TACO,
  volume =       "13",
  number =       "4",
  pages =        "48:1--48:??",
  month =        dec,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3017991",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Wed Dec 28 16:24:46 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Parallel programming introduces notoriously difficult
                 bugs, usually referred to as concurrency bugs. This
                 article investigates the potential for deviating from
                 the conventional wisdom of writing concurrency
                 bug-free, parallel programs. It explores the benefit of
                 accepting buggy but approximately correct parallel
                 programs by leveraging the inherent tolerance of
                 emerging parallel applications to inaccuracy in
                 computations. Under algorithmic noise tolerance, a new
                 class of concurrency bugs, accuracy bugs, degrade the
                 accuracy of computation (often at acceptable levels)
                 rather than causing catastrophic termination. This
                 study demonstrates how embracing accuracy bugs affects
                 the application output quality and performance and
                 analyzes the impact on execution semantics.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "48",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Tomusk:2016:SHC,
  author =       "Erik Tomusk and Christophe Dubach and Michael
                 O'Boyle",
  title =        "Selecting Heterogeneous Cores for Diversity",
  journal =      j-TACO,
  volume =       "13",
  number =       "4",
  pages =        "49:1--49:??",
  month =        dec,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3014165",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Wed Dec 28 16:24:46 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Mobile devices with heterogeneous processors are
                 becoming mainstream. With a heterogeneous processor,
                 the runtime scheduler can pick the best CPU core for a
                 given task based on program characteristics,
                 performance requirements, and power limitations. For a
                 heterogeneous processor to be effective, it must
                 contain a diverse set of cores to match a range of
                 runtime requirements and program behaviors. Selecting a
                 diverse set of cores is, however, a non-trivial
                 problem. Power and performance are dependent on both
                 program features and the microarchitectural features of
                 cores, and a selection of cores must satisfy the
                 competing demands of different types of programs. We
                 present a method of core selection that chooses cores
                 at a range of power-performance points. Our algorithm
                 is based on the observation that it is not necessary
                 for a core to consistently have high performance or low
                 power; one type of core can fulfill different roles for
                 different types of programs. Given a power budget,
                 cores selected with our method provide an average
                 speedup of 6\% on EEMBC mobile benchmarks and a 24\%
                 speedup on SPEC 2006 integer benchmarks over the
                 state-of-the-art core selection method.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "49",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Michaud:2016:SMF,
  author =       "Pierre Michaud",
  title =        "Some Mathematical Facts About Optimal Cache
                 Replacement",
  journal =      j-TACO,
  volume =       "13",
  number =       "4",
  pages =        "50:1--50:??",
  month =        dec,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3017992",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Wed Dec 28 16:24:46 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "This article exposes and proves some mathematical
                 facts about optimal cache replacement that were
                 previously unknown or not proved rigorously. An
                 explicit formula is obtained, giving OPT hits and
                 misses as a function of past references. Several
                 mathematical facts are derived from this formula,
                 including a proof that OPT miss curves are always
                 convex, and a new algorithm called OPT tokens, for
                 reasoning about optimal replacement.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "50",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Bao:2016:SDF,
  author =       "Wenlei Bao and Changwan Hong and Sudheer Chunduri and
                 Sriram Krishnamoorthy and Louis-No{\"e}l Pouchet and
                 Fabrice Rastello and P. Sadayappan",
  title =        "Static and Dynamic Frequency Scaling on Multicore
                 {CPUs}",
  journal =      j-TACO,
  volume =       "13",
  number =       "4",
  pages =        "51:1--51:??",
  month =        dec,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3011017",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Wed Dec 28 16:24:46 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Dynamic Voltage and Frequency Scaling (DVFS) typically
                 adapts CPU power consumption by modifying a processor's
                 operating frequency (and the associated voltage).
                 Typical DVFS approaches include using default
                 strategies such as running at the lowest or the highest
                 frequency or reacting to the CPU's runtime load to
                 reduce or increase frequency based on the CPU usage. In
                 this article, we argue that a compile-time approach to
                 CPU frequency selection is achievable for affine
                 program regions and can significantly outperform
                 runtime-based approaches. We first propose a
                 lightweight runtime approach that can exploit the
                 properties of the power profile specific to a
                 processor, outperforming classical Linux governors such
                 as powersave or on-demand for computational kernels. We
                 then demonstrate that, for affine kernels in the
                 application, a purely compile-time approach to CPU
                 frequency and core count selection is achievable,
                 providing significant additional benefits over the
                 runtime approach. Our framework relies on a one-time
                 profiling of the target CPU, along with a compile-time
                 categorization of loop-based code segments in the
                 application. These are combined to determine at
                 compile-time the frequency and the number of cores to
                 use to execute each affine region to optimize energy or
                 energy-delay product. Extensive evaluation on 60
                 benchmarks and 5 multi-core CPUs show that our approach
                 systematically outperforms the powersave Linux governor
                 while also improving overall performance.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "51",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Vale:2016:PDT,
  author =       "Tiago M. Vale and Jo{\~a}o A. Silva and Ricardo J.
                 Dias and Jo{\~a}o M. Louren{\c{c}}o",
  title =        "{Pot}: Deterministic Transactional Execution",
  journal =      j-TACO,
  volume =       "13",
  number =       "4",
  pages =        "52:1--52:??",
  month =        dec,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3017993",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Wed Dec 28 16:24:46 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "This article presents Pot, a system that leverages the
                 concept of preordered transactions to achieve
                 deterministic multithreaded execution of programs that
                 use Transactional Memory. Preordered transactions
                 eliminate the root cause of nondeterminism in
                 transactional execution: they provide the illusion of
                 executing in a deterministic serial order, unlike
                 traditional transactions that appear to execute in a
                 nondeterministic order that can change from execution
                 to execution. Pot uses a new concurrency control
                 protocol that exploits the serialization order to
                 distinguish between fast and speculative transaction
                 execution modes in order to mitigate the overhead of
                 imposing a deterministic order. We build two Pot
                 prototypes: one using STM and another using
                 off-the-shelf HTM. To the best of our knowledge, Pot
                 enables deterministic execution of programs using
                 off-the-shelf HTM for the first time. An experimental
                 evaluation shows that Pot achieves deterministic
                 execution of TM programs with low overhead, sometimes
                 even outperforming nondeterministic executions, and
                 clearly outperforming the state of the art.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "52",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Lu:2016:AFB,
  author =       "Zhonghai Lu and Yuan Yao",
  title =        "Aggregate Flow-Based Performance Fairness in {CMPs}",
  journal =      j-TACO,
  volume =       "13",
  number =       "4",
  pages =        "53:1--53:??",
  month =        dec,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3014429",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Wed Dec 28 16:24:46 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "In CMPs, multiple co-executing applications create
                 mutual interference when sharing the underlying
                 network-on-chip architecture. Such interference causes
                 different performance slowdowns to different
                 applications. To mitigate the unfairness problem, we
                 treat traffic initiated from the same thread as an
                 aggregate flow such that causal request/reply packet
                 sequences can be allocated to resources consistently
                 and fairly according to online profiled traffic
                 injection rates. Our solution comprises three coherent
                 mechanisms from rate profiling, rate inheritance, and
                 rate-proportional channel scheduling to facilitate and
                 realize unbiased workload-adaptive resource allocation.
                 Full-system evaluations in GEM5 demonstrate that,
                 compared to classic packet-centric and latest
                 application-prioritization approaches, our approach
                 significantly improves weighted speed-up for all
                 multi-application mixtures and achieves nearly ideal
                 performance fairness.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "53",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Demir:2016:EPP,
  author =       "Yigit Demir and Nikos Hardavellas",
  title =        "Energy-Proportional Photonic Interconnects",
  journal =      j-TACO,
  volume =       "13",
  number =       "4",
  pages =        "54:1--54:??",
  month =        dec,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3018110",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Wed Dec 28 16:24:46 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Photonic interconnects have emerged as the prime
                 candidate technology for efficient networks on chip at
                 future process nodes. However, the high optical loss of
                 many nanophotonic components coupled with the low
                 efficiency of current laser sources results in
                 exceedingly high total power requirements for the
                 laser. As optical interconnects stay on even during
                 periods of system inactivity, most of this power is
                 wasted, which has prompted research on laser gating.
                 Unfortunately, prior work has been complicated by the
                 long laser turn-on delays and has failed to deliver the
                 full savings. In this article, we propose ProLaser, a
                 laser control mechanism that monitors the requests sent
                 on the interconnect, the cache, and the coherence
                 directory to detect highly correlated events and turn
                 on proactively the lasers of a photonic interconnect.
                 While ProLaser requires fast lasers with a turn-on
                 delay of a few nanoseconds, a technology that is still
                 experimental, several types of such lasers that are
                 suitable for power gating have already been
                 manufactured over the last decade. Overall, ProLaser
                 saves 42\% to 85\% of the laser power, outperforms the
                 current state of the art by 2$ \times $ on average, and
                 closely tracks (within 2\%--6\%) a perfect prediction
                 scheme with full knowledge of future interconnect
                 requests. Moreover, the power savings of ProLaser allow
                 the cores to exploit a higher-power budget and run
                 faster, achieving speedups of 1.5 to 1.7$ \times $
                 (1.6$ \times $ on average).",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "54",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Kurt:2016:UAS,
  author =       "Mehmet Can Kurt and Sriram Krishnamoorthy and Gagan
                 Agrawal and Bin Ren",
  title =        "User-Assisted Store Recycling for Dynamic Task Graph
                 Schedulers",
  journal =      j-TACO,
  volume =       "13",
  number =       "4",
  pages =        "55:1--55:??",
  month =        dec,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3018111",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Wed Dec 28 16:24:46 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "The emergence of the multi-core era has led to
                 increased interest in designing effective yet practical
                 parallel programming models. Models based on task
                 graphs that operate on single-assignment data are
                 attractive in several ways. Notably, they can support
                 dynamic applications and precisely represent the
                 available concurrency. However, for efficient
                 execution, they also require nuanced algorithms for
                 scheduling and memory management. In this article, we
                 consider memory-efficient dynamic scheduling of task
                 graphs. Specifically, we present a novel approach for
                 dynamically recycling the memory locations assigned to
                 data items as they are produced by tasks. We develop
                 algorithms to identify memory-efficient store recycling
                 functions by systematically evaluating the validity of
                 a set of user-provided or automatically generated
                 alternatives. Because recycling functions can be input
                 data-dependent, we have also developed support for
                 continued correct execution of a task graph in the
                 presence of a potentially incorrect store recycling
                 function. Experimental evaluation demonstrates that
                 this approach to automatic store recycling incurs
                 little to no overheads, achieves memory usage
                 comparable to the best manually derived solutions,
                 often produces recycling functions valid across problem
                 sizes and input parameters, and efficiently recovers
                 from an incorrect choice of store recycling
                 functions.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "55",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Haj-Yihia:2016:FGP,
  author =       "Jawad Haj-Yihia and Ahmad Yasin and Yosi Ben Asher and
                 Avi Mendelson",
  title =        "Fine-Grain Power Breakdown of Modern Out-of-Order
                 Cores and Its Implications on {Skylake}-Based Systems",
  journal =      j-TACO,
  volume =       "13",
  number =       "4",
  pages =        "56:1--56:??",
  month =        dec,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3018112",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Wed Dec 28 16:24:46 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "A detailed analysis of power consumption at low system
                 levels becomes important as a means for reducing the
                 overall power consumption of a system and its thermal
                 hot spots. This work presents a new power estimation
                 method that allows understanding the power breakdown of
                 an application when running on modern processor
                 architecture such as the newly released Intel Skylake
                 processor. This work also provides a detailed power and
                 performance characterization report for the SPEC
                 CPU2006 benchmarks, analysis of the data using
                 side-by-side power and performance breakdowns, as well
                 as few interesting case studies.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "56",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Scolari:2016:SCP,
  author =       "Alberto Scolari and Davide Basilio Bartolini and Marco
                 Domenico Santambrogio",
  title =        "A Software Cache Partitioning System for Hash-Based
                 Caches",
  journal =      j-TACO,
  volume =       "13",
  number =       "4",
  pages =        "57:1--57:??",
  month =        dec,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3018113",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Wed Dec 28 16:24:46 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/hash.bib;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Contention on the shared Last-Level Cache (LLC) can
                 have a fundamental negative impact on the performance
                 of applications executed on modern multicores. An
                 interesting software approach to address LLC contention
                 issues is based on page coloring, which is a software
                 technique that attempts to achieve performance
                 isolation by partitioning a shared cache through
                 careful memory management. The key assumption of
                 traditional page coloring is that the cache is
                 physically addressed. However, recent multicore
                 architectures (e.g., Intel Sandy Bridge and later)
                 switched from a physical addressing scheme to a more
                 complex scheme that involves a hash function.
                 Traditional page coloring is ineffective on these
                 recent architectures. In this article, we extend page
                 coloring to work on these recent architectures by
                 proposing a mechanism able to handle their hash-based
                 LLC addressing scheme. Just as for traditional page
                 coloring, the goal of this new mechanism is to deliver
                 performance isolation by avoiding contention on the
                 LLC, thus enabling predictable performance. We
                 implement this mechanism in the Linux kernel, and
                 evaluate it using several benchmarks from the SPEC
                 CPU2006 and PARSEC 3.0 suites. Our results show that
                 our solution is able to deliver performance isolation
                 to concurrently running applications by enforcing
                 partitioning of a Sandy Bridge LLC, which traditional
                 page coloring techniques are not able to handle.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "57",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Mukhanov:2017:AFG,
  author =       "Lev Mukhanov and Pavlos Petoumenos and Zheng Wang and
                 Nikos Parasyris and Dimitrios S. Nikolopoulos and
                 Bronis R. {De Supinski} and Hugh Leather",
  title =        "{ALEA}: a Fine-Grained Energy Profiling Tool",
  journal =      j-TACO,
  volume =       "14",
  number =       "1",
  pages =        "1:1--1:??",
  month =        apr,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3050436",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Jul 24 18:00:58 MDT 2017",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Energy efficiency is becoming increasingly important,
                 yet few developers understand how source code changes
                 affect the energy and power consumption of their
                 programs. To enable them to achieve energy savings, we
                 must associate energy consumption with software
                 structures, especially at the fine-grained level of
                 functions and loops. Most research in the field relies
                 on direct power/energy measurements taken from on-board
                 sensors or performance counters. However, this coarse
                 granularity does not directly provide the needed
                 fine-grained measurements. This article presents ALEA,
                 a novel fine-grained energy profiling tool based on
                 probabilistic analysis for fine-grained energy
                 accounting. ALEA overcomes the limitations of
                 coarse-grained power-sensing instruments to associate
                 energy information effectively with source code at a
                 fine-grained level. We demonstrate and validate that
                 ALEA can perform accurate energy profiling at various
                 granularity levels on two different architectures:
                 Intel Sandy Bridge and ARM big.LITTLE. ALEA achieves a
                 worst-case error of only 2\% for coarse-grained code
                 structures and 6\% for fine-grained ones, with less
                 than 1\% runtime overhead. Our use cases demonstrate
                 that ALEA supports energy optimizations, with energy
                 savings of up to 2.87 times for a latency-critical
                 option pricing workload under a given power budget.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "1",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Pathania:2017:DTM,
  author =       "Anuj Pathania and Vanchinathan Venkataramani and
                 Muhammad Shafique and Tulika Mitra and J{\"o}rg
                 Henkel",
  title =        "Defragmentation of Tasks in Many-Core Architecture",
  journal =      j-TACO,
  volume =       "14",
  number =       "1",
  pages =        "2:1--2:??",
  month =        apr,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3050437",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Jul 24 18:00:58 MDT 2017",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Many-cores can execute multiple multithreaded tasks in
                 parallel. A task performs most efficiently when it is
                 executed over a spatially connected and compact subset
                 of cores so that performance loss due to communication
                 overhead imposed by the task's threads spread across
                 the allocated cores is minimal. Over a span of time,
                 unallocated cores can get scattered all over the
                 many-core, creating fragments in the task mapping.
                 These fragments can prevent efficient contiguous
                 mapping of incoming new tasks leading to loss of
                 performance. This problem can be alleviated by using a
                 task defragmenter, which consolidates smaller fragments
                 into larger fragments wherein the incoming tasks can be
                 efficiently executed. Optimal defragmentation of a
                 many-core is an NP-hard problem in the general case.
                 Therefore, we simplify the original problem to a
                 problem that can be solved optimally in polynomial
                 time. In this work, we introduce a concept of
                 exponentially separable mapping (ESM), which defines a
                 set of task mapping constraints on a many-core. We
                 prove that an ESM enforcing many-core can be
                 defragmented optimally in polynomial time.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "2",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Zivanovic:2017:MMH,
  author =       "Darko Zivanovic and Milan Pavlovic and Milan Radulovic
                 and Hyunsung Shin and Jongpil Son and Sally A. Mckee
                 and Paul M. Carpenter and Petar Radojkovi{\'c} and
                 Eduard Ayguad{\'e}",
  title =        "Main Memory in {HPC}: Do We Need More or Could We Live
                 with Less?",
  journal =      j-TACO,
  volume =       "14",
  number =       "1",
  pages =        "3:1--3:??",
  month =        apr,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3023362",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Jul 24 18:00:58 MDT 2017",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "An important aspect of High-Performance Computing
                 (HPC) system design is the choice of main memory
                 capacity. This choice becomes increasingly important
                 now that 3D-stacked memories are entering the market.
                 Compared with conventional Dual In-line Memory Modules
                 (DIMMs), 3D memory chiplets provide better performance
                 and energy efficiency but lower memory capacities.
                 Therefore, the adoption of 3D-stacked memories in the
                 HPC domain depends on whether we can find use cases
                 that require much less memory than is available now.
                 This study analyzes the memory capacity requirements of
                 important HPC benchmarks and applications. We find that
                 the High-Performance Conjugate Gradients (HPCG)
                 benchmark could be an important success story for
                 3D-stacked memories in HPC, but High-Performance
                 Linpack (HPL) is likely to be constrained by 3D memory
                 capacity. The study also emphasizes that the analysis
                 of memory footprints of production HPC applications is
                 complex and that it requires an understanding of
                 application scalability and target category, i.e.,
                 whether the users target capability or capacity
                 computing. The results show that most of the HPC
                 applications under study have per-core memory
                 footprints in the range of hundreds of megabytes, but
                 we also detect applications and use cases that require
                 gigabytes per core. Overall, the study identifies the
                 HPC applications and use cases with memory footprints
                 that could be provided by 3D-stacked memory chiplets,
                 making a first step toward adoption of this novel
                 technology in the HPC domain.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "3",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Zheng:2017:WAD,
  author =       "Wenguang Zheng and Hui Wu and Qing Yang",
  title =        "{WCET}-Aware Dynamic {I}-Cache Locking for a Single
                 Task",
  journal =      j-TACO,
  volume =       "14",
  number =       "1",
  pages =        "4:1--4:??",
  month =        apr,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3046683",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Jul 24 18:00:58 MDT 2017",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Caches are widely used in embedded systems to bridge
                 the increasing speed gap between processors and
                 off-chip memory. However, caches make it significantly
                 harder to compute the worst-case execution time (WCET)
                 of a task. To alleviate this problem, cache locking has
                 been proposed. We investigate the WCET-aware I-cache
                 locking problem and propose a novel dynamic I-cache
                 locking heuristic approach for reducing the WCET of a
                 task. For a nonnested loop, our approach aims at
                 selecting a minimum set of memory blocks of the loop as
                 locked cache contents by using the min-cut algorithm.
                 For a loop nest, our approach not only aims at
                 selecting a minimum set of memory blocks of the loop
                 nest as locked cache contents but also finds a good
                 loading point for each selected memory block. We
                 propose two algorithms for finding a good loading point
                 for each selected memory block, a polynomial-time
                 heuristic algorithm and an integer linear programming
                 (ILP)-based algorithm, further reducing the WCET of
                 each loop nest. We have implemented our approach and
                 compared it to two state-of-the-art I-cache locking
                 approaches by using a set of benchmarks from the MRTC
                 benchmark suite. The experimental results show that the
                 polynomial-time heuristic algorithm for finding a good
                 loading point for each selected memory block performs
                 almost equally as well as the ILP-based algorithm.
                 Compared to the partial locking approach proposed in
                 Ding et al. [2012], our approach using the heuristic
                 algorithm achieves the average improvements of 33\%,
                 15\%, 9\%, 3\%, 8\%, and 11\% for the 256B, 512B, 1KB,
                 4KB, 8KB, and 16KB caches, respectively. Compared to
                 the dynamic locking approach proposed in Puaut [2006],
                 it achieves the average improvements of 9\%, 19\%,
                 18\%, 5\%, 11\%, and 16\% for the 256B, 512B, 1KB, 4KB,
                 8KB, and 16KB caches, respectively.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "4",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Yang:2017:EJV,
  author =       "Byung-Sun Yang and Jae-Yun Kim and Soo-Mook Moon",
  title =        "Exceptionization: a {Java} {VM} Optimization for
                 Non-{Java} Languages",
  journal =      j-TACO,
  volume =       "14",
  number =       "1",
  pages =        "5:1--5:??",
  month =        apr,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3046681",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Jul 24 18:00:58 MDT 2017",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/java2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib;
                 https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib",
  abstract =     "Java virtual machine (JVM) has recently evolved into a
                 general-purpose language runtime environment to execute
                 popular programming languages such as JavaScript, Ruby,
                 Python, and Scala. These languages have complex
                 non-Java features, including dynamic typing and
                 first-class function, so additional language runtimes
                 (engines) are provided on top of the JVM to support
                 them with bytecode extensions. Although there are
                 high-performance JVMs with powerful just-in-time (JIT)
                 compilers, running these languages efficiently on the
                 JVM is still a challenge. This article introduces a
                 simple and novel technique for the JVM JIT compiler
                 called exceptionization to improve the performance of
                 JVM-based language runtimes. We observed that the JVM
                 executing some non-Java languages encounters at least 2
                 times more branch bytecodes than Java, most of which
                 are highly biased to take only one target.
                 Exceptionization treats such a highly biased branch as
                 some implicit exception-throwing instruction. This
                 allows the JVM JIT compiler to prune the infrequent
                 target of the branch from the frequent control flow,
                 thus compiling the frequent control flow more
                 aggressively with better optimization. If a pruned path
                 were taken, then it would run like a Java exception
                 handler, that is, a catch block. We also devised
                 de-exceptionization, a mechanism to cope with the case
                 when a pruned path is executed more often than
                 expected. Since exceptionization is a generic JVM
                 optimization, independent of any specific language
                 runtime, it would be generally applicable to other
                 language runtimes on the JVM. Our experimental result
                 shows that exceptionization accelerates the performance
                 of several non-Java languages. For example,
                 JavaScript-on-JVM runs faster by as much as 60\% and by
                 6\% on average, when experimented with the Octane
                 benchmark suite on Oracle's latest Nashorn JavaScript
                 engine and HotSpot 1.9 JVM. Furthermore, the
                 performance of Ruby-on-JVM shows an improvement by as
                 much as 60\% and by 6\% on average, while Python-on-JVM
                 improves by as much as 6\% and by 2\% on average. We
                 found that exceptionization is more effective to apply
                 to the branch bytecode of the language runtime itself
                 than the bytecode corresponding to the application code
                 or the bytecode of the Java class libraries. This
                 implies that the performance benefit of
                 exceptionization comes from better JIT compilation of
                 the language runtime of non-Java languages.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "5",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Sen:2017:PGE,
  author =       "Rathijit Sen and David A. Wood",
  title =        "{Pareto} Governors for Energy-Optimal Computing",
  journal =      j-TACO,
  volume =       "14",
  number =       "1",
  pages =        "6:1--6:??",
  month =        apr,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3046682",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Jul 24 18:00:58 MDT 2017",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "The original definition of energy-proportional
                 computing does not characterize the energy efficiency
                 of recent reconfigurable computers, resulting in
                 nonintuitive ``super-proportional'' behavior. This
                 article introduces a new definition of ideal
                 energy-proportional computing, new metrics to quantify
                 computational energy waste, and new SLA-aware OS
                 governors that seek Pareto optimality to achieve
                 power-efficient performance.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "6",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Chaudhuri:2017:MSC,
  author =       "Mainak Chaudhuri and Mukesh Agrawal and Jayesh Gaur
                 and Sreenivas Subramoney",
  title =        "Micro-Sector Cache: Improving Space Utilization in
                 Sectored {DRAM} Caches",
  journal =      j-TACO,
  volume =       "14",
  number =       "1",
  pages =        "7:1--7:??",
  month =        apr,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3046680",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Jul 24 18:00:58 MDT 2017",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Recent research proposals on DRAM caches with
                 conventional allocation units (64 or 128 bytes) as well
                 as large allocation units (512 bytes to 4KB) have
                 explored ways to minimize the space/latency impact of
                 the tag store and maximize the effective utilization of
                 the bandwidth. In this article, we study sectored DRAM
                 caches that exercise large allocation units called
                 sectors, invest reasonably small storage to maintain
                 tag/state, enable space- and bandwidth-efficient
                 tag/state caching due to low tag working set size and
                 large data coverage per tag element, and minimize main
                 memory bandwidth wastage by fetching only the useful
                 portions of an allocated sector. However, the sectored
                 caches suffer from poor space utilization, since a
                 large sector is always allocated even if the sector
                 utilization is low. The recently proposed Unison cache
                 addresses only a special case of this problem by not
                 allocating the sectors that have only one active block.
                 We propose Micro-sector cache, a locality-aware
                 sectored DRAM cache architecture that features a
                 flexible mechanism to allocate cache blocks within a
                 sector and a locality-aware sector replacement
                 algorithm. Simulation studies on a set of 30 16-way
                 multi-programmed workloads show that our proposal, when
                 incorporated in an optimized Unison cache baseline,
                 improves performance (weighted speedup) by 8\%, 14\%,
                 and 16\% on average, respectively, for 1KB, 2KB, and
                 4KB sectors at 128MB capacity. These performance
                 improvements result from significantly better cache
                 space utilization, leading to 18\%, 21\%, and 22\%
                 average reduction in DRAM cache read misses,
                 respectively, for 1KB, 2KB, and 4KB sectors at 128MB
                 capacity. We evaluate our proposal for DRAM cache
                 capacities ranging from 128MB to 1GB.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "7",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Georgiou:2017:ETD,
  author =       "Kyriakos Georgiou and Steve Kerrison and Zbigniew
                 Chamski and Kerstin Eder",
  title =        "Energy Transparency for Deeply Embedded Programs",
  journal =      j-TACO,
  volume =       "14",
  number =       "1",
  pages =        "8:1--8:??",
  month =        apr,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3046679",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Jul 24 18:00:58 MDT 2017",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Energy transparency is a concept that makes a
                 program's energy consumption visible, from hardware up
                 to software, through the different system layers. Such
                 transparency can enable energy optimizations at each
                 layer and between layers, as well as help both
                 programmers and operating systems make energy-aware
                 decisions. In this article, we focus on deeply embedded
                 devices, typically used for Internet of Things (IoT)
                 applications, and demonstrate how to enable energy
                 transparency through existing static resource analysis
                 (SRA) techniques and a new target-agnostic profiling
                 technique, without hardware energy measurements. Our
                 novel mapping technique enables software energy
                 consumption estimations at a higher level than the
                 Instruction Set Architecture (ISA), namely the LLVM
                 intermediate representation (IR) level, and therefore
                 introduces energy transparency directly to the LLVM
                 optimizer. We apply our energy estimation techniques to
                 a comprehensive set of benchmarks, including single-
                 and multithreaded embedded programs from two commonly
                 used concurrency patterns: task farms and pipelines.
                 Using SRA, our LLVM IR results demonstrate a high
                 accuracy with a deviation in the range of 1\% from the
                 ISA SRA. Our profiling technique captures the actual
                 energy consumption at the LLVM IR level with an average
                 error of 3\%.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "8",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Li:2017:LLO,
  author =       "Pengcheng Li and Xiaoyu Hu and Dong Chen and Jacob
                 Brock and Hao Luo and Eddy Z. Zhang and Chen Ding",
  title =        "{LD}: Low-Overhead {GPU} Race Detection Without Access
                 Monitoring",
  journal =      j-TACO,
  volume =       "14",
  number =       "1",
  pages =        "9:1--9:??",
  month =        apr,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3046678",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Jul 24 18:00:58 MDT 2017",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Data race detection has become an important problem in
                 GPU programming. Previous designs of CPU race-checking
                 tools are mainly task parallel and incur high overhead
                 on GPUs due to access instrumentation, especially when
                 monitoring many thousands of threads routinely used by
                 GPU programs. This article presents a novel
                 data-parallel solution designed and optimized for the
                 GPU architecture. It includes compiler support and a
                 set of runtime techniques. It uses value-based
                 checking, which detects the races reported in previous
                 work, finds new races, and supports race-free
                 deterministic GPU execution. More important, race
                 checking is massively data parallel and does not
                 introduce divergent branching or atomic
                 synchronization. Its slowdown is less than $ 5 \times $
                 for over half of the tests and $ 10 \times $ on
                 average, which is orders of magnitude more efficient
                 than the cuda-memcheck tool by Nvidia and the methods
                 that use fine-grained access instrumentation.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "9",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Palangappa:2017:CCE,
  author =       "Poovaiah M. Palangappa and Kartik Mohanram",
  title =        "{CompEx++}: Compression-Expansion Coding for Energy,
                 Latency, and Lifetime Improvements in {MLC\slash TLC
                 NVMs}",
  journal =      j-TACO,
  volume =       "14",
  number =       "1",
  pages =        "10:1--10:??",
  month =        apr,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3050440",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Jul 24 18:00:58 MDT 2017",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/datacompression.bib;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Multilevel/triple-level cell nonvolatile memories
                 (MLC/TLC NVMs) such as phase-change memory (PCM) and
                 resistive RAM (RRAM) are the subject of active research
                 and development as replacement candidates for DRAM,
                 which is limited by its high refresh power and poor
                 scaling potential. In addition to the benefits of
                 nonvolatility (low refresh power) and improved
                 scalability, MLC/TLC NVMs offer high data density and
                 memory capacity over DRAM. However, the viability of
                 MLC/TLC NVMs is limited primarily due to the high
                 programming energy and latency as well as the low
                 endurance of NVM cells; these are primarily attributed
                 to the iterative program-and-verify procedure necessary
                 for programming the NVM cells. This article proposes
                 compression-expansion (CompEx) coding, a low overhead
                 scheme that synergistically integrates pattern-based
                 compression with expansion coding to realize
                 simultaneous energy, latency, and lifetime improvements
                 in MLC/TLC NVMs. CompEx coding is agnostic to the
                 choice of compression technique; in this work, we
                 evaluate CompEx coding using both frequent pattern
                 compression (FPC) and base-delta-immediate $ (B \Delta
                 I) $ compression. CompEx coding integrates FPC/$ B
                 \Delta I $ with $ (k, m)_q $ ``expansion'' coding;
                 expansion codes are a class of $q$-ary linear block
                 codes that encode data using only the low energy states
                 of a $q$-ary NVM cell. CompEx coding simultaneously
                 reduces energy and latency and improves lifetime for
                 negligible-to-no memory overhead and negligible logic
                 overhead ( \approx 10k gates, which is $ < 0.1 \% $ per
                 NVM module). Furthermore, we also propose CompEx++
                 coding, which extends CompEx coding by leveraging the
                 variable compressibility of pattern-based compression
                 techniques. CompEx++ coding integrates custom expansion
                 codes to each of the compression patterns to exploit
                 maximum energy/latency benefits of CompEx coding. Our
                 full-system simulations using TLC RRAM show that
                 CompEx/CompEx++ coding reduces total memory energy by
                 57\%/61\% and write latency by 23.5\%/26\%; these
                 improvements translate to a 5.7\%/10.6\% improvement in
                 IPC, a 11.8\%/19.9\% improvement in main memory
                 bandwidth, and $ 1.8 \times $ improvement in lifetime
                 over classical binary coding using data-comparison
                 write. CompEx/CompEx++ coding thus addresses the
                 programming energy/latency and lifetime challenges of
                 MLC/TLC NVMs that pose a serious technological
                 roadblock to their adoption in high-performance
                 computing systems.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "10",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Lee:2017:DBT,
  author =       "Dongwoo Lee and Sangheon Lee and Soojung Ryu and
                 Kiyoung Choi",
  title =        "Dirty-Block Tracking in a Direct-Mapped {DRAM} Cache
                 with Self-Balancing Dispatch",
  journal =      j-TACO,
  volume =       "14",
  number =       "2",
  pages =        "11:1--11:??",
  month =        jul,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3068460",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Jul 24 18:00:59 MDT 2017",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Recently, processors have begun integrating 3D stacked
                 DRAMs with the cores on the same package, and there
                 have been several approaches to effectively utilizing
                 the on-package DRAMs as caches. This article presents
                 an approach that combines the previous approaches in a
                 synergistic way by devising a module called the
                 dirty-block tracker to maintain the dirtiness of each
                 block in a dirty region. The approach avoids
                 unnecessary tag checking for a write operation if the
                 corresponding block in the cache is not dirty. Our
                 simulation results show that the proposed technique
                 achieves a 10.3\% performance improvement on average
                 over the state-of-the-art DRAM cache technique.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "11",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Parasyris:2017:SAP,
  author =       "Konstantinos Parasyris and Vassilis Vassiliadis and
                 Christos D. Antonopoulos and Spyros Lalis and Nikolaos
                 Bellas",
  title =        "Significance-Aware Program Execution on Unreliable
                 Hardware",
  journal =      j-TACO,
  volume =       "14",
  number =       "2",
  pages =        "12:1--12:??",
  month =        jul,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3058980",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Jul 24 18:00:59 MDT 2017",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "This article introduces a significance-centric
                 programming model and runtime support that sets the
                 supply voltage in a multicore CPU to sub-nominal values
                 to reduce the energy footprint and provide mechanisms
                 to control output quality. The developers specify the
                 significance of application tasks respecting their
                 contribution to the output quality and provide check
                 and repair functions for handling faults. On a
                 multicore system, we evaluate five benchmarks using an
                 energy model that quantifies the energy reduction. When
                 executing the least-significant tasks unreliably, our
                 approach leads to 20\% CPU energy reduction with
                 respect to a reliable execution and has minimal quality
                 degradation.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "12",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Mendonca:2017:DAA,
  author =       "Gleison Mendon{\c{c}}a and Breno Guimar{\~a}es and
                 P{\'e}ricles Alves and M{\'a}rcio Pereira and Guido
                 Ara{\'u}jo and Fernando Magno Quint{\~a}o Pereira",
  title =        "{DawnCC}: Automatic Annotation for Data Parallelism
                 and Offloading",
  journal =      j-TACO,
  volume =       "14",
  number =       "2",
  pages =        "13:1--13:??",
  month =        jul,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3084540",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Jul 24 18:00:59 MDT 2017",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Directive-based programming models, such as OpenACC
                 and OpenMP, allow developers to convert a sequential
                 program into a parallel one with minimum human
                 intervention. However, inserting pragmas into
                 production code is a difficult and error-prone task,
                 often requiring familiarity with the target program.
                 This difficulty restricts the ability of developers to
                 annotate code that they have not written themselves.
                 This article provides a suite of compiler-related
                 methods to mitigate this problem. Such techniques rely
                 on symbolic range analysis, a well-known static
                 technique, to achieve two purposes: populate source
                 code with data transfer primitives and to disambiguate
                 pointers that could hinder automatic parallelization
                 due to aliasing. We have materialized our ideas into a
                 tool, DawnCC, which can be used stand-alone or through
                 an online interface. To demonstrate its effectiveness,
                 we show how DawnCC can annotate the programs available
                 in PolyBench without any intervention from users. Such
                 annotations lead to speedups of over $ 100 \times $ in
                 an Nvidia architecture and over $ 50 \times $ in an ARM
                 architecture.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "13",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Balasubramonian:2017:CNT,
  author =       "Rajeev Balasubramonian and Andrew B. Kahng and Naveen
                 Muralimanohar and Ali Shafiee and Vaishnav Srinivas",
  title =        "{CACTI 7}: New Tools for Interconnect Exploration in
                 Innovative Off-Chip Memories",
  journal =      j-TACO,
  volume =       "14",
  number =       "2",
  pages =        "14:1--14:??",
  month =        jul,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3085572",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Jul 24 18:00:59 MDT 2017",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Historically, server designers have opted for simple
                 memory systems by picking one of a few commoditized DDR
                 memory products. We are already witnessing a major
                 upheaval in the off-chip memory hierarchy, with the
                 introduction of many new memory
                 products-buffer-on-board, LRDIMM, HMC, HBM, and NVMs,
                 to name a few. Given the plethora of choices, it is
                 expected that different vendors will adopt different
                 strategies for their high-capacity memory systems,
                 often deviating from DDR standards and/or integrating
                 new functionality within memory systems. These
                 strategies will likely differ in their choice of
                 interconnect and topology, with a significant fraction
                 of memory energy being dissipated in I/O and data
                 movement. To make the case for memory interconnect
                 specialization, this paper makes three contributions.
                 First, we design a tool that carefully models I/O power
                 in the memory system, explores the design space, and
                 gives the user the ability to define new types of
                 memory interconnects/topologies. The tool is validated
                 against SPICE models, and is integrated into version 7
                 of the popular CACTI package. Our analysis with the
                 tool shows that several design parameters have a
                 significant impact on I/O power. We then use the tool
                 to help craft novel specialized memory system channels.
                 We introduce a new relay-on-board chip that partitions
                 a DDR channel into multiple cascaded channels. We show
                 that this simple change to the channel topology can
                 improve performance by 22\% for DDR DRAM and lower cost
                 by up to 65\% for DDR DRAM. This new architecture does
                 not require any changes to DIMMs, and it efficiently
                 supports hybrid DRAM/NVM systems. Finally, as an
                 example of a more disruptive architecture, we design a
                 custom DIMM and parallel bus that moves away from the
                 DDR3/DDR4 standards. To reduce energy and improve
                 performance, the baseline data channel is split into
                 three narrow parallel channels and the on-DIMM
                 interconnects are operated at a lower frequency. In
                 addition, this allows us to design a two-tier error
                 protection strategy that reduces data transfers on the
                 interconnect. This architecture yields a performance
                 improvement of 18\% and a memory power reduction of
                 23\%. The cascaded channel and narrow channel
                 architectures serve as case studies for the new tool
                 and show the potential for benefit from re-organizing
                 basic memory interconnects.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "14",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Jatala:2017:SSG,
  author =       "Vishwesh Jatala and Jayvant Anantpur and Amey
                 Karkare",
  title =        "Scratchpad Sharing in {GPUs}",
  journal =      j-TACO,
  volume =       "14",
  number =       "2",
  pages =        "15:1--15:??",
  month =        jul,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3075619",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Jul 24 18:00:59 MDT 2017",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "General-Purpose Graphics Processing Unit (GPGPU)
                 applications exploit on-chip scratchpad memory
                 available in the Graphics Processing Units (GPUs) to
                 improve performance. The amount of thread level
                 parallelism (TLP) present in the GPU is limited by the
                 number of resident threads, which in turn depends on
                 the availability of scratchpad memory in its streaming
                 multiprocessor (SM). Since the scratchpad memory is
                 allocated at thread block granularity, part of the
                 memory may remain unutilized. In this article, we
                 propose architectural and compiler optimizations to
                 improve the scratchpad memory utilization. Our
                 approach, called Scratchpad Sharing, addresses
                 scratchpad under-utilization by launching additional
                 thread blocks in each SM. These thread blocks use
                 unutilized scratchpad memory and also share scratchpad
                 memory with other resident blocks. To improve the
                 performance of scratchpad sharing, we propose Owner
                 Warp First (OWF) scheduling that schedules warps from
                 the additional thread blocks effectively. The
                 performance of this approach, however, is limited by
                 the availability of the part of scratchpad memory that
                 is shared among thread blocks. We propose compiler
                 optimizations to improve the availability of shared
                 scratchpad memory. We describe an allocation scheme
                 that helps in allocating scratchpad variables such that
                 shared scratchpad is accessed for short duration. We
                 introduce a new hardware instruction, relssp, that when
                 executed releases the shared scratchpad memory.
                 Finally, we describe an analysis for optimal placement
                 of relssp instructions, such that shared scratchpad
                 memory is released as early as possible, but only after
                 its last use, along every execution path. We
                 implemented the hardware changes required for
                 scratchpad sharing and the relssp instruction using the
                 GPGPU-Sim simulator and implemented the compiler
                 optimizations in Ocelot framework. We evaluated the
                 effectiveness of our approach on 19 kernels from 3
                 benchmarks suites: CUDA-SDK, GPGPU-Sim, and Rodinia.
                 The kernels that under-utilize scratchpad memory show
                 an average improvement of 19\% and maximum improvement
                 of 92.17\% in terms of the number of instruction
                 executed per cycle when compared to the baseline
                 approach, without affecting the performance of the
                 kernels that are not limited by scratchpad memory.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "15",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Ham:2017:DDS,
  author =       "Tae Jun Ham and Juan L. Arag{\'o}n and Margaret
                 Martonosi",
  title =        "Decoupling Data Supply from Computation for
                 Latency-Tolerant Communication in Heterogeneous
                 Architectures",
  journal =      j-TACO,
  volume =       "14",
  number =       "2",
  pages =        "16:1--16:??",
  month =        jul,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3075620",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Jul 24 18:00:59 MDT 2017",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "In today's computers, heterogeneous processing is used
                 to meet performance targets at manageable power. In
                 adopting increased compute specialization, however, the
                 relative amount of time spent on communication
                 increases. System and software optimizations for
                 communication often come at the costs of increased
                 complexity and reduced portability. The Decoupled
                 Supply-Compute (DeSC) approach offers a way to attack
                 communication latency bottlenecks automatically, while
                 maintaining good portability and low complexity. Our
                 work expands prior Decoupled Access Execute techniques
                 with hardware/software specialization. For a range of
                 workloads, DeSC offers roughly 2 $ \times $ speedup,
                 and additional specialized compression optimizations
                 reduce traffic between decoupled units by 40\%.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "16",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Stanic:2017:IVS,
  author =       "Milan Stanic and Oscar Palomar and Timothy Hayes and
                 Ivan Ratkovic and Adrian Cristal and Osman Unsal and
                 Mateo Valero",
  title =        "An Integrated Vector-Scalar Design on an In-Order
                 {ARM} Core",
  journal =      j-TACO,
  volume =       "14",
  number =       "2",
  pages =        "17:1--17:??",
  month =        jul,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3075618",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Jul 24 18:00:59 MDT 2017",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "In the low-end mobile processor market, power, energy,
                 and area budgets are significantly lower than in the
                 server/desktop/laptop/high-end mobile markets. It has
                 been shown that vector processors are a highly
                 energy-efficient way to increase performance; however,
                 adding support for them incurs area and power overheads
                 that would not be acceptable for low-end mobile
                 processors. In this work, we propose an integrated
                 vector-scalar design for the ARM architecture that
                 mostly reuses scalar hardware to support the execution
                 of vector instructions. The key element of the design
                 is our proposed block-based model of execution that
                 groups vector computational instructions together to
                 execute them in a coordinated manner. We implemented a
                 classic vector unit and compare its results against our
                 integrated design. Our integrated design improves the
                 performance (more than $ 6 \times $) and energy
                 consumption (up to $ 5 \times $) of a scalar in-order
                 core with negligible area overhead (only 4.7\% when
                 using a vector register with 32 elements). In contrast,
                 the area overhead of the classic vector unit can be
                 significant (around 44\%) if a dedicated vector
                 floating-point unit is incorporated. Our block-based
                 vector execution outperforms the classic vector unit
                 for all kernels with floating-point data and also
                 consumes less energy. We also complement the integrated
                 design with three energy/performance-efficient
                 techniques that further reduce power and increase
                 performance. The first proposal covers the design and
                 implementation of chaining logic that is optimized to
                 work with the cache hierarchy through vector memory
                 instructions, the second proposal reduces the number of
                 reads/writes from/to the vector register file, and the
                 third idea optimizes complex memory access patterns
                 with the memory shape instruction and unified indexed
                 vector load.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "17",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Endo:2017:IBV,
  author =       "Fernando A. Endo and Arthur Perais and Andr{\'e}
                 Seznec",
  title =        "On the Interactions Between Value Prediction and
                 Compiler Optimizations in the Context of {EOLE}",
  journal =      j-TACO,
  volume =       "14",
  number =       "2",
  pages =        "18:1--18:??",
  month =        jul,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3090634",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Jul 24 18:00:59 MDT 2017",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Increasing instruction-level parallelism is regaining
                 attractiveness within the microprocessor industry. The
                 {Early | Out-of-order | Late} Execution (EOLE)
                 microarchitecture and Differential Value TAgged
                 GEometric (D-VTAGE) value predictor were recently
                 introduced to solve practical issues of Value
                 Prediction (VP). In particular, they remove the most
                 significant difficulties that forbade an effective VP
                 hardware. In this study, we present a detailed
                 evaluation of the potential of VP in the context of
                 EOLE/D-VTAGE and different compiler options. Our study
                 shows that if no single general rule always
                 applies-more optimization might sometimes lead to more
                 performance-unoptimized codes often get a large benefit
                 from the prediction of redundant loads.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "18",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Sridharan:2017:BPP,
  author =       "Aswinkumar Sridharan and Biswabandan Panda and Andre
                 Seznec",
  title =        "Band-Pass Prefetching: an Effective Prefetch
                 Management Mechanism Using Prefetch-Fraction Metric in
                 Multi-Core Systems",
  journal =      j-TACO,
  volume =       "14",
  number =       "2",
  pages =        "19:1--19:??",
  month =        jul,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3090635",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Jul 24 18:00:59 MDT 2017",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "In multi-core systems, an application's prefetcher can
                 interfere with the memory requests of other
                 applications using the shared resources, such as last
                 level cache and memory bandwidth. In order to minimize
                 prefetcher-caused interference, prior mechanisms have
                 been proposed to dynamically control prefetcher
                 aggressiveness at runtime. These mechanisms use several
                 parameters to capture prefetch usefulness as well as
                 prefetcher-caused interference, performing aggressive
                 control decisions. However, these mechanisms do not
                 capture the actual interference at the shared resources
                 and most often lead to incorrect aggressiveness control
                 decisions. Therefore, prior works leave scope for
                 performance improvement. Toward this end, we propose a
                 solution to manage prefetching in multicore systems. In
                 particular, we make two fundamental observations:
                 First, a positive correlation exists between the
                 accuracy of a prefetcher and the amount of prefetch
                 requests it generates relative to an application's
                 total (demand and prefetch) requests. Second, a strong
                 positive correlation exists between the ratio of total
                 prefetch to demand requests and the ratio of average
                 last level cache miss service times of demand to
                 prefetch requests. In this article, we propose
                 Band-pass prefetching that builds on those two
                 observations, a simple and low-overhead mechanism to
                 effectively manage prefetchers in multicore systems.
                 Our solution consists of local and global prefetcher
                 aggressiveness control components, which altogether,
                 control the flow of prefetch requests between a range
                 of prefetch to demand requests ratios. From our
                 experiments on 16-core multi-programmed workloads, on
                 systems using stream prefetching, we observe that
                 Band-pass prefetching achieves 12.4\% (geometric-mean)
                 improvement on harmonic speedup over the baseline that
                 implements no prefetching, while aggressive prefetching
                 without prefetcher aggressiveness control and
                 state-of-the-art HPAC, P-FST, and CAFFEINE achieve
                 8.2\%, 8.4\%, 1.4\%, and 9.7\%, respectively. Further
                 evaluation of the proposed Band-pass prefetching
                 mechanism on systems using AMPM prefetcher shows
                 similar performance trends. For a 16-core system,
                 Band-pass prefetching requires only a modest hardware
                 cost of 239 bytes.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "19",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Goens:2017:SSS,
  author =       "Andr{\'e}s Goens and Sergio Siccha and Jeronimo
                 Castrillon",
  title =        "Symmetry in Software Synthesis",
  journal =      j-TACO,
  volume =       "14",
  number =       "2",
  pages =        "20:1--20:??",
  month =        jul,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3095747",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Jul 24 18:00:59 MDT 2017",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "With the surge of multi- and many-core systems, much
                 research has focused on algorithms for mapping and
                 scheduling on these complex platforms. Large classes of
                 these algorithms face scalability problems. This is why
                 diverse methods are commonly used for reducing the
                 search space. While most such approaches leverage the
                 inherent symmetry of architectures and applications,
                 they do it in a problem-specific and intuitive way.
                 However, intuitive approaches become impractical with
                 growing hardware complexity, like Network-on-Chip
                 interconnect or heterogeneous cores. In this article,
                 we present a formal framework that can determine the
                 inherent local and global symmetry of architectures and
                 applications algorithmically and leverage these for
                 problems in software synthesis. Our approach is based
                 on the mathematical theory of groups and a
                 generalization called inverse semigroups. We evaluate
                 our approach in two state-of-the-art mapping
                 frameworks. Even for the platforms with a handful of
                 cores of today and moderate-sized benchmarks, our
                 approach consistently yields reductions of the overall
                 execution time of algorithms. We obtain a speedup of
                 more than $ 10 \times $ for one use-case and saved 10\%
                 of time in another.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "20",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Vocke:2017:EHI,
  author =       "Sander Vocke and Henk Corporaal and Roel Jordans and
                 Rosilde Corvino and Rick Nas",
  title =        "Extending {Halide} to Improve Software Development for
                 Imaging {DSPs}",
  journal =      j-TACO,
  volume =       "14",
  number =       "3",
  pages =        "21:1--21:??",
  month =        sep,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3106343",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Wed Sep 6 17:12:05 MDT 2017",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Specialized Digital Signal Processors (DSPs), which
                 can be found in a wide range of modern devices, play an
                 important role in power-efficient, high-performance
                 image processing. Applications including camera sensor
                 post-processing and computer vision benefit from being
                 (partially) mapped onto such DSPs. However, due to
                 their specialized instruction sets and dependence on
                 low-level code optimization, developing applications
                 for DSPs is more time-consuming and error-prone than
                 for general-purpose processors. Halide is a
                 domain-specific language (DSL) that enables low-effort
                 development of portable, high-performance imaging
                 pipelines-a combination of qualities that is hard, if
                 not impossible, to find among DSP programming models.
                 We propose a set of extensions and modifications to
                 Halide to generate code for DSP C compilers, focusing
                 specifically on diverse SIMD target instruction sets
                 and heterogeneous scratchpad memory hierarchies. We
                 implement said techniques for a commercial DSP found in
                 an Intel Image Processing Unit (IPU), demonstrating
                 that this solution can be used to achieve performance
                 within 20\% of highly tuned, manually written C code,
                 while leading to a reduction in code complexity. By
                 comparing performance of Halide algorithms using our
                 solution to results on CPU and GPU targets, we confirm
                 the value of using DSP targets with Halide.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "21",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Jensen:2017:ILD,
  author =       "Nicklas Bo Jensen and Sven Karlsson",
  title =        "Improving Loop Dependence Analysis",
  journal =      j-TACO,
  volume =       "14",
  number =       "3",
  pages =        "22:1--22:??",
  month =        sep,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3095754",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Wed Sep 6 17:12:05 MDT 2017",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Programmers can no longer depend on new processors to
                 have significantly improved single-thread performance.
                 Instead, gains have to come from other sources such as
                 the compiler and its optimization passes. Advanced
                 passes make use of information on the dependencies
                 related to loops. We improve the quality of that
                 information by reusing the information given by the
                 programmer for parallelization. We have implemented a
                 prototype based on GCC into which we also add a new
                 optimization pass. Our approach improves the amount of
                 correctly classified dependencies resulting in 46\%
                 average improvement in single-thread performance for
                 kernel benchmarks compared to GCC 6.1.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "22",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Ganser:2017:ISO,
  author =       "Stefan Ganser and Armin Gr{\"o}sslinger and Norbert
                 Siegmund and Sven Apel and Christian Lengauer",
  title =        "Iterative Schedule Optimization for Parallelization in
                 the Polyhedron Model",
  journal =      j-TACO,
  volume =       "14",
  number =       "3",
  pages =        "23:1--23:??",
  month =        sep,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3109482",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Wed Sep 6 17:12:05 MDT 2017",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "The polyhedron model is a powerful model to identify
                 and apply systematically loop transformations that
                 improve data locality (e.g., via tiling) and enable
                 parallelization. In the polyhedron model, a loop
                 transformation is, essentially, represented as an
                 affine function. Well-established algorithms for the
                 discovery of promising transformations are based on
                 performance models. These algorithms have the drawback
                 of not being easily adaptable to the characteristics of
                 a specific program or target hardware. An iterative
                 search for promising loop transformations is more
                 easily adaptable and can help to learn better models.
                 We present an iterative optimization method in the
                 polyhedron model that targets tiling and
                 parallelization. The method enables either a sampling
                 of the search space of legal loop transformations at
                 random or a more directed search via a genetic
                 algorithm. For the latter, we propose a set of novel,
                 tailored reproduction operators. We evaluate our
                 approach against existing iterative and model-driven
                 optimization strategies. We compare the convergence
                 rate of our genetic algorithm to that of random
                 exploration. Our approach of iterative optimization
                 outperforms existing optimization techniques in that it
                 finds loop transformations that yield significantly
                 higher performance. If well configured, then random
                 exploration turns out to be very effective and reduces
                 the need for a genetic algorithm.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "23",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Wei:2017:HHM,
  author =       "Wei Wei and Dejun Jiang and Jin Xiong and Mingyu
                 Chen",
  title =        "{HAP}: Hybrid-Memory-Aware Partition in Shared
                 Last-Level Cache",
  journal =      j-TACO,
  volume =       "14",
  number =       "3",
  pages =        "24:1--24:??",
  month =        sep,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3106340",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Wed Sep 6 17:12:05 MDT 2017",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Data-center servers benefit from large-capacity memory
                 systems to run multiple processes simultaneously.
                 Hybrid DRAM-NVM memory is attractive for increasing
                 memory capacity by exploiting the scalability of
                 Non-Volatile Memory (NVM). However, current LLC
                 policies are unaware of hybrid memory. Cache misses to
                 NVM introduce high cost due to long NVM latency.
                 Moreover, evicting dirty NVM data suffer from long
                 write latency. We propose hybrid memory aware cache
                 partitioning to dynamically adjust cache spaces and
                 give NVM dirty data more chances to reside in LLC.
                 Experimental results show Hybrid-memory-Aware Partition
                 (HAP) improves performance by 46.7\% and reduces energy
                 consumption by 21.9\% on average against LRU
                 management. Moreover, HAP averagely improves
                 performance by 9.3\% and reduces energy consumption by
                 6.4\% against a state-of-the-art cache mechanism.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "24",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Xiong:2017:PPP,
  author =       "Dongliang Xiong and Kai Huang and Xiaowen Jiang and
                 Xiaolang Yan",
  title =        "Providing Predictable Performance via a Slowdown
                 Estimation Model",
  journal =      j-TACO,
  volume =       "14",
  number =       "3",
  pages =        "25:1--25:??",
  month =        sep,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3124451",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Wed Sep 6 17:12:05 MDT 2017",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Interapplication interference at shared main memory
                 slows down different applications differently. A few
                 slowdown estimation models have been proposed to
                 provide predictable performance by quantifying memory
                 interference, but they have relatively low accuracy.
                 Thus, we propose a more accurate slowdown estimation
                 model called SEM at main memory. First, SEM unifies the
                 slowdown estimation model by measuring IPC directly.
                 Second, SEM uses the per-bank structure to monitor
                 memory interference and improves estimation accuracy by
                 considering write interference, row-buffer
                 interference, and data bus interference. The evaluation
                 results show that SEM has significantly lower slowdown
                 estimation error (4.06\%) compared to STFM (30.15\%)
                 and MISE (10.1\%).",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "25",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Pu:2017:PHS,
  author =       "Jing Pu and Steven Bell and Xuan Yang and Jeff Setter
                 and Stephen Richardson and Jonathan Ragan-Kelley and
                 Mark Horowitz",
  title =        "Programming Heterogeneous Systems from an Image
                 Processing {DSL}",
  journal =      j-TACO,
  volume =       "14",
  number =       "3",
  pages =        "26:1--26:??",
  month =        sep,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3107953",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Wed Sep 6 17:12:05 MDT 2017",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Specialized image processing accelerators are
                 necessary to deliver the performance and energy
                 efficiency required by important applications in
                 computer vision, computational photography, and
                 augmented reality. But creating, ``programming,'' and
                 integrating this hardware into a hardware/software
                 system is difficult. We address this problem by
                 extending the image processing language Halide so users
                 can specify which portions of their applications should
                 become hardware accelerators, and then we provide a
                 compiler that uses this code to automatically create
                 the accelerator along with the ``glue'' code needed for
                 the user's application to access this hardware.
                 Starting with Halide not only provides a very
                 high-level functional description of the hardware but
                 also allows our compiler to generate a complete
                 software application, which accesses the hardware for
                 acceleration when appropriate. Our system also provides
                 high-level semantics to explore different mappings of
                 applications to a heterogeneous system, including the
                 flexibility of being able to change the throughput rate
                 of the generated hardware. We demonstrate our approach
                 by mapping applications to a commercial Xilinx Zynq
                 system. Using its FPGA with two low-power ARM cores,
                 our design achieves up to 6$ \times $ higher
                 performance and 38$ \times $ lower energy compared to
                 the quad-core ARM CPU on an NVIDIA Tegra K1, and 3.5$
                 \times $ higher performance with 12$ \times $ lower
                 energy compared to the K1's 192-core GPU.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "26",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Hroub:2017:EGC,
  author =       "Ayman Hroub and M. E. S. Elrabaa and M. F. Mudawar and
                 A. Khayyat",
  title =        "Efficient Generation of Compact Execution Traces for
                 Multicore Architectural Simulations",
  journal =      j-TACO,
  volume =       "14",
  number =       "3",
  pages =        "27:1--27:??",
  month =        sep,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3106342",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Wed Sep 6 17:12:05 MDT 2017",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Requiring no functional simulation, trace-driven
                 simulation has the potential of achieving faster
                 simulation speeds than execution-driven simulation of
                 multicore architectures. An efficient, on-the-fly,
                 high-fidelity trace generation method for multithreaded
                 applications is reported. The generated trace is
                 encoded in an instruction-like binary format that can
                 be directly ``interpreted'' by a timing simulator to
                 simulate a general load/store or x8-like architecture.
                 A complete tool suite that has been developed and used
                 for evaluation of the proposed method showed that it
                 produces smaller traces over existing trace compression
                 methods while retaining good fidelity including all
                 threading- and synchronization-related events.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "27",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Weber:2017:MAL,
  author =       "Nicolas Weber and Michael Goesele",
  title =        "{MATOG}: Array Layout Auto-Tuning for {CUDA}",
  journal =      j-TACO,
  volume =       "14",
  number =       "3",
  pages =        "28:1--28:??",
  month =        sep,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3106341",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Wed Sep 6 17:12:05 MDT 2017",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Optimal code performance is (besides correctness and
                 accuracy) the most important objective in compute
                 intensive applications. In many of these applications,
                 Graphic Processing Units (GPUs) are used because of
                 their high amount of compute power. However, caused by
                 their massively parallel architecture, the code has to
                 be specifically adjusted to the underlying hardware to
                 achieve optimal performance and therefore has to be
                 reoptimized for each new generation. In reality, this
                 is usually not the case as productive code is normally
                 at least several years old and nobody has the time to
                 continuously adjust existing code to new hardware. In
                 recent years more and more approaches have emerged that
                 automatically tune the performance of applications
                 toward the underlying hardware. In this article, we
                 present the MATOG auto-tuner and its concepts. It
                 abstracts the array memory access in CUDA applications
                 and automatically optimizes the code according to the
                 used GPUs. MATOG only requires few profiling runs to
                 analyze even complex applications, while achieving
                 significant speedups over non-optimized code,
                 independent of the used GPU generation and without the
                 need to manually tune the code.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "28",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Ashouri:2017:MMC,
  author =       "Amir H. Ashouri and Andrea Bignoli and Gianluca
                 Palermo and Cristina Silvano and Sameer Kulkarni and
                 John Cavazos",
  title =        "{MiCOMP}: Mitigating the Compiler Phase-Ordering
                 Problem Using Optimization Sub-Sequences and Machine
                 Learning",
  journal =      j-TACO,
  volume =       "14",
  number =       "3",
  pages =        "29:1--29:??",
  month =        sep,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3124452",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Wed Sep 6 17:12:05 MDT 2017",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Recent compilers offer a vast number of multilayered
                 optimizations targeting different code segments of an
                 application. Choosing among these optimizations can
                 significantly impact the performance of the code being
                 optimized. The selection of the right set of compiler
                 optimizations for a particular code segment is a very
                 hard problem, but finding the best ordering of these
                 optimizations adds further complexity. Finding the best
                 ordering represents a long standing problem in
                 compilation research, named the phase-ordering problem.
                 The traditional approach of constructing compiler
                 heuristics to solve this problem simply cannot cope
                 with the enormous complexity of choosing the right
                 ordering of optimizations for every code segment in an
                 application. This article proposes an automatic
                 optimization framework we call MiCOMP, which Mitigates
                 the COMpiler Phase-ordering problem. We perform phase
                 ordering of the optimizations in LLVM's highest
                 optimization level using optimization sub-sequences and
                 machine learning. The idea is to cluster the
                 optimization passes of LLVM's O3 setting into different
                 clusters to predict the speedup of a complete sequence
                 of all the optimization clusters instead of having to
                 deal with the ordering of more than 60 different
                 individual optimizations. The predictive model uses (1)
                 dynamic features, (2) an encoded version of the
                 compiler sequence, and (3) an exploration heuristic to
                 tackle the problem. Experimental results using the LLVM
                 compiler framework and the Cbench suite show the
                 effectiveness of the proposed clustering and encoding
                 techniques to application-based reordering of passes,
                 while using a number of predictive models. We perform
                 statistical analysis on the results and compare against
                 (1) random iterative compilation, (2) standard
                 optimization levels, and (3) two recent prediction
                 approaches. We show that MiCOMP's iterative compilation
                 using its sub-sequences can reach an average
                 performance speedup of 1.31 (up to 1.51). Additionally,
                 we demonstrate that MiCOMP's prediction model
                 outperforms the -O1, -O2, and -O3 optimization levels
                 within using just a few predictions and reduces the
                 prediction error rate down to only 5\%. Overall, it
                 achieves 90\% of the available speedup by exploring
                 less than 0.001\% of the optimization space.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "29",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Vermij:2017:AIN,
  author =       "Erik Vermij and Leandro Fiorin and Rik Jongerius and
                 Christoph Hagleitner and Jan {Van Lunteren} and Koen
                 Bertels",
  title =        "An Architecture for Integrated Near-Data Processors",
  journal =      j-TACO,
  volume =       "14",
  number =       "3",
  pages =        "30:1--30:??",
  month =        sep,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3127069",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Wed Sep 6 17:12:05 MDT 2017",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "To increase the performance of data-intensive
                 applications, we present an extension to a CPU
                 architecture that enables arbitrary near-data
                 processing capabilities close to the main memory. This
                 is realized by introducing a component attached to the
                 CPU system-bus and a component at the memory side.
                 Together they support hardware-managed coherence and
                 virtual memory support to integrate the near-data
                 processors in a shared-memory environment. We present
                 an implementation of the components, as well as a
                 system-simulator, providing detailed performance
                 estimations. With a variety of synthetic workloads we
                 demonstrate the performance of the memory accesses, the
                 mixed fine- and coarse-grained coherence mechanisms,
                 and the near-data processor communication mechanism.
                 Furthermore, we quantify the inevitable start-up
                 penalty regarding coherence and data writeback, and
                 argue that near-data processing workloads should access
                 data several times to offset this penalty. A case study
                 based on the Graph500 benchmark confirms the small
                 overhead for the proposed coherence mechanisms and
                 shows the ability to outperform a real CPU by a factor
                 of two.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "30",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Diavastos:2017:SLR,
  author =       "Andreas Diavastos and Pedro Trancoso",
  title =        "{SWITCHES}: a Lightweight Runtime for Dataflow
                 Execution of Tasks on Many-Cores",
  journal =      j-TACO,
  volume =       "14",
  number =       "3",
  pages =        "31:1--31:??",
  month =        sep,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3127068",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Wed Sep 6 17:12:05 MDT 2017",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "SWITCHES is a task-based dataflow runtime that
                 implements a lightweight distributed triggering system
                 for runtime dependence resolution and uses static
                 scheduling and compile-time assignment policies to
                 reduce runtime overheads. Unlike other systems, the
                 granularity of loop-tasks can be increased to favor
                 data-locality, even when having dependences across
                 different loops. SWITCHES introduces explicit task
                 resource allocation mechanisms for efficient allocation
                 of resources and adopts the latest OpenMP Application
                 Programming Interface (API), as to maintain high levels
                 of programming productivity. It provides a
                 source-to-source tool that automatically produces
                 thread-based code. Performance on an Intel Xeon-Phi
                 shows good scalability and surpasses OpenMP by an
                 average of 32\%.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "31",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Jain:2017:CMA,
  author =       "Rahul Jain and Preeti Ranjan Panda and Sreenivas
                 Subramoney",
  title =        "Cooperative Multi-Agent Reinforcement Learning-Based
                 Co-optimization of Cores, Caches, and On-chip Network",
  journal =      j-TACO,
  volume =       "14",
  number =       "4",
  pages =        "32:1--32:??",
  month =        dec,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3132170",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Dec 22 18:25:55 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Modern multi-core systems provide huge computational
                 capabilities, which can be used to run multiple
                 processes concurrently. To achieve the best possible
                 performance within limited power budgets, the various
                 system resources need to be allocated effectively. Any
                 mismatch between runtime resource requirement and
                 allocation leads to a sub-optimal energy-delay product
                 (EDP). Different optimization techniques exist for
                 addressing the problem of mismatch between the dynamic
                 requirement and runtime allocation of the system
                 resources. Choosing between multiple optimizations at
                 runtime is complex due to the non-additive effects,
                 making the scenario suitable for the application of
                 machine learning techniques. We present a novel method,
                 Machine Learned Machines (MLM), by using online
                 reinforcement learning (RL) to perform dynamic
                 partitioning of the last level cache (LLC), along with
                 dynamic voltage and frequency scaling (DVFS) of the
                 core and uncore (interconnection network and LLC). We
                 have proposed and evaluated three different MLM
                 co-optimization techniques based on independent and
                 cooperative multi-agent learners. We show that the
                 co-optimization results in a much lower system EDP than
                 any of the techniques applied individually. We explore
                 various RL models targeted toward optimization of
                 different system metrics and study their effects on a
                 system EDP, system throughput (STP), and Fairness. The
                 various proposed techniques have been extensively
                 evaluated with a mix of 20 workloads on a 4-core system
                 using Spec2006 benchmarks. We have further evaluated
                 our cooperative MLM techniques on a 16-core system. The
                 results show an average of 20.5\% and 19.1\% system EDP
                 improvement on a 4-core and 16-core system,
                 respectively, with limited degradation of STP and
                 Fairness.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "32",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{DeSensi:2017:BPP,
  author =       "Daniele {De Sensi} and Tiziano {De Matteis} and
                 Massimo Torquati and Gabriele Mencagli and Marco
                 Danelutto",
  title =        "Bringing Parallel Patterns Out of the Corner: The
                 {P$^3$ARSEC} Benchmark Suite",
  journal =      j-TACO,
  volume =       "14",
  number =       "4",
  pages =        "33:1--33:??",
  month =        dec,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3132710",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Dec 22 18:25:55 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "High-level parallel programming is an active research
                 topic aimed at promoting parallel programming
                 methodologies that provide the programmer with
                 high-level abstractions to develop complex parallel
                 software with reduced time to solution. Pattern-based
                 parallel programming is based on a set of composable
                 and customizable parallel patterns used as basic
                 building blocks in parallel applications. In recent
                 years, a considerable effort has been made in
                 empowering this programming model with features able to
                 overcome shortcomings of early approaches concerning
                 flexibility and performance. In this article, we
                 demonstrate that the approach is flexible and efficient
                 enough by applying it on 12 out of 13 PARSEC
                 applications. Our analysis, conducted on three
                 different multicore architectures, demonstrates that
                 pattern-based parallel programming has reached a good
                 level of maturity, providing comparable results in
                 terms of performance with respect to both other
                 parallel programming methodologies based on
                 pragma-based annotations (i.e., Open mp and OmpSs) and
                 native implementations (i.e., Pthreads). Regarding the
                 programming effort, we also demonstrate a considerable
                 reduction in lines of code and code churn compared to
                 Pthreads and comparable results with respect to other
                 existing implementations.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "33",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Ye:2017:CES,
  author =       "Chencheng Ye and Chen Ding and Hao Luo and Jacob Brock
                 and Dong Chen and Hai Jin",
  title =        "Cache Exclusivity and Sharing: Theory and
                 Optimization",
  journal =      j-TACO,
  volume =       "14",
  number =       "4",
  pages =        "34:1--34:??",
  month =        dec,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3134437",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Dec 22 18:25:55 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "A problem on multicore systems is cache sharing, where
                 the cache occupancy of a program depends on the cache
                 usage of peer programs. Exclusive cache hierarchy as
                 used on AMD processors is an effective solution to
                 allow processor cores to have a large private cache
                 while still benefitting from shared cache. The shared
                 cache stores the ``victims'' (i.e., data evicted from
                 private caches). The performance depends on how victims
                 of co-run programs interact in shared cache. This
                 article presents a new metric called the victim
                 footprint (VFP). It is measured once per program in its
                 solo execution and can then be combined to compute the
                 performance of any exclusive cache hierarchy, replacing
                 parallel testing with theoretical analysis. The work
                 evaluates the VFP by using it to analyze cache sharing
                 by parallel mixes of sequential programs, comparing the
                 accuracy of the theory to hardware counter results, and
                 measuring the benefit of exclusivity-aware analysis and
                 optimization.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "34",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Shrivastava:2017:EEC,
  author =       "Rahul Shrivastava and V. Krishna Nandivada",
  title =        "Energy-Efficient Compilation of Irregular
                 Task-Parallel Loops",
  journal =      j-TACO,
  volume =       "14",
  number =       "4",
  pages =        "35:1--35:??",
  month =        dec,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3136063",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Dec 22 18:25:55 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Energy-efficient compilation is an important problem
                 for multi-core systems. In this context, irregular
                 programs with task-parallel loops present interesting
                 challenges: the threads with lesser work-loads (
                 non-critical -threads) wait at the join-points for the
                 thread with maximum work-load ( critical -thread); this
                 leads to significant energy wastage. This problem
                 becomes more interesting in the context of
                 multi-socket-multi-core (MSMC) systems, where different
                 sockets may run at different frequencies, but all the
                 cores connected to a socket run at a single frequency.
                 In such a configuration, even though the load-imbalance
                 among the cores may be significant, an MSMC-oblivious
                 technique may miss the opportunities to reduce energy
                 consumption, if the load-imbalance across the sockets
                 is minimal. This problem becomes further challenging in
                 the presence of mutual-exclusion, where scaling the
                 frequencies of a socket executing the
                 non-critical-threads can impact the execution time of
                 the critical-threads. In this article, we propose a
                 scheme (X10Ergy) to obtain energy gains with minimal
                 impact on the execution time, for task-parallel
                 languages, such as X10, HJ, and so on. X10Ergy takes as
                 input a loop-chunked program (parallel-loop iterations
                 divided into chunks and each chunk is executed by a
                 unique thread). X10Ergy follows a mixed compile-time +
                 runtime approach that (i) uses static analysis to
                 efficiently compute the work-load of each chunk at
                 runtime, (ii) computes the ``remaining'' work-load of
                 the chunks running on the cores of each socket at
                 regular intervals and tunes the frequency of the
                 sockets accordingly, (iii) groups the threads into
                 different sockets (based on the remaining work-load of
                 their respective chunks), and (iv) in the presence of
                 atomic-blocks, models the effect of frequency-scaling
                 on the critical-thread. We implemented X10Ergy for X10
                 and have obtained encouraging results for the IMSuite
                 kernels.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "35",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Proy:2017:CAL,
  author =       "Julien Proy and Karine Heydemann and Alexandre Berzati
                 and Albert Cohen",
  title =        "Compiler-Assisted Loop Hardening Against Fault
                 Attacks",
  journal =      j-TACO,
  volume =       "14",
  number =       "4",
  pages =        "36:1--36:??",
  month =        dec,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3141234",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Dec 22 18:25:55 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Secure elements widely used in smartphones, digital
                 consumer electronics, and payment systems are subject
                 to fault attacks. To thwart such attacks, software
                 protections are manually inserted requiring experts and
                 time. The explosion of the Internet of Things (IoT) in
                 home, business, and public spaces motivates the
                 hardening of a wider class of applications and the need
                 to offer security solutions to non-experts. This
                 article addresses the automated protection of loops at
                 compilation time, covering the widest range of control-
                 and data-flow patterns, in both shape and complexity.
                 The security property we consider is that a sensitive
                 loop must always perform the expected number of
                 iterations; otherwise, an attack must be reported. We
                 propose a generic compile-time loop hardening scheme
                 based on the duplication of termination conditions and
                 of the computations involved in the evaluation of such
                 conditions. We also investigate how to preserve the
                 security property along the compilation flow while
                 enabling aggressive optimizations. We implemented this
                 algorithm in LLVM 4.0 at the Intermediate
                 Representation (IR) level in the backend. On average,
                 the compiler automatically hardens 95\% of the
                 sensitive loops of typical security benchmarks, and
                 98\% of these loops are shown to be robust to simulated
                 faults. Performance and code size overhead remain quite
                 affordable, at 12.5\% and 14\%, respectively.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "36",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Peterson:2017:TCT,
  author =       "Christina Peterson and Damian Dechev",
  title =        "A Transactional Correctness Tool for Abstract Data
                 Types",
  journal =      j-TACO,
  volume =       "14",
  number =       "4",
  pages =        "37:1--37:??",
  month =        dec,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3148964",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Dec 22 18:25:55 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Transactional memory simplifies multiprocessor
                 programming by providing the guarantee that a
                 sequential block of code in the form of a transaction
                 will exhibit atomicity and isolation. Transactional
                 data structures offer the same guarantee to concurrent
                 data structures by enabling the atomic execution of a
                 composition of operations. The concurrency control of
                 transactional memory systems preserves atomicity and
                 isolation by detecting read/write conflicts among
                 multiple concurrent transactions. State-of-the-art
                 transactional data structures improve on this
                 concurrency control protocol by providing explicit
                 transaction-level synchronization for only
                 non-commutative operations. Since read/write conflicts
                 are handled by thread-level concurrency control, the
                 correctness of transactional data structures cannot be
                 evaluated according to the read/write histories. This
                 presents a challenge for existing correctness
                 verification techniques for transactional memory,
                 because correctness is determined according to the
                 transitions taken by the transactions in the presence
                 of read/write conflicts. In this article, we present
                 Transactional Correctness tool for Abstract Data Types
                 (TxC-ADT), the first tool that can check the
                 correctness of transactional data structures. TxC-ADT
                 elevates the standard definitions of transactional
                 correctness to be in terms of an abstract data type, an
                 essential aspect for checking correctness of
                 transactions that synchronize only for high-level
                 semantic conflicts. To accommodate a diverse assortment
                 of transactional correctness conditions, we present a
                 technique for defining correctness as a happens-before
                 relation. Defining a correctness condition in this
                 manner enables an automated approach in which
                 correctness is evaluated by generating and analyzing a
                 transactional happens-before graph during model
                 checking. A transactional happens-before graph is
                 maintained on a per-thread basis, making our approach
                 applicable to transactional correctness conditions that
                 do not enforce a total order on a transactional
                 execution. We demonstrate the practical applications of
                 TxC-ADT by checking Lock Free Transactional
                 Transformation and Transactional Data Structure
                 Libraries for serializability, strict serializability,
                 opacity, and causal consistency.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "37",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Ferroni:2017:PCM,
  author =       "Matteo Ferroni and Andrea Corna and Andrea Damiani and
                 Rolando Brondolin and Juan A. Colmenares and Steven
                 Hofmeyr and John D. Kubiatowicz and Marco D.
                 Santambrogio",
  title =        "Power Consumption Models for Multi-Tenant Server
                 Infrastructures",
  journal =      j-TACO,
  volume =       "14",
  number =       "4",
  pages =        "38:1--38:??",
  month =        dec,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3148965",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Dec 22 18:25:55 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Multi-tenant virtualized infrastructures allow cloud
                 providers to minimize costs through workload
                 consolidation. One of the largest costs is power
                 consumption, which is challenging to understand in
                 heterogeneous environments. We propose a power modeling
                 methodology that tackles this complexity using a
                 divide-and-conquer approach. Our results outperform
                 previous research work, achieving a relative error of
                 2\% on average and under 4\% in almost all cases.
                 Models are portable across similar architectures,
                 enabling predictions of power consumption before
                 migrating a tenant to a different hardware platform.
                 Moreover, we show the models allow us to evaluate
                 colocations of tenants to reduce overall consumption.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "38",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Mohammadi:2017:COE,
  author =       "Milad Mohammadi and Tor M. Aamodt and William J.
                 Dally",
  title =        "{CG-OoO}: Energy-Efficient Coarse-Grain Out-of-Order
                 Execution Near In-Order Energy with Near Out-of-Order
                 Performance",
  journal =      j-TACO,
  volume =       "14",
  number =       "4",
  pages =        "39:1--39:??",
  month =        dec,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3151034",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Dec 22 18:25:55 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "We introduce the Coarse-Grain Out-of-Order (CG-OoO)
                 general-purpose processor designed to achieve close to
                 In-Order (InO) processor energy while maintaining
                 Out-of-Order (OoO) performance. CG-OoO is an
                 energy-performance-proportional architecture.
                 Block-level code processing is at the heart of this
                 architecture; CG-OoO speculates, fetches, schedules,
                 and commits code at block-level granularity. It
                 eliminates unnecessary accesses to energy-consuming
                 tables and turns large tables into smaller, distributed
                 tables that are cheaper to access. CG-OoO leverages
                 compiler-level code optimizations to deliver efficient
                 static code and exploits dynamic block-level and
                 instruction-level parallelism. CG-OoO introduces
                 Skipahead, a complexity effective, limited out-of-order
                 instruction scheduling model. Through the energy
                 efficiency techniques applied to the compiler and
                 processor pipeline stages, CG-OoO closes 62\% of the
                 average energy gap between the InO and OoO baseline
                 processors at the same area and nearly the same
                 performance as the OoO. This makes CG-OoO 1.8$ \times $
                 more efficient than the OoO on the energy-delay product
                 inverse metric. CG-OoO meets the OoO nominal
                 performance while trading off the peak scheduling
                 performance for superior energy efficiency.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "39",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Swami:2017:EEC,
  author =       "Shivam Swami and Poovaiah M. Palangappa and Kartik
                 Mohanram",
  title =        "{ECS}: Error-Correcting Strings for Lifetime
                 Improvements in Nonvolatile Memories",
  journal =      j-TACO,
  volume =       "14",
  number =       "4",
  pages =        "40:1--40:??",
  month =        dec,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3151083",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Dec 22 18:25:55 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Emerging nonvolatile memories (NVMs) suffer from low
                 write endurance, resulting in early cell failures (hard
                 errors), which reduce memory lifetime. It was
                 recognized early on that conventional error-correcting
                 codes (ECCs), which are designed for soft errors, are a
                 poor choice for addressing hard errors in NVMs. This
                 led to the evolution of hard error correction schemes
                 like dynamically replicated memory (DRM),
                 error-correcting pointers (ECPs), SAFER, FREE-p, PAYG,
                 and Zombie memory to improve NVM lifetime. Whereas
                 these approaches made significant inroads in addressing
                 hard errors and low memory lifetime in NVMs, overcoming
                 the challenges of underutilization of error-correcting
                 resources and/or implementation overhead (e.g., codec
                 latency, hardware support) remain areas of active
                 research and development. This article proposes
                 error-correcting strings (ECSs) as a high-utilization,
                 low-latency solution for hard error correction in
                 single-/multi-/triple-level cell (SLC/MLC/TLC) NVMs. At
                 its core, ECS adopts a base-offset approach to store
                 pointers to the failed memory cells; in this work, base
                 is the address of the first failed cell in a memory
                 block and offsets are the distances between successive
                 failed cells in that memory block. Unlike ECP, which
                 uses fixed-length pointers, ECS uses variable-length
                 offsets to point to the failed cells, thereby realizing
                 more pointers to tolerate more hard errors per memory
                 block. Further, this article proposes eXtended-ECS
                 (XECS), a page-level error correction architecture,
                 which employs dynamic on-demand ECS allocation and
                 opportunistic pattern-based data compression to improve
                 NVM lifetime by 2$ \times $ over ECP-6 for comparable
                 overhead and negligible impact to system performance.
                 Finally, this article demonstrates that ECS is a
                 drop-in replacement for ECP to extend the lifetime of
                 state-of-the-art ECP-based techniques like PAYG and
                 Zombie memory; ECS is also compatible with MLC/TLC
                 NVMs, where it complements drift-induced soft error
                 reduction techniques like ECC and incomplete data
                 mapping to simultaneously extend NVM lifetime.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "40",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Azhar:2017:SQS,
  author =       "M. Waqar Azhar and Per Stenstr{\"o}m and Vassilis
                 Papaefstathiou",
  title =        "{SLOOP}: {QoS}-Supervised Loop Execution to Reduce
                 Energy on Heterogeneous Architectures",
  journal =      j-TACO,
  volume =       "14",
  number =       "4",
  pages =        "41:1--41:??",
  month =        dec,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3148053",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Dec 22 18:25:55 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Most systems allocate computational resources to each
                 executing task without any actual knowledge of the
                 application's Quality-of-Service (QoS) requirements.
                 Such best-effort policies lead to overprovisioning of
                 the resources and increase energy loss. This work
                 assumes applications with soft QoS requirements and
                 exploits the inherent timing slack to minimize the
                 allocated computational resources to reduce energy
                 consumption. We propose a lightweight progress-tracking
                 methodology based on the outer loops of application
                 kernels. It builds on online history and uses it to
                 estimate the total execution time. The prediction of
                 the execution time and the QoS requirements are then
                 used to schedule the application on a heterogeneous
                 architecture with big out-of-order cores and small
                 (LITTLE) in-order cores and select the minimum
                 operating frequency, using DVFS, that meets the
                 deadline. Our scheme is effective in exploiting the
                 timing slack of each application. We show that it can
                 reduce the energy consumption by more than 20\% without
                 missing any computational deadlines.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "41",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Kanakagiri:2017:MMD,
  author =       "Raghavendra Kanakagiri and Biswabandan Panda and Madhu
                 Mutyam",
  title =        "{MBZip}: Multiblock Data Compression",
  journal =      j-TACO,
  volume =       "14",
  number =       "4",
  pages =        "42:1--42:??",
  month =        dec,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3151033",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Dec 22 18:25:55 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/datacompression.bib;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Compression techniques at the last-level cache and the
                 DRAM play an important role in improving system
                 performance by increasing their effective capacities. A
                 compressed block in DRAM also reduces the transfer time
                 over the memory bus to the caches, reducing the latency
                 of a LLC cache miss. Usually, compression is achieved
                 by exploiting data patterns present within a block. But
                 applications can exhibit data locality that spread
                 across multiple consecutive data blocks. We observe
                 that there is significant opportunity available for
                 compressing multiple consecutive data blocks into one
                 single block, both at the LLC and DRAM. Our studies
                 using 21 SPEC CPU applications show that, at the LLC,
                 around 25\% (on average) of the cache blocks can be
                 compressed into one single cache block when grouped
                 together in groups of 2 to 8 blocks. In DRAM, more than
                 30\% of the columns residing in a single DRAM page can
                 be compressed into one DRAM column, when grouped
                 together in groups of 2 to 6. Motivated by these
                 observations, we propose a mechanism, namely, MBZip,
                 that compresses multiple data blocks into one single
                 block (called a zipped block), both at the LLC and
                 DRAM. At the cache, MBZip includes a simple tag
                 structure to index into these zipped cache blocks and
                 the indexing does not incur any redirectional delay. At
                 the DRAM, MBZip does not need any changes to the
                 address computation logic and works seamlessly with the
                 conventional/existing logic. MBZip is a synergistic
                 mechanism that coordinates these zipped blocks at the
                 LLC and DRAM. Further, we also explore silent writes at
                 the DRAM and show that certain writes need not access
                 the memory when blocks are zipped. MBZip improves the
                 system performance by 21.9\%, with a maximum of 90.3\%
                 on a 4-core system.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "42",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Neill:2017:FAM,
  author =       "Richard Neill and Andi Drebes and Antoniu Pop",
  title =        "Fuse: Accurate Multiplexing of Hardware Performance
                 Counters Across Executions",
  journal =      j-TACO,
  volume =       "14",
  number =       "4",
  pages =        "43:1--43:??",
  month =        dec,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3148054",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Dec 22 18:25:55 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Collecting hardware event counts is essential to
                 understanding program execution behavior. Contemporary
                 systems offer few Performance Monitoring Counters
                 (PMCs), thus only a small fraction of hardware events
                 can be monitored simultaneously. We present new
                 techniques to acquire counts for all available hardware
                 events with high accuracy by multiplexing PMCs across
                 multiple executions of the same program, then carefully
                 reconciling and merging the multiple profiles into a
                 single, coherent profile. We present a new metric for
                 assessing the similarity of statistical distributions
                 of event counts and show that our execution profiling
                 approach performs significantly better than Hardware
                 Event Multiplexing.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "43",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Sardashti:2017:CCG,
  author =       "Somayeh Sardashti and David A. Wood",
  title =        "Could Compression Be of General Use? {Evaluating}
                 Memory Compression across Domains",
  journal =      j-TACO,
  volume =       "14",
  number =       "4",
  pages =        "44:1--44:??",
  month =        dec,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3138805",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Dec 22 18:25:55 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/datacompression.bib;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Recent proposals present compression as a
                 cost-effective technique to increase cache and memory
                 capacity and bandwidth. While these proposals show
                 potentials of compression, there are several open
                 questions to adopt these proposals in real systems
                 including the following: (1) Do these techniques work
                 for real-world workloads running for long time? (2)
                 Which application domains would potentially benefit the
                 most from compression? (3) At which level of memory
                 hierarchy should we apply compression: caches, main
                 memory, or both? In this article, our goal is to shed
                 light on some main questions on applicability of
                 compression. We evaluate compression in the memory
                 hierarchy for selected examples from different
                 application classes. We analyze real applications with
                 real data and complete runs of several benchmarks.
                 While simulators provide a pretty accurate framework to
                 study potential performance/energy impacts of ideas,
                 they mostly limit us to a small range of workloads with
                 short runtimes. To enable studying real workloads, we
                 introduce a fast and simple methodology to get samples
                 of memory and cache contents of a real machine (a
                 desktop or a server). Compared to a cycle-accurate
                 simulator, our methodology allows us to study real
                 workloads as well as benchmarks. Our toolset is not a
                 replacement for simulators but mostly complements them.
                 While we can use a simulator to measure
                 performance/energy impact of a particular compression
                 proposal, here with our methodology we can study the
                 potentials with long running workloads in early stages
                 of the design. Using our toolset, we evaluate a
                 collection of workloads from different domains, such as
                 a web server of CS department of UW-Madison for 24h,
                 Google Chrome (watching a 1h-long movie on YouTube),
                 and Linux games (playing for about an hour). We also
                 use several benchmarks from different domains,
                 including SPEC, mobile, and big data. We run these
                 benchmarks to completion. Using these workloads and our
                 toolset, we analyze different compression properties
                 for both real applications and benchmarks. We focus on
                 eight main hypotheses on compression, derived from
                 previous work on compression. These properties (Table
                 2) act as foundation of several proposals on
                 compression, so performance of those proposals depends
                 very much on these basic properties. Overall, our
                 results suggest that compression could be of general
                 use both in main memory and caches. On average, the
                 compression ratio is {$>$}=2 for 64\% and 54\% of
                 workloads, respectively, for memory and cache data. Our
                 evaluation indicates significant potential for both
                 cache and memory compression, with higher
                 compressibility in memory due to abundance of zero
                 blocks. Among application domains we studied, servers
                 show on average the highest compressibility, while our
                 mobile benchmarks show the lowest compressibility. For
                 comparing benchmarks with real workloads, we show that
                 (1) it is critical to run benchmarks to completion or
                 considerably long runtimes to avoid biased conclusions,
                 and (2) SPEC benchmarks are good representative of real
                 Desktop applications in terms of compressibility of
                 their datasets. However, this does not hold for all
                 compression properties. For example, SPEC benchmarks
                 have much better compression locality (i.e.,
                 neighboring blocks have similar compressibility) than
                 real workloads. Thus, it is critical for designers to
                 consider wider range of workloads, including real
                 applications, to evaluate their compression
                 techniques.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "44",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Huang:2017:IEG,
  author =       "Libo Huang and Yashuai L{\"u} and Li Shen and Zhiying
                 Wang",
  title =        "Improving the Efficiency of {GPGPU} Work-Queue Through
                 Data Awareness",
  journal =      j-TACO,
  volume =       "14",
  number =       "4",
  pages =        "45:1--45:??",
  month =        dec,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3151035",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Dec 22 18:25:55 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "The architecture and programming model of current
                 GPGPUs are best suited for applications that are
                 dominated by structured control and data flows across
                 large regular datasets. Parallel workloads with
                 irregular control and data structures cannot easily
                 harness the processing power of the GPGPU. One approach
                 for mapping these irregular-parallel workloads to
                 GPGPUs is using work-queues. The work-queue approach
                 improves the utilization of SIMD units by only
                 processing useful works that are dynamically generated
                 during execution. As current GPGPUs lack necessary
                 supports for work-queues, a software-based work-queue
                 implementation often suffers from memory contention and
                 load balancing issues. In this article, we present a
                 novel hardware work-queue design named DaQueue, which
                 incorporates three data-aware features to improve the
                 efficiency of work-queues on GPGPUs. We evaluate our
                 proposal on the irregular-parallel workloads and carry
                 out a case study on a path tracing pipeline with a
                 cycle-level simulator. Experimental results show that
                 for the tested workloads, DaQueue improves performance
                 by 1.53$ \times $ on average and up to 1.91$ \times $.
                 Compared to a hardware worklist approach that is the
                 state-of-the-art prior work, DaQueue can achieve an
                 average of 33.92\% extra speedup with less hardware
                 area cost.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "45",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Angerd:2017:FAC,
  author =       "Alexandra Angerd and Erik Sintorn and Per
                 Stenstr{\"o}m",
  title =        "A Framework for Automated and Controlled
                 Floating-Point Accuracy Reduction in Graphics
                 Applications on {GPUs}",
  journal =      j-TACO,
  volume =       "14",
  number =       "4",
  pages =        "46:1--46:??",
  month =        dec,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3151032",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Dec 22 18:25:55 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/fparith.bib;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Reducing the precision of floating-point values can
                 improve performance and/or reduce energy expenditure in
                 computer graphics, among other, applications. However,
                 reducing the precision level of floating-point values
                 in a controlled fashion needs support both at the
                 compiler and at the microarchitecture level. At the
                 compiler level, a method is needed to automate the
                 reduction of precision of each floating-point value. At
                 the microarchitecture level, a lower precision of each
                 floating-point register can allow more floating-point
                 values to be packed into a register file. This,
                 however, calls for new register file organizations.
                 This article proposes an automated precision-selection
                 method and a novel GPU register file organization that
                 can store floating-point register values at arbitrary
                 precisions densely. The automated precision-selection
                 method uses a data-driven approach for setting the
                 precision level of floating-point values, given a
                 quality threshold and a representative set of input
                 data. By allowing a small, but acceptable, degradation
                 in output quality, our method can remove a significant
                 amount of the bits needed to represent floating-point
                 values in the investigated kernels (between 28\% and
                 60\%). Our proposed register file organization exploits
                 these lower-precision floating-point values by packing
                 several of them into the same physical register. This
                 reduces the register pressure per thread by up to 48\%,
                 and by 27\% on average, for a negligible output-quality
                 degradation. This can enable GPUs to keep up to twice
                 as many threads in flight simultaneously.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "46",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Arteaga:2017:GFG,
  author =       "Jaime Arteaga and St{\'e}phane Zuckerman and Guang R.
                 Gao",
  title =        "Generating Fine-Grain Multithreaded Applications Using
                 a Multigrain Approach",
  journal =      j-TACO,
  volume =       "14",
  number =       "4",
  pages =        "47:1--47:??",
  month =        dec,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3155288",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Dec 22 18:25:55 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "The recent evolution in hardware landscape, aimed at
                 producing high-performance computing systems capable of
                 reaching extreme-scale performance, has reignited the
                 interest in fine-grain multithreading, particularly at
                 the intranode level. Indeed, popular parallel
                 programming environments, such as OpenMP, which
                 features a simple interface for the parallelization of
                 programs, are now incorporating fine-grain constructs.
                 However, since coarse-grain directives are still
                 heavily used, the OpenMP runtime is forced to support
                 both coarse- and fine-grain models of execution,
                 potentially reducing the advantages obtained when
                 executing an application in a fully fine-grain
                 environment. To evaluate the type of applications that
                 benefit from executing in a unified fine-grain program
                 execution model, this article presents a multigrain
                 parallel programming environment for the generation of
                 fine-grain multithreaded applications from programs
                 featuring OpenMP's API, allowing OpenMP programs to be
                 run on top of a fine-grain event-driven program
                 execution model. Experimental results with five
                 scientific benchmarks show that fine-grain
                 applications, generated by and run on our environment
                 with two runtimes implementing a fine-grain
                 event-driven program execution model, are competitive
                 and can outperform their OpenMP counterparts,
                 especially for data-intensive workloads with irregular
                 and dynamic parallelism, reaching speedups as high as
                 2.6$ \times $ for Graph500 and 51$ \times $ for NAS
                 Data Cube.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "47",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Hadidi:2017:CCA,
  author =       "Ramyad Hadidi and Lifeng Nai and Hyojong Kim and
                 Hyesoon Kim",
  title =        "{CAIRO}: a Compiler-Assisted Technique for Enabling
                 Instruction-Level Offloading of Processing-In-Memory",
  journal =      j-TACO,
  volume =       "14",
  number =       "4",
  pages =        "48:1--48:??",
  month =        dec,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3155287",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Dec 22 18:25:55 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Three-dimensional (3D)-stacking technology and the
                 memory-wall problem have popularized
                 processing-in-memory (PIM) concepts again, which offers
                 the benefits of bandwidth and energy savings by
                 offloading computations to functional units inside the
                 memory. Several memory vendors have also started to
                 integrate computation logics into the memory, such as
                 Hybrid Memory Cube (HMC), the latest version of which
                 supports up to 18 in-memory atomic instructions.
                 Although industry prototypes have motivated studies for
                 investigating efficient methods and architectures for
                 PIM, researchers have not proposed a systematic way for
                 identifying the benefits of instruction-level PIM
                 offloading. As a result, compiler support for
                 recognizing offloading candidates and utilizing
                 instruction-level PIM offloading is unavailable. In
                 this article, we analyze the advantages of
                 instruction-level PIM offloading in the context of
                 HMC-atomic instructions for graph-computing
                 applications and propose CAIRO, a compiler-assisted
                 technique and decision model for enabling
                 instruction-level offloading of PIM without any burden
                 on programmers. To develop CAIRO, we analyzed how
                 instruction offloading enables performance gain in both
                 CPU and GPU workloads. Our studies show that
                 performance gain from bandwidth savings, the ratio of
                 number of cache misses to total cache accesses, and the
                 overhead of host atomic instructions are the key
                 factors in selecting an offloading candidate. Based on
                 our analytical models, we characterize the properties
                 of beneficial and nonbeneficial candidates for
                 offloading. We evaluate CAIRO with 27 multithreaded CPU
                 and 36 GPU benchmarks. In our evaluation, CAIRO not
                 only doubles the speedup for a set of PIM-beneficial
                 workloads by exploiting HMC-atomic instructions but
                 also prevents slowdown caused by incorrect offloading
                 decisions for other workloads.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "48",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Lim:2017:TEP,
  author =       "Hongyeol Lim and Giho Park",
  title =        "{Triple Engine Processor (TEP)}: a Heterogeneous
                 Near-Memory Processor for Diverse Kernel Operations",
  journal =      j-TACO,
  volume =       "14",
  number =       "4",
  pages =        "49:1--49:??",
  month =        dec,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3155920",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Dec 22 18:25:55 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "The advent of 3D memory stacking technology, which
                 integrates a logic layer and stacked memories, is
                 expected to be one of the most promising memory
                 technologies to mitigate the memory wall problem by
                 leveraging the concept of near-memory processing (NMP).
                 With the ability to process data locally within the
                 logic layer of stacked memory, a variety of emerging
                 big data applications can achieve significant
                 performance and energy-efficiency benefits. Various
                 approaches to the NMP logic layer architecture have
                 been studied to utilize the advantage of stacked
                 memory. While significant acceleration of specific
                 kernel operations has been derived from previous NMP
                 studies, an NMP-based system using an NMP logic
                 architecture capable of handling some specific kernel
                 operations can suffer from performance and energy
                 efficiency degradation caused by a significant
                 communication overhead between the host processor and
                 NMP stack. In this article, we first analyze the kernel
                 operations that can greatly improve the performance of
                 NMP-based systems in diverse emerging applications, and
                 then we analyze the architecture to efficiently process
                 the extracted kernel operations. This analysis confirms
                 that three categories of processing engines for NMP
                 logic are required for efficient processing of a
                 variety of emerging applications, and thus we propose a
                 Triple Engine Processor (TEP), a heterogeneous
                 near-memory processor with three types of computing
                 engines. These three types of engines are an in-order
                 core, a coerce-grain reconfigurable processor (CGRA),
                 and dedicated hardware. The proposed TEP provides about
                 3.4 times higher performance and 33\% greater energy
                 savings than the baseline 3D memory system.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "49",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Patsilaras:2017:RRD,
  author =       "George Patsilaras and James Tuck",
  title =        "{ReDirect}: Reconfigurable Directories for Multicore
                 Architectures",
  journal =      j-TACO,
  volume =       "14",
  number =       "4",
  pages =        "50:1--50:??",
  month =        dec,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3162015",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Dec 22 18:25:55 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "As we enter the dark silicon era, architects should
                 not envision designs in which every transistor remains
                 turned on permanently but rather ones in which portions
                 of the chip are judiciously turned on/off depending on
                 the characteristics of a workload. At the same time,
                 due to the increasing cost per transistor, architects
                 should also consider new ways to re-purpose transistors
                 to increase their architectural value. In this work, we
                 consider the design of directory-based cache coherence
                 in light of the dark silicon era and the need to
                 re-purpose transistors. We point out that directories
                 are not needed all of the time, and we argue that
                 directories (and coherence) should be off unless it is
                 actually needed for correctness. In our design,
                 directories will be disabled and powered off for
                 workloads with no sharing. Then only when parallel
                 workloads need cache coherence will directories be
                 enabled in proportion to the sharing that is present.
                 At the same time, we exploit the structural
                 similarities of directories and cache. If a directory
                 is idle, then we reconfigure it to be used as extra
                 capacity in the last-level cache. Since our novel
                 approach can keep most directories off, we are free to
                 select sparse overprovisioned directory designs that
                 are reconfigurable to large amounts of cache that can
                 significantly boost performance when the directory is
                 idle. We call these combined features Reconfigured Dark
                 Directories, since directories are usually dark (off)
                 and can be reconfigured. Our results for Reconfigurable
                 Dark Directories running SPEC 2006 applications show a
                 performance benefit, on average, of 17\% for an 8$
                 \times $ overprovisioned fully mapped directory on a
                 64-tile system under low system concurrency (10\% under
                 heavy concurrency), or a 29\% average speedup for a 2$
                 \times $ overprovisioned directory on 256-tile system
                 (10\% under heavy concurrency) to systems with a
                 conventional sparse directory design using the same
                 overprovisioning factor.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "50",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Patil:2017:HHA,
  author =       "Adarsh Patil and Ramaswamy Govindarajan",
  title =        "{HAShCache}: Heterogeneity-Aware Shared {DRAMCache}
                 for Integrated Heterogeneous Systems",
  journal =      j-TACO,
  volume =       "14",
  number =       "4",
  pages =        "51:1--51:??",
  month =        dec,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3158641",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Dec 22 18:25:55 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Integrated Heterogeneous System (IHS) processors pack
                 throughput-oriented General-Purpose Graphics Processing
                 Units (GPGPUs) alongside latency-oriented Central
                 Processing Units (CPUs) on the same die sharing certain
                 resources, e.g., shared last-level cache,
                 Network-on-Chip (NoC), and the main memory. The demands
                 for memory accesses and other shared resources from GPU
                 cores can exceed that of CPU cores by two to three
                 orders of magnitude. This disparity poses significant
                 problems in exploiting the full potential of these
                 architectures. In this article, we propose adding a
                 large-capacity stacked DRAM, used as a shared
                 last-level cache, for the IHS processors. However,
                 adding the DRAMCache naively, leaves significant
                 performance on the table due to the disparate demands
                 from CPU and GPU cores for DRAMCache and memory
                 accesses. In particular, the imbalance can
                 significantly reduce the performance benefits that the
                 CPU cores would have otherwise enjoyed with the
                 introduction of the DRAMCache, necessitating a
                 heterogeneity-aware management of this shared resource
                 for improved performance. In this article, we propose
                 three simple techniques to enhance the performance of
                 CPU application while ensuring very little to no
                 performance impact to the GPU. Specifically, we propose
                 (i) PrIS, a prioritization scheme for scheduling CPU
                 requests at the DRAMCache controller; (ii) ByE, a
                 selective and temporal bypassing scheme for CPU
                 requests at the DRAMCache; and (iii) Chaining, an
                 occupancy controlling mechanism for GPU lines in the
                 DRAMCache through pseudo-associativity. The resulting
                 cache, Heterogeneity-Aware Shared DRAMCache
                 (HAShCache), is heterogeneity-aware and can adapt
                 dynamically to address the inherent disparity of
                 demands in an IHS architecture. Experimental evaluation
                 of the proposed HAShCache results in an average system
                 performance improvement of 41\% over a naive DRAMCache
                 and over 200\% improvement over a baseline system with
                 no stacked DRAMCache.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "51",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Alias:2017:OAC,
  author =       "Christophe Alias and Alexandru Plesco",
  title =        "Optimizing Affine Control With Semantic
                 Factorizations",
  journal =      j-TACO,
  volume =       "14",
  number =       "4",
  pages =        "52:1--52:??",
  month =        dec,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3162017",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Dec 22 18:25:55 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Hardware accelerators generated by polyhedral
                 synthesis techniques make extensive use of affine
                 expressions (affine functions and convex polyhedra) in
                 control and steering logic. Since the control is
                 pipelined, these affine objects must be evaluated at
                 the same time for different values, which forbids
                 aggressive reuse of operators. In this article, we
                 propose a method to factorize a collection of affine
                 expressions without preventing pipelining. Our key
                 contributions are (i) to use semantic factorizations
                 exploiting arithmetic properties of addition and
                 multiplication and (ii) to rely on a cost function
                 whose minimization ensures correct usage of FPGA
                 resources. Our algorithm is totally parameterized by
                 the cost function, which can be customized to fit a
                 target FPGA. Experimental results on a large pool of
                 linear algebra kernels show a significant improvement
                 compared to traditional low-level RTL optimizations. In
                 particular, we show how our method reduces resource
                 consumption by revealing hidden strength reductions.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "52",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Matheou:2017:DDC,
  author =       "George Matheou and Paraskevas Evripidou",
  title =        "Data-Driven Concurrency for High Performance
                 Computing",
  journal =      j-TACO,
  volume =       "14",
  number =       "4",
  pages =        "53:1--53:??",
  month =        dec,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3162014",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Dec 22 18:25:55 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "In this work, we utilize dynamic dataflow/data-driven
                 techniques to improve the performance of high
                 performance computing (HPC) systems. The proposed
                 techniques are implemented and evaluated through an
                 efficient, portable, and robust programming framework
                 that enables data-driven concurrency on HPC systems.
                 The proposed framework is based on data-driven
                 multithreading (DDM), a hybrid control-flow/dataflow
                 model that schedules threads based on data availability
                 on sequential processors. The proposed framework was
                 evaluated using several benchmarks, with different
                 characteristics, on two different systems: a 4-node AMD
                 system with a total of 128 cores and a 64-node Intel
                 HPC system with a total of 768 cores. The performance
                 evaluation shows that the proposed framework scales
                 well and tolerates scheduling overheads and memory
                 latencies effectively. We also compare our framework to
                 MPI, DDM-VM, and OmpSs@Cluster. The comparison results
                 show that the proposed framework obtains comparable or
                 better performance.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "53",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Georgakoudis:2017:SSA,
  author =       "Giorgis Georgakoudis and Hans Vandierendonck and Peter
                 Thoman and Bronis R. {De Supinski} and Thomas Fahringer
                 and Dimitrios S. Nikolopoulos",
  title =        "{SCALO}: Scalability-Aware Parallelism Orchestration
                 for Multi-Threaded Workloads",
  journal =      j-TACO,
  volume =       "14",
  number =       "4",
  pages =        "54:1--54:??",
  month =        dec,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3158643",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Dec 22 18:25:55 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Shared memory machines continue to increase in scale
                 by adding more parallelism through additional cores and
                 complex memory hierarchies. Often, executing multiple
                 applications concurrently, dividing among them hardware
                 threads, provides greater efficiency rather than
                 executing a single application with large thread
                 counts. However, contention for shared resources can
                 limit the improvement of concurrent application
                 execution: orchestrating the number of threads used by
                 each application and is essential. In this article, we
                 contribute SCALO, a solution to orchestrate concurrent
                 application execution to increase throughput. SCALO
                 monitors co-executing applications at runtime to
                 evaluate their scalability. Its optimizing thread
                 allocator analyzes these scalability estimates to adapt
                 the parallelism of each program. Unlike previous
                 approaches, SCALO differs by including dynamic
                 contention effects on scalability and by controlling
                 the parallelism during the execution of parallel
                 regions. Thus, it improves throughput when other
                 state-of-the-art approaches fail and outperforms them
                 by up to 40\% when they succeed.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "54",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Baroudi:2017:OTB,
  author =       "Toufik Baroudi and Rachid Seghir and Vincent
                 Loechner",
  title =        "Optimization of Triangular and Banded Matrix
                 Operations Using $2$ d-Packed Layouts",
  journal =      j-TACO,
  volume =       "14",
  number =       "4",
  pages =        "55:1--55:??",
  month =        dec,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3162016",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Dec 22 18:25:55 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Over the past few years, multicore systems have become
                 increasingly powerful and thereby very useful in
                 high-performance computing. However, many applications,
                 such as some linear algebra algorithms, still cannot
                 take full advantage of these systems. This is mainly
                 due to the shortage of optimization techniques dealing
                 with irregular control structures. In particular, the
                 well-known polyhedral model fails to optimize loop
                 nests whose bounds and/or array references are not
                 affine functions. This is more likely to occur when
                 handling sparse matrices in their packed formats. In
                 this article, we propose using 2d-packed layouts and
                 simple affine transformations to enable optimization of
                 triangular and banded matrix operations. The benefit of
                 our proposal is shown through an experimental study
                 over a set of linear algebra benchmarks.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "55",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Lee:2018:IEE,
  author =       "Hochan Lee and Mansureh S. Moghaddam and Dongkwan Suh
                 and Bernhard Egger",
  title =        "Improving Energy Efficiency of Coarse-Grain
                 Reconfigurable Arrays Through Modulo Schedule
                 Compression\slash Decompression",
  journal =      j-TACO,
  volume =       "15",
  number =       "1",
  pages =        "1:1--1:??",
  month =        apr,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3162018",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Tue Jan 8 17:19:59 MST 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/datacompression.bib;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Modulo-scheduled course-grain reconfigurable array
                 (CGRA) processors excel at exploiting loop-level
                 parallelism at a high performance per watt ratio. The
                 frequent reconfiguration of the array, however, causes
                 between 25\% and 45\% of the consumed chip energy to be
                 spent on the instruction memory and fetches therefrom.
                 This article presents a hardware/software codesign
                 methodology for such architectures that is able to
                 reduce both the size required to store the
                 modulo-scheduled loops and the energy consumed by the
                 instruction decode logic. The hardware modifications
                 improve the spatial organization of a CGRA's execution
                 plan by reorganizing the configuration memory into
                 separate partitions based on a statistical analysis of
                 code. A compiler technique optimizes the generated code
                 in the temporal dimension by minimizing the number of
                 signal changes. The optimizations achieve, on average,
                 a reduction in code size of more than 63\% and in
                 energy consumed by the instruction decode logic by 70\%
                 for a wide variety of application domains.
                 Decompression of the compressed loops can be performed
                 in hardware with no additional latency, rendering the
                 presented method ideal for low-power CGRAs running at
                 high frequencies. The presented technique is orthogonal
                 to dictionary-based compression schemes and can be
                 combined to achieve a further reduction in code size.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "1",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Sangaiah:2018:SSA,
  author =       "Karthik Sangaiah and Michael Lui and Radhika Jagtap
                 and Stephan Diestelhorst and Siddharth Nilakantan and
                 Ankit More and Baris Taskin and Mark Hempstead",
  title =        "{SynchroTrace}: Synchronization-Aware
                 Architecture-Agnostic Traces for Lightweight Multicore
                 Simulation of {CMP} and {HPC} Workloads",
  journal =      j-TACO,
  volume =       "15",
  number =       "1",
  pages =        "2:1--2:??",
  month =        apr,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3158642",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Tue Jan 8 17:19:59 MST 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Trace-driven simulation of chip multiprocessor (CMP)
                 systems offers many advantages over execution-driven
                 simulation, such as reducing simulation time and
                 complexity, allowing portability, and scalability.
                 However, trace-based simulation approaches have
                 difficulty capturing and accurately replaying
                 multithreaded traces due to the inherent nondeterminism
                 in the execution of multithreaded programs. In this
                 work, we present SynchroTrace, a scalable, flexible,
                 and accurate trace-based multithreaded simulation
                 methodology. By recording synchronization events
                 relevant to modern threading libraries (e.g., Pthreads
                 and OpenMP) and dependencies in the traces, independent
                 of the host architecture, the methodology is able to
                 accurately model the nondeterminism of multithreaded
                 programs for different hardware platforms and threading
                 paradigms. Through capturing high-level instruction
                 categories, the SynchroTrace average CPI trace Replay
                 timing model offers fast and accurate simulation of
                 many-core in-order CMPs. We perform two case studies to
                 validate the SynchroTrace simulation flow against the
                 gem5 full-system simulator: (1) a constraint-based
                 design space exploration with traditional CMP
                 benchmarks and (2) a thread-scalability study with
                 HPC-representative applications. The results from these
                 case studies show that (1) our trace-based approach
                 with trace filtering has a peak speedup of up to 18.7$
                 \times $ over simulation in gem5 full-system with an
                 average of 9.6$ \times $ speedup, (2) SynchroTrace
                 maintains the thread-scaling accuracy of gem5 and can
                 efficiently scale up to 64 threads, and (3)
                 SynchroTrace can trace in one platform and model any
                 platform in early stages of design.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "2",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Zheng:2018:ESG,
  author =       "Long Zheng and Xiaofei Liao and Hai Jin",
  title =        "Efficient and Scalable Graph Parallel Processing With
                 Symbolic Execution",
  journal =      j-TACO,
  volume =       "15",
  number =       "1",
  pages =        "3:1--3:??",
  month =        apr,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3170434",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Tue Jan 8 17:19:59 MST 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/pagerank.bib;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Existing graph processing essentially relies on the
                 underlying iterative execution with synchronous (Sync)
                 and/or asynchronous (Async) engine. Nevertheless, they
                 both suffer from a wide class of inherent serialization
                 arising from data interdependencies within a graph. In
                 this article, we present SymGraph, a judicious graph
                 engine with symbolic iteration that enables the
                 parallelism of dependent computation on vertices.
                 SymGraph allows using abstract symbolic value (instead
                 of the concrete value) for the computation if the
                 desired data is unavailable. To maximize the potential
                 of symbolic iteration, we propose a chain of tailored
                 sophisticated techniques, enabling SymGraph to scale
                 out with a new milestone of efficiency for large-scale
                 graph processing. We evaluate SymGraph in comparison to
                 Sync, Async, and a hybrid of Sync and Async engines.
                 Our results on 12 nodes show that SymGraph outperforms
                 all three graph engines by 1.93x (vs. Sync), 1.98x (vs.
                 Async), and 1.57x (vs. Hybrid) on average. In
                 particular, the performance for PageRank on 32 nodes
                 can be dramatically improved by 16.5x (vs. Sync), 23.3x
                 (vs. Async), and 12.1x (vs. Hybrid), respectively. The
                 efficiency of SymGraph is also validated with at least
                 one order of magnitude improvement in contrast to three
                 specialized graph systems (Naiad, GraphX, and PGX.D).",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "3",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Jo:2018:DSD,
  author =       "Jae-Eon Jo and Gyu-Hyeon Lee and Hanhwi Jang and
                 Jaewon Lee and Mohammadamin Ajdari and Jangwoo Kim",
  title =        "{DiagSim}: Systematically Diagnosing Simulators for
                 Healthy Simulations",
  journal =      j-TACO,
  volume =       "15",
  number =       "1",
  pages =        "4:1--4:??",
  month =        apr,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3177959",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Tue Jan 8 17:19:59 MST 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Simulators are the most popular and useful tool to
                 study computer architecture and examine new ideas.
                 However, modern simulators have become prohibitively
                 complex (e.g., 200K+ lines of code) to fully understand
                 and utilize. Users therefore end up analyzing and
                 modifying only the modules of interest (e.g., branch
                 predictor, register file) when performing simulations.
                 Unfortunately, hidden details and inter-module
                 interactions of simulators create discrepancies between
                 the expected and actual module behaviors. Consequently,
                 the effect of modifying the target module may be
                 amplified or masked and the users get inaccurate
                 insights from expensive simulations. In this article,
                 we propose DiagSim, an efficient and systematic method
                 to diagnose simulators. It ensures the target modules
                 behave as expected to perform simulation in a healthy
                 (i.e., accurate and correct) way. DiagSim is efficient
                 in that it quickly pinpoints the modules showing
                 discrepancies and guides the users to inspect the
                 behavior without investigating the whole simulator.
                 DiagSim is systematic in that it hierarchically tests
                 the modules to guarantee the integrity of individual
                 diagnosis and always provide reliable results. We
                 construct DiagSim based on generic category-based
                 diagnosis ideas to encourage easy expansion of the
                 diagnosis. We diagnose three popular open source
                 simulators and discover hidden details including
                 implicitly reserved resources, un-documented latency
                 factors, and hard-coded module parameter values. We
                 observe that these factors have large performance
                 impacts (up to 156\%) and illustrate that our diagnosis
                 can correctly detect and eliminate them.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "4",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Kondguli:2018:CME,
  author =       "Sushant Kondguli and Michael Huang",
  title =        "A Case for a More Effective, Power-Efficient Turbo
                 Boosting",
  journal =      j-TACO,
  volume =       "15",
  number =       "1",
  pages =        "5:1--5:??",
  month =        apr,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3170433",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Tue Jan 8 17:19:59 MST 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Single-thread performance and throughput often pose
                 different design constraints and require compromises.
                 Mainstream CPUs today incorporate a non-trivial number
                 of cores, even for mobile devices. For power and
                 thermal considerations, by default, a single core does
                 not operate at the maximum performance level. When
                 operating conditions allow, however, commercial
                 products often rely on turbo boosting, which
                 temporarily increases the clock frequency to increase
                 single-thread performance. However, increasing clock
                 speed may result in a poor performance return for
                 invested energy. In this article, we make a case for a
                 more effective boosting strategy, which invests energy
                 in activities with the best estimated return. In
                 addition to running faster clocks, we can also use a
                 look-ahead thread to overlap the penalties of cache
                 misses and branch mispredicts. Overall, for similar
                 power consumptions, the proposed adaptive turbo
                 boosting strategy can achieve about twice the
                 performance benefits while halving the energy
                 overhead.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "5",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Chen:2018:ESE,
  author =       "Kuan-Chung Chen and Chung-Ho Chen",
  title =        "Enabling {SIMT} Execution Model on Homogeneous
                 Multi-Core System",
  journal =      j-TACO,
  volume =       "15",
  number =       "1",
  pages =        "6:1--6:??",
  month =        apr,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3177960",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Tue Jan 8 17:19:59 MST 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Single-instruction multiple-thread (SIMT) machine
                 emerges as a primary computing device in
                 high-performance computing, since the SIMT execution
                 paradigm can exploit data-level parallelism
                 effectively. This article explores the SIMT execution
                 potential on homogeneous multi-core processors, which
                 generally run in multiple-instruction multiple-data
                 (MIMD) mode when utilizing the multi-core resources. We
                 address three architecture issues in enabling SIMT
                 execution model on multi-core processor, including
                 multithreading execution model, kernel thread context
                 placement, and thread divergence. For the SIMT
                 execution model, we propose a fine-grained
                 multithreading mechanism on an ARM-based multi-core
                 system. Each of the processor cores stores the kernel
                 thread contexts in its L1 data cache for per-cycle
                 thread-switching requirement. For divergence-intensive
                 kernels, an Inner Conditional Statement First
                 (ICS-First) mechanism helps early re-convergence to
                 occur and significantly improves the performance. The
                 experiment results show that effectiveness in
                 data-parallel processing reduces on average 36\%
                 dynamic instructions, and boosts the SIMT executions to
                 achieve on average 1.52$ \times $ and up to 5$ \times $
                 speedups over the MIMD counterpart for OpenCL
                 benchmarks for single issue in-order processor cores.
                 By using the explicit vectorization optimization on the
                 kernels, the SIMT model gains further benefits from the
                 SIMD extension and achieves 1.71$ \times $ speedup over
                 the MIMD approach. The SIMT model using in-order
                 superscalar processor cores outperforms the MIMD model
                 that uses superscalar out-of-order processor cores by
                 40\%. The results show that, to exploit data-level
                 parallelism, enabling the SIMT model on homogeneous
                 multi-core processors is important.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "6",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Zhang:2018:SSM,
  author =       "Mingzhe Zhang and King Tin Lam and Xin Yao and Cho-Li
                 Wang",
  title =        "{SIMPO}: a Scalable In-Memory Persistent Object
                 Framework Using {NVRAM} for Reliable Big Data
                 Computing",
  journal =      j-TACO,
  volume =       "15",
  number =       "1",
  pages =        "7:1--7:??",
  month =        apr,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3167972",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Tue Jan 8 17:19:59 MST 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "While CPU architectures are incorporating many more
                 cores to meet ever-bigger workloads, advance in
                 fault-tolerance support is indispensable for sustaining
                 system performance under reliability constraints.
                 Emerging non-volatile memory technologies are yielding
                 fast, dense, and energy-efficient NVRAM that can
                 dethrone SSD drives for persisting data. Research on
                 using NVRAM to enable fast in-memory data persistence
                 is ongoing. In this work, we design and implement a
                 persistent object framework, dubbed scalable in-memory
                 persistent object (SIMPO), which exploits NVRAM,
                 alongside DRAM, to support efficient object persistence
                 in highly threaded big data applications. Based on
                 operation logging, we propose a new programming model
                 that classifies functions into instant and deferrable
                 groups. SIMPO features a streamlined execution model,
                 which allows lazy evaluation of deferrable functions
                 and is well suited to big data computing workloads that
                 would see improved data locality and concurrency. Our
                 log recording and checkpointing scheme is effectively
                 optimized towards NVRAM, mitigating its long write
                 latency through write-combining and consolidated
                 flushing techniques. Efficient persistent object
                 management with features including safe references and
                 memory leak prevention is also implemented and tailored
                 to NVRAM. We evaluate a wide range of SIMPO-enabled
                 applications with machine learning, high-performance
                 computing, and database workloads on an emulated hybrid
                 memory architecture and a real hybrid memory machine
                 with NVDIMM. Compared with native applications without
                 persistence, experimental results show that SIMPO
                 incurs less than 5\% runtime overhead on both platforms
                 and even gains up to 2.5$ \times $ speedup and 84\%
                 increase in throughput in highly threaded situations on
                 the two platforms, respectively, thanks to the
                 streamlined execution model.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "7",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Deng:2018:EML,
  author =       "Bobin Deng and Sriseshan Srikanth and Eric R. Hein and
                 Thomas M. Conte and Erik Debenedictis and Jeanine Cook
                 and Michael P. Frank",
  title =        "Extending {Moore's Law} via Computationally
                 Error-Tolerant Computing",
  journal =      j-TACO,
  volume =       "15",
  number =       "1",
  pages =        "8:1--8:??",
  month =        apr,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3177837",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Tue Jan 8 17:19:59 MST 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Dennard scaling has ended. Lowering the voltage supply
                 (V$_{dd}$) to sub-volt levels causes intermittent
                 losses in signal integrity, rendering further scaling
                 (down) no longer acceptable as a means to lower the
                 power required by a processor core. However, it is
                 possible to correct the occasional errors caused due to
                 lower V$_{dd}$ in an efficient manner and effectively
                 lower power. By deploying the right amount and kind of
                 redundancy, we can strike a balance between overhead
                 incurred in achieving reliability and energy savings
                 realized by permitting lower V$_{dd}$. One promising
                 approach is the Redundant Residue Number System (RRNS)
                 representation. Unlike other error correcting codes,
                 RRNS has the important property of being closed under
                 addition, subtraction and multiplication, thus enabling
                 computational error correction at a fraction of an
                 overhead compared to conventional approaches. We use
                 the RRNS scheme to design a Computationally-Redundant,
                 Energy-Efficient core, including the microarchitecture,
                 Instruction Set Architecture (ISA) and RRNS centered
                 algorithms. From the simulation results, this RRNS
                 system can reduce the energy-delay-product by about 3$
                 \times $ for multiplication intensive workloads and by
                 about 2$ \times $ in general, when compared to a
                 non-error-correcting binary core.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "8",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Dice:2018:IPH,
  author =       "Dave Dice and Maurice Herlihy and Alex Kogan",
  title =        "Improving Parallelism in Hardware Transactional
                 Memory",
  journal =      j-TACO,
  volume =       "15",
  number =       "1",
  pages =        "9:1--9:??",
  month =        apr,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3177962",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Tue Jan 8 17:19:59 MST 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Today's hardware transactional memory (HTM) systems
                 rely on existing coherence protocols, which implement a
                 requester-wins strategy. This, in turn, leads to poor
                 performance when transactions frequently conflict,
                 causing them to resort to a non-speculative fallback
                 path. Often, such a path severely limits parallelism.
                 In this article, we propose very simple architectural
                 changes to the existing requester-wins HTM
                 implementations that enhance conflict resolution
                 between hardware transactions and thus improve their
                 parallelism. Our idea is compatible with existing HTM
                 systems, requires no changes to target applications
                 that employ traditional lock synchronization, and is
                 shown to provide robust performance benefits.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "9",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Kim:2018:BEE,
  author =       "Namhyung Kim and Junwhan Ahn and Kiyoung Choi and
                 Daniel Sanchez and Donghoon Yoo and Soojung Ryu",
  title =        "{Benzene}: an Energy-Efficient Distributed Hybrid
                 Cache Architecture for Manycore Systems",
  journal =      j-TACO,
  volume =       "15",
  number =       "1",
  pages =        "10:1--10:??",
  month =        apr,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3177963",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Tue Jan 8 17:19:59 MST 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "This article proposes Benzene, an energy-efficient
                 distributed SRAM/STT-RAM hybrid cache for manycore
                 systems running multiple applications. It is based on
                 the observation that a na{\"\i}ve application of hybrid
                 cache techniques to distributed caches in a manycore
                 architecture suffers from limited energy reduction due
                 to uneven utilization of scarce SRAM. We propose
                 two-level optimization techniques: intra-bank and
                 inter-bank. Intra-bank optimization leverages highly
                 associative cache design, achieving more uniform
                 distribution of writes within a bank. Inter-bank
                 optimization evenly balances the amount of
                 write-intensive data across the banks. Our evaluation
                 results show that Benzene significantly reduces energy
                 consumption of distributed hybrid caches.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "10",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Ao:2018:POH,
  author =       "Yulong Ao and Chao Yang and Fangfang Liu and Wanwang
                 Yin and Lijuan Jiang and Qiao Sun",
  title =        "Performance Optimization of the {HPCG} Benchmark on
                 the {Sunway TaihuLight Supercomputer}",
  journal =      j-TACO,
  volume =       "15",
  number =       "1",
  pages =        "11:1--11:??",
  month =        apr,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3182177",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Tue Jan 8 17:19:59 MST 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/super.bib;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "In this article, we present some key techniques for
                 optimizing HPCG on Sunway TaihuLight and demonstrate
                 how to achieve high performance in memory-bound
                 applications by exploiting specific characteristics of
                 the hardware architecture. In particular, we utilize a
                 block multicoloring approach for parallelization and
                 propose methods such as requirement-based data mapping
                 and customized gather collective to enhance the
                 effective memory bandwidth. Experiments indicate that
                 the optimized HPCG code can sustain 77\% of the
                 theoretical memory bandwidth and scale to the full
                 system of more than 10 million cores, with an
                 aggregated performance of 480.8 Tflop/s and a weak
                 scaling efficiency of 87.3\%.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "11",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Rashidi:2018:IMP,
  author =       "Saeed Rashidi and Majid Jalili and Hamid
                 Sarbazi-Azad",
  title =        "Improving {MLC PCM} Performance through Relaxed Write
                 and Read for Intermediate Resistance Levels",
  journal =      j-TACO,
  volume =       "15",
  number =       "1",
  pages =        "12:1--12:??",
  month =        apr,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3177965",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Tue Jan 8 17:19:59 MST 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Phase Change Memory (PCM) is one of the most promising
                 candidates to be used at the main memory level of the
                 memory hierarchy due to poor scalability, considerable
                 leakage power, and high cost/bit of DRAM. PCM is a new
                 resistive memory that is capable of storing data based
                 on resistance values. The wide resistance range of PCM
                 allows for storing multiple bits per cell (MLC) rather
                 than a single bit per cell (SLC). Unfortunately, higher
                 density of MLC PCM comes at the expense of longer
                 read/write latency, higher soft error rate, higher
                 energy consumption, and earlier wearout compared to the
                 SLC PCM. Some studies suggest removing the most
                 error-prone level to mitigate soft error and write
                 latency of MLC PCM, hence introducing a less dense
                 memory called Tri-Level memory. Another scheme, called
                 M-Metric, proposes a new read metric to address the
                 soft error problem in MLC PCM. In order to deal with
                 the limited lifetime of PCM, some extra storage per
                 memory line is required to correct permanent hard
                 errors (stuck-at faults). Since the extra storage is
                 used only when permanent faults occur, it has a low
                 utilization for a long time before hard errors start to
                 occur. In this article, we utilize the extra storage to
                 improve the read/write latency in a 2-bit MLC PCM using
                 a relaxation scheme for reading and writing the cells
                 for intermediate resistance levels. More specifically,
                 we combine the most time-consuming levels (intermediate
                 resistance levels) to reduce the number of resistance
                 levels (making a Tri-Level PCM) and therefore improve
                 write latency. We then store some error correction
                 metadata in the extra storage section to successfully
                 retrieve the exact data values in the read operation.
                 We also modify the Tri-Level PCM cell to increase its
                 read latency when the M-Metric scheme is used.
                 Evaluation results show that the proposed scheme
                 improves read latency by 57.2\%, write latency by
                 56.1\%, and overall system performance (IPC) by 26.9\%
                 over the baseline. It is noteworthy that combining the
                 proposed scheme and FPC compression method improves
                 read latency by 75.2\%, write latency by 67\%, and
                 overall system performance (IPC) by 37.4\%.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "12",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Zhao:2018:OCN,
  author =       "Wenlai Zhao and Haohuan Fu and Jiarui Fang and Weijie
                 Zheng and Lin Gan and Guangwen Yang",
  title =        "Optimizing Convolutional Neural Networks on the
                 {Sunway TaihuLight Supercomputer}",
  journal =      j-TACO,
  volume =       "15",
  number =       "1",
  pages =        "13:1--13:??",
  month =        apr,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3177885",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Tue Jan 8 17:19:59 MST 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/super.bib;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "The Sunway TaihuLight supercomputer is powered by
                 SW26010, a new 260-core processor designed with on-chip
                 fusion of heterogeneous cores. In this article, we
                 present our work on optimizing the training process of
                 convolutional neural networks (CNNs) on the Sunway
                 TaihuLight supercomputer. Specifically, a highly
                 efficient library (swDNN) and a customized Caffe
                 framework (swCaffe) are proposed. Architecture-oriented
                 optimization methods targeting the many-core
                 architecture of SW26010 are introduced and are able to
                 achieve 48$ \times $ speedup for the convolution
                 routine in swDNN and 4$ \times $ speedup for the
                 complete training process of the VGG-16 network using
                 swCaffe, compared to the unoptimized algorithm and
                 framework. Compared to the cuDNN library and the Caffe
                 framework based on the NVIDIA K40m GPU, the proposed
                 swDNN library and swCaffe framework on SW26010 have
                 nearly half the performance of K40m in single-precision
                 and have 3.6$ \times $ and 1.8$ \times $ speedup over
                 K40m in double precision, respectively.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "13",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Mbakoyiannis:2018:EPC,
  author =       "Dimitrios Mbakoyiannis and Othon Tomoutzoglou and
                 George Kornaros",
  title =        "Energy-Performance Considerations for Data Offloading
                 to {FPGA}-Based Accelerators Over {PCIe}",
  journal =      j-TACO,
  volume =       "15",
  number =       "1",
  pages =        "14:1--14:??",
  month =        apr,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3180263",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Tue Jan 8 17:19:59 MST 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Modern data centers increasingly employ FPGA-based
                 heterogeneous acceleration platforms as a result of
                 their great potential for continued performance and
                 energy efficiency. Today, FPGAs provide more hardware
                 parallelism than is possible with GPUs or CPUs, whereas
                 C-like programming environments facilitate shorter
                 development time, even close to software cycles. In
                 this work, we address limitations and overheads in
                 access and transfer of data to accelerators over common
                 CPU-accelerator interconnects such as PCIe. We present
                 three different FPGA accelerator dispatching methods
                 for streaming applications (e.g., multimedia, vision
                 computing). The first uses zero-copy data transfers and
                 on-chip scratchpad memory (SPM) for energy efficiency,
                 and the second uses also zero-copy but shared copy
                 engines among different accelerator instances and local
                 external memory. The third uses the processor's memory
                 management unit to acquire the physical address of user
                 pages and uses scatter-gather data transfers with SPM.
                 Even though all techniques exhibit advantages in terms
                 of scalability and relieve the processor from control
                 overheads through using integrated schedulers, the
                 first method presents the best energy-efficient
                 acceleration in streaming applications.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "14",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Lin:2018:GPV,
  author =       "Zhen Lin and Michael Mantor and Huiyang Zhou",
  title =        "{GPU} Performance vs. Thread-Level Parallelism:
                 Scalability Analysis and a Novel Way to Improve {TLP}",
  journal =      j-TACO,
  volume =       "15",
  number =       "1",
  pages =        "15:1--15:??",
  month =        apr,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3177964",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Tue Jan 8 17:19:59 MST 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Graphics Processing Units (GPUs) leverage massive
                 thread-level parallelism (TLP) to achieve high
                 computation throughput and hide long memory latency.
                 However, recent studies have shown that the GPU
                 performance does not scale with the GPU occupancy or
                 the degrees of TLP that a GPU supports, especially for
                 memory-intensive workloads. The current understanding
                 points to L1 D-cache contention or off-chip memory
                 bandwidth. In this article, we perform a novel
                 scalability analysis from the perspective of throughput
                 utilization of various GPU components, including
                 off-chip DRAM, multiple levels of caches, and the
                 interconnect between L1 D-caches and L2 partitions. We
                 show that the interconnect bandwidth is a critical
                 bound for GPU performance scalability. For the
                 applications that do not have saturated throughput
                 utilization on a particular resource, their performance
                 scales well with increased TLP. To improve TLP for such
                 applications efficiently, we propose a fast context
                 switching approach. When a warp/thread block (TB) is
                 stalled by a long latency operation, the context of the
                 warp/TB is spilled to spare on-chip resource so that a
                 new warp/TB can be launched. The switched-out warp/TB
                 is switched back when another warp/TB is completed or
                 switched out. With this fine-grain fast context
                 switching, higher TLP can be supported without
                 increasing the sizes of critical resources like the
                 register file. Our experiment shows that the
                 performance can be improved by up to 47\% and a
                 geometric mean of 22\% for a set of applications with
                 unsaturated throughput utilization. Compared to the
                 state-of-the-art TLP improvement scheme, our proposed
                 scheme achieves 12\% higher performance on average and
                 16\% for unsaturated benchmarks.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "15",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Zinenko:2018:VPM,
  author =       "Oleksandr Zinenko and St{\'e}phane Huot and C{\'e}dric
                 Bastoul",
  title =        "Visual Program Manipulation in the Polyhedral Model",
  journal =      j-TACO,
  volume =       "15",
  number =       "1",
  pages =        "16:1--16:??",
  month =        apr,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3177961",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Tue Jan 8 17:19:59 MST 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Parallelism is one of the key performance sources in
                 modern computer systems. When heuristics-based
                 automatic parallelization fails to improve performance,
                 a cumbersome and error-prone manual transformation is
                 often required. As a solution, we propose an
                 interactive visual approach building on the polyhedral
                 model that visualizes exact dependencies and
                 parallelism; decomposes and replays a complex
                 automatically computed transformation step by step; and
                 allows for directly manipulating the visual
                 representation as a means of transforming the program
                 with immediate feedback. User studies suggest that our
                 visualization is understood by experts and nonexperts
                 alike, and that it may favor an exploratory approach.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "16",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Shihab:2018:RFD,
  author =       "Mustafa M. Shihab and Jie Zhang and Myoungsoo Jung and
                 Mahmut Kandemir",
  title =        "{ReveNAND}: a Fast-Drift-Aware Resilient {$3$D} {NAND}
                 Flash Design",
  journal =      j-TACO,
  volume =       "15",
  number =       "2",
  pages =        "17:1--17:??",
  month =        jun,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3184744",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Tue Jan 8 17:19:59 MST 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "The paradigm shift from planar (two dimensional (2D))
                 to vertical (three-dimensional (3D)) models has placed
                 the NAND flash technology on the verge of a design
                 evolution that can handle the demands of
                 next-generation storage applications. However, it also
                 introduces challenges that may obstruct the realization
                 of such 3D NAND flash. Specifically, we observed that
                 the fast threshold drift (fast-drift) in a charge-trap
                 flash-based 3D NAND cell can make it lose a critical
                 fraction of the stored charge relatively soon after
                 programming and generate errors. In this work, we first
                 present an elastic read reference (V$_{Ref}$) scheme
                 (ERR) for reducing such errors in ReveNAND-our
                 fast-drift aware 3D NAND design. To address the
                 inherent limitation of the adaptive V$_{Ref}$, we
                 introduce a new intra-block page organization
                 (hitch-hike) that can enable stronger error correction
                 for the error-prone pages. In addition, we propose a
                 novel reinforcement-learning-based smart data refill
                 scheme (iRefill) to counter the impact of fast-drift
                 with minimum performance and hardware overhead.
                 Finally, we present the first analytic model to
                 characterize fast-drift and evaluate its system-level
                 impact. Our results show that, compared to conventional
                 3D NAND design, our ReveNAND can reduce fast-drift
                 errors by 87\%, on average, and can lower the ECC
                 latency and energy overheads by 13$ \times $ and 10$
                 \times $, respectively.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "17",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Zahedi:2018:MHD,
  author =       "Seyed Majid Zahedi and Songchun Fan and Benjamin C.
                 Lee",
  title =        "Managing Heterogeneous Datacenters with Tokens",
  journal =      j-TACO,
  volume =       "15",
  number =       "2",
  pages =        "18:1--18:??",
  month =        jun,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3191821",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Tue Jan 8 17:19:59 MST 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Ensuring fairness in a system with scarce, preferred
                 resources requires time sharing. We consider a
                 heterogeneous system with a few ``big'' and many
                 ``small'' processors. We allocate heterogeneous
                 processors using a novel token mechanism, which frames
                 the allocation problem as a repeated game. At each
                 round, users request big processors and spend a token
                 if their request is granted. We analyze the game and
                 optimize users' strategies to produce an equilibrium.
                 In equilibrium, allocations balance performance and
                 fairness. Our mechanism outperforms classical, fair
                 mechanisms by 1.7$ \times $, on average, in performance
                 gains, and is competitive with a performance maximizing
                 mechanism.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "18",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Pericas:2018:EPA,
  author =       "Miquel Peric{\`a}s",
  title =        "{Elastic Places}: an Adaptive Resource Manager for
                 Scalable and Portable Performance",
  journal =      j-TACO,
  volume =       "15",
  number =       "2",
  pages =        "19:1--19:??",
  month =        jun,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3185458",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Tue Jan 8 17:19:59 MST 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "The diversity and complexity of modern computing
                 platforms makes the development of high-performance
                 software challenging. Designing scalable software
                 requires tuning for a large set of resources, including
                 cores (parallelism), memory bandwidths, and various
                 levels of private and shared caches, as well as
                 developing strategies for optimizing locality. But
                 highly optimized implementations are often inefficient
                 when executed on a different platform. This is the
                 performance portability problem. One approach to
                 scalability and portability is to tune the amount of
                 work per task based on runtime overheads and
                 concurrency. This results in a better balance between
                 parallelism and scheduling overheads, but it can
                 neither tune data reuse nor avoid inter-task
                 interference. We propose a complementary approach that
                 consists in tuning the amount of resources allocated to
                 tasks and combine it with software-defined task
                 topologies to provide portable locality. These ideas
                 are combined into a low-overhead resource management
                 scheme called Elastic Places. Elastic Places is
                 implemented in the XiTAO software framework but the
                 core ideas are equally applicable to other languages
                 and runtimes. Experimental results on an AMD-based NUMA
                 machine and an Intel Knights Landing system show that
                 elastic places provides both high scalability and
                 performance portability, with speed-ups of up to 2.3$
                 \times $ on both platforms compared to state-of-the-art
                 runtimes.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "19",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Olson:2018:CLM,
  author =       "Matthew Benjamin Olson and Joseph T. Teague and
                 Divyani Rao and Michael R. JANTZ and Kshitij A. Doshi
                 and Prasad A. Kulkarni",
  title =        "Cross-Layer Memory Management to Improve {DRAM} Energy
                 Efficiency",
  journal =      j-TACO,
  volume =       "15",
  number =       "2",
  pages =        "20:1--20:??",
  month =        jun,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3196886",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Tue Jan 8 17:19:59 MST 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/java2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Controlling the distribution and usage of memory power
                 is often difficult, because these effects typically
                 depend on activity across multiple layers of the
                 vertical execution stack. To address this challenge, we
                 construct a novel and collaborative framework that
                 employs object placement, cross-layer communication,
                 and page-level management to effectively distribute
                 application objects in the DRAM hardware to achieve
                 desired power/performance goals. This work describes
                 the design and implementation of our framework, which
                 is the first to integrate automatic object profiling
                 and analysis at the application layer with fine-grained
                 management of memory hardware resources in the
                 operating system. We demonstrate the utility of this
                 framework by employing it to control memory power
                 consumption more effectively. First, we design a custom
                 memory-intensive workload to show the potential of this
                 approach to reduce DRAM energy consumption. Next, we
                 develop sampling and profiling-based analyses and
                 modify the code generator in the HotSpot VM to
                 understand object usage patterns and automatically
                 control the placement of hot and cold objects in a
                 partitioned VM heap. This information is communicated
                 to the operating system, which uses it to map the
                 logical application pages to the appropriate DRAM
                 modules according to user-defined provisioning goals.
                 The evaluation shows that our Java VM-based framework
                 achieves our goal of significant DRAM energy savings
                 across a variety of workloads, without any source code
                 modifications or recompilations.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "20",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Zoni:2018:DEP,
  author =       "Davide Zoni and Luca Colombo and William Fornaciari",
  title =        "{DarkCache}: Energy-Performance Optimization of Tiled
                 Multi-Cores by Adaptively Power-Gating {LLC} Banks",
  journal =      j-TACO,
  volume =       "15",
  number =       "2",
  pages =        "21:1--21:??",
  month =        jun,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3186895",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Tue Jan 8 17:19:59 MST 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "The Last Level Cache (LLC) is a key element to improve
                 application performance in multi-cores. To handle the
                 worst case, the main design trend employs tiled
                 architectures with a large LLC organized in banks,
                 which goes underutilized in several realistic
                 scenarios. Our proposal, named DarkCache, aims at
                 properly powering off such unused banks to optimize the
                 Energy-Delay Product (EDP) through an adaptive cache
                 reconfiguration, thus aggressively reducing the leakage
                 energy. The implemented solution is general and it can
                 recognize and skip the activation of the DarkCache
                 policy for the few strong memory intensive applications
                 that actually require the use of the entire LLC. The
                 validation has been carried out on 16- and 64-core
                 architectures also accounting for two state-of-the-art
                 methodologies. Compared to the baseline solution,
                 DarkCache exhibits a performance overhead within 2\%
                 and an average EDP improvement of 32.58\% and 36.41\%
                 considering 16 and 64 cores, respectively. Moreover,
                 DarkCache shows an average EDP gain between 16.15\% (16
                 cores) and 21.05\% (64 cores) compared to the best
                 state-of-the-art we evaluated, and it confirms a good
                 scalability since the gain improves with the size of
                 the architecture.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "21",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Zhang:2018:CNC,
  author =       "Yang Zhang and Dan Feng and Wei Tong and Yu Hua and
                 Jingning Liu and Zhipeng Tan and Chengning Wang and
                 Bing Wu and Zheng Li and Gaoxiang Xu",
  title =        "{CACF}: a Novel Circuit Architecture Co-optimization
                 Framework for Improving Performance, Reliability and
                 Energy of {ReRAM}-based Main Memory System",
  journal =      j-TACO,
  volume =       "15",
  number =       "2",
  pages =        "22:1--22:??",
  month =        jun,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3195799",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Tue Jan 8 17:19:59 MST 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Emerging Resistive Random Access Memory (ReRAM) is a
                 promising candidate as the replacement for DRAM due to
                 its low standby power, high density, high scalability,
                 and nonvolatility. By employing the unique crossbar
                 structure, ReRAM can be constructed with extremely high
                 density. However, the crossbar ReRAM faces some serious
                 challenges in terms of performance, reliability, and
                 energy consumption. First, ReRAM's crossbar structure
                 causes an IR drop problem due to wire resistance and
                 sneak currents, which results in nonuniform access
                 latency in ReRAM banks and reduces its reliability.
                 Second, without access transistors in the crossbar
                 structure, write disturbance results in serious data
                 reliability problem. Third, the access latency,
                 reliability, and energy use of ReRAM arrays are
                 significantly influenced by the data patterns involved
                 in a write operation. To overcome the challenges of the
                 crossbar ReRAM, we propose a novel circuit architecture
                 co-optimization framework for improving the
                 performance, reliability, and energy use of ReRAM-based
                 main memory system, called CACF. The proposed CACF
                 consists of three levels, including the circuit level,
                 circuit architecture level, and architecture level. At
                 the circuit level, to reduce the IR drops along
                 bitlines, we propose a double-sided write driver design
                 by applying write drivers along both sides of bitlines
                 and selectively activating the write drivers. At the
                 circuit architecture level, to address the write
                 disturbance with low overheads, we propose a RESET
                 disturbance detection scheme by adding disturbance
                 reference cells and conditionally performing refresh
                 operations. At the architecture level, a region
                 partition with address remapping method is proposed to
                 leverage the nonuniform access latency in ReRAM banks,
                 and two flip schemes are proposed in different regions
                 to optimize the data patterns involved in a write
                 operation. The experimental results show that CACF
                 improves system performance by 26.1\%, decreases memory
                 access latency by 22.4\%, shortens running time by
                 20.1\%, and reduces energy consumption by 21.6\% on
                 average over an aggressive baseline. Meanwhile, CACF
                 significantly improves the reliability of ReRAM-based
                 memory systems.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "22",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Stawinoga:2018:PTC,
  author =       "Nicolai Stawinoga and Tony Field",
  title =        "Predictable Thread Coarsening",
  journal =      j-TACO,
  volume =       "15",
  number =       "2",
  pages =        "23:1--23:??",
  month =        jun,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3194242",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Tue Jan 8 17:19:59 MST 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Thread coarsening on GPUs combines the work of several
                 threads into one. We show how thread coarsening can be
                 implemented as a fully automated compile-time
                 optimisation that estimates the optimal coarsening
                 factor based on a low-cost, approximate static analysis
                 of cache line re-use and an occupancy prediction model.
                 We evaluate two coarsening strategies on three
                 different NVidia GPU architectures. For NVidia
                 reduction kernels we achieve a maximum speedup of
                 5.08x, and for the Rodinia benchmarks we achieve a mean
                 speedup of 1.30x over 8 of 19 kernels that were
                 determined safe to coarsen.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "23",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Roy:2018:NCN,
  author =       "Probir Roy and Shuaiwen Leon Song and Sriram
                 Krishnamoorthy and Abhinav Vishnu and Dipanjan Sengupta
                 and Xu Liu",
  title =        "{NUMA-Caffe}: {NUMA}-Aware Deep Learning Neural
                 Networks",
  journal =      j-TACO,
  volume =       "15",
  number =       "2",
  pages =        "24:1--24:??",
  month =        jun,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3199605",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Tue Jan 8 17:19:59 MST 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Convolution Neural Networks (CNNs), a special
                 subcategory of Deep Learning Neural Networks (DNNs),
                 have become increasingly popular in industry and
                 academia for their powerful capability in pattern
                 classification, image processing, and speech
                 recognition. Recently, they have been widely adopted in
                 High Performance Computing (HPC) environments for
                 solving complex problems related to modeling, runtime
                 prediction, and big data analysis. Current
                 state-of-the-art designs for DNNs on modern multi- and
                 many-core CPU architectures, such as variants of Caffe,
                 have reported promising performance in speedup and
                 scalability, comparable with the GPU implementations.
                 However, modern CPU architectures employ Non-Uniform
                 Memory Access (NUMA) technique to integrate multiple
                 sockets, which incurs unique challenges for designing
                 highly efficient CNN frameworks. Without a careful
                 design, DNN frameworks can easily suffer from long
                 memory latency due to a large number of memory accesses
                 to remote NUMA domains, resulting in poor scalability.
                 To address this challenge, we propose NUMA-aware
                 multi-solver-based CNN design, named NUMA-Caffe, for
                 accelerating deep learning neural networks on multi-
                 and many-core CPU architectures. NUMA-Caffe is
                 independent of DNN topology, does not impact network
                 convergence rates, and provides superior scalability to
                 the existing Caffe variants. Through a thorough
                 empirical study on four contemporary NUMA-based multi-
                 and many-core architectures, our experimental results
                 demonstrate that NUMA-Caffe significantly outperforms
                 the state-of-the-art Caffe designs in terms of both
                 throughput and scalability.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "24",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Ejaz:2018:DDD,
  author =       "Ahsen Ejaz and Vassilios Papaefstathiou and Ioannis
                 Sourdis",
  title =        "{DDRNoC}: Dual Data-Rate Network-on-Chip",
  journal =      j-TACO,
  volume =       "15",
  number =       "2",
  pages =        "25:1--25:??",
  month =        jun,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3200201",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Tue Jan 8 17:19:59 MST 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "This article introduces DDRNoC, an on-chip
                 interconnection network capable of routing packets at
                 Dual Data Rate. The cycle time of current 2D-mesh
                 Network-on-Chip routers is limited by their control as
                 opposed to the datapath (switch and link traversal),
                 which exhibits significant slack. DDRNoC capitalizes on
                 this observation, allowing two flits per cycle to share
                 the same datapath. Thereby, DDRNoC achieves higher
                 throughput than a Single Data Rate (SDR) network.
                 Alternatively, using lower voltage circuits, the above
                 slack can be exploited to reduce power consumption
                 while matching the SDR network throughput. In addition,
                 DDRNoC exhibits reduced clock distribution power,
                 improving energy efficiency, as it needs a slower clock
                 than a SDR network that routes packets at the same
                 rate. Post place and route results in 28nm technology
                 show that, compared to an iso-voltage (1.1V) SDR
                 network, DDRNoC improves throughput proportionally to
                 the SDR datapath slack. Moreover, a low-voltage (0.95V)
                 DDRNoC implementation converts that slack to power
                 reduction offering the 1.1V SDR throughput at a
                 substantially lower energy cost.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "25",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Cai:2018:ESH,
  author =       "Ying Cai and Yulong Ao and Chao Yang and Wenjing Ma
                 and Haitao Zhao",
  title =        "Extreme-Scale High-Order {WENO} Simulations of {$3$-D}
                 Detonation Wave with 10 Million Cores",
  journal =      j-TACO,
  volume =       "15",
  number =       "2",
  pages =        "26:1--26:??",
  month =        jun,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3209208",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Tue Jan 8 17:19:59 MST 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "High-order stencil computations, frequently found in
                 many applications, pose severe challenges to emerging
                 many-core platforms due to the complexities of hardware
                 architectures as well as the sophisticated computing
                 and data movement patterns. In this article, we tackle
                 the challenges of high-order WENO computations in
                 extreme-scale simulations of 3D gaseous waves on Sunway
                 TaihuLight. We design efficient parallelization
                 algorithms and present effective optimization
                 techniques to fully exploit various parallelisms with
                 reduced memory footprints, enhanced data reuse, and
                 balanced computation load. Test results show the
                 optimized code can scale to 9.98 million cores, solving
                 12.74 trillion unknowns with 23.12 Pflops
                 double-precision performance.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "26",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Sfakianakis:2018:QPB,
  author =       "Yannis Sfakianakis and Christos Kozanitis and Christos
                 Kozyrakis and Angelos Bilas",
  title =        "{QuMan}: Profile-based Improvement of Cluster
                 Utilization",
  journal =      j-TACO,
  volume =       "15",
  number =       "3",
  pages =        "27:1--27:??",
  month =        oct,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3210560",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Tue Jan 8 17:19:59 MST 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Modern data centers consolidate workloads to increase
                 server utilization and reduce total cost of ownership,
                 and cope with scaling limitations. However, server
                 resource sharing introduces performance interference
                 across applications and, consequently, increases
                 performance volatility, which negatively affects user
                 experience. Thus, a challenging problem is to increase
                 server utilization while maintaining application QoS.
                 In this article, we present QuMan, a server resource
                 manager that uses application isolation and profiling
                 to increase server utilization while controlling
                 degradation of application QoS. Previous solutions,
                 either estimate interference across applications and
                 then restrict colocation to ``compatible''
                 applications, or assume that application requirements
                 are known. Instead, QuMan estimates the required
                 resources of applications. It uses an isolation
                 mechanism to create properly-sized resource slices for
                 applications, and arbitrarily colocates applications.
                 QuMan 's mechanisms can be used with a variety of
                 admission control policies, and we explore the
                 potential of two such policies: (1) A policy that
                 allows users to specify a minimum performance threshold
                 and (2) an automated policy, which operates without
                 user input and is based on a new combined
                 QoS-utilization metric. We implement QuMan on top of
                 Linux servers, and we evaluate its effectiveness using
                 containers and real applications. Our single-node
                 results show that QuMan balances highly effectively the
                 tradeoff between server utilization and application
                 performance, as it achieves 80\% server utilization
                 while the performance of each application does not drop
                 below 80\% the respective standalone performance. We
                 also deploy QuMan on a cluster of 100 AWS instances
                 that are managed by a modified version of the Sparrow
                 scheduler [37] and, we observe a 48\% increase in
                 application performance on a highly utilized cluster,
                 compared to the performance of the same cluster under
                 the same load when it is managed by native Sparrow or
                 Apache Mesos.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "27",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Kayraklioglu:2018:LLA,
  author =       "Engin Kayraklioglu and Michael P. Ferguson and Tarek
                 El-Ghazawi",
  title =        "{LAPPS}: Locality-Aware Productive Prefetching Support
                 for {PGAS}",
  journal =      j-TACO,
  volume =       "15",
  number =       "3",
  pages =        "28:1--28:??",
  month =        oct,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3233299",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Tue Jan 8 17:19:59 MST 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Prefetching is a well-known technique to mitigate
                 scalability challenges in the Partitioned Global
                 Address Space (PGAS) model. It has been studied as
                 either an automated compiler optimization or a manual
                 programmer optimization. Using the PGAS locality
                 awareness, we define a hybrid tradeoff. Specifically,
                 we introduce locality-aware productive prefetching
                 support for PGAS. Our novel, user-driven approach
                 strikes a balance between the ease-of-use of
                 compiler-based automated prefetching and the high
                 performance of the laborious manual prefetching. Our
                 prototype implementation in Chapel shows that
                 significant scalability and performance improvements
                 can be achieved with minimal effort in common
                 applications.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "28",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Benatia:2018:BSM,
  author =       "Akrem Benatia and Weixing Ji and Yizhuo Wang and Feng
                 Shi",
  title =        "{BestSF}: a Sparse Meta-Format for Optimizing {SpMV}
                 on {GPU}",
  journal =      j-TACO,
  volume =       "15",
  number =       "3",
  pages =        "29:1--29:??",
  month =        oct,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3226228",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Tue Jan 8 17:19:59 MST 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "The Sparse Matrix-Vector Multiplication (SpMV) kernel
                 dominates the computing cost in numerous scientific
                 applications. Many implementations based on different
                 sparse formats were proposed to improve this kernel on
                 the recent GPU architectures. However, it has been
                 widely observed that there is no ``best-for-all''
                 sparse format for the SpMV kernel on GPU. Indeed,
                 serious performance degradation of an order of
                 magnitude can be observed without a careful selection
                 of the sparse format to use. To address this problem,
                 we propose in this article BestSF (Best Sparse Format),
                 a new learning-based sparse meta-format that
                 automatically selects the most appropriate sparse
                 format for a given input matrix. To do so, BestSF
                 relies on a cost-sensitive classification system
                 trained using Weighted Support Vector Machines (WSVMs)
                 to predict the best sparse format for each input sparse
                 matrix. Our experimental results on two different
                 NVIDIA GPU architectures using a large number of
                 real-world sparse matrices show that BestSF achieved a
                 noticeable overall performance improvement over using a
                 single sparse format. While BestSF is trained to select
                 the best sparse format in terms of performance
                 (GFLOPS), our further experimental investigations
                 revealed that using BestSF also led, in most of the
                 test cases, to the best energy efficiency (MFLOPS/W).
                 To prove its practical effectiveness, we also evaluate
                 the performance and energy efficiency improvement
                 achieved when using BestSF as a building block in a
                 GPU-based Preconditioned Conjugate Gradient (PCG)
                 iterative solver.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "29",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Michaud:2018:ATL,
  author =       "Pierre Michaud",
  title =        "An Alternative {TAGE}-like Conditional Branch
                 Predictor",
  journal =      j-TACO,
  volume =       "15",
  number =       "3",
  pages =        "30:1--30:??",
  month =        oct,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3226098",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Tue Jan 8 17:19:59 MST 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "TAGE is one of the most accurate conditional branch
                 predictors known today. However, TAGE does not exploit
                 its input information perfectly, as it is possible to
                 obtain significant prediction accuracy improvements by
                 complementing TAGE with a statistical corrector using
                 the same input information. This article proposes an
                 alternative TAGE-like predictor making statistical
                 correction practically superfluous.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "30",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Garland:2018:LCM,
  author =       "James Garland and David Gregg",
  title =        "Low Complexity Multiply-Accumulate Units for
                 Convolutional Neural Networks with Weight-Sharing",
  journal =      j-TACO,
  volume =       "15",
  number =       "3",
  pages =        "31:1--31:??",
  month =        oct,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3233300",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Tue Jan 8 17:19:59 MST 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/fparith.bib;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Convolutional neural networks (CNNs) are one of the
                 most successful machine-learning techniques for image,
                 voice, and video processing. CNNs require large amounts
                 of processing capacity and memory bandwidth. Hardware
                 accelerators have been proposed for CNNs that typically
                 contain large numbers of multiply-accumulate (MAC)
                 units, the multipliers of which are large in integrated
                 circuit (IC) gate count and power consumption.
                 ``Weight-sharing'' accelerators have been proposed
                 where the full range of weight values in a trained CNN
                 are compressed and put into bins, and the bin index is
                 used to access the weight-shared value. We reduce power
                 and area of the CNN by implementing parallel accumulate
                 shared MAC (PASM) in a weight-shared CNN. PASM
                 re-architects the MAC to instead count the frequency of
                 each weight and place it in a bin. The accumulated
                 value is computed in a subsequent multiply phase,
                 significantly reducing gate count and power consumption
                 of the CNN. In this article, we implement PASM in a
                 weight-shared CNN convolution hardware accelerator and
                 analyze its effectiveness. Experiments show that for a
                 clock speed 1GHz implemented on a 45nm ASIC process our
                 approach results in fewer gates, smaller logic, and
                 reduced power with only a slight increase in latency.
                 We also show that the same weight-shared-with-PASM CNN
                 accelerator can be implemented in resource-constrained
                 FPGAs, where the FPGA has limited numbers of digital
                 signal processor (DSP) units to accelerate the MAC
                 operations.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "31",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Kim:2018:CEC,
  author =       "Hyojong Kim and Ramyad Hadidi and Lifeng Nai and
                 Hyesoon Kim and Nuwan Jayasena and Yasuko Eckert and
                 Onur Kayiran and Gabriel Loh",
  title =        "{CODA}: Enabling Co-location of Computation and Data
                 for Multiple {GPU} Systems",
  journal =      j-TACO,
  volume =       "15",
  number =       "3",
  pages =        "32:1--32:??",
  month =        oct,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3232521",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Tue Jan 8 17:19:59 MST 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "To exploit parallelism and scalability of multiple
                 GPUs in a system, it is critical to place compute and
                 data together. However, two key techniques that have
                 been used to hide memory latency and improve
                 thread-level parallelism (TLP), memory interleaving,
                 and thread block scheduling, in traditional GPU systems
                 are at odds with efficient use of multiple GPUs.
                 Distributing data across multiple GPUs to improve
                 overall memory bandwidth utilization incurs high remote
                 traffic when the data and compute are misaligned.
                 Nondeterministic thread block scheduling to improve
                 compute resource utilization impedes co-placement of
                 compute and data. Our goal in this work is to enable
                 co-placement of compute and data in the presence of
                 fine-grained interleaved memory with a low-cost
                 approach. To this end, we propose a mechanism that
                 identifies exclusively accessed data and place the data
                 along with the thread block that accesses it in the
                 same GPU. The key ideas are (1) the amount of data
                 exclusively used by a thread block can be estimated,
                 and that exclusive data (of any size) can be localized
                 to one GPU with coarse-grained interleaved pages; (2)
                 using the affinity-based thread block scheduling
                 policy, we can co-place compute and data together; and
                 (3) by using dual address mode with lightweight changes
                 to virtual to physical page mappings, we can
                 selectively choose different interleaved memory pages
                 for each data structure. Our evaluations across a wide
                 range of workloads show that the proposed mechanism
                 improves performance by 31\% and reduces 38\% remote
                 traffic over a baseline system.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "32",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Manivannan:2018:GDB,
  author =       "Madhavan Manivannan and Miquel Peric{\'a}s and
                 Vassilis Papaefstathiou and Per Stenstr{\"o}m",
  title =        "Global Dead-Block Management for Task-Parallel
                 Programs",
  journal =      j-TACO,
  volume =       "15",
  number =       "3",
  pages =        "33:1--33:??",
  month =        oct,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3234337",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Tue Jan 8 17:19:59 MST 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Task-parallel programs inefficiently utilize the cache
                 hierarchy due to the presence of dead blocks in caches.
                 Dead blocks may occupy cache space in multiple cache
                 levels for a long time without providing any utility
                 until they are finally evicted. Existing dead-block
                 prediction schemes take decisions locally for each
                 cache level and do not efficiently manage the entire
                 cache hierarchy. This article introduces
                 runtime-orchestrated global dead-block management, in
                 which static and dynamic information about tasks
                 available to the runtime system is used to effectively
                 detect and manage dead blocks across the cache
                 hierarchy. In the proposed global management schemes,
                 static information (e.g., when tasks start/finish, and
                 what data regions tasks produce/consume) is combined
                 with dynamic information to detect when/where blocks
                 become dead. When memory regions are deemed dead at
                 some cache level(s), all the associated cache blocks
                 are evicted from the corresponding level(s). We extend
                 the cache controllers at both private and shared cache
                 levels to use the aforementioned information to evict
                 dead blocks. The article does an extensive evaluation
                 of both inclusive and non-inclusive cache hierarchies
                 and shows that the proposed global schemes outperform
                 existing local dead-block management schemes.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "33",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Gareev:2018:HPG,
  author =       "Roman Gareev and Tobias Grosser and Michael Kruse",
  title =        "High-Performance Generalized Tensor Operations: a
                 Compiler-Oriented Approach",
  journal =      j-TACO,
  volume =       "15",
  number =       "3",
  pages =        "34:1--34:??",
  month =        oct,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3235029",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Tue Jan 8 17:19:59 MST 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "The efficiency of tensor contraction is of great
                 importance. Compilers cannot optimize it well enough to
                 come close to the performance of expert-tuned
                 implementations. All existing approaches that provide
                 competitive performance require optimized external
                 code. We introduce a compiler optimization that reaches
                 the performance of optimized BLAS libraries without the
                 need for an external implementation or automatic
                 tuning. Our approach provides competitive performance
                 across hardware architectures and can be generalized to
                 deliver the same benefits for algebraic path problems.
                 By making fast linear algebra kernels available to
                 everyone, we expect productivity increases when
                 optimized libraries are not available.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "34",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Yviquel:2018:CPU,
  author =       "Herv{\'e} Yviquel and Lauro Cruz and Guido Araujo",
  title =        "Cluster Programming using the {OpenMP} Accelerator
                 Model",
  journal =      j-TACO,
  volume =       "15",
  number =       "3",
  pages =        "35:1--35:??",
  month =        oct,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3226112",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Tue Jan 8 17:19:59 MST 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Computation offloading is a programming model in which
                 program fragments (e.g., hot loops) are annotated so
                 that their execution is performed in dedicated hardware
                 or accelerator devices. Although offloading has been
                 extensively used to move computation to GPUs, through
                 directive-based annotation standards like OpenMP,
                 offloading computation to very large computer clusters
                 can become a complex and cumbersome task. It typically
                 requires mixing programming models (e.g., OpenMP and
                 MPI) and languages (e.g., C/C++ and Scala), dealing
                 with various access control mechanisms from different
                 cloud providers (e.g., AWS and Azure), and integrating
                 all this into a single application. This article
                 introduces computer cluster nodes as simple OpenMP
                 offloading devices that can be used either from a local
                 computer or from the cluster head-node. It proposes a
                 methodology that transforms OpenMP directives to Spark
                 runtime calls with fully integrated communication
                 management, in a way that a cluster appears to the
                 programmer as yet another accelerator device.
                 Experiments using LLVM 3.8, OpenMP 4.5 on well known
                 cloud infrastructures (Microsoft Azure and Amazon EC2)
                 show the viability of the proposed approach, enable a
                 thorough analysis of its performance, and make a
                 comparison with an MPI implementation. The results show
                 that although data transfers can impose overheads,
                 cloud offloading from a local machine can still achieve
                 promising speedups for larger granularity: up to 115$
                 \times $ in 256 cores for the 2MM benchmark using 1GB
                 sparse matrices. In addition, the parallel
                 implementation of a complex and relevant scientific
                 application reveals a 80$ \times $ speedup on a 320
                 core machine when executed directly from the headnode
                 of the cluster.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "35",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Tavana:2018:BCA,
  author =       "Mohammad Khavari Tavana and Amir Kavyan Ziabari and
                 David Kaeli",
  title =        "Block Cooperation: Advancing Lifetime of Resistive
                 Memories by Increasing Utilization of Error Correcting
                 Codes",
  journal =      j-TACO,
  volume =       "15",
  number =       "3",
  pages =        "36:1--36:??",
  month =        oct,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3243906",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Tue Jan 8 17:19:59 MST 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Block-level cooperation is an endurance management
                 technique that operates on top of error correction
                 mechanisms to extend memory lifetimes. Once an error
                 recovery scheme fails to recover from faults in a data
                 block, the entire physical page associated with that
                 block is disabled and becomes unavailable to the
                 physical address space. To reduce the page waste caused
                 by early block failures, other blocks can be used to
                 support the failed block, working cooperatively to keep
                 it alive and extend the faulty page's lifetime. We
                 combine the proposed technique with existing error
                 recovery schemes, such as Error Correction Pointers
                 (ECP) and Aegis, to increase memory lifetimes. Block
                 cooperation is realized through metadata sharing in
                 ECP, where one data block shares its unused metadata
                 with another data block. When combined with Aegis,
                 block cooperation is realized through reorganizing data
                 layout, where blocks possessing few faults come to the
                 aid of failed blocks, bringing them back from the dead.
                 Our evaluation using Monte Carlo simulation shows that
                 block cooperation at a single level (or multiple
                 levels) on top of ECP and Aegis, boosts memory
                 lifetimes by 28\% (37\%) and 8\% (14\%) on average,
                 respectively. Furthermore, using trace-driven benchmark
                 evaluation shows that lifetime boost can reach to 68\%
                 (30\%) exploiting metadata sharing (or data layout
                 reorganization).",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "36",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Jin:2018:LCM,
  author =       "Hai Jin and Bo Liu and Wenbin Jiang and Yang Ma and
                 Xuanhua Shi and Bingsheng He and Shaofeng Zhao",
  title =        "Layer-Centric Memory Reuse and Data Migration for
                 Extreme-Scale Deep Learning on Many-Core
                 Architectures",
  journal =      j-TACO,
  volume =       "15",
  number =       "3",
  pages =        "37:1--37:??",
  month =        oct,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3243904",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Tue Jan 8 17:19:59 MST 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Due to the popularity of Deep Neural Network (DNN)
                 models, we have witnessed extreme-scale DNN models with
                 the continued increase of the scale in terms of depth
                 and width. However, the extremely high memory
                 requirements for them make it difficult to run the
                 training processes on single many-core architectures
                 such as a Graphic Processing Unit (GPU), which compels
                 researchers to use model parallelism over multiple GPUs
                 to make it work. However, model parallelism always
                 brings very heavy additional overhead. Therefore,
                 running an extreme-scale model in a single GPU is
                 urgently required. There still exist several challenges
                 to reduce the memory footprint for extreme-scale deep
                 learning. To address this tough problem, we first
                 identify the memory usage characteristics for deep and
                 wide convolutional networks, and demonstrate the
                 opportunities for memory reuse at both the intra-layer
                 and inter-layer levels. We then present Layrub, a
                 runtime data placement strategy that orchestrates the
                 execution of the training process. It achieves
                 layer-centric reuse to reduce memory consumption for
                 extreme-scale deep learning that could not previously
                 be run on a single GPU. Experiments show that, compared
                 to the original Caffe, Layrub can cut down the memory
                 usage rate by an average of 58.2\% and by up to 98.9\%,
                 at the moderate cost of 24.1\% higher training
                 execution time on average. Results also show that
                 Layrub outperforms some popular deep learning systems
                 such as GeePS, vDNN, MXNet, and Tensorflow. More
                 importantly, Layrub can tackle extreme-scale deep
                 learning tasks. For example, it makes an extra-deep
                 ResNet with 1,517 layers that can be trained
                 successfully in one GPU with 12GB memory, while other
                 existing deep learning systems cannot.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "37",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Voitsechov:2018:SDT,
  author =       "Dani Voitsechov and Arslan Zulfiqar and Mark
                 Stephenson and Mark Gebhart and Stephen W. Keckler",
  title =        "Software-Directed Techniques for Improved {GPU}
                 Register File Utilization",
  journal =      j-TACO,
  volume =       "15",
  number =       "3",
  pages =        "38:1--38:??",
  month =        oct,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3243905",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Tue Jan 8 17:19:59 MST 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Throughput architectures such as GPUs require
                 substantial hardware resources to hold the state of a
                 massive number of simultaneously executing threads.
                 While GPU register files are already enormous, reaching
                 capacities of 256KB per streaming multiprocessor (SM),
                 we find that nearly half of real-world applications we
                 examined are register-bound and would benefit from a
                 larger register file to enable more concurrent threads.
                 This article seeks to increase the thread occupancy and
                 improve performance of these register-bound
                 applications by making more efficient use of the
                 existing register file capacity. Our first technique
                 eagerly deallocates register resources during
                 execution. We show that releasing register resources
                 based on value liveness as proposed in prior states of
                 the art leads to unreliable performance and undue
                 design complexity. To address these deficiencies, our
                 article presents a novel compiler-driven approach that
                 identifies and exploits last use of a register name
                 (instead of the value contained within) to eagerly
                 release register resources. Furthermore, while previous
                 works have leveraged ``scalar'' and ``narrow'' operand
                 properties of a program for various optimizations,
                 their impact on thread occupancy has been relatively
                 unexplored. Our article evaluates the effectiveness of
                 these techniques in improving thread occupancy and
                 demonstrates that while any one approach may fail to
                 free very many registers, together they synergistically
                 free enough registers to launch additional parallel
                 work. An in-depth evaluation on a large suite of
                 applications shows that just our early register
                 technique outperforms previous work on dynamic register
                 allocation, and together these approaches, on average,
                 provide 12\% performance speedup (23\% higher thread
                 occupancy) on register bound applications not already
                 saturating other GPU resources.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "38",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Lin:2018:GTD,
  author =       "Huanxin Lin and Cho-Li Wang and Hongyuan Liu",
  title =        "{On-GPU} Thread-Data Remapping for Branch Divergence
                 Reduction",
  journal =      j-TACO,
  volume =       "15",
  number =       "3",
  pages =        "39:1--39:??",
  month =        oct,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3242089",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Tue Jan 8 17:19:59 MST 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "General Purpose GPU computing (GPGPU) plays an
                 increasingly vital role in high performance computing
                 and other areas like deep learning. However, arising
                 from the SIMD execution model, the branch divergence
                 issue lowers efficiency of conditional branching on
                 GPUs, and hinders the development of GPGPU. To achieve
                 runtime on-the-spot branch divergence reduction, we
                 propose the first on-GPU thread-data remapping scheme.
                 Before kernel launching, our solution inserts codes
                 into GPU kernels immediately before each target branch
                 so as to acquire actual runtime divergence information.
                 GPU software threads can be remapped to datasets
                 multiple times during single kernel execution. We
                 propose two thread-data remapping algorithms that are
                 tailored to the GPU architecture. Effective on two
                 generations of GPUs from both NVIDIA and AMD, our
                 solution achieves speedups up to 2.718 with third-party
                 benchmarks. We also implement three GPGPU frontier
                 benchmarks from areas including computer vision,
                 algorithmic trading and data analytics. They are
                 hindered by more complex divergence coupled with
                 different memory access patterns, and our solution
                 works better than the traditional thread-data remapping
                 scheme in all cases. As a compiler-assisted runtime
                 solution, it can better reduce divergence for divergent
                 applications that gain little acceleration on GPUs for
                 the time being.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "39",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Kronawitter:2019:PSS,
  author =       "Stefan Kronawitter and Christian Lengauer",
  title =        "Polyhedral Search Space Exploration in the
                 {ExaStencils} Code Generator",
  journal =      j-TACO,
  volume =       "15",
  number =       "4",
  pages =        "40:1--40:??",
  month =        jan,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3274653",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Tue Jan 8 17:20:00 MST 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Performance optimization of stencil codes requires
                 data locality improvements. The polyhedron model for
                 loop transformation is well suited for such
                 optimizations with established techniques, such as the
                 PLuTo algorithm and diamond tiling. However, in the
                 domain of our project ExaStencils, stencil codes, it
                 fails to yield optimal results. As an alternative, we
                 propose a new, optimized, multi-dimensional polyhedral
                 search space exploration and demonstrate its
                 effectiveness: we obtain better results than existing
                 approaches in several cases. We also propose how to
                 specialize the search for the domain of stencil codes,
                 which dramatically reduces the exploration effort
                 without significantly impairing performance.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "40",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Xu:2019:PTA,
  author =       "Jingheng Xu and Haohuan Fu and Wen Shi and Lin Gan and
                 Yuxuan Li and Wayne Luk and Guangwen Yang",
  title =        "Performance Tuning and Analysis for Stencil-Based
                 Applications on {POWER8} Processor",
  journal =      j-TACO,
  volume =       "15",
  number =       "4",
  pages =        "41:1--41:??",
  month =        jan,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3264422",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Tue Jan 8 17:20:00 MST 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "This article demonstrates an approach for combining
                 general tuning techniques with the POWER8 hardware
                 architecture through optimizing three representative
                 stencil benchmarks. Two typical real-world
                 applications, with kernels similar to those of the
                 winning programs of the Gordon Bell Prize 2016 and
                 2017, are employed to illustrate algorithm
                 modifications and a combination of hardware-oriented
                 tuning strategies with the application algorithms. This
                 work fills the gap between hardware capability and
                 software performance of the POWER8 processor, and
                 provides useful guidance for optimizing stencil-based
                 scientific applications on POWER systems.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "41",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Wang:2019:SSS,
  author =       "Jiajun Wang and Reena Panda and Lizy K. John",
  title =        "{SelSMaP}: a Selective Stride Masking Prefetching
                 Scheme",
  journal =      j-TACO,
  volume =       "15",
  number =       "4",
  pages =        "42:1--42:??",
  month =        jan,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3274650",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Tue Jan 8 17:20:00 MST 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Data prefetching, which intelligently loads data
                 closer to the processor before demands, is a popular
                 cache performance optimization technique to address the
                 increasing processor-memory performance gap. Although
                 prefetching concepts have been proposed for decades,
                 sophisticated system architecture and emerging
                 applications introduce new challenges. Large
                 instruction windows coupled with out-of-order execution
                 makes the program data access sequence distorted from a
                 cache perspective. Furthermore, big data applications
                 stress memory subsystems heavily with their large
                 working set sizes and complex data access patterns. To
                 address such challenges, this work proposes a
                 high-performance hardware prefetching scheme, SelSMaP.
                 SelSMaP is able to detect both regular and nonuniform
                 stride patterns by taking the minimum observed address
                 offset (called a reference stride) as a heuristic. A
                 stride masking is generated according to the reference
                 stride and is to filter out history accesses whose
                 pattern can be rephrased as uniform stride accesses.
                 Prefetching decision and prefetch degree are determined
                 based on the masking outcome. As SelSMaP prediction
                 logic does not rely on the chronological order of data
                 accesses or program counter information, it is able to
                 unveil the effect of out-of-order execution and
                 compiler optimization. We evaluated SelSMaP with
                 CloudSuite workloads and SPEC CPU2006 benchmarks.
                 SelSMaP achieves an average CloudSuite performance
                 improvement of 30\% over nonprefetching systems. With
                 one to two orders of magnitude less storage and much
                 less functional logic, SelSMaP outperforms the
                 highest-performing prefetcher by 8.6\% in CloudSuite
                 workloads.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "42",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Su:2019:SSC,
  author =       "Xing Su and Xiangke Liao and Hao Jiang and Canqun Yang
                 and Jingling Xue",
  title =        "{SCP}: Shared Cache Partitioning for High-Performance
                 {GEMM}",
  journal =      j-TACO,
  volume =       "15",
  number =       "4",
  pages =        "43:1--43:??",
  month =        jan,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3274654",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Tue Jan 8 17:20:00 MST 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "GEneral Matrix Multiply (GEMM) is the most fundamental
                 computational kernel routine in the BLAS library. To
                 achieve high performance, in-memory data must be
                 prefetched into fast on-chip caches before they are
                 used. Two techniques, software prefetching and data
                 packing, have been used to effectively exploit the
                 capability of on-chip least recent used (LRU) caches,
                 which are popular in traditional high-performance
                 processors used in high-end servers and supercomputers.
                 However, the market has recently witnessed a new
                 diversity in processor design, resulting in
                 high-performance processors equipped with shared caches
                 with non-LRU replacement policies. This poses a
                 challenge to the development of high-performance GEMM
                 in a multithreaded context. As several threads try to
                 load data into a shared cache simultaneously,
                 interthread cache conflicts will increase
                 significantly. We present a Shared Cache Partitioning
                 (SCP) method to eliminate interthread cache conflicts
                 in the GEMM routines, by partitioning a shared cache
                 into physically disjoint sets and assigning different
                 sets to different threads. We have implemented SCP in
                 the OpenBLAS library and evaluated it on Phytium 2000+,
                 a 64-core AArch64 processor with private LRU L1 caches
                 and shared pseudo-random L2 caches (per four-core
                 cluster). Our evaluation shows that SCP has effectively
                 reduced the conflict misses in both L1 and L2 caches in
                 a highly optimized GEMM implementation, resulting in an
                 improvement of its performance by 2.75\% to 6.91\%.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "43",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Pereira:2019:SPS,
  author =       "Fernando Magno Quint{\~a}o Pereira and Guilherme
                 Vieira Leobas and Abdoulaye Gamati{\'e}",
  title =        "Static Prediction of Silent Stores",
  journal =      j-TACO,
  volume =       "15",
  number =       "4",
  pages =        "44:1--44:??",
  month =        jan,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3280848",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Tue Jan 8 17:20:00 MST 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "A store operation is called ``silent'' if it writes in
                 memory a value that is already there. The ability to
                 detect silent stores is important, because they might
                 indicate performance bugs, might enable code
                 optimizations, and might reveal opportunities of
                 automatic parallelization, for instance. Silent stores
                 are traditionally detected via profiling tools. In this
                 article, we depart from this methodology and instead
                 explore the following question: is it possible to
                 predict silentness by analyzing the syntax of programs?
                 The process of building an answer to this question is
                 interesting in itself, given the stochastic nature of
                 silent stores, which depend on data and coding style.
                 To build such an answer, we have developed a
                 methodology to classify store operations in terms of
                 syntactic features of programs. Based on such features,
                 we develop different kinds of predictors, some of which
                 go much beyond what any trivial approach could achieve.
                 To illustrate how static prediction can be employed in
                 practice, we use it to optimize programs running on
                 nonvolatile memory systems.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "44",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Crago:2019:EMA,
  author =       "Neal C. Crago and Mark Stephenson and Stephen W.
                 Keckler",
  title =        "Exposing Memory Access Patterns to Improve Instruction
                 and Memory Efficiency in {GPUs}",
  journal =      j-TACO,
  volume =       "15",
  number =       "4",
  pages =        "45:1--45:??",
  month =        jan,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3280851",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Tue Jan 8 17:20:00 MST 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Modern computing workloads often have high memory
                 intensity, requiring high bandwidth access to memory.
                 The memory request patterns of these workloads vary and
                 include regular strided accesses and indirect
                 (pointer-based) accesses. Such applications require a
                 large number of address generation instructions and a
                 high degree of memory-level parallelism. This article
                 proposes new memory instructions that exploit strided
                 and indirect memory request patterns and improve
                 efficiency in GPU architectures. The new instructions
                 reduce address calculation instructions by offloading
                 addressing to dedicated hardware, and reduce
                 destructive memory request interference by grouping
                 related requests together. Our results show that we can
                 eliminate 33\% of dynamic instructions across 16 GPU
                 benchmarks. These improvements result in an overall
                 runtime improvement of 26\%, an energy reduction of
                 18\%, and a reduction in energy-delay product of
                 32\%.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "45",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Zhang:2019:PPB,
  author =       "Feng Zhang and Jingling Xue",
  title =        "{Poker}: Permutation-Based {SIMD} Execution of
                 Intensive Tree Search by Path Encoding",
  journal =      j-TACO,
  volume =       "15",
  number =       "4",
  pages =        "46:1--46:??",
  month =        jan,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3280850",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Tue Jan 8 17:20:00 MST 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "We introduce Poker, a permutation-based approach for
                 vectorizing multiple queries over B$^+$-trees. Our key
                 insight is to combine vector loads and
                 path-encoding-based permutations to alleviate memory
                 latency while keeping the number of key comparisons
                 needed for a query to a minimum. Implemented as a C++
                 template library, Poker represents a general-purpose
                 solution for vectorizing the queries over indexing
                 trees on multi-core processors equipped with SIMD
                 units. For a set of five representative benchmarks
                 evaluated with 24 configurations each, Poker
                 outperforms the state of the art by 2.11x with one
                 single thread and 2.28x with eight threads on an Intel
                 Broadwell processor that supports 256-bit AVX2, on
                 average. In addition, strip-mining queries will further
                 improve Poker's performance by 1.21x (with one single
                 thread) and 1.31x (with eight threads), on average.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "46",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Belleville:2019:ASP,
  author =       "Nicolas Belleville and Damien Courouss{\'e} and Karine
                 Heydemann and Henri-Pierre Charles",
  title =        "Automated Software Protection for the Masses Against
                 Side-Channel Attacks",
  journal =      j-TACO,
  volume =       "15",
  number =       "4",
  pages =        "47:1--47:??",
  month =        jan,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3281662",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Tue Jan 8 17:20:00 MST 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/cryptography2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "We present an approach and a tool to answer the need
                 for effective, generic, and easily applicable
                 protections against side-channel attacks. The
                 protection mechanism is based on code polymorphism, so
                 that the observable behaviour of the protected
                 component is variable and unpredictable to the
                 attacker. Our approach combines lightweight specialized
                 runtime code generation with the optimization
                 capabilities of static compilation. It is extensively
                 configurable. Experimental results show that programs
                 secured by our approach present strong security levels
                 and meet the performance requirements of constrained
                 systems.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "47",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Yu:2019:ITL,
  author =       "Chao Yu and Yuebin Bai and Qingxiao Sun and Hailong
                 Yang",
  title =        "Improving Thread-level Parallelism in {GPUs} Through
                 Expanding Register File to Scratchpad Memory",
  journal =      j-TACO,
  volume =       "15",
  number =       "4",
  pages =        "48:1--48:??",
  month =        jan,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3280849",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Tue Jan 8 17:20:00 MST 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Modern Graphic Processing Units (GPUs) have become
                 pervasive computing devices in datacenters due to their
                 high performance with massive thread level parallelism
                 (TLP). GPUs are equipped with large register files (RF)
                 to support fast context switch between massive threads
                 and scratchpad memory (SPM) to support inter-thread
                 communication within the cooperative thread array
                 (CTA). However, the TLP of GPUs is usually limited by
                 the inefficient resource management of register file
                 and scratchpad memory. This inefficiency also leads to
                 register file and scratchpad memory underutilization.
                 To overcome the above inefficiency, we propose a new
                 resource management approach EXPARS for GPUs. EXPARS
                 provides a larger register file logically by expanding
                 the register file to scratchpad memory. When the
                 available register file becomes limited, our approach
                 leverages the underutilized scratchpad memory to
                 support additional register allocation. Therefore, more
                 CTAs can be dispatched to SMs, which improves the GPU
                 utilization. Our experiments on representative
                 benchmark suites show that the number of CTAs
                 dispatched to each SM increases by 1.28$ \times $ on
                 average. In addition, our approach improves the GPU
                 resource utilization significantly, with the register
                 file utilization improved by 11.64\% and the scratchpad
                 memory utilization improved by 48.20\% on average. With
                 better TLP, our approach achieves 20.01\% performance
                 improvement on average with negligible energy
                 overhead.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "48",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Orosa:2019:AAF,
  author =       "Lois Orosa and Rodolfo Azevedo and Onur Mutlu",
  title =        "{AVPP}: Address-first Value-next Predictor with Value
                 Prefetching for Improving the Efficiency of Load Value
                 Prediction",
  journal =      j-TACO,
  volume =       "15",
  number =       "4",
  pages =        "49:1--49:??",
  month =        jan,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3239567",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Tue Jan 8 17:20:00 MST 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Value prediction improves instruction level
                 parallelism in superscalar processors by breaking true
                 data dependencies. Although this technique can
                 significantly improve overall performance, most of the
                 state-of-the-art value prediction approaches require
                 high hardware cost, which is the main obstacle for its
                 wide adoption in current processors. To tackle this
                 issue, we revisit load value prediction as an efficient
                 alternative to the classical approaches that predict
                 all instructions. By speculating only on loads, the
                 pressure over shared resources (e.g., the Physical
                 Register File) and the predictor size can be
                 substantially reduced (e.g., more than 90\% reduction
                 compared to recent works). We observe that existing
                 value predictors cannot achieve very high performance
                 when speculating only on load instructions. To solve
                 this problem, we propose a new, accurate and low-cost
                 mechanism for predicting the values of load
                 instructions: the Address-first Value-next Predictor
                 with Value Prefetching (AVPP). The key idea of our
                 predictor is to predict the load address first (which,
                 we find, is much more predictable than the value) and
                 to use a small non-speculative Value Table (VT)-indexed
                 by the predicted address-to predict the value next. To
                 increase the coverage of AVPP, we aim to increase the
                 hit rate of the VT by predicting also the load address
                 of a future instance of the same load instruction and
                 prefetching its value in the VT. We show that AVPP is
                 relatively easy to implement, requiring only 2.5\% of
                 the area of a 32KB L1 data cache. We compare our
                 mechanism with five state-of-the-art value prediction
                 techniques, evaluated within the context of load value
                 prediction, in a relatively narrow out-of-order
                 processor. On average, our AVPP predictor achieves
                 11.2\% speedup and 3.7\% of energy savings over the
                 baseline processor, outperforming all the
                 state-of-the-art predictors in 16 of the 23 benchmarks
                 we evaluate. We evaluate AVPP implemented together with
                 different prefetching techniques, showing additive
                 performance gains (20\% average speedup). In addition,
                 we propose a new taxonomy to classify different value
                 predictor policies regarding predictor update,
                 predictor availability, and in-flight pending updates.
                 We evaluate these policies in detail.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "49",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Zhang:2019:REU,
  author =       "Jun Zhang and Rui Hou and Wei Song and Sally A. Mckee
                 and Zhen Jia and Chen Zheng and Mingyu Chen and Lixin
                 Zhang and Dan Meng",
  title =        "{RAGuard}: an Efficient and User-Transparent Hardware
                 Mechanism against {ROP} Attacks",
  journal =      j-TACO,
  volume =       "15",
  number =       "4",
  pages =        "50:1--50:??",
  month =        jan,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3280852",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Tue Jan 8 17:20:00 MST 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/cryptography2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/prng.bib;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Control-flow integrity (CFI) is a general method for
                 preventing code-reuse attacks, which utilize benign
                 code sequences to achieve arbitrary code execution. CFI
                 ensures that the execution of a program follows the
                 edges of its predefined static Control-Flow Graph: any
                 deviation that constitutes a CFI violation terminates
                 the application. Despite decades of research effort,
                 there are still several implementation challenges in
                 efficiently protecting the control flow of function
                 returns (Return-Oriented Programming attacks). The set
                 of valid return addresses of frequently called
                 functions can be large and thus an attacker could bend
                 the backward-edge CFI by modifying an indirect branch
                 target to another within the valid return set. This
                 article proposes RAGuard, an efficient and
                 user-transparent hardware-based approach to prevent
                 Return-Oriented Programming attacks. RAGuard binds a
                 message authentication code (MAC) to each return
                 address to protect its integrity. To guarantee the
                 security of the MAC and reduce runtime overhead:
                 RAGuard (1) computes the MAC by encrypting the
                 signature of a return address with AES-128, (2)
                 develops a key management module based on a Physical
                 Unclonable Function (PUF) and a True Random Number
                 Generator (TRNG), and (3) uses a dedicated register to
                 reduce MACs' load and store operations of leaf
                 functions. We have evaluated our mechanism based on the
                 open-source LEON3 processor and the results show that
                 RAGuard incurs acceptable performance overhead and
                 occupies reasonable area.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "50",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Wang:2019:GGC,
  author =       "Ping Wang and Luke Mchale and Paul V. Gratz and Alex
                 Sprintson",
  title =        "{GenMatcher}: a Generic Clustering-Based Arbitrary
                 Matching Framework",
  journal =      j-TACO,
  volume =       "15",
  number =       "4",
  pages =        "51:1--51:??",
  month =        jan,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3281663",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Tue Jan 8 17:20:00 MST 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Packet classification methods rely upon packet
                 content/header matching against rules. Thus, throughput
                 of matching operations is critical in many networking
                 applications. Further, with the advent of Software
                 Defined Networking (SDN), efficient implementation of
                 software approaches to matching are critical for the
                 overall system performance. This article presents$^1$
                 GenMatcher, a generic, software-only, arbitrary
                 matching framework for fast, efficient searches. The
                 key idea of our approach is to represent arbitrary
                 rules with efficient prefix-based tries. To support
                 arbitrary wildcards, we rearrange bits within the rules
                 such that wildcards accumulate to one side of the
                 bitstring. Since many non-contiguous wildcards often
                 remain, we use multiple prefix-based tries. The main
                 challenge in this context is to generate efficient trie
                 groupings and expansions to support all arbitrary
                 rules. Finding an optimal mix of grouping and expansion
                 is an NP-complete problem. Our contribution includes a
                 novel, clustering-based grouping algorithm to group
                 rules based upon their bit-level similarities. Our
                 algorithm generates near-optimal trie groupings with
                 low configuration times and provides significantly
                 higher match throughput compared to prior techniques.
                 Experiments with synthetic traffic show that our method
                 can achieve a 58.9X speedup compared to the baseline on
                 a single core processor under a given memory
                 constraint.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "51",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Hong:2019:PTG,
  author =       "Ding-Yong Hong and Jan-Jan Wu and Yu-Ping Liu and
                 Sheng-Yu Fu and Wei-Chung Hsu",
  title =        "Processor-Tracing Guided Region Formation in Dynamic
                 Binary Translation",
  journal =      j-TACO,
  volume =       "15",
  number =       "4",
  pages =        "52:1--52:??",
  month =        jan,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3281664",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Tue Jan 8 17:20:00 MST 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Region formation is an important step in dynamic
                 binary translation to select hot code regions for
                 translation and optimization. The quality of the formed
                 regions determines the extent of optimizations and thus
                 determines the final execution performance. Moreover,
                 the overall performance is very sensitive to the
                 formation overhead, because region formation can have a
                 non-trivial cost. For addressing the dual issues of
                 region quality and region formation overhead, this
                 article presents a lightweight region formation method
                 guided by processor tracing, e.g., Intel PT. We
                 leverage the branch history information stored in the
                 processor to reconstruct the program execution profile
                 and effectively form high-quality regions with low
                 cost. Furthermore, we present the designs of
                 lightweight hardware performance monitoring sampling
                 and the branch instruction decode cache to minimize
                 region formation overhead. Using ARM64 to x86-64
                 translations, the experiment results show that our
                 method achieves a performance speedup of up to 1.53$
                 \times $ (1.16$ \times $ on average) for SPEC CPU2006
                 benchmarks with reference inputs, compared to the
                 well-known software-based trace formation method, Next
                 Executing Tail (NET). The performance results of x86-64
                 to ARM64 translations also show a speedup of up to
                 1.25$ \times $ over NET for CINT2006 benchmarks with
                 reference inputs. The comparison with a relaxed NETPlus
                 region formation method further demonstrates that our
                 method achieves the best performance and lowest
                 compilation overhead.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "52",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Wang:2019:PNW,
  author =       "Yu Wang and Victor Lee and Gu-Yeon Wei and David
                 Brooks",
  title =        "Predicting New Workload or {CPU} Performance by
                 Analyzing Public Datasets",
  journal =      j-TACO,
  volume =       "15",
  number =       "4",
  pages =        "53:1--53:??",
  month =        jan,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3284127",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Tue Jan 8 17:20:00 MST 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "The marketplace for general-purpose microprocessors
                 offers hundreds of functionally similar models,
                 differing by traits like frequency, core count, cache
                 size, memory bandwidth, and power consumption. Their
                 performance depends not only on microarchitecture, but
                 also on the nature of the workloads being executed.
                 Given a set of intended workloads, the consumer needs
                 both performance and price information to make rational
                 buying decisions. Many benchmark suites have been
                 developed to measure processor performance, and their
                 results for large collections of CPUs are often
                 publicly available. However, repositories of benchmark
                 results are not always helpful when consumers need
                 performance data for new processors or new workloads.
                 Moreover, the aggregate scores for benchmark suites
                 designed to cover a broad spectrum of workload types
                 can be misleading. To address these problems, we have
                 developed a deep neural network (DNN) model, and we
                 have used it to learn the relationship between the
                 specifications of Intel CPUs and their performance on
                 the SPEC CPU2006 and Geekbench 3 benchmark suites. We
                 show that we can generate useful predictions for new
                 processors and new workloads. We also cross-predict the
                 two benchmark suites and compare their performance
                 scores. The results quantify the self-similarity of
                 these suites for the first time in the literature. This
                 work should discourage consumers from basing purchasing
                 decisions exclusively on Geekbench 3, and it should
                 encourage academics to evaluate research using more
                 diverse workloads than the SPEC CPU suites alone.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "53",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Park:2019:ROC,
  author =       "Hyukwoo Park and Sungkook Kim and Jung-Geun Park and
                 Soo-Mook Moon",
  title =        "Reusing the Optimized Code for {JavaScript}
                 Ahead-of-Time Compilation",
  journal =      j-TACO,
  volume =       "15",
  number =       "4",
  pages =        "54:1--54:??",
  month =        jan,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3291056",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Tue Jan 8 17:20:00 MST 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/java2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "As web pages and web apps increasingly include heavy
                 JavaScript code, JavaScript performance has been a
                 critical issue. Modern JavaScript engines achieve a
                 remarkable performance by employing tiered-execution
                 architecture based on interpreter, baseline
                 just-in-time compiler (JITC), and optimizing JITC.
                 Unfortunately, they suffer from a substantial
                 compilation overhead, which can take more than 50\% of
                 the whole running time. A simple idea to reduce the
                 compilation overhead is ahead-of-time compilation
                 (AOTC), which reuses the code generated in the previous
                 run. In fact, existing studies that reuse the bytecode
                 generated by the interpreter or the machine code
                 generated by the baseline JITC have shown tangible
                 performance benefits [12, 31, 41]. However, there has
                 been no study to reuse the machine code generated by
                 the optimizing JITC, which heavily uses profile-based
                 optimizations, thus not easily reusable. We propose a
                 novel AOTC that can reuse the optimized machine code
                 for high-performance JavaScript engines. Unlike
                 previous AOTCs, we need to resolve a few challenging
                 issues related to reusing profile-based optimized code
                 and relocating dynamic addresses. Our AOTC improves the
                 performance of a commercial JavaScript engine by 6.36
                 times (max) and 1.99 times (average) for Octane
                 benchmarks, by reducing the compilation overhead and by
                 running the optimized code from the first invocation of
                 functions. It also improves the loading time of six web
                 apps by 1.28 times, on average.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "54",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Zhao:2019:BLA,
  author =       "Han Zhao and Quan Chen and Yuxian Qiu and Ming Wu and
                 Yao Shen and Jingwen Leng and Chao Li and Minyi Guo",
  title =        "Bandwidth and Locality Aware Task-stealing for
                 Manycore Architectures with Bandwidth-Asymmetric
                 Memory",
  journal =      j-TACO,
  volume =       "15",
  number =       "4",
  pages =        "55:1--55:??",
  month =        jan,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3291058",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Tue Jan 8 17:20:00 MST 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Parallel computers now start to adopt
                 Bandwidth-Asymmetric Memory architecture that consists
                 of traditional DRAM memory and new High Bandwidth
                 Memory (HBM) for high memory bandwidth. However,
                 existing task schedulers suffer from low bandwidth
                 usage and poor data locality problems in
                 bandwidth-asymmetric memory architectures. To solve the
                 two problems, we propose a Bandwidth and Locality Aware
                 Task-stealing (BATS) system, which consists of an
                 HBM-aware data allocator, a bandwidth-aware traffic
                 balancer, and a hierarchical task-stealing scheduler.
                 Leveraging compile-time code transformation and
                 run-time data distribution, the data allocator enables
                 HBM usage automatically without user interference.
                 According to data access hotness, the traffic balancer
                 migrates data to balance memory traffic across memory
                 nodes proportional to their bandwidth. The hierarchical
                 scheduler improves data locality at runtime without a
                 priori program knowledge. Experiments on an Intel
                 Knights Landing server that adopts bandwidth-asymmetric
                 memory show that BATS reduces the execution time of
                 memory-bound programs up to 83.5\% compared with
                 traditional task-stealing schedulers.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "55",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Ganser:2019:SIP,
  author =       "Stefan Ganser and Armin Gr{\"o}{\ss}linger and Norbert
                 Siegmund and Sven Apel and Christian Lengauer",
  title =        "Speeding up Iterative Polyhedral Schedule Optimization
                 with Surrogate Performance Models",
  journal =      j-TACO,
  volume =       "15",
  number =       "4",
  pages =        "56:1--56:??",
  month =        jan,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3291773",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Tue Jan 8 17:20:00 MST 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Iterative program optimization is known to be able to
                 adapt more easily to particular programs and target
                 hardware than model-based approaches. An approach is to
                 generate random program transformations and evaluate
                 their profitability by applying them and benchmarking
                 the transformed program on the target hardware. This
                 procedure's large computational effort impairs its
                 practicality tremendously, though. To address this
                 limitation, we pursue the guidance of a genetic
                 algorithm for program optimization via feedback from
                 surrogate performance models. We train the models on
                 program transformations that were evaluated during
                 previous iterative optimizations. Our representation of
                 programs and program transformations refers to the
                 polyhedron model. The representation is particularly
                 meaningful for an optimization of loop programs that
                 profit a from coarse-grained parallelization for
                 execution on modern multicore-CPUs. Our evaluation
                 reveals that surrogate performance models can be used
                 to speed up the optimization of loop programs. We
                 demonstrate that we can reduce the benchmarking effort
                 required for an iterative optimization and degrade the
                 resulting speedups by an average of 15\%.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "56",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Wu:2019:DPC,
  author =       "Song Wu and Fang Zhou and Xiang Gao and Hai Jin and
                 Jinglei Ren",
  title =        "Dual-Page Checkpointing: an Architectural Approach to
                 Efficient Data Persistence for In-Memory Applications",
  journal =      j-TACO,
  volume =       "15",
  number =       "4",
  pages =        "57:1--57:??",
  month =        jan,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3291057",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Tue Jan 8 17:20:00 MST 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Data persistence is necessary for many in-memory
                 applications. However, the disk-based data persistence
                 largely slows down in-memory applications. Emerging
                 non-volatile memory (NVM) offers an opportunity to
                 achieve in-memory data persistence at the DRAM-level
                 performance. Nevertheless, NVM typically requires a
                 software library to operate NVM data, which brings
                 significant overhead. This article demonstrates that a
                 hardware-based high-frequency checkpointing mechanism
                 can be used to achieve efficient in-memory data
                 persistence on NVM. To maintain checkpoint consistency,
                 traditional logging and copy-on-write techniques incur
                 excessive NVM writes that impair both performance and
                 endurance of NVM; recent work attempts to solve the
                 issue but requires a large amount of metadata in the
                 memory controller. Hence, we design a new dual-page
                 checkpointing system, which achieves low metadata cost
                 and eliminates most excessive NVM writes at the same
                 time. It breaks the traditional trade-off between
                 metadata space cost and extra data writes. Our solution
                 outperforms the state-of-the-art NVM software libraries
                 by 13.6$ \times $ in throughput, and leads to 34\% less
                 NVM wear-out and 1.28$ \times $ higher throughput than
                 state-of-the-art hardware checkpointing solutions,
                 according to our evaluation with OLTP, graph computing,
                 and machine-learning workloads.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "57",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Kiani:2019:ECP,
  author =       "Mohsen Kiani and Amir Rajabzadeh",
  title =        "Efficient Cache Performance Modeling in {GPUs} Using
                 Reuse Distance Analysis",
  journal =      j-TACO,
  volume =       "15",
  number =       "4",
  pages =        "58:1--58:??",
  month =        jan,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3291051",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Tue Jan 8 17:20:00 MST 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Reuse distance analysis (RDA) is a popular method for
                 calculating locality profiles and modeling cache
                 performance. The present article proposes a framework
                 to apply the RDA algorithm to obtain reuse distance
                 profiles in graphics processing unit (GPU) kernels. To
                 study the implications of hardware-related parameters
                 in RDA, two RDA algorithms were employed, including a
                 high-level cache-independent RDA algorithm, called
                 HLRDA, and a detailed RDA algorithm, called DRDA. DRDA
                 models the effects of reservation fails in cache blocks
                 and miss status holding registers to provide accurate
                 cache-related performance metrics. In this case, the
                 reuse profiles are cache-specific. In a selection of
                 GPU kernels, DRDA obtained the L1 miss-rate breakdowns
                 with an average error of 3.86\% and outperformed the
                 state-of-the-art RDA in terms of accuracy. In terms of
                 performance, DRDA is 246,000$ \times $ slower than the
                 real GPU executions and 11$ \times $ faster than
                 GPGPU-Sim. HLRDA ignores the cache-related parameters
                 and its obtained reuse profiles are general, which can
                 be used to calculate miss rates in all cache sizes.
                 Moreover, the average error incurred by HLRDA was
                 16.9\%.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "58",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Debrunner:2019:AAK,
  author =       "Thomas Debrunner and Sajad Saeedi and Paul H. J.
                 Kelly",
  title =        "{AUKE}: Automatic Kernel Code Generation for an
                 Analogue {SIMD} Focal-Plane Sensor-Processor Array",
  journal =      j-TACO,
  volume =       "15",
  number =       "4",
  pages =        "59:1--59:??",
  month =        jan,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3291055",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Tue Jan 8 17:20:00 MST 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Focal-plane Sensor-Processor Arrays (FPSPs) are new
                 imaging devices with parallel Single Instruction
                 Multiple Data (SIMD) computational capabilities built
                 into every pixel. Compared to traditional imaging
                 devices, FPSPs allow for massive pixel-parallel
                 execution of image processing algorithms. This enables
                 the application of certain algorithms at extreme frame
                 rates ({$>$10},000 frames per second). By performing
                 some early-stage processing in-situ, systems
                 incorporating FPSPs can consume less power compared to
                 conventional approaches using standard digital cameras.
                 In this article, we explore code generation for an FPSP
                 whose 256 $ \times $ 256 processors operate on analogue
                 signal data, leading to further opportunities for power
                 reduction-and additional code synthesis challenges.
                 While rudimentary image processing algorithms have been
                 demonstrated on FPSPs before, progress with
                 higher-level computer vision algorithms has been sparse
                 due to the unique architecture and limits of the
                 devices. This article presents a code generator for
                 convolution filters for the SCAMP-5 FPSP, with
                 applications in many high-level tasks such as
                 convolutional neural networks, pose estimation, and so
                 on. The SCAMP-5 FPSP has no effective multiply
                 operator. Convolutions have to be implemented through
                 sequences of more primitive operations such as
                 additions, subtractions, and multiplications/divisions
                 by two. We present a code generation algorithm to
                 optimise convolutions by identifying common factors in
                 the different weights and by determining an optimised
                 pattern of pixel-to-pixel data movements to exploit
                 them. We present evaluation in terms of both speed and
                 energy consumption for a suite of well-known
                 convolution filters. Furthermore, an application of the
                 method is shown by the implementation of a Viola-Jones
                 face detection algorithm.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "59",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Zhou:2019:SNS,
  author =       "You Zhou and Fei Wu and Zhonghai Lu and Xubin He and
                 Ping Huang and Changsheng Xie",
  title =        "{SCORE}: a Novel Scheme to Efficiently Cache Overlong
                 {ECCs} in {NAND} Flash Memory",
  journal =      j-TACO,
  volume =       "15",
  number =       "4",
  pages =        "60:1--60:??",
  month =        jan,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3291052",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Tue Jan 8 17:20:00 MST 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Technology scaling and program/erase cycling result in
                 an increasing bit error rate in NAND flash storage.
                 Some solid state drives (SSDs) adopt overlong error
                 correction codes (ECCs), whose redundancy size exceeds
                 the spare area limit of flash pages, to protect user
                 data for improved reliability and lifetime. However,
                 the read performance is significantly degraded, because
                 a logical data page and its ECC redundancy are stored
                 in two flash pages. In this article, we find that
                 caching ECCs has a large potential to reduce flash
                 reads by achieving higher hit rates, compared to
                 caching data. Then, we propose a novel {$<$
                 underline$>$ s$<$}/{underline$>$ cheme} to efficiently
                 {$<$ underline$>$ c$<$}/{underline$>$ ache} {$<$
                 underline$>$ o$<$}/{underline$>$ ve$<$ underline$>$
                 r$<$}/{underline$>$ long} {$<$
                 underline$>$E$<$}/{underline$>$CCs}, called SCORE, to
                 improve the SSD performance. Exceeding ECC redundancy
                 (called ECC residues ) of logically consecutive data
                 pages are grouped into ECC pages. SCORE partitions RAM
                 to cache both data pages and ECC pages in a
                 workload-adaptive manner. Finally, we verify SCORE
                 using extensive trace-driven simulations. The results
                 show that SCORE obtains high ECC hit rates without
                 sacrificing data hit rates, thus improving the read
                 performance by an average of 22\% under various
                 workloads, compared to the state-of-the-art schemes.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "60",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Andujar:2019:PPA,
  author =       "Franciso J. And{\'u}jar and Salvador Coll and Marina
                 Alonso and Pedro L{\'o}pez and Juan-Miguel
                 Mart{\'\i}nez",
  title =        "{POWAR}: Power-Aware Routing in {HPC} Networks with
                 On\slash Off Links",
  journal =      j-TACO,
  volume =       "15",
  number =       "4",
  pages =        "61:1--61:??",
  month =        jan,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3293445",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Tue Jan 8 17:20:00 MST 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "In order to save energy in HPC interconnection
                 networks, one usual proposal is to switch idle links
                 into a low-power mode after a certain time without any
                 transmission, as IEEE Energy Efficient Ethernet
                 standard proposes. Extending the low-power mode
                 mechanism, we propose POWer-Aware Routing (POWAR), a
                 simple power-aware routing and selection function for
                 fat-tree and torus networks. POWAR adapts the amount of
                 network links that can be used, taking into account the
                 network load, and obtaining great energy savings in the
                 network (55\%--65\%) and the entire system (9\%--10\%)
                 with negligible performance overhead.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "61",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Mammadli:2019:AGD,
  author =       "Rahim Mammadli and Felix Wolf and Ali Jannesari",
  title =        "The Art of Getting Deep Neural Networks in Shape",
  journal =      j-TACO,
  volume =       "15",
  number =       "4",
  pages =        "62:1--62:??",
  month =        jan,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3291053",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Tue Jan 8 17:20:00 MST 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Training a deep neural network (DNN) involves
                 selecting a set of hyperparameters that define the
                 network topology and influence the accuracy of the
                 resulting network. Often, the goal is to maximize
                 prediction accuracy on a given dataset. However,
                 non-functional requirements of the trained network ---
                 such as inference speed, size, and energy consumption
                 --- can be very important as well. In this article, we
                 aim to automate the process of selecting an appropriate
                 DNN topology that fulfills both functional and
                 non-functional requirements of the application.
                 Specifically, we focus on tuning two important
                 hyperparameters, depth and width, which together define
                 the shape of the resulting network and directly affect
                 its accuracy, speed, size, and energy consumption. To
                 reduce the time needed to search the design space, we
                 train a fraction of DNNs and build a model to predict
                 the performances of the remaining ones. We are able to
                 produce tuned ResNets, which are up to 4.22 times
                 faster than original depth-scaled ResNets on a batch of
                 128 images while matching their accuracy.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "62",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Tzilis:2019:EER,
  author =       "Stavros Tzilis and Pedro Trancoso and Ioannis
                 Sourdis",
  title =        "Energy-Efficient Runtime Management of Heterogeneous
                 Multicores using Online Projection",
  journal =      j-TACO,
  volume =       "15",
  number =       "4",
  pages =        "63:1--63:??",
  month =        jan,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3293446",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Tue Jan 8 17:20:00 MST 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Heterogeneous multicores offer flexibility in the form
                 of different core types and Dynamic Voltage and
                 Frequency Scaling (DVFS), defining a vast configuration
                 space. The optimal configuration choice is not always
                 straightforward, even for single applications, and
                 becomes a very difficult problem for dynamically
                 changing scenarios of concurrent applications with
                 unpredictable spawn and termination times and
                 individual performance requirements. This article
                 proposes an integrated approach for runtime decision
                 making for energy efficiency on such systems. The
                 approach consists of a model that predicts performance
                 and power for any possible decision and low-complexity
                 heuristics that use this model to evaluate a subset of
                 possible decisions to choose the best. The model
                 predicts performance by projecting standalone
                 application profiling data to the current status of the
                 system and power by using a set of platform-specific
                 parameters that are determined only once for a given
                 system and are independent of the application mix. Our
                 approach is evaluated with a plethora of dynamic,
                 multi-application scenarios. When considering best
                 effort performance to be adequate, our runtime achieves
                 on average 3\% higher energy efficiency compared to the
                 powersave governor and 2$ \times $ better compared to
                 the other Linux governors. Moreover, when also
                 considering individual applications' performance
                 requirements, our runtime is able to satisfy them,
                 giving away 18\% of the system's energy efficiency
                 compared to the powersave, which, however, misses the
                 performance targets by 23\%; at the same time, our
                 runtime maintains an efficiency advantage of about 55\%
                 compared to the other governors, which also satisfy the
                 performance constraints.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "63",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Lee:2019:SLS,
  author =       "Matthew Kay Fei Lee and Yingnan Cui and Thannirmalai
                 Somu and Tao Luo and Jun Zhou and Wai Teng Tang and
                 Weng-Fai Wong and Rick Siow Mong Goh",
  title =        "A System-Level Simulator for {RRAM}-Based Neuromorphic
                 Computing Chips",
  journal =      j-TACO,
  volume =       "15",
  number =       "4",
  pages =        "64:1--64:??",
  month =        jan,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3291054",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Tue Jan 8 17:20:00 MST 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Advances in non-volatile resistive switching random
                 access memory (RRAM) have made it a promising memory
                 technology with potential applications in low-power and
                 embedded in-memory computing devices owing to a number
                 of advantages such as low-energy consumption, low area
                 cost and good scaling. There have been proposals to
                 employ RRAM in architecting chips for neuromorphic
                 computing and artificial neural networks where
                 matrix-vector multiplication can be computed in the
                 analog domain in a single timestep. However, it is
                 challenging to employ RRAM devices in neuromorphic
                 chips owing to the non-ideal behavior of RRAM. In this
                 article, we propose a cycle-accurate and scalable
                 system-level simulator that can be used to study the
                 effects of using RRAM devices in neuromorphic computing
                 chips. The simulator models a spatial neuromorphic chip
                 architecture containing many neural cores with RRAM
                 crossbars connected via a Network-on-Chip (NoC). We
                 focus on system-level simulation and demonstrate the
                 effectiveness of our simulator in understanding how
                 non-linear RRAM effects such as stuck-at-faults (SAFs),
                 write variability, and random telegraph noise (RTN) can
                 impact an application's behavior. By using our
                 simulator, we show that RTN and write variability can
                 have adverse effects on an application. Nevertheless,
                 we show that these effects can be mitigated through
                 proper design choices and the implementation of a
                 write-verify scheme.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "64",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Vasilakis:2019:DFC,
  author =       "Evangelos Vasilakis and Vassilis Papaefstathiou and
                 Pedro Trancoso and Ioannis Sourdis",
  title =        "Decoupled Fused Cache: Fusing a Decoupled {LLC} with a
                 {DRAM} Cache",
  journal =      j-TACO,
  volume =       "15",
  number =       "4",
  pages =        "65:1--65:??",
  month =        jan,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3293447",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Tue Jan 8 17:20:00 MST 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "DRAM caches have shown excellent potential in
                 capturing the spatial and temporal data locality of
                 applications capitalizing on advances of 3D-stacking
                 technology; however, they are still far from their
                 ideal performance. Besides the unavoidable DRAM access
                 to fetch the requested data, tag access is in the
                 critical path, adding significant latency and energy
                 costs. Existing approaches are not able to remove these
                 overheads and in some cases limit DRAM cache design
                 options. For instance, caching DRAM cache tags adds
                 constant latency to every access; accessing the DRAM
                 cache using the TLB calls for OS support and DRAM
                 cachelines as large as a page; reusing the last-level
                 cache (LLC) tags to access the DRAM cache limits LLC
                 performance as it requires indexing the LLC using
                 higher-order address bits. In this article, we
                 introduce Decoupled Fused Cache, a DRAM cache design
                 that alleviates the cost of tag accesses by fusing DRAM
                 cache tags with the tags of the on-chip LLC without
                 affecting LLC performance. In essence, the Decoupled
                 Fused Cache relies in most cases on the LLC tag access
                 to retrieve the required information for accessing the
                 DRAM cache while avoiding additional overheads.
                 Compared to current DRAM cache designs of the same
                 cacheline size, Decoupled Fused Cache improves system
                 performance by 6\% on average and by 16\% to 18\% for
                 large cacheline sizes. Finally, Decoupled Fused Cache
                 reduces DRAM cache traffic by 18\% and DRAM cache
                 energy consumption by 7\%.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "65",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Pirkelbauer:2019:BTF,
  author =       "Peter Pirkelbauer and Amalee Wilson and Christina
                 Peterson and Damian Dechev",
  title =        "{Blaze-Tasks}: a Framework for Computing Parallel
                 Reductions over Tasks",
  journal =      j-TACO,
  volume =       "15",
  number =       "4",
  pages =        "66:1--66:??",
  month =        jan,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3293448",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Tue Jan 8 17:20:00 MST 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Compared to threads, tasks are a more fine-grained
                 alternative. The task parallel programming model offers
                 benefits in terms of better performance portability and
                 better load-balancing for problems that exhibit
                 nonuniform workloads. A common scenario of task
                 parallel programming is that a task is recursively
                 decomposed into smaller sub-tasks. Depending on the
                 problem domain, the number of created sub-tasks may be
                 nonuniform, thereby creating potential for significant
                 load imbalances in the system. Dynamic load-balancing
                 mechanisms will distribute the tasks across available
                 threads. The final result of a computation may be
                 modeled as a reduction over the results of all
                 sub-tasks. This article describes a simple, yet
                 effective prototype framework, Blaze-Tasks, for task
                 scheduling and task reductions on shared memory
                 architectures. The framework has been designed with
                 lock-free techniques and generic programming principles
                 in mind. Blaze-Tasks is implemented entirely in C++17
                 and is thus portable. To load-balance the computation,
                 Blaze-Tasks uses task stealing. To manage contention on
                 a task pool, the number of lock-free attempts to steal
                 a task depends on the distance between thief and pool
                 owner and the estimated number of tasks in a victim's
                 pool. This article evaluates the Blaze framework on
                 Intel and IBM dual-socket systems using nine benchmarks
                 and compares its performance with other task parallel
                 frameworks. While Cilk outperforms Blaze on Intel on
                 most benchmarks, the evaluation shows that Blaze is
                 competitive with OpenMP and other library-based
                 implementations. On IBM, the experiments show that
                 Blaze outperforms other approaches on most
                 benchmarks.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "66",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Sato:2019:AFS,
  author =       "Yukinori Sato and Tomoya Yuki and Toshio Endo",
  title =        "An Autotuning Framework for Scalable Execution of
                 Tiled Code via Iterative Polyhedral Compilation",
  journal =      j-TACO,
  volume =       "15",
  number =       "4",
  pages =        "67:1--67:??",
  month =        jan,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3293449",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Tue Jan 8 17:20:00 MST 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "On modern many-core CPUs, performance tuning against
                 complex memory subsystems and scalability for
                 parallelism is mandatory to achieve their potential. In
                 this article, we focus on loop tiling, which plays an
                 important role in performance tuning, and develop a
                 novel framework that analytically models the load
                 balance and empirically autotunes unpredictable cache
                 behaviors through iterative polyhedral compilation
                 using LLVM/Polly. From an evaluation on many-core CPUs,
                 we demonstrate that our autotuner achieves a
                 performance superior to those that use conventional
                 static approaches and well-known autotuning heuristics.
                 Moreover, our autotuner achieves almost the same
                 performance as a brute-force search-based approach.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "67",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Shekofteh:2019:MSG,
  author =       "S.-Kazem Shekofteh and Hamid Noori and Mahmoud
                 Naghibzadeh and Hadi Sadoghi Yazdi and Holger
                 Fr{\"o}ning",
  title =        "Metric Selection for {GPU} Kernel Classification",
  journal =      j-TACO,
  volume =       "15",
  number =       "4",
  pages =        "68:1--68:??",
  month =        jan,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3295690",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Tue Jan 8 17:20:00 MST 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Graphics Processing Units (GPUs) are vastly used for
                 running massively parallel programs. GPU kernels
                 exhibit different behavior at runtime and can usually
                 be classified in a simple form as either
                 ``compute-bound'' or ``memory-bound.'' Recent GPUs are
                 capable of concurrently running multiple kernels, which
                 raises the question of how to most appropriately
                 schedule kernels to achieve higher performance. In
                 particular, co-scheduling of compute-bound and
                 memory-bound kernels seems promising. However, its
                 benefits as well as drawbacks must be determined along
                 with which kernels should be selected for a concurrent
                 execution. Classifying kernels can be performed online
                 by instrumentation based on performance counters. This
                 work conducts a thorough analysis of the metrics
                 collected from various benchmarks from Rodinia and CUDA
                 SDK. The goal is to find the minimum number of
                 effective metrics that enables online classification of
                 kernels with a low overhead. This study employs a
                 wrapper-based feature selection method based on the
                 Fisher feature selection criterion. The results of
                 experiments show that to classify kernels with a high
                 accuracy, only three and five metrics are sufficient on
                 a Kepler and a Pascal GPU, respectively. The proposed
                 method is then utilized for a runtime scheduler. The
                 results show an average speedup of 1.18$ \times $ and
                 1.1$ \times $ compared with a serial and a random
                 scheduler, respectively.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "68",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Bilas:2019:LDR,
  author =       "Angelos Bilas",
  title =        "List of 2018 Distinguished Reviewers {ACM TACO}",
  journal =      j-TACO,
  volume =       "15",
  number =       "4",
  pages =        "69:1--69:??",
  month =        jan,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3293444",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Tue Jan 8 17:20:00 MST 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "69",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Shobaki:2019:EAC,
  author =       "Ghassan Shobaki and Austin Kerbow and Christopher
                 Pulido and William Dobson",
  title =        "Exploring an Alternative Cost Function for
                 Combinatorial Register-Pressure-Aware Instruction
                 Scheduling",
  journal =      j-TACO,
  volume =       "16",
  number =       "1",
  pages =        "1:1--1:??",
  month =        mar,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3301489",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Mar 11 19:00:20 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Multiple combinatorial algorithms have been proposed
                 for doing pre-allocation instruction scheduling with
                 the objective of minimizing register pressure or
                 balancing register pressure and instruction-level
                 parallelism. The cost function that is minimized in
                 most of these algorithms is the peak register pressure
                 (or the peak excess register pressure). In this work,
                 we explore an alternative register-pressure cost
                 function, which is the Sum of Live Interval Lengths
                 (SLIL). Unlike the peak cost function, which captures
                 register pressure only at the highest pressure point in
                 the schedule, the proposed SLIL cost function captures
                 register pressure at all points in the schedule.
                 Minimizing register pressure at all points is desirable
                 in larger scheduling regions with multiple
                 high-pressure points. This article describes a
                 Branch-and-Bound (B8B) algorithm for minimizing the
                 SLIL cost function. The algorithm is based on two
                 SLIL-specific dynamic lower bounds as well as the
                 history utilization technique proposed in our previous
                 work. The proposed algorithm is implemented into the
                 LLVM Compiler and evaluated experimentally relative to
                 our previously proposed B8B algorithm for minimizing
                 the peak excess register pressure. The experimental
                 results show that the proposed algorithm for minimizing
                 the SLIL cost function produces substantially less
                 spilling than the previous algorithm that minimizes the
                 peak cost function. Execution-time results on various
                 processors show that the proposed B8B algorithm
                 significantly improves the performance of many CPU2006
                 benchmarks by up to 49\% relative to LLVM's default
                 scheduler. The geometric-mean improvement for FP2006 on
                 Intel Core i7 is 4.22\%.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "1",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Liu:2019:ESA,
  author =       "Yu-Ping Liu and Ding-Yong Hong and Jan-Jan Wu and
                 Sheng-Yu Fu and Wei-Chung Hsu",
  title =        "Exploiting {SIMD} Asymmetry in {ARM}-to-x86 Dynamic
                 Binary Translation",
  journal =      j-TACO,
  volume =       "16",
  number =       "1",
  pages =        "2:1--2:??",
  month =        mar,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3301488",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Mar 11 19:00:20 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Single instruction multiple data (SIMD) has been
                 adopted for decades because of its superior performance
                 and power efficiency. The SIMD capability (i.e., width,
                 number of registers, and advanced instructions) has
                 diverged rapidly on different SIMD instruction-set
                 architectures (ISAs). Therefore, migrating existing
                 applications to another host ISA that has fewer but
                 longer SIMD registers and more advanced instructions
                 raises the issues of asymmetric SIMD capability. To
                 date, this issue has been overlooked and the host SIMD
                 capability is underutilized, resulting in suboptimal
                 performance. In this article, we present a novel binary
                 translation technique called spill-aware superword
                 level parallelism (saSLP), which combines short ARMv8
                 instructions and registers in the guest binaries to
                 exploit the x86 AVX2 host's parallelism, register
                 capacity, and gather instructions. Our experiment
                 results show that saSLP improves the performance by
                 1.6$ \times $ (2.3$ \times $) across a number of
                 benchmarks and reduces spilling by 97\% (99\%) for
                 ARMv8 to x86 AVX2 (AVX-512) translation. Furthermore,
                 with AVX2 (AVX-512) gather instructions, saSLP speeds
                 up several data-irregular applications that cannot be
                 vectorized on ARMv8 NEON by up to 3.9$ \times $ (4.2$
                 \times $).",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "2",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Sadrosadati:2019:IIT,
  author =       "Mohammad Sadrosadati and Seyed Borna Ehsani and Hajar
                 Falahati and Rachata Ausavarungnirun and Arash Tavakkol
                 and Mojtaba Abaee and Lois Orosa and Yaohua Wang and
                 Hamid Sarbazi-Azad and Onur Mutlu",
  title =        "{ITAP}: Idle-Time-Aware Power Management for {GPU}
                 Execution Units",
  journal =      j-TACO,
  volume =       "16",
  number =       "1",
  pages =        "3:1--3:??",
  month =        mar,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3291606",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Mar 11 19:00:20 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Graphics Processing Units (GPUs) are widely used as
                 the accelerator of choice for applications with
                 massively data-parallel tasks. However, recent studies
                 show that GPUs suffer heavily from resource
                 underutilization, which, combined with their large
                 static power consumption, imposes a significant power
                 overhead. One of the most power-hungry components of a
                 GPU-the execution units-frequently experience idleness
                 when (1) an underutilized warp is issued to the
                 execution units, leading to partial lane idleness, and
                 (2) there is no active warp to be issued for the
                 execution due to warp stalls (e.g., waiting for memory
                 access and synchronization). Although large in total,
                 the idle time of execution units actually comes from
                 short but frequent stalls, leaving little potential for
                 common power saving techniques, such as power-gating.
                 In this article, we propose ITAP, a novel
                 idle-time-aware power management technique, which aims
                 to effectively reduce the static energy consumption of
                 GPU execution units. By taking advantage of different
                 power management techniques (i.e., power-gating and
                 different levels of voltage scaling), ITAP employs
                 three static power reduction modes with different
                 overheads and capabilities of static power reduction.
                 ITAP estimates the idle period length of execution
                 units using prediction and peek-ahead techniques in a
                 synergistic way and then applies the most appropriate
                 static power reduction mode based on the estimated idle
                 period length. We design ITAP to be power-aggressive or
                 performance-aggressive, not both at the same time. Our
                 experimental results on several workloads show that the
                 power-aggressive design of ITAP outperforms the
                 state-of-the-art solution by an average of 27.6\% in
                 terms of static energy savings, with less than 2.1\%
                 performance overhead. However, the
                 performance-aggressive design of ITAP improves the
                 static energy savings by an average of 16.9\%, while
                 keeping the GPU performance almost unaffected (i.e., up
                 to 0.4\% performance overhead) compared to the
                 state-of-the-art static energy savings mechanism.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "3",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Dogan:2019:ASU,
  author =       "Halit Dogan and Masab Ahmad and Brian Kahne and Omer
                 Khan",
  title =        "Accelerating Synchronization Using Moving Compute to
                 Data Model at 1,000-core Multicore Scale",
  journal =      j-TACO,
  volume =       "16",
  number =       "1",
  pages =        "4:1--4:??",
  month =        mar,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3300208",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Mar 11 19:00:20 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Thread synchronization using shared memory hardware
                 cache coherence paradigm is prevalent in multicore
                 processors. However, as the number of cores increase on
                 a chip, cache line ping-pong prevents performance
                 scaling for algorithms that deploy fine-grain
                 synchronization. This article proposes an in-hardware
                 moving computation to data model (MC) that pins shared
                 data at dedicated cores. The critical code sections are
                 serialized and executed at these cores in a spatial
                 setting to enable data locality optimizations.
                 In-hardware messages enable non-blocking and blocking
                 communication between cores, without involving the
                 cache coherence protocol. The in-hardware MC model is
                 implemented on Tilera Tile-Gx72 multicore platform to
                 evaluate 8- to 64-core count scale. A simulated RISC-V
                 multicore environment is built to further evaluate the
                 performance scaling advantages of the MC model at
                 1,024-cores scale. The evaluation using graph and
                 machine-learning benchmarks illustrates that atomic
                 instructions based synchronization scales up to 512
                 cores, and the MC model at the same core count
                 outperforms by 27\% in completion time and 39\% in
                 dynamic energy consumption.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "4",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Azriel:2019:MSP,
  author =       "Leonid Azriel and Lukas Humbel and Reto Achermann and
                 Alex Richardson and Moritz Hoffmann and Avi Mendelson
                 and Timothy Roscoe and Robert N. M. Watson and Paolo
                 Faraboschi and Dejan Milojicic",
  title =        "Memory-Side Protection With a Capability Enforcement
                 Co-Processor",
  journal =      j-TACO,
  volume =       "16",
  number =       "1",
  pages =        "5:1--5:??",
  month =        mar,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3302257",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Mar 11 19:00:20 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Byte-addressable nonvolatile memory (NVM) blends the
                 concepts of storage and memory and can radically
                 improve data-centric applications, from in-memory
                 databases to graph processing. By enabling
                 large-capacity devices to be shared across multiple
                 computing elements, fabric-attached NVM changes the
                 nature of rack-scale systems and enables short-latency
                 direct memory access while retaining data persistence
                 properties and simplifying the software stack. An
                 adequate protection scheme is paramount when addressing
                 shared and persistent memory, but mechanisms that rely
                 on virtual memory paging suffer from the tension
                 between performance (pushing toward large pages) and
                 protection granularity (pushing toward small pages). To
                 address this tension, capabilities are worth revisiting
                 as a more powerful protection mechanism, but the long
                 time needed to introduce new CPU features hampers the
                 adoption of schemes that rely on instruction-set
                 architecture support. This article proposes the
                 Capability Enforcement Co-Processor (CEP), a
                 programmable memory controller that implements
                 fine-grain protection through the capability model
                 without requiring instruction-set support in the
                 application CPU. CEP decouples capabilities from the
                 application CPU instruction-set architecture, shortens
                 time to adoption, and can rapidly evolve to embrace new
                 persistent memory technologies, from NVDIMMs to native
                 NVM devices, either locally connected or fabric
                 attached in rack-scale configurations. CEP exposes an
                 application interface based on memory handles that get
                 internally converted to extended-pointer capabilities.
                 This article presents a proof of concept implementation
                 of a distributed object store (Redis) with CEP. It also
                 demonstrates a capability-enhanced file system (FUSE)
                 implementation using CEP. Our proof of concept shows
                 that CEP provides fine-grain protection while enabling
                 direct memory access from application clients to the
                 NVM, and that by doing so opens up important
                 performance optimization opportunities (up to 4$ \times
                 $ reduction in latency in comparison to software-based
                 security enforcement) without compromising security.
                 Finally, we also sketch how a future hybrid model could
                 improve the initial implementation by delegating some
                 CEP functionality to a CHERI-enabled processor.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "5",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Jaleel:2019:DHP,
  author =       "Aamer Jaleel and Eiman Ebrahimi and Sam Duncan",
  title =        "{DUCATI}: High-performance Address Translation by
                 Extending {TLB} Reach of {GPU}-accelerated Systems",
  journal =      j-TACO,
  volume =       "16",
  number =       "1",
  pages =        "6:1--6:??",
  month =        mar,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3309710",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Mar 11 19:00:20 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Conventional on-chip TLB hierarchies are unable to
                 fully cover the growing application working-set sizes.
                 To make things worse, Last-Level TLB (LLT) misses
                 require multiple accesses to the page table even with
                 the use of page walk caches. Consequently, LLT misses
                 incur long address translation latency and hurt
                 performance. This article proposes two low-overhead
                 hardware mechanisms for reducing the frequency and
                 penalty of on-die LLT misses. The first, Unified CAche
                 and TLB (UCAT), enables the conventional on-die
                 Last-Level Cache to store cache lines and TLB entries
                 in a single unified structure and increases on-die TLB
                 capacity significantly. The second, DRAM-TLB, memoizes
                 virtual to physical address translations in DRAM and
                 reduces LLT miss penalty when UCAT is unable to fully
                 cover total application working-set. DRAM-TLB serves as
                 the next larger level in the TLB hierarchy that
                 significantly increases TLB coverage relative to
                 on-chip TLBs. The combination of these two mechanisms,
                 DUCATI, is an address translation architecture that
                 improves GPU performance by 81\%; (up to 4.5$ \times $)
                 while requiring minimal changes to the existing system
                 design. We show that DUCATI is within 20\%, 5\%, and
                 2\% the performance of a perfect LLT system when using
                 4KB, 64KB, and 2MB pages, respectively.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "6",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Xu:2019:SSD,
  author =       "Yemao Xu and Dezun Dong and Weixia Xu and Xiangke
                 Liao",
  title =        "{SketchDLC}: a Sketch on Distributed Deep Learning
                 Communication via Trace Capturing",
  journal =      j-TACO,
  volume =       "16",
  number =       "2",
  pages =        "7:1--7:??",
  month =        may,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3312570",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Jul 26 14:25:54 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "With the fast development of deep learning (DL), the
                 communication is increasingly a bottleneck for
                 distributed workloads, and a series of optimization
                 works have been done to scale out successfully.
                 Nevertheless, the network behavior has not been
                 investigated much yet. We intend to analyze the network
                 behavior and then carry out some research through
                 network simulation. Under this circumstance, an
                 accurate communication measurement is necessary, as it
                 is an effective way to study the network behavior and
                 the basis for accurate simulation. Therefore, we
                 propose to capture the deep learning communication
                 (DLC) trace to achieve the measurement. To the best of
                 our knowledge, we make the first attempt to capture the
                 communication trace for DL training. In this article,
                 we first provide detailed analyses about the
                 communication mechanism of MXNet, which is a
                 representative framework for distributed DL. Secondly,
                 we define the DLC trace format to describe and record
                 the communication behaviors. Third, we present the
                 implementation of method for trace capturing. Finally,
                 we make some statistics and analyses about the
                 distributed DL training, including communication
                 pattern, overlap ratio between computation and
                 communication, computation overhead, synchronization
                 overhead, update overhead, and so forth. Both the
                 statistics and analyses are based on the trace files
                 captured in a cluster with six machines. On the one
                 hand, our trace files provide a sketch on the DLC,
                 which contributes to understanding the communication
                 details. On the other hand, the captured trace files
                 can be used for figuring out various overheads, as they
                 record the communication behaviors of each node.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "7",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Mastoras:2019:ESE,
  author =       "Aristeidis Mastoras and Thomas R. Gross",
  title =        "Efficient and Scalable Execution of Fine-Grained
                 Dynamic Linear Pipelines",
  journal =      j-TACO,
  volume =       "16",
  number =       "2",
  pages =        "8:1--8:??",
  month =        may,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3307411",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Jul 26 14:25:54 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "We present Pipelite, a dynamic scheduler that exploits
                 the properties of dynamic linear pipelines to achieve
                 high performance for fine-grained workloads. The
                 flexibility of Pipelite allows the stages and their
                 data dependences to be determined at runtime. Pipelite
                 unifies communication, scheduling, and synchronization
                 algorithms with suitable data structures. This unified
                 design introduces the local suspension mechanism and a
                 wait-free enqueue operation, which allow efficient
                 dynamic scheduling. The evaluation on a 44-core
                 machine, using programs from three widely used
                 benchmark suites, shows that Pipelite implies low
                 overhead and significantly outperforms the state of the
                 art in terms of speedup, scalability, and memory
                 usage.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "8",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Ham:2019:EDS,
  author =       "Tae Jun Ham and Juan L. Arag{\'o}n and Margaret
                 Martonosi",
  title =        "Efficient Data Supply for Parallel Heterogeneous
                 Architectures",
  journal =      j-TACO,
  volume =       "16",
  number =       "2",
  pages =        "9:1--9:??",
  month =        may,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3310332",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Jul 26 14:25:54 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Decoupling techniques have been proposed to reduce the
                 amount of memory latency exposed to high-performance
                 accelerators as they fetch data. Although decoupled
                 access-execute (DAE) and more recent decoupled data
                 supply approaches offer promising single-threaded
                 performance improvements, little work has considered
                 how to extend them into parallel scenarios. This
                 article explores the opportunities and challenges of
                 designing parallel, high-performance,
                 resource-efficient decoupled data supply systems. We
                 propose Mercury, a parallel decoupled data supply
                 system that utilizes thread-level parallelism for
                 high-throughput data supply with good portability
                 attributes. Additionally, we introduce some
                 microarchitectural improvements for data supply units
                 to efficiently handle long-latency indirect loads.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "9",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Sioutas:2019:SSH,
  author =       "Savvas Sioutas and Sander Stuijk and Luc Waeijen and
                 Twan Basten and Henk Corporaal and Lou Somers",
  title =        "Schedule Synthesis for {Halide} Pipelines through
                 Reuse Analysis",
  journal =      j-TACO,
  volume =       "16",
  number =       "2",
  pages =        "10:1--10:??",
  month =        may,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3310248",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Jul 26 14:25:54 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Efficient code generation for image processing
                 applications continues to pose a challenge in a domain
                 where high performance is often necessary to meet
                 real-time constraints. The inherently complex structure
                 found in most image-processing pipelines, the plethora
                 of transformations that can be applied to optimize the
                 performance of an implementation, as well as the
                 interaction of these optimizations with locality,
                 redundant computation and parallelism, can be
                 identified as the key reasons behind this issue. Recent
                 domain-specific languages (DSL) such as the Halide DSL
                 and compiler attempt to encourage high-level
                 design-space exploration to facilitate the optimization
                 process. We propose a novel optimization strategy that
                 aims to maximize producer-consumer locality by
                 exploiting reuse in image-processing pipelines. We
                 implement our analysis as a tool that can be used
                 alongside the Halide DSL to automatically generate
                 schedules for pipelines implemented in Halide and test
                 it on a variety of benchmarks. Experimental results on
                 three different multi-core architectures show an
                 average performance improvement of 40\% over the Halide
                 Auto-Scheduler and 75\% over a state-of-the art
                 approach that targets the PolyMage DSL.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "10",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Wang:2019:SSL,
  author =       "Xiaoyuan Wang and Haikun Liu and Xiaofei Liao and Ji
                 Chen and Hai Jin and Yu Zhang and Long Zheng and
                 Bingsheng He and Song Jiang",
  title =        "Supporting Superpages and Lightweight Page Migration
                 in Hybrid Memory Systems",
  journal =      j-TACO,
  volume =       "16",
  number =       "2",
  pages =        "11:1--11:??",
  month =        may,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3310133",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Jul 26 14:25:54 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Superpages have long been used to mitigate address
                 translation overhead in large-memory systems. However,
                 superpages often preclude lightweight page migration,
                 which is crucial for performance and energy efficiency
                 in hybrid memory systems composed of DRAM and
                 non-volatile memory (NVM). In this article, we propose
                 a novel memory management mechanism called Rainbow to
                 bridge this fundamental conflict between superpages and
                 lightweight page migration. Rainbow manages NVM at the
                 superpage granularity, and uses DRAM to cache
                 frequently accessed (hot) small pages within each
                 superpage. Correspondingly, Rainbow utilizes split TLBs
                 to support different page sizes. By introducing an
                 efficient hot page identification mechanism and a novel
                 NVM-to-DRAM address remapping mechanism, Rainbow
                 supports lightweight page migration without splintering
                 superpages. Experiment results show that Rainbow can
                 significantly reduce applications' TLB misses by
                 99.9\%, and improve application performance (in terms
                 of IPC) by up to $ 2.9 \times $ (45.3\% on average)
                 when compared to a state-of-the-art memory migration
                 policy without a superpage support.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "11",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Sargaran:2019:SSA,
  author =       "Sahar Sargaran and Naser Mohammadzadeh",
  title =        "{SAQIP}: a Scalable Architecture for Quantum
                 Information Processors",
  journal =      j-TACO,
  volume =       "16",
  number =       "2",
  pages =        "12:1--12:??",
  month =        may,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3311879",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Jul 26 14:25:54 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Proposing an architecture that efficiently compensates
                 for the inefficiencies of physical hardware with extra
                 resources is one of the key issues in quantum computer
                 design. Although the demonstration of quantum systems
                 has been limited to some dozen qubits, scaling the
                 current small-sized lab quantum systems to large-scale
                 quantum systems that are capable of solving meaningful
                 practical problems can be the main goal of much
                 research. Focusing on this issue, in this article a
                 scalable architecture for quantum information
                 processors, called SAQIP, is proposed. Moreover, a flow
                 is presented to map and schedule a quantum circuit on
                 this architecture. Experimental results show that the
                 proposed architecture and design flow decrease the
                 average latency and the average area of quantum
                 circuits by about 81\% and 11\%, respectively, for the
                 attempted benchmarks.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "12",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Budhkar:2019:AMD,
  author =       "Prerna Budhkar and Ildar Absalyamov and Vasileios Zois
                 and Skyler Windh and Walid A. Najjar and Vassilis J.
                 Tsotras",
  title =        "Accelerating In-Memory Database Selections Using
                 Latency Masking Hardware Threads",
  journal =      j-TACO,
  volume =       "16",
  number =       "2",
  pages =        "13:1--13:??",
  month =        may,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3310229",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Jul 26 14:25:54 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Inexpensive DRAMs have created new opportunities for
                 in-memory data analytics. However, the major bottleneck
                 in such systems is high memory access latency.
                 Traditionally, this problem is solved with large cache
                 hierarchies that only benefit regular applications.
                 Alternatively, many data-intensive applications exhibit
                 irregular behavior. Hardware multithreading can better
                 cope with high latency seen in such applications. This
                 article implements a multithreaded prototype (MTP) on
                 FPGAs for the relational selection operator that
                 exhibits control flow irregularity. On a standard TPC-H
                 query evaluation, MTP achieves a bandwidth utilization
                 of 83\%, while the CPU and the GPU implementations
                 achieve 61\% and 64\%, respectively. Besides being
                 bandwidth efficient, MTP is also $ 14.2 \times $ and $
                 4.2 \times $ more power efficient than CPU and GPU,
                 respectively.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "13",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Riebler:2019:TAH,
  author =       "Heinrich Riebler and Gavin Vaz and Tobias Kenter and
                 Christian Plessl",
  title =        "Transparent Acceleration for Heterogeneous Platforms
                 With Compilation to {OpenCL}",
  journal =      j-TACO,
  volume =       "16",
  number =       "2",
  pages =        "14:1--14:??",
  month =        may,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3319423",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Jul 26 14:25:54 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Multi-accelerator platforms combine CPUs and different
                 accelerator architectures within a single compute node.
                 Such systems are capable of processing parallel
                 workloads very efficiently while being more energy
                 efficient than regular systems consisting of CPUs only.
                 However, the architectures of such systems are diverse,
                 forcing developers to port applications to each
                 accelerator using different programming languages,
                 models, tools, and compilers. Developers not only
                 require domain-specific knowledge but also need to
                 understand the low-level accelerator details, leading
                 to an increase in the design effort and costs. To
                 tackle this challenge, we propose a compilation
                 approach and a practical realization called HTrOP that
                 is completely transparent to the user. HTrOP is able to
                 automatically analyze a sequential CPU application,
                 detect computational hotspots, and generate parallel
                 OpenCL host and kernel code. The potential of HTrOP is
                 demonstrated by offloading hotspots to different
                 OpenCL-enabled resources (currently the CPU, the
                 general-purpose GPU, and the manycore Intel Xeon Phi)
                 for a broad set of benchmark applications. We present
                 an in-depth evaluation of our approach in terms of
                 performance gains and energy savings, taking into
                 account all static and dynamic overheads. We are able
                 to achieve speedups and energy savings of up to two
                 orders of magnitude, if an application has sufficient
                 computational intensity, when compared to a natively
                 compiled application.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "14",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Gong:2019:HAG,
  author =       "Xun Gong and Xiang Gong and Leiming Yu and David
                 Kaeli",
  title =        "{HAWS}: Accelerating {GPU} Wavefront Execution through
                 Selective Out-of-order Execution",
  journal =      j-TACO,
  volume =       "16",
  number =       "2",
  pages =        "15:1--15:??",
  month =        may,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3291050",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Jul 26 14:25:54 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Graphics Processing Units (GPUs) have become an
                 attractive platform for accelerating challenging
                 applications on a range of platforms, from High
                 Performance Computing (HPC) to full-featured
                 smartphones. They can overcome computational barriers
                 in a wide range of data-parallel kernels. GPUs hide
                 pipeline stalls and memory latency by utilizing
                 efficient thread preemption. But given the demands on
                 the memory hierarchy due to the growth in the number of
                 computing cores on-chip, it has become increasingly
                 difficult to hide all of these stalls. In this article,
                 we propose a novel Hint-Assisted Wavefront Scheduler
                 (HAWS) to bypass long-latency stalls. HAWS starts by
                 enhancing a compiler infrastructure to identify
                 potential opportunities that can bypass memory stalls.
                 HAWS includes a wavefront scheduler that can continue
                 to execute instructions in the shadow of a memory
                 stall, executing instructions speculatively, guided by
                 compiler-generated hints. HAWS increases utilization of
                 GPU resources by aggressively fetching/executing
                 speculatively. Based on our simulation results on the
                 AMD Southern Islands GPU architecture, at an estimated
                 cost of 0.4\% total chip area, HAWS can improve
                 application performance by 14.6\% on average for memory
                 intensive applications.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "15",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Song:2019:SAR,
  author =       "Yang Song and Olivier Alavoine and Bill Lin",
  title =        "A Self-aware Resource Management Framework for
                 Heterogeneous Multicore {SoCs} with Diverse {QoS}
                 Targets",
  journal =      j-TACO,
  volume =       "16",
  number =       "2",
  pages =        "16:1--16:??",
  month =        may,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3319804",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Jul 26 14:25:54 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "In modern heterogeneous MPSoCs, the management of
                 shared memory resources is crucial in delivering
                 end-to-end QoS. Previous frameworks have either focused
                 on singular QoS targets or the allocation of
                 partitionable resources among CPU applications at
                 relatively slow timescales. However, heterogeneous
                 MPSoCs typically require instant response from the
                 memory system where most resources cannot be
                 partitioned. Moreover, the health of different cores in
                 a heterogeneous MPSoC is often measured by diverse
                 performance objectives. In this work, we propose the
                 Self-Aware Resource Allocation framework for
                 heterogeneous MPSoCs. Priority-based adaptation allows
                 cores to use different target performance and
                 self-monitor their own intrinsic health. In response,
                 the system allocates non-partitionable resources based
                 on priorities. The proposed framework meets a diverse
                 range of QoS demands from heterogeneous cores.
                 Moreover, we present a runtime scheme to configure
                 priority-based adaptation so that distinct
                 sensitivities of heterogeneous QoS targets with respect
                 to memory allocation can be accommodated. In addition,
                 the priority of best-effort cores can also be
                 regulated.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "16",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Yebenes:2019:CSA,
  author =       "Pedro Yebenes and Jose Rocher-Gonzalez and Jesus
                 Escudero-Sahuquillo and Pedro Javier Garcia and
                 Francisco J. Alfaro and Francisco J. Quiles and
                 Crisp{\'\i}n G{\'o}mez and Jose Duato",
  title =        "Combining Source-adaptive and Oblivious Routing with
                 Congestion Control in High-performance Interconnects
                 using Hybrid and Direct Topologies",
  journal =      j-TACO,
  volume =       "16",
  number =       "2",
  pages =        "17:1--17:??",
  month =        may,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3319805",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Jul 26 14:25:54 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Hybrid and direct topologies are cost-efficient and
                 scalable options to interconnect thousands of end nodes
                 in high-performance computing (HPC) systems. They offer
                 a rich path diversity, high bisection bandwidth, and a
                 reduced diameter guaranteeing low latency. In these
                 topologies, efficient deterministic routing algorithms
                 can be used to balance smartly the traffic flows among
                 the available routes. Unfortunately, congestion leads
                 these networks to saturation, where the HoL blocking
                 effect degrades their performance dramatically. Among
                 the proposed solutions to deal with HoL blocking, the
                 routing algorithms selecting alternative routes, such
                 as adaptive and oblivious, can mitigate the congestion
                 effects. Other techniques use queues to separate
                 congested flows from non-congested ones, thus reducing
                 the HoL blocking. In this article, we propose a new
                 approach that reduces HoL blocking in hybrid and direct
                 topologies using source-adaptive and oblivious routing.
                 This approach also guarantees deadlock-freedom as it
                 uses virtual networks to break potential cycles
                 generated by the routing policy in the topology.
                 Specifically, we propose two techniques, called
                 Source-Adaptive Solution for Head-of-Line Blocking
                 Avoidance (SASHA) and Oblivious Solution for
                 Head-of-Line Blocking Avoidance (OSHA). Experiment
                 results, carried out through simulations under
                 different traffic scenarios, show that SASHA and OSHA
                 can significantly reduce the HoL blocking.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "17",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Alshboul:2019:ECR,
  author =       "Mohammad Alshboul and Hussein Elnawawy and Reem
                 Elkhouly and Keiji Kimura and James Tuck and Yan
                 Solihin",
  title =        "Efficient Checkpointing with Recompute Scheme for
                 Non-volatile Main Memory",
  journal =      j-TACO,
  volume =       "16",
  number =       "2",
  pages =        "18:1--18:??",
  month =        may,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3323091",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Jul 26 14:25:54 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Future main memory will likely include Non-Volatile
                 Memory. Non-Volatile Main Memory (NVMM) provides an
                 opportunity to rethink checkpointing strategies for
                 providing failure safety to applications. While there
                 are many checkpointing and logging schemes in the
                 literature, their use must be revisited as they incur
                 high execution time overheads as well as a large number
                 of additional writes to NVMM, which may significantly
                 impact write endurance. In this article, we propose a
                 novel recompute-based failure safety approach and
                 demonstrate its applicability to loop-based code.
                 Rather than keeping a fully consistent logging state,
                 we only log enough state to enable recomputation. Upon
                 a failure, our approach recovers to a consistent state
                 by determining which parts of the computation were not
                 completed and recomputing them. Effectively, our
                 approach removes the need to keep checkpoints or logs,
                 thus reducing execution time overheads and improving
                 NVMM write endurance at the expense of more complex
                 recovery. We compare our new approach against logging
                 and checkpointing on five scientific workloads,
                 including tiled matrix multiplication, on a computer
                 system model that was built on gem5 and supports Intel
                 PMEM instruction extensions. For tiled matrix
                 multiplication, our recompute approach incurs an
                 execution time overhead of only 5\%, in contrast to 8\%
                 overhead with logging and 207\% overhead with
                 checkpointing. Furthermore, recompute only adds 7\%
                 additional NVMM writes, compared to 111\% with logging
                 and 330\% with checkpointing. We also conduct
                 experiments on real hardware, allowing us to run our
                 workloads to completion while varying the number of
                 threads used for computation. These experiments
                 substantiate our simulation-based observations and
                 provide a sensitivity study and performance comparison
                 between the Recompute Scheme and Naive Checkpointing.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "18",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Hadjilambrou:2019:CCO,
  author =       "Zacharias Hadjilambrou and Marios Kleanthous and
                 Georgia Antoniou and Antoni Portero and Yiannakis
                 Sazeides",
  title =        "Comprehensive Characterization of an Open Source
                 Document Search Engine",
  journal =      j-TACO,
  volume =       "16",
  number =       "2",
  pages =        "19:1--19:??",
  month =        may,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3320346",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Jul 26 14:25:54 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/gnu.bib;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "This work performs a thorough characterization and
                 analysis of the open source Lucene search library. The
                 article describes in detail the architecture,
                 functionality, and micro-architectural behavior of the
                 search engine, and investigates prominent online
                 document search research issues. In particular, we
                 study how intra-server index partitioning affects the
                 response time and throughput, explore the potential use
                 of low power servers for document search, and examine
                 the sources of performance degradation ands the causes
                 of tail latencies. Some of our main conclusions are the
                 following: (a) intra-server index partitioning can
                 reduce tail latencies but with diminishing benefits as
                 incoming query traffic increases, (b) low power servers
                 given enough partitioning can provide same average and
                 tail response times as conventional high performance
                 servers, (c) index search is a CPU-intensive
                 cache-friendly application, and (d) C-states are the
                 main culprits for performance degradation in document
                 search.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "19",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Li:2019:EGC,
  author =       "Bingchao Li and Jizeng Wei and Jizhou Sun and Murali
                 Annavaram and Nam Sung Kim",
  title =        "An Efficient {GPU} Cache Architecture for Applications
                 with Irregular Memory Access Patterns",
  journal =      j-TACO,
  volume =       "16",
  number =       "3",
  pages =        "20:1--20:??",
  month =        jul,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3322127",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Jul 26 14:25:54 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "GPUs provide high-bandwidth/low-latency on-chip shared
                 memory and L1 cache to efficiently service a large
                 number of concurrent memory requests. Specifically,
                 concurrent memory requests accessing contiguous memory
                 space are coalesced into warp-wide accesses. To support
                 such large accesses to L1 cache with low latency, the
                 size of L1 cache line is no smaller than that of
                 warp-wide accesses. However, such L1 cache architecture
                 cannot always be efficiently utilized when applications
                 generate many memory requests with irregular access
                 patterns especially due to branch and memory
                 divergences that make requests uncoalesced and small.
                 Furthermore, unlike L1 cache, the shared memory of GPUs
                 is not often used in many applications, which
                 essentially depends on programmers. In this article, we
                 propose Elastic-Cache, which can efficiently support
                 both fine- and coarse-grained L1 cache line management
                 for applications with both regular and irregular memory
                 access patterns to improve the L1 cache efficiency.
                 Specifically, it can store 32- or 64-byte words in
                 non-contiguous memory space to a single 128-byte cache
                 line. Furthermore, it neither requires an extra memory
                 structure nor reduces the capacity of L1 cache for tag
                 storage, since it stores auxiliary tags for
                 fine-grained L1 cache line managements in the shared
                 memory space that is not fully used in many
                 applications. To improve the bandwidth utilization of
                 L1 cache with Elastic-Cache for fine-grained accesses,
                 we further propose Elastic-Plus to issue 32-byte memory
                 requests in parallel, which can reduce the processing
                 latency of memory instructions and improve the
                 throughput of GPUs. Our experiment result shows that
                 Elastic-Cache improves the geometric-mean performance
                 of applications with irregular memory access patterns
                 by 104\% without degrading the performance of
                 applications with regular memory access patterns.
                 Elastic-Plus outperforms Elastic-Cache and improves the
                 performance of applications with irregular memory
                 access patterns by 131\%.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "20",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Roberts:2019:POS,
  author =       "Stephen I. Roberts and Steven A. Wright and Suhaib A.
                 Fahmy and Stephen A. Jarvis",
  title =        "The Power-optimised Software Envelope",
  journal =      j-TACO,
  volume =       "16",
  number =       "3",
  pages =        "21:1--21:??",
  month =        jul,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3321551",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Jul 26 14:25:54 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Advances in processor design have delivered
                 performance improvements for decades. As physical
                 limits are reached, refinements to the same basic
                 technologies are beginning to yield diminishing
                 returns. Unsustainable increases in energy consumption
                 are forcing hardware manufacturers to prioritise energy
                 efficiency in their designs. Research suggests that
                 software modifications may be needed to exploit the
                 resulting improvements in current and future hardware.
                 New tools are required to capitalise on this new class
                 of optimisation. In this article, we present the Power
                 Optimised Software Envelope (POSE) model, which allows
                 developers to assess the potential benefits of power
                 optimisation for their applications. The POSE model is
                 metric agnostic and in this article, we provide
                 derivations using the established Energy-Delay Product
                 metric and the novel Energy-Delay Sum and Energy-Delay
                 Distance metrics that we believe are more appropriate
                 for energy-aware optimisation efforts. We demonstrate
                 POSE on three platforms by studying the optimisation
                 characteristics of applications from the Mantevo
                 benchmark suite. Our results show that the Pathfinder
                 application has very little scope for power
                 optimisation while TeaLeaf has the most, with all other
                 applications in the benchmark suite falling between the
                 two. Finally, we extend our POSE model with a
                 formulation known as System Summary POSE-a
                 meta-heuristic that allows developers to assess the
                 scope a system has for energy-aware software
                 optimisation independent of the code being run.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "21",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Kannan:2019:CIE,
  author =       "Ram Srivatsa Kannan and Michael Laurenzano and
                 Jeongseob Ahn and Jason Mars and Lingjia Tang",
  title =        "{Caliper}: Interference Estimator for Multi-tenant
                 Environments Sharing Architectural Resources",
  journal =      j-TACO,
  volume =       "16",
  number =       "3",
  pages =        "22:1--22:??",
  month =        jul,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3323090",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Jul 26 14:25:54 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "We introduce Caliper, a technique for accurately
                 estimating performance interference occurring in shared
                 servers. Caliper overcomes the limitations of prior
                 approaches by leveraging a micro-experiment-based
                 technique. In contrast to state-of-the-art approaches
                 that focus on periodically pausing co-running
                 applications to estimate slowdown, Caliper utilizes a
                 strategic phase-triggered technique to capture
                 interference due to co-location. This enables Caliper
                 to orchestrate an accurate and low-overhead
                 interference estimation technique that can be readily
                 deployed in existing production systems. We evaluate
                 Caliper for a broad spectrum of workload scenarios,
                 demonstrating its ability to seamlessly support up to
                 16 applications running simultaneously and outperform
                 the state-of-the-art approaches.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "22",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Lin:2019:CCC,
  author =       "Zhen Lin and Hongwen Dai and Michael Mantor and
                 Huiyang Zhou",
  title =        "Coordinated {CTA} Combination and Bandwidth
                 Partitioning for {GPU} Concurrent Kernel Execution",
  journal =      j-TACO,
  volume =       "16",
  number =       "3",
  pages =        "23:1--23:??",
  month =        jul,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3326124",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Jul 26 14:25:54 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Contemporary GPUs support multiple kernels to run
                 concurrently on the same streaming multiprocessors
                 (SMs). Recent studies have demonstrated that such
                 concurrent kernel execution (CKE) improves both
                 resource utilization and computational throughput. Most
                 of the prior works focus on partitioning the GPU
                 resources at the cooperative thread array (CTA) level
                 or the warp scheduler level to improve CKE. However,
                 significant performance slowdown and unfairness are
                 observed when latency-sensitive kernels co-run with
                 bandwidth-intensive ones. The reason is that bandwidth
                 over-subscription from bandwidth-intensive kernels
                 leads to much aggravated memory access latency, which
                 is highly detrimental to latency-sensitive kernels.
                 Even among bandwidth-intensive kernels, more intensive
                 kernels may unfairly consume much higher bandwidth than
                 less-intensive ones. In this article, we first make a
                 case that such problems cannot be sufficiently solved
                 by managing CTA combinations alone and reveal the
                 fundamental reasons. Then, we propose a coordinated
                 approach for CTA combination and bandwidth
                 partitioning. Our approach dynamically detects
                 co-running kernels as latency sensitive or bandwidth
                 intensive. As both the DRAM bandwidth and L2-to-L1
                 Network-on-Chip (NoC) bandwidth can be the critical
                 resource, our approach partitions both bandwidth
                 resources coordinately along with selecting proper CTA
                 combinations. The key objective is to allocate more CTA
                 resources for latency-sensitive kernels and more
                 NoC/DRAM bandwidth resources to NoC-/DRAM-intensive
                 kernels. We achieve it using a variation of dominant
                 resource fairness (DRF). Compared with two
                 state-of-the-art CKE optimization schemes, SMK [52] and
                 WS [55], our approach improves the average harmonic
                 speedup by 78\% and 39\%, respectively. Even compared
                 to the best possible CTA combinations, which are
                 obtained from an exhaustive search among all possible
                 CTA combinations, our approach improves the harmonic
                 speedup by up to 51\% and 11\% on average.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "23",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Didier:2019:CCP,
  author =       "Keryan Didier and Dumitru Potop-Butucaru and Guillaume
                 Iooss and Albert Cohen and Jean Souyris and Philippe
                 Baufreton and Amaury Graillat",
  title =        "Correct-by-Construction Parallelization of Hard
                 Real-Time Avionics Applications on Off-the-Shelf
                 Predictable Hardware",
  journal =      j-TACO,
  volume =       "16",
  number =       "3",
  pages =        "24:1--24:??",
  month =        jul,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3328799",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Jul 26 14:25:54 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "We present the first end-to-end modeling and
                 compilation flow to parallelize hard real-time control
                 applications while fully guaranteeing the respect of
                 real-time requirements on off-the-shelf hardware. It
                 scales to thousands of dataflow nodes and has been
                 validated on two production avionics applications.
                 Unlike classical optimizing compilation, it takes as
                 input non-functional requirements (real time, resource
                 limits). To enforce these requirements, the compiler
                 follows a static resource allocation strategy, from
                 coarse-grain tasks communicating over an
                 interconnection network all the way to individual
                 variables and memory accesses. It controls timing
                 interferences resulting from mapping decisions in a
                 precise, safe, and scalable way.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "24",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Zardoshti:2019:STM,
  author =       "Pantea Zardoshti and Tingzhe Zhou and Pavithra Balaji
                 and Michael L. Scott and Michael Spear",
  title =        "Simplifying Transactional Memory Support in {C++}",
  journal =      j-TACO,
  volume =       "16",
  number =       "3",
  pages =        "25:1--25:??",
  month =        jul,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3328796",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Jul 26 14:25:54 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "C++ has supported a provisional version of
                 Transactional Memory (TM) since 2015, via a technical
                 specification. However, TM has not seen widespread
                 adoption, and compiler vendors have been slow to
                 implement the technical specification. We conjecture
                 that the proposed TM support is too difficult for
                 programmers to use, too complex for compiler designers
                 to implement and verify, and not industry-proven enough
                 to justify final standardization in its current form.
                 To address these problems, we present a different
                 design for supporting TM in C++. By forbidding explicit
                 self-abort, and by introducing an executor-based
                 mechanism for running transactions, our approach makes
                 it easier for developers to get code up and running
                 with TM. Our proposal should also be appealing to
                 compiler developers, as it allows a spectrum of levels
                 of support for TM, with varying performance, and
                 varying reliance on hardware TM support in order to
                 provide scalability. \&lt;?tight?\&gt;While our design
                 does not enable some of the optimizations admitted by
                 the current technical specification, we show that it
                 enables the implementation of robust support for TM in
                 a small, orthogonal compiler extension. Our
                 implementation is able to handle a wide range of
                 transactional programs, delivering low instrumentation
                 overhead and scalability and performance on par with
                 the current state of the art. Based on this experience,
                 we believe our approach to be a viable means of
                 reinvigorating the standardization of TM in C++.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "25",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Park:2019:MCM,
  author =       "Jungwoo Park and Myoungjun Lee and Soontae Kim and
                 Minho Ju and Jeongkyu Hong",
  title =        "{MH} Cache: a Multi-retention {STT-RAM}-based
                 Low-power Last-level Cache for Mobile Hardware
                 Rendering Systems",
  journal =      j-TACO,
  volume =       "16",
  number =       "3",
  pages =        "26:1--26:??",
  month =        jul,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3328520",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Jul 26 14:25:54 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Mobile devices have become the most important devices
                 in our life. However, they are limited in battery
                 capacity. Therefore, low-power computing is crucial for
                 their long lifetime. A spin-transfer torque RAM
                 (STT-RAM) has become emerging memory technology because
                 of its low leakage power consumption. We herein propose
                 MH cache, a multi-retention STT-RAM-based cache
                 management scheme for last-level caches (LLC) to reduce
                 their power consumption for mobile hardware rendering
                 systems. We analyzed the memory access patterns of
                 processes and observed how rendering methods affect
                 process behaviors. We propose a cache management scheme
                 that measures write-intensity of each process
                 dynamically and exploits it to manage a power-efficient
                 multi-retention STT-RAM-based cache. Our proposed
                 scheme uses variable threshold for a process'
                 write-intensity to determine cache line placement. We
                 explain how to deal with the following issue to
                 implement our proposed scheme. Our experimental results
                 show that our techniques significantly reduce the LLC
                 power consumption by 32\% and 32.2\% in single- and
                 quad-core systems, respectively, compared to a full
                 STT-RAM LLC.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "26",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Leben:2019:PCM,
  author =       "Jakob Leben and George Tzanetakis",
  title =        "Polyhedral Compilation for Multi-dimensional Stream
                 Processing",
  journal =      j-TACO,
  volume =       "16",
  number =       "3",
  pages =        "27:1--27:??",
  month =        jul,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3330999",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Jul 26 14:25:54 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "We present a method for compilation of
                 multi-dimensional stream processing programs from
                 affine recurrence equations with unbounded domains into
                 imperative code with statically allocated memory. The
                 method involves a novel polyhedral schedule
                 transformation called periodic tiling. It accommodates
                 existing polyhedral optimizations to improve memory
                 access patterns and expose parallelism. This enables
                 efficient execution of programming languages with
                 unbounded recurrence equations, as well as optimization
                 of existing languages from which this form can be
                 derived. The method is experimentally evaluated on 5
                 DSP algorithms with large problem sizes. Results show
                 potential for improved throughput compared to
                 hand-optimized C++ (speedups on a 6-core Intel Xeon CPU
                 up to $ 10 \times $ with a geometric mean $ 3.3 \times
                 $).",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "27",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Sadeghi:2019:TCN,
  author =       "Mohammad Sadegh Sadeghi and Siavash Bayat Sarmadi and
                 Shaahin Hessabi",
  title =        "Toward On-chip Network Security Using Runtime
                 Isolation Mapping",
  journal =      j-TACO,
  volume =       "16",
  number =       "3",
  pages =        "28:1--28:??",
  month =        jul,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3337770",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Jul 26 14:25:54 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Many-cores execute a large number of diverse
                 applications concurrently. Inter-application
                 interference can lead to a security threat as timing
                 channel attack in the on-chip network. A
                 non-interference communication in the shared on-chip
                 network is a dominant necessity for secure many-core
                 platforms to leverage the concepts of the cloud and
                 embedded system-on-chip. The current non-interference
                 techniques are limited to static scheduling and need
                 router modification at micro-architecture level.
                 Mapping of applications can effectively determine the
                 interference among applications in on-chip network. In
                 this work, we explore non-interference approaches
                 through run-time mapping at software and application
                 level. We map the same group of applications in
                 isolated domain(s) to meet non-interference flows.
                 Through run-time mapping, we can maximize utilization
                 of the system without leaking information. The proposed
                 run-time mapping policy requires no router modification
                 in contrast to the best known competing schemes, and
                 the performance degradation is, on average, 16\%
                 compared to the state-of-the-art baselines.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "28",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Louise:2019:FST,
  author =       "Stephane Louise",
  title =        "A First Step Toward Using Quantum Computing for
                 Low-level {WCETs} Estimations",
  journal =      j-TACO,
  volume =       "16",
  number =       "3",
  pages =        "29:1--29:??",
  month =        jul,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3335549",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Jul 26 14:25:54 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Low-Level analysis of Worst Case Execution Time (WCET)
                 is an important field for real-time system validation.
                 It stands between computer architecture and
                 mathematics, as it relies strongly on variants of
                 abstract interpretation. One of the features that
                 causes the largest uncertainty regarding WCET
                 evaluation for low-level analysis of sequential
                 execution on a single processor is taking Cache
                 Memory-related Delays (CMRD) and Cache-related
                 Preemption Delays (CRPD) correctly into account.
                 Research work from the 1990s provides a good basic
                 framework for this problem as long as a task runs
                 without preemption. But when preemption of tasks is
                 allowed, although several formalisms exist, their
                 predictive power is lower and the usual approach relies
                 on analyses of NP-hard problems. In this article, we
                 want to show some potential advantages of using a
                 formalism inspired by Quantum Computing (QC) to
                 evaluate CMRDs with preemptions while avoiding the
                 NP-hard problem underneath. The experimental results,
                 with a classic (non-quantum) numerical approach, on a
                 selection of Malardalen benchmark programs display very
                 good accuracy, while the complexity of the evaluation
                 is a low-order polynomial of the number of memory
                 accesses. While it is not yet a fully parallel quantum
                 algorithm, we provide a first roadmap on how to reach
                 such an objective.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "29",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Chikin:2019:MAA,
  author =       "Artem Chikin and Taylor Lloyd and Jos{\'e} Nelson
                 Amaral and Ettore Tiotto and Muhammad Usman",
  title =        "Memory-access-aware Safety and Profitability Analysis
                 for Transformation of Accelerator-bound {OpenMP}
                 Loops",
  journal =      j-TACO,
  volume =       "16",
  number =       "3",
  pages =        "30:1--30:??",
  month =        jul,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3333060",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Jul 26 14:25:54 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Iteration Point Difference Analysis is a new static
                 analysis framework that can be used to determine the
                 memory coalescing characteristics of parallel loops
                 that target GPU offloading and to ascertain safety and
                 profitability of loop transformations with the goal of
                 improving their memory access characteristics. This
                 analysis can propagate definitions through control
                 flow, works for non-affine expressions, and is capable
                 of analyzing expressions that reference conditionally
                 defined values. This analysis framework enables safe
                 and profitable loop transformations. Experimental
                 results demonstrate potential for dramatic performance
                 improvements. GPU kernel execution time across the
                 Polybench suite is improved by up to $ 25.5 \times $ on
                 an Nvidia P100 with benchmark overall improvement of up
                 to $ 3.2 \times $. An opportunity detected in a SPEC
                 ACCEL benchmark yields kernel speedup of $ 86.5 \times
                 $ with a benchmark improvement of $ 3.3 \times $. This
                 work also demonstrates how architecture-aware compilers
                 improve code portability and reduce programmer
                 effort.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "30",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Cha:2019:MDC,
  author =       "Sanghoon Cha and Bokyeong Kim and Chang Hyun Park and
                 Jaehyuk Huh",
  title =        "Morphable {DRAM} Cache Design for Hybrid Memory
                 Systems",
  journal =      j-TACO,
  volume =       "16",
  number =       "3",
  pages =        "31:1--31:??",
  month =        jul,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3338505",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Jul 26 14:25:54 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "DRAM caches have emerged as an efficient new layer in
                 the memory hierarchy to address the increasing
                 diversity of memory components. When a small amount of
                 fast memory is combined with slow but large memory, the
                 cache-based organization of the fast memory can provide
                 a SW-transparent solution for the hybrid memory
                 systems. In such DRAM cache designs, their
                 effectiveness is affected by the bandwidth and latency
                 of both fast and slow memory. To quantitatively assess
                 the effect of memory configurations and application
                 patterns on the DRAM cache designs, this article first
                 investigates how three prior approaches perform with
                 six hybrid memory scenarios. From the investigation, we
                 observe no single DRAM cache organization always
                 outperforms the other organizations across the diverse
                 hybrid memory configurations and memory access
                 patterns. Based on this observation, this article
                 proposes a reconfigurable DRAM cache design that can
                 adapt to different hybrid memory combinations and
                 workload patterns. Unlike the fixed tag and data arrays
                 of conventional on-chip SRAM caches, this study
                 advocates to exploit the flexibility of DRAM caches,
                 which can store tags and data to DRAM in any arbitrary
                 way. Using a sample-based mechanism, the proposed DRAM
                 cache controller dynamically finds the best
                 organization from three candidates and applies the best
                 one by reconfiguring the tags and data layout in the
                 DRAM cache. Our evaluation shows that the proposed
                 morphable DRAM cache can outperform the fixed DRAM
                 configurations across six hybrid memory
                 configurations.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "31",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Luo:2019:SCT,
  author =       "Chao Luo and Yunsi Fei and David Kaeli",
  title =        "Side-channel Timing Attack of {RSA} on a {GPU}",
  journal =      j-TACO,
  volume =       "16",
  number =       "3",
  pages =        "32:1--32:??",
  month =        aug,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3341729",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Sat Oct 12 15:27:40 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/cryptography2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/ft_gateway.cfm?id=3341729",
  abstract =     "To increase computation throughput, general purpose
                 Graphics Processing Units (GPUs) have been leveraged to
                 accelerate computationally intensive workloads. GPUs
                 have been used as cryptographic engines, improving
                 encryption/decryption throughput and leveraging the
                 GPU's Single Instruction Multiple Thread (SIMT) model.
                 RSA is a widely used public-key cipher and has been
                 ported onto GPUs for signing and decrypting large
                 files. Although performance has been significantly
                 improved, the security of RSA on GPUs is vulnerable to
                 side-channel timing attacks and is an exposure
                 overlooked in previous studies. GPUs tend to be
                 naturally resilient to side-channel attacks, given that
                 they execute a large number of concurrent threads,
                 performing many RSA operations on different data in
                 parallel. Given the degree of parallel execution on a
                 GPU, there will be a significant amount of noise
                 introduced into the timing channel given the thousands
                 of concurrent threads executing concurrently. In this
                 work, we build a timing model to capture the parallel
                 characteristics of an RSA public-key cipher implemented
                 on a GPU. We consider optimizations that include using
                 Montgomery multiplication and sliding-window
                 exponentiation to implement cryptographic operations.
                 Our timing model considers the challenges of parallel
                 execution, complications that do not occur in
                 single-threaded computing platforms. Based on our
                 timing model, we launch successful timing attacks on
                 RSA running on a GPU, extracting the private key of
                 RSA. We also present an effective error detection and
                 correction mechanism. Our results demonstrate that GPU
                 acceleration of RSA is vulnerable to side-channel
                 timing attacks. We propose several countermeasures to
                 defend against this class of attacks.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "32",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Yuan:2019:RTL,
  author =       "Liang Yuan and Chen Ding and Wesley Smith and Peter
                 Denning and Yunquan Zhang",
  title =        "A Relational Theory of Locality",
  journal =      j-TACO,
  volume =       "16",
  number =       "3",
  pages =        "33:1--33:??",
  month =        aug,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3341109",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Sat Oct 12 15:27:40 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/ft_gateway.cfm?id=3341109",
  abstract =     "In many areas of program and system analysis and
                 optimization, locality is a common concept and has been
                 defined and measured in many ways. This article aims to
                 formally establish relations between these previously
                 disparate types of locality. It categorizes locality
                 definitions in three groups and shows whether and how
                 they can be interconverted. For the footprint, a recent
                 metric, it gives a new measurement algorithm that is
                 asymptotically more time/space efficient than previous
                 approaches. Using the conversion relations, the new
                 algorithm derives with the same efficiency different
                 locality metrics developed and used in program
                 analysis, memory management, and cache design.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "33",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Thangamani:2020:ORC,
  author =       "Arun Thangamani and V. Krishna Nandivada",
  title =        "Optimizing Remote Communication in {X10}",
  journal =      j-TACO,
  volume =       "16",
  number =       "4",
  pages =        "34:1--34:26",
  month =        jan,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3345558",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Sat Oct 12 15:31:26 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "X10 is a partitioned global address space programming
                 language that supports the notion of places; a place
                 consists of some data and some lightweight tasks called
                 activities. Each activity runs at a place and may
                 invoke a place-change operation (using the
                 at-construct) to synchronously perform some computation
                 at another place. These place-change operations can be
                 very expensive, as they need to copy all the required
                 data from the current place to the remote place.
                 However, identifying the necessary number of
                 place-change operations and the required data during
                 each place-change operation are non-trivial tasks,
                 especially in the context of irregular applications
                 (like graph applications) that contain complex code
                 with large amounts of cross-referencing objects-not all
                 of those objects may be actually required, at the
                 remote place. In this article, we present AT-Com, a
                 scheme to optimize X10 code with place-change
                 operations. AT-Com consists of two inter-related new
                 optimizations: (i) AT-Opt, which minimizes the amount
                 of data serialized and communicated during place-change
                 operations, and (ii) AT-Pruning, which
                 identifies/elides redundant place-change operations and
                 does parallel execution of place-change operations.
                 AT-Opt uses a novel abstraction, called
                 abstract-place-tree, to capture place-change operations
                 in the program. For each place-change operation, AT-Opt
                 uses a novel inter-procedural analysis to precisely
                 identify the data required at the remote place in terms
                 of the variables in the current scope. AT-Opt then
                 emits the appropriate code to copy the identified
                 data-items to the remote place. AT-Pruning introduces a
                 set of program transformation techniques to emit
                 optimized code such that it avoids the redundant
                 place-change operations. We have implemented AT-Com in
                 the x10v2.6.0 compiler and tested it over the IMSuite
                 benchmark kernels. Compared to the current X10
                 compiler, the AT-Com optimized code achieved a
                 geometric mean speedup of 18.72$ \times $ and 17.83$
                 \times $ on a four-node (32 cores per node) Intel and
                 two-node (16 cores per node) AMD system,
                 respectively.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "34",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Srikanth:2020:MAS,
  author =       "Sriseshan Srikanth and Anirudh Jain and Joseph M.
                 Lennon and Thomas M. Conte and Erik Debenedictis and
                 Jeanine Cook",
  title =        "{MetaStrider}: Architectures for Scalable
                 Memory-centric Reduction of Sparse Data Streams",
  journal =      j-TACO,
  volume =       "16",
  number =       "4",
  pages =        "35:1--35:26",
  month =        jan,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3355396",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Sat Oct 12 15:31:26 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Reduction is an operation performed on the values of
                 two or more key-value pairs that share the same key.
                 Reduction of sparse data streams finds application in a
                 wide variety of domains such as data and graph
                 analytics, cybersecurity, machine learning, and HPC
                 applications. However, these applications exhibit low
                 locality of reference, rendering traditional
                 architectures and data representations inefficient.
                 This article presents MetaStrider, a significant
                 algorithmic and architectural enhancement to the
                 state-of-the-art, SuperStrider. Furthermore, these
                 enhancements enable a variety of parallel,
                 memory-centric architectures that we propose, resulting
                 in demonstrated performance that scales near-linearly
                 with available memory-level parallelism.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "35",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Koraei:2020:DSS,
  author =       "Mostafa Koraei and Omid Fatemi and Magnus Jahre",
  title =        "{DCMI}: a Scalable Strategy for Accelerating Iterative
                 Stencil Loops on {FPGAs}",
  journal =      j-TACO,
  volume =       "16",
  number =       "4",
  pages =        "36:1--36:24",
  month =        jan,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3352813",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Sat Oct 12 15:31:26 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Iterative Stencil Loops (ISLs) are the key kernel
                 within a range of compute-intensive applications. To
                 accelerate ISLs with Field Programmable Gate Arrays, it
                 is critical to exploit parallelism (1) among elements
                 within the same iteration and (2) across loop
                 iterations. We propose a novel ISL acceleration scheme
                 called Direct Computation of Multiple Iterations (DCMI)
                 that improves upon prior work by pre-computing the
                 effective stencil coefficients after a number of
                 iterations at design time-resulting in accelerators
                 that use minimal on-chip memory and avoid redundant
                 computation. This enables DCMI to improve throughput by
                 up to 7.7$ \times $ compared to the state-of-the-art
                 cone-based architecture.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "36",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Peled:2020:NNP,
  author =       "Leeor Peled and Uri Weiser and Yoav Etsion",
  title =        "A Neural Network Prefetcher for Arbitrary Memory
                 Access Patterns",
  journal =      j-TACO,
  volume =       "16",
  number =       "4",
  pages =        "37:1--37:27",
  month =        jan,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3345000",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Sat Jan 11 07:11:45 MST 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3345000",
  abstract =     "Memory prefetchers are designed to identify and
                 prefetch specific access patterns, including
                 spatiotemporal locality (e.g., strides, streams),
                 recurring patterns (e.g., varying strides, temporal
                 correlation), and specific irregular patterns (e.g.,
                 \ldots{})",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "37",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Vasilache:2020:NAL,
  author =       "Nicolas Vasilache and Oleksandr Zinenko and Theodoros
                 Theodoridis and Priya Goyal and Zachary Devito and
                 William S. Moses and Sven Verdoolaege and Andrew Adams
                 and Albert Cohen",
  title =        "The Next 700 Accelerated Layers: From Mathematical
                 Expressions of Network Computation Graphs to
                 Accelerated {GPU} Kernels, Automatically",
  journal =      j-TACO,
  volume =       "16",
  number =       "4",
  pages =        "38:1--38:26",
  month =        jan,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3355606",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Sat Oct 12 15:31:26 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Deep learning frameworks automate the deployment,
                 distribution, synchronization, memory allocation, and
                 hardware acceleration of models represented as graphs
                 of computational operators. These operators wrap
                 high-performance libraries such as cuDNN or NNPACK.
                 When the computation does not match any predefined
                 library call, custom operators must be implemented,
                 often at high engineering cost and performance penalty,
                 limiting the pace of innovation. To address this
                 productivity gap, we propose and evaluate: (1) a
                 domain-specific language with a tensor notation close
                 to the mathematics of deep learning; (2) a Just-In-Time
                 optimizing compiler based on the polyhedral framework;
                 (3) carefully coordinated linear optimization and
                 evolutionary algorithms to synthesize high-performance
                 CUDA kernels; (4) the transparent integration of our
                 flow into PyTorch and Caffe2, providing the fully
                 automatic synthesis of high-performance GPU kernels
                 from simple tensor algebra. The performance is
                 comparable to, and often exceeds the performance of,
                 highly tuned libraries.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "38",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Jiang:2020:LLA,
  author =       "Wenbin Jiang and Yang Ma and Bo Liu and Haikun Liu and
                 Bing Bing Zhou and Jian Zhu and Song Wu and Hai Jin",
  title =        "{Layup}: Layer-adaptive and Multi-type
                 Intermediate-oriented Memory Optimization for
                 {GPU}-based {CNNs}",
  journal =      j-TACO,
  volume =       "16",
  number =       "4",
  pages =        "39:1--39:23",
  month =        jan,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3357238",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Sat Oct 12 15:31:26 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Although GPUs have emerged as the mainstream for the
                 acceleration of convolutional neural network (CNN)
                 training processes, they usually have limited physical
                 memory, meaning that it is hard to train large-scale
                 CNN models. Many methods for memory optimization have
                 been proposed to decrease the memory consumption of
                 CNNs and to mitigate the increasing scale of these
                 networks; however, this optimization comes at the cost
                 of an obvious drop in time performance. We propose a
                 new memory optimization strategy named Layup that
                 realizes both better memory efficiency and better time
                 performance. First, a fast layer-type-specific method
                 for memory optimization is presented, based on the new
                 finding that a single memory optimization often shows
                 dramatic differences in time performance for different
                 types of layers. Second, a new memory reuse method is
                 presented in which greater attention is paid to
                 multi-type intermediate data such as convolutional
                 workspaces and cuDNN handle data. Experiments show that
                 Layup can significantly increase the scale of
                 extra-deep network models on a single GPU with lower
                 performance loss. It even can train ResNet with 2,504
                 layers using 12GB memory, outperforming the
                 state-of-the-art work of SuperNeurons with 1,920 layers
                 (batch size = 16).",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "39",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Siso:2020:EAV,
  author =       "Sergi Siso and Wes Armour and Jeyarajan
                 Thiyagalingam",
  title =        "Evaluating Auto-Vectorizing Compilers through
                 Objective Withdrawal of Useful Information",
  journal =      j-TACO,
  volume =       "16",
  number =       "4",
  pages =        "40:1--40:23",
  month =        jan,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3356842",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Sat Oct 12 15:31:26 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "The need for compilers to generate highly vectorized
                 code is at an all-time high with the increasing
                 vectorization capabilities of modern processors. To
                 this end, the information that compilers have at their
                 disposal, either through code analysis or via user
                 annotations, is instrumental for auto-vectorization,
                 and hence for the overall performance. However, the
                 information that is available to compilers at compile
                 time and its accuracy varies greatly, as does the
                 resulting performance of vectorizing compilers.
                 Benchmarks like the Test Suite for Vectorizing
                 Compilers (TSVC) have been developed to evaluate the
                 vectorization capability of such compilers. The
                 overarching approach of TSVC and similar benchmarks is
                 to evaluate the compilers under the best possible
                 scenario (i.e., assuming that compilers have access to
                 all useful contextual information at compile time).
                 Although this idealistic view is useful to observe the
                 capability of compilers for auto-vectorization, it is
                 not a true reflection of the conditions found in
                 real-world applications. In this article, we propose a
                 novel method for evaluating the auto-vectorization
                 capability of compilers. Instead of assuming that
                 compilers have access to a wealth of information at
                 compile time, we formulate a method to objectively
                 supply or withdraw information that would otherwise aid
                 the compiler in the auto-vectorization process. This
                 method is orthogonal to the approach adopted by TSVC,
                 and as such, it provides the means of assessing the
                 capabilities of modern vectorizing compilers in a more
                 detailed way. Using this new method, we exhaustively
                 evaluated five industry-grade compilers (GNU, Intel,
                 Clang, PGI, and IBM) on four representative vector
                 platforms (AVX-2, AVX-512 (Skylake), AVX-512 (KNL), and
                 AltiVec) using the modified version of TSVC and
                 application-level proxy kernels. The results show the
                 impact that withdrawing information has on the
                 vectorization capabilities of each compiler and also
                 prove the validity of the presented technique.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "40",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Resch:2020:PBN,
  author =       "Salonik Resch and S. Karen Khatamifard and Zamshed
                 Iqbal Chowdhury and Masoud Zabihi and Zhengyang Zhao
                 and Jian-Ping Wang and Sachin S. Sapatnekar and Ulya R.
                 Karpuzcu",
  title =        "{PIMBALL}: Binary Neural Networks in Spintronic
                 Memory",
  journal =      j-TACO,
  volume =       "16",
  number =       "4",
  pages =        "41:1--41:26",
  month =        jan,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3357250",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Sat Oct 12 15:31:26 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Neural networks span a wide range of applications of
                 industrial and commercial significance. Binary neural
                 networks (BNN) are particularly effective in trading
                 accuracy for performance, energy efficiency, or
                 hardware/software complexity. Here, we introduce a
                 spintronic, re-configurable in-memory BNN accelerator,
                 PIMBALL: Processing In Memory BNN AccL(L)erator, which
                 allows for massively parallel and energy efficient
                 computation. PIMBALL is capable of being used as a
                 standard spintronic memory (STT-MRAM) array and a
                 computational substrate simultaneously. We evaluate
                 PIMBALL using multiple image classifiers and a genomics
                 kernel. Our simulation results show that PIMBALL is
                 more energy efficient than alternative CPU-, GPU-, and
                 FPGA-based implementations while delivering higher
                 throughput.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "41",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Jiang:2020:EBC,
  author =       "Zhen Hang Jiang and Yunsi Fei and David Kaeli",
  title =        "Exploiting Bank Conflict-based Side-channel Timing
                 Leakage of {GPUs}",
  journal =      j-TACO,
  volume =       "16",
  number =       "4",
  pages =        "42:1--42:24",
  month =        jan,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3361870",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Sat Jan 11 07:11:45 MST 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3361870",
  abstract =     "To prevent information leakage during program
                 execution, modern software cryptographic
                 implementations target constant-time function, where
                 the number of instructions executed remains the same
                 when program inputs change. However, the underlying
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "42",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Daruwalla:2020:BVC,
  author =       "Kyle Daruwalla and Heng Zhuo and Rohit Shukla and
                 Mikko Lipasti",
  title =        "{BitSAD v2}: Compiler Optimization and Analysis for
                 Bitstream Computing",
  journal =      j-TACO,
  volume =       "16",
  number =       "4",
  pages =        "43:1--43:25",
  month =        jan,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3364999",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Sat Jan 11 07:11:45 MST 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3364999",
  abstract =     "Computer vision and machine learning algorithms
                 operating under a strict power budget require an
                 alternate computing paradigm. While bitstream computing
                 (BC) satisfies these constraints, creating BC systems
                 is difficult. To address the design challenges,
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "43",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Mastoras:2020:CDL,
  author =       "Aristeidis Mastoras and Thomas R. Gross",
  title =        "Chunking for Dynamic Linear Pipelines",
  journal =      j-TACO,
  volume =       "16",
  number =       "4",
  pages =        "44:1--44:25",
  month =        jan,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3363815",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Sat Jan 11 07:11:45 MST 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3363815",
  abstract =     "Dynamic scheduling and dynamic creation of the
                 pipeline structure are crucial for efficient execution
                 of pipelined programs. Nevertheless, dynamic systems
                 imply higher overhead than static systems. Therefore,
                 chunking is the key to decrease the \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "44",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Selva:2020:BPR,
  author =       "Manuel Selva and Fabian Gruber and Diogo Sampaio and
                 Christophe Guillon and Louis-No{\"e}l Pouchet and
                 Fabrice Rastello",
  title =        "Building a Polyhedral Representation from an
                 Instrumented Execution: Making Dynamic Analyses of
                 Nonaffine Programs Scalable",
  journal =      j-TACO,
  volume =       "16",
  number =       "4",
  pages =        "45:1--45:26",
  month =        jan,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3363785",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Sat Jan 11 07:11:45 MST 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3363785",
  abstract =     "The polyhedral model has been successfully used in
                 production compilers. Nevertheless, only a very
                 restricted class of applications can benefit from it.
                 Recent proposals investigated how runtime information
                 could be used to apply polyhedral optimization
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "45",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Yasin:2020:MGM,
  author =       "Ahmad Yasin and Jawad Haj-Yahya and Yosi Ben-Asher and
                 Avi Mendelson",
  title =        "A Metric-Guided Method for Discovering Impactful
                 Features and Architectural Insights for {Skylake}-Based
                 Processors",
  journal =      j-TACO,
  volume =       "16",
  number =       "4",
  pages =        "46:1--46:25",
  month =        jan,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3369383",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Sat Jan 11 07:11:45 MST 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3369383",
  abstract =     "The slowdown in technology scaling puts architectural
                 features at the forefront of the innovation in modern
                 processors. This article presents a Metric-Guided
                 Method (MGM) that extends Top-Down analysis with
                 carefully selected, dynamically adapted \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "46",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Zhao:2020:FTF,
  author =       "Jie Zhao and Albert Cohen",
  title =        "Flextended Tiles: a Flexible Extension of Overlapped
                 Tiles for Polyhedral Compilation",
  journal =      j-TACO,
  volume =       "16",
  number =       "4",
  pages =        "47:1--47:25",
  month =        jan,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3369382",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Sat Jan 11 07:11:45 MST 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3369382",
  abstract =     "Loop tiling to exploit data locality and parallelism
                 plays an essential role in a variety of general-purpose
                 and domain-specific compilers. Affine transformations
                 in polyhedral frameworks implement classical forms of
                 rectangular and parallelogram tiling, \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "47",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Gerzhoy:2020:NMS,
  author =       "Daniel Gerzhoy and Xiaowu Sun and Michael Zuzak and
                 Donald Yeung",
  title =        "Nested {MIMD--SIMD} Parallelization for Heterogeneous
                 Microprocessors",
  journal =      j-TACO,
  volume =       "16",
  number =       "4",
  pages =        "48:1--48:27",
  month =        jan,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3368304",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Sat Jan 11 07:11:45 MST 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3368304",
  abstract =     "Heterogeneous microprocessors integrate a CPU and GPU
                 on the same chip, providing fast CPU-GPU communication
                 and enabling cores to compute on data {``in place.''}
                 This permits exploiting a finer granularity of
                 parallelism on the integrated GPUs, and enables
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "48",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Xia:2020:DAB,
  author =       "Chunwei Xia and Jiacheng Zhao and Huimin Cui and
                 Xiaobing Feng and Jingling Xue",
  title =        "{DNNTune}: Automatic Benchmarking {DNN} Models for
                 Mobile-cloud Computing",
  journal =      j-TACO,
  volume =       "16",
  number =       "4",
  pages =        "49:1--49:26",
  month =        jan,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3368305",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Sat Jan 11 07:11:45 MST 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/super.bib;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3368305",
  abstract =     "Deep Neural Networks (DNNs) are now increasingly
                 adopted in a variety of Artificial Intelligence (AI)
                 applications. Meantime, more and more DNNs are moving
                 from cloud to the mobile devices, as emerging AI chips
                 are integrated into mobiles. Therefore, \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "49",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Briggs:2020:FRT,
  author =       "Ian Briggs and Arnab Das and Mark Baranowski and
                 Vishal Sharma and Sriram Krishnamoorthy and Zvonimir
                 Rakamari{\'c} and Ganesh Gopalakrishnan",
  title =        "{FailAmp}: Relativization Transformation for Soft
                 Error Detection in Structured Address Generation",
  journal =      j-TACO,
  volume =       "16",
  number =       "4",
  pages =        "50:1--50:21",
  month =        jan,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3369381",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Sat Jan 11 07:11:45 MST 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3369381",
  abstract =     "We present FailAmp, a novel LLVM program
                 transformation algorithm that makes programs employing
                 structured index calculations more robust against soft
                 errors. Without FailAmp, an offset error can go
                 undetected; with FailAmp, all subsequent offsets are
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "50",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Ahmad:2020:DDM,
  author =       "Khalid Ahmad and Hari Sundar and Mary Hall",
  title =        "Data-driven Mixed Precision Sparse Matrix Vector
                 Multiplication for {GPUs}",
  journal =      j-TACO,
  volume =       "16",
  number =       "4",
  pages =        "51:1--51:24",
  month =        jan,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3371275",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Sat Jan 11 07:11:45 MST 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3371275",
  abstract =     "We optimize Sparse Matrix Vector multiplication (SpMV)
                 using a mixed precision strategy (MpSpMV) for Nvidia
                 V100 GPUs. The approach has three benefits: (1) It
                 reduces computation time, (2) it reduces the size of
                 the input matrix and therefore reduces data movement,
                 and (3) it provides an opportunity for increased
                 parallelism. MpSpMV's decision to lower to single
                 precision is data driven, based on individual nonzero
                 values of the sparse matrix. On all real-valued
                 matrices from the Sparse Matrix Collection, we obtain a
                 maximum speedup of $ 2.61 \times $ and average speedup
                 of $ 1.06 \times $ over double precision, while
                 maintaining higher accuracy compared to single
                 precision.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "51",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Stoltzfus:2020:TOS,
  author =       "Larisa Stoltzfus and Bastian Hagedorn and Michel
                 Steuwer and Sergei Gorlatch and Christophe Dubach",
  title =        "Tiling Optimizations for Stencil Computations Using
                 Rewrite Rules in Lift",
  journal =      j-TACO,
  volume =       "16",
  number =       "4",
  pages =        "52:1--52:25",
  month =        jan,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3368858",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Sat Jan 11 07:11:45 MST 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3368858",
  abstract =     "Stencil computations are a widely used type of
                 algorithm, found in applications from physical
                 simulations to machine learning. Stencils are
                 embarrassingly parallel, therefore fit on modern
                 hardware such as Graphic Processing Units perfectly.
                 Although \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "52",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{vanderVlag:2020:ECB,
  author =       "Michiel A. van der Vlag and Georgios Smaragdos and
                 Zaid Al-Ars and Christos Strydis",
  title =        "Exploring Complex Brain-Simulation Workloads on
                 Multi-{GPU} Deployments",
  journal =      j-TACO,
  volume =       "16",
  number =       "4",
  pages =        "53:1--53:25",
  month =        jan,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3371235",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Sat Jan 11 07:11:45 MST 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3371235",
  abstract =     "In-silico brain simulations are the de-facto tools
                 computational neuroscientists use to understand
                 large-scale and complex brain-function dynamics.
                 Current brain simulators do not scale efficiently
                 enough to large-scale problem sizes (e.g., $ > 100, 000
                 $ \ldots{})",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "53",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Elkhouly:2020:CSC,
  author =       "Reem Elkhouly and Mohammad Alshboul and Akihiro
                 Hayashi and Yan Solihin and Keiji Kimura",
  title =        "Compiler-support for Critical Data Persistence in
                 {NVM}",
  journal =      j-TACO,
  volume =       "16",
  number =       "4",
  pages =        "54:1--54:25",
  month =        jan,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3371236",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Sat Jan 11 07:11:45 MST 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3371236",
  abstract =     "Non-volatile Main Memories (NVMs) offer a promising
                 way to preserve data persistence and enable computation
                 recovery in case of failure. While the use of NVMs can
                 significantly reduce the overhead of failure recovery,
                 which is the case with High-\ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "54",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Chelini:2020:DLT,
  author =       "Lorenzo Chelini and Oleksandr Zinenko and Tobias
                 Grosser and Henk Corporaal",
  title =        "Declarative Loop Tactics for Domain-specific
                 Optimization",
  journal =      j-TACO,
  volume =       "16",
  number =       "4",
  pages =        "55:1--55:25",
  month =        jan,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3372266",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Sat Jan 11 07:11:45 MST 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3372266",
  abstract =     "Increasingly complex hardware makes the design of
                 effective compilers difficult. To reduce this problem,
                 we introduce Declarative Loop Tactics, which is a novel
                 framework of composable program transformations based
                 on an internal tree-like program \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "55",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Khan:2020:SMS,
  author =       "Asif Ali Khan and Fazal Hameed and Robin Bl{\"a}sing
                 and Stuart S. P. Parkin and Jeronimo Castrillon",
  title =        "{ShiftsReduce}: Minimizing Shifts in {Racetrack Memory
                 4.0}",
  journal =      j-TACO,
  volume =       "16",
  number =       "4",
  pages =        "56:1--56:23",
  month =        jan,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3372489",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Sat Jan 11 07:11:45 MST 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3372489",
  abstract =     "Racetrack memories (RMs) have significantly evolved
                 since their conception in 2008, making them a serious
                 contender in the field of emerging memory technologies.
                 Despite key technological advancements, the access
                 latency and energy consumption of an RM-\ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "56",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Li:2020:DCP,
  author =       "Yuhao Li and Dan Sun and Benjamin C. Lee",
  title =        "Dynamic Colocation Policies with Reinforcement
                 Learning",
  journal =      j-TACO,
  volume =       "17",
  number =       "1",
  pages =        "1:1--1:25",
  month =        mar,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3375714",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Tue Mar 10 08:30:23 MDT 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3375714",
  abstract =     "We draw on reinforcement learning frameworks to design
                 and implement an adaptive controller for managing
                 resource contention. During runtime, the controller
                 observes the dynamic system conditions and optimizes
                 control policies that satisfy latency \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "1",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Tampouratzis:2020:NHI,
  author =       "Nikolaos Tampouratzis and Ioannis Papaefstathiou and
                 Antonios Nikitakis and Andreas Brokalakis and Stamatis
                 Andrianakis and Apostolos Dollas and Marco Marcon and
                 Emanuele Plebani",
  title =        "A Novel, Highly Integrated Simulator for Parallel and
                 Distributed Systems",
  journal =      j-TACO,
  volume =       "17",
  number =       "1",
  pages =        "2:1--2:28",
  month =        mar,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3378934",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Tue Mar 10 08:30:23 MDT 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3378934",
  abstract =     "In an era of complex networked parallel heterogeneous
                 systems, simulating independently only parts,
                 components, or attributes of a system-under-design is a
                 cumbersome, inaccurate, and inefficient approach.
                 Moreover, by considering each part of a system
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "2",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Jiang:2020:EHE,
  author =       "Lijuan Jiang and Chao Yang and Wenjing Ma",
  title =        "Enabling Highly Efficient Batched Matrix
                 Multiplications on {SW26010} Many-core Processor",
  journal =      j-TACO,
  volume =       "17",
  number =       "1",
  pages =        "3:1--3:23",
  month =        mar,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3378176",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Tue Mar 10 08:30:23 MDT 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3378176",
  abstract =     "We present a systematic methodology for optimizing
                 batched matrix multiplications on SW26010 many-core
                 processor of the Sunway TaihuLight supercomputer. Five
                 surrogate algorithms and a machine learning-based
                 algorithm selector are proposed to fully \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "3",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Cavus:2020:IPI,
  author =       "Mustafa Cavus and Resit Sendag and Joshua J. Yi",
  title =        "Informed Prefetching for Indirect Memory Accesses",
  journal =      j-TACO,
  volume =       "17",
  number =       "1",
  pages =        "4:1--4:29",
  month =        mar,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3374216",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Tue Mar 10 08:30:23 MDT 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3374216",
  abstract =     "Indirect memory accesses have irregular access
                 patterns that limit the performance of conventional
                 software and hardware-based prefetchers. To address
                 this problem, we propose the Array Tracking Prefetcher
                 (ATP), which tracks array-based indirect memory
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "4",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Uguen:2020:ASA,
  author =       "Yohann Uguen and Florent {De Dinechin} and Victor
                 Lezaud and Steven Derrien",
  title =        "Application-Specific Arithmetic in High-Level
                 Synthesis Tools",
  journal =      j-TACO,
  volume =       "17",
  number =       "1",
  pages =        "5:1--5:23",
  month =        mar,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3377403",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Tue Mar 10 08:30:23 MDT 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/fparith.bib;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3377403",
  abstract =     "This work studies hardware-specific optimization
                 opportunities currently unexploited by high-level
                 synthesis compilers. Some of these optimizations are
                 specializations of floating-point operations that
                 respect the usual semantics of the input program
                 without changing the numerical result. Some other
                 optimizations, locally triggered by the programmer
                 thanks to a pragma, assume a different semantics, where
                 floating-point code is interpreted as the specification
                 of computation with real numbers. The compiler is then
                 in charge to ensure an application-level accuracy
                 constraint expressed in the pragma and has the freedom
                 to use non-standard arithmetic hardware when more
                 efficient. These two classes of optimizations are
                 prototyped in the GeCoS source-to-source compiler and
                 evaluated on the Polybench and EEMBC benchmark suites.
                 Latency is reduced by up to 93\%, and resource usage is
                 reduced by up to 58\%.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "5",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Song:2020:IME,
  author =       "Yang Song and Bill Lin",
  title =        "Improving Memory Efficiency in Heterogeneous {MPSoCs}
                 through Row-Buffer Locality-aware Forwarding",
  journal =      j-TACO,
  volume =       "17",
  number =       "1",
  pages =        "6:1--6:26",
  month =        mar,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3377149",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Tue Mar 10 08:30:23 MDT 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3377149",
  abstract =     "In heterogeneous multicore systems, the memory
                 subsystem plays a critical role, since most
                 core-to-core communications are conducted through the
                 main memory. Memory efficiency has a substantial impact
                 on system performance. Although memory traffic from
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "6",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Wu:2020:MBS,
  author =       "Hao Wu and Weizhi Liu and Huanxin Lin and Cho-Li
                 Wang",
  title =        "A Model-Based Software Solution for Simultaneous
                 Multiple Kernels on {GPUs}",
  journal =      j-TACO,
  volume =       "17",
  number =       "1",
  pages =        "7:1--7:26",
  month =        mar,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3377138",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Tue Mar 10 08:30:23 MDT 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3377138",
  abstract =     "As a critical computing resource in multiuser systems
                 such as supercomputers, data centers, and cloud
                 services, a GPU contains multiple compute units (CUs).
                 GPU Multitasking is an intuitive solution to
                 underutilization in GPGPU computing. Recently
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "7",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Shi:2020:OSB,
  author =       "Xuanhua Shi and Wei Liu and Ligang He and Hai Jin and
                 Ming Li and Yong Chen",
  title =        "Optimizing the {SSD} Burst Buffer by Traffic
                 Detection",
  journal =      j-TACO,
  volume =       "17",
  number =       "1",
  pages =        "8:1--8:26",
  month =        mar,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3377705",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Tue Mar 10 08:30:23 MDT 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3377705",
  abstract =     "Currently, HPC storage systems still use hard disk
                 drive (HDD) as their dominant storage device. Solid
                 state drive (SSD) is widely deployed as the buffer to
                 HDDs. Burst buffer has also been proposed to manage the
                 SSD buffering of bursty write requests.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "8",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Kalra:2020:ACB,
  author =       "Charu Kalra and Fritz Previlon and Norm Rubin and
                 David Kaeli",
  title =        "{ArmorAll}: Compiler-based Resilience Targeting {GPU}
                 Applications",
  journal =      j-TACO,
  volume =       "17",
  number =       "2",
  pages =        "9:1--9:24",
  month =        jun,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3382132",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Sat Jun 27 12:06:50 MDT 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3382132",
  abstract =     "The vulnerability of GPUs to soft errors has become a
                 first-class design concern as they are increasingly
                 being used in accuracy-sensitive and safety-critical
                 domains. Existing solutions used to enhance the
                 reliability of GPUs come with significant overhead in
                 terms of area, power, and/or performance. In this
                 article, we propose ArmorAll, a light-weight, adaptive,
                 selective, and portable software solution to protect
                 GPUs against soft errors. ArmorAll consists of a set of
                 purely compiler-based redundancy schemes designed to
                 optimize instruction duplication on GPUs, thereby
                 enabling much more reliable execution. The choice of
                 the scheme determines the subset of instructions that
                 must be duplicated in an application, allowing
                 adaptable fault coverage for different applications.
                 ArmorAll can intelligently select a redundancy scheme
                 that provides the best coverage to an application with
                 an accuracy of 91.7\%. The high coverage provided by
                 ArmorAll comes at an average improvement of 64.5\% in
                 runtime",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "9",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Cherubin:2020:DPA,
  author =       "Stefano Cherubin and Daniele Cattaneo and Michele
                 Chiari and Giovanni Agosta",
  title =        "Dynamic Precision Autotuning with {TAFFO}",
  journal =      j-TACO,
  volume =       "17",
  number =       "2",
  pages =        "10:1--10:26",
  month =        jun,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3388785",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Sat Jun 27 12:06:50 MDT 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3388785",
  abstract =     "Many classes of applications, both in the embedded and
                 high performance domains, can trade off the accuracy of
                 the computed results for computation performance. One
                 way to achieve such a trade-off is precision
                 tuning-that is, to modify the data types used for the
                 computation by reducing the bit width, or by changing
                 the representation from floating point to fixed point.
                 We present a methodology for high-accuracy dynamic
                 precision tuning based on the identification of input
                 classes (i.e., classes of input datasets that benefit
                 from similar optimizations). When a new input region is
                 detected, the application kernels are re-compiled on
                 the fly with the appropriate selection of parameters.
                 In this way, we obtain a continuous optimization
                 approach that enables the exploitation of the reduced
                 precision computation while progressively exploring the
                 solution space, thus reducing the time required by
                 compilation overheads. We provide tools to support the
                 automation of the runtime part of the solution, leaving
                 to the user only the task of identifying the input
                 classes. Our approach provides a significant
                 performance boost (up to 320\%) on the typical
                 approximate computing benchmarks, without meaningfully
                 affecting the accuracy of the result, since the error
                 remains always below 3\%.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "10",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Erdem:2020:RDS,
  author =       "Ahmet Erdem and Cristina Silvano and Thomas Boesch and
                 Andrea Carlo Ornstein and Surinder-Pal Singh and
                 Giuseppe Desoli",
  title =        "Runtime Design Space Exploration and Mapping of
                 {DCNNs} for the Ultra-Low-Power {Orlando SoC}",
  journal =      j-TACO,
  volume =       "17",
  number =       "2",
  pages =        "11:1--11:25",
  month =        jun,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3379933",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Sat Jun 27 12:06:50 MDT 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3379933",
  abstract =     "Recent trends in deep convolutional neural networks
                 (DCNNs) impose hardware accelerators as a viable
                 solution for computer vision and speech recognition.
                 The Orlando SoC architecture from STMicroelectronics
                 targets exactly this class of problems by \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "11",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Sabet:2020:RAU,
  author =       "Amir Hossein Nodehi Sabet and Junqiao Qiu and Zhijia
                 Zhao and Sriram Krishnamoorthy",
  title =        "Reliability Analysis for Unreliable {FSM}
                 Computations",
  journal =      j-TACO,
  volume =       "17",
  number =       "2",
  pages =        "12:1--12:23",
  month =        jun,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3377456",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Sat Jun 27 12:06:50 MDT 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3377456",
  abstract =     "Finite State Machines (FSMs) are fundamental in both
                 hardware design and software development. However, the
                 reliability of FSM computations remains poorly
                 understood. Existing reliability analyses are mainly
                 designed for generic computations and are \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "12",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Xue:2020:NIA,
  author =       "Jiachen Xue and T. N. Vijaykumar and Mithuna
                 Thottethodi",
  title =        "Network Interface Architecture for Remote Indirect
                 Memory Access {(RIMA)} in Datacenters",
  journal =      j-TACO,
  volume =       "17",
  number =       "2",
  pages =        "13:1--13:22",
  month =        jun,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3374215",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Sat Jun 27 12:06:50 MDT 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3374215",
  abstract =     "Remote Direct Memory Access (RDMA) fabrics such as
                 InfiniBand and Converged Ethernet report latency
                 shorter by a factor of 50 than TCP. As such, RDMA is a
                 potential replacement for TCP in datacenters (DCs)
                 running low-latency applications, such as Web
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "13",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Wang:2020:CFS,
  author =       "Qinggang Wang and Long Zheng and Jieshan Zhao and
                 Xiaofei Liao and Hai Jin and Jingling Xue",
  title =        "A Conflict-free Scheduler for High-performance Graph
                 Processing on Multi-pipeline {FPGAs}",
  journal =      j-TACO,
  volume =       "17",
  number =       "2",
  pages =        "14:1--14:26",
  month =        jun,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3390523",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Sat Jun 27 12:06:50 MDT 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3390523",
  abstract =     "FPGA-based graph processing accelerators are nowadays
                 equipped with multiple pipelines for hardware
                 acceleration of graph computations. However, their
                 multi-pipeline efficiency can suffer greatly from the
                 considerable overheads caused by the read/write
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "14",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Tino:2020:SXE,
  author =       "Anita Tino and Caroline Collange and Andr{\'e}
                 Seznec",
  title =        "{SIMT-X}: Extending Single-Instruction Multi-Threading
                 to Out-of-Order Cores",
  journal =      j-TACO,
  volume =       "17",
  number =       "2",
  pages =        "15:1--15:23",
  month =        jun,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3392032",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Sat Jun 27 12:06:50 MDT 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3392032",
  abstract =     "This work introduces Single Instruction Multi-Thread
                 Express (SIMT-X), a general-purpose Central Processing
                 Unit (CPU) microarchitecture that enables Graphics
                 Processing Units (GPUs)-style SIMT execution across
                 multiple threads of the same program for high
                 throughput, while retaining the latency benefits of
                 out-of-order execution, and the programming convenience
                 of homogeneous multi-thread processors. SIMT-X
                 leverages the existing Single Instruction Multiple Data
                 (SIMD) back-end to provide CPU/GPU-like processing on a
                 single core with minimal overhead. We demonstrate that
                 although SIMT-X invokes a restricted form of
                 Out-of-Order (OoO), the microarchitecture successfully
                 captures a majority of the benefits of aggressive OoO
                 execution using at most two concurrent register
                 mappings per architectural register, while addressing
                 issues of partial dependencies and supporting a
                 general-purpose Instruction Set Architecture (ISA).",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "15",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Kaeli:2020:EME,
  author =       "Dave Kaeli",
  title =        "Editorial: a Message from the {Editor-in-Chief}",
  journal =      j-TACO,
  volume =       "17",
  number =       "3",
  pages =        "16:1--16:2",
  month =        aug,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3409369",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Aug 28 12:02:00 MDT 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3409369",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "16",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Rangan:2020:ZEZ,
  author =       "Ram Rangan and Mark W. Stephenson and Aditya Ukarande
                 and Shyam Murthy and Virat Agarwal and Marc
                 Blackstein",
  title =        "{Zeroploit}: Exploiting Zero Valued Operands in
                 Interactive Gaming Applications",
  journal =      j-TACO,
  volume =       "17",
  number =       "3",
  pages =        "17:1--17:26",
  month =        aug,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3394284",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Aug 28 12:02:00 MDT 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3394284",
  abstract =     "In this article, we first characterize register
                 operand value locality in shader programs of modern
                 gaming applications and observe that there is a high
                 likelihood of one of the register operands of several
                 multiply, logical-and, and similar operations
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "17",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Adamek:2020:GFC,
  author =       "Karel Ad{\'a}mek and Sofia Dimoudi and Mike Giles and
                 Wesley Armour",
  title =        "{GPU} Fast Convolution via the Overlap-and-Save Method
                 in Shared Memory",
  journal =      j-TACO,
  volume =       "17",
  number =       "3",
  pages =        "18:1--18:20",
  month =        aug,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3394116",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Aug 28 12:02:00 MDT 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3394116",
  abstract =     "We present an implementation of the overlap-and-save
                 method, a method for the convolution of very long
                 signals with short response functions, which is
                 tailored to GPUs. We have implemented several FFT
                 algorithms (using the CUDA programming language),
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "18",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Das:2020:FER,
  author =       "Arnab Das and Sriram Krishnamoorthy and Ian Briggs and
                 Ganesh Gopalakrishnan and Ramakrishna Tipireddy",
  title =        "{FPDetect}: Efficient Reasoning About Stencil Programs
                 Using Selective Direct Evaluation",
  journal =      j-TACO,
  volume =       "17",
  number =       "3",
  pages =        "19:1--19:27",
  month =        aug,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3402451",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Aug 28 12:02:00 MDT 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3402451",
  abstract =     "We present FPDetect, a low-overhead approach for
                 detecting logical errors and soft errors affecting
                 stencil computations without generating false
                 positives. We develop an offline analysis that tightly
                 estimates the number of floating-point bits \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "19",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Abdelrahman:2020:CSH,
  author =       "Tarek S. Abdelrahman",
  title =        "Cooperative Software-hardware Acceleration of
                 {$K$}-means on a Tightly Coupled {CPU--FPGA} System",
  journal =      j-TACO,
  volume =       "17",
  number =       "3",
  pages =        "20:1--20:24",
  month =        aug,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3406114",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Aug 28 12:02:00 MDT 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3406114",
  abstract =     "We consider software-hardware acceleration of K-means
                 clustering on the Intel Xeon+FPGA platform. We design a
                 pipelined accelerator for K-means and combine it with
                 CPU threads to assess performance benefits of (1)
                 acceleration when data are only \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "20",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Lee:2020:SBP,
  author =       "Jaekyu Lee and Yasuo Ishii and Dam Sunwoo",
  title =        "Securing Branch Predictors with Two-Level Encryption",
  journal =      j-TACO,
  volume =       "17",
  number =       "3",
  pages =        "21:1--21:25",
  month =        aug,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3404189",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Aug 28 12:02:00 MDT 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/cryptography2020.bib;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3404189",
  abstract =     "Modern processors rely on various speculative
                 mechanisms to meet performance demand. Branch
                 predictors are one of the most important
                 micro-architecture components to deliver performance.
                 However, they have been under heavy scrutiny because of
                 recent \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "21",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Cerina:2020:EDO,
  author =       "L. Cerina and M. D. Santambrogio and G. Franco and C.
                 Gallicchio and A. Micheli",
  title =        "{EchoBay}: Design and Optimization of Echo State
                 Networks under Memory and Time Constraints",
  journal =      j-TACO,
  volume =       "17",
  number =       "3",
  pages =        "22:1--22:24",
  month =        aug,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3404993",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Aug 28 12:02:00 MDT 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3404993",
  abstract =     "The increase in computational power of embedded
                 devices and the latency demands of novel applications
                 brought a paradigm shift on how and where the
                 computation is performed. Although AI inference is
                 slowly moving from the cloud to end-devices with
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "22",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Sioutas:2020:SSH,
  author =       "Savvas Sioutas and Sander Stuijk and Twan Basten and
                 Henk Corporaal and Lou Somers",
  title =        "Schedule Synthesis for {Halide} Pipelines on {GPUs}",
  journal =      j-TACO,
  volume =       "17",
  number =       "3",
  pages =        "23:1--23:25",
  month =        aug,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3406117",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Aug 28 12:02:00 MDT 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3406117",
  abstract =     "The Halide DSL and compiler have enabled
                 high-performance code generation for image processing
                 pipelines targeting heterogeneous architectures through
                 the separation of algorithmic description and
                 optimization schedule. However, automatic schedule
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "23",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Huzaifa:2020:IKR,
  author =       "Muhammad Huzaifa and Johnathan Alsop and Abdulrahman
                 Mahmoud and Giordano Salvador and Matthew D. Sinclair
                 and Sarita V. Adve",
  title =        "Inter-kernel Reuse-aware Thread Block Scheduling",
  journal =      j-TACO,
  volume =       "17",
  number =       "3",
  pages =        "24:1--24:27",
  month =        aug,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3406538",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Aug 28 12:02:00 MDT 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3406538",
  abstract =     "As GPUs have become more programmable, their
                 performance and energy benefits have made them
                 increasingly popular. However, while GPU compute units
                 continue to improve in performance, on-chip memories
                 lag behind and data accesses are becoming \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "24",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Jafri:2021:RTC,
  author =       "Syed M. A. H. Jafri and Hasan Hassan and Ahmed Hemani
                 and Onur Mutlu",
  title =        "Refresh Triggered Computation: Improving the Energy
                 Efficiency of Convolutional Neural Network
                 Accelerators",
  journal =      j-TACO,
  volume =       "18",
  number =       "1",
  pages =        "2:1--2:29",
  month =        jan,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3417708",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Sat Jan 16 06:46:44 MST 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3417708",
  abstract =     "To employ a Convolutional Neural Network (CNN) in an
                 energy-constrained embedded system, it is critical for
                 the CNN implementation to be highly energy efficient.
                 Many recent studies propose CNN accelerator
                 architectures with custom computation units \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "2",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Abera:2021:PET,
  author =       "Solomon Abera and M. Balakrishnan and Anshul Kumar",
  title =        "Performance-Energy Trade-off in Modern {CMPs}",
  journal =      j-TACO,
  volume =       "18",
  number =       "1",
  pages =        "3:1--3:26",
  month =        jan,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3427092",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Sat Jan 16 06:46:44 MST 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3427092",
  abstract =     "Chip multiprocessors (CMPs) are ubiquitous in all
                 computing systems ranging from high-end servers to
                 mobile devices. In these systems, energy consumption is
                 a critical design constraint as it constitutes the most
                 significant operating cost for computing \ldots{}.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "3",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Mehrabi:2021:BOE,
  author =       "Atefeh Mehrabi and Aninda Manocha and Benjamin C. Lee
                 and Daniel J. Sorin",
  title =        "{Bayesian} Optimization for Efficient Accelerator
                 Synthesis",
  journal =      j-TACO,
  volume =       "18",
  number =       "1",
  pages =        "4:1--4:25",
  month =        jan,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3427377",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Sat Jan 16 06:46:44 MST 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3427377",
  abstract =     "Accelerator design is expensive due to the effort
                 required to understand an algorithm and optimize the
                 design. Architects have embraced two technologies to
                 reduce costs. High-level synthesis automatically
                 generates hardware from code. Reconfigurable \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "4",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Kim:2021:IRA,
  author =       "Minsu Kim and Jeong-Keun Park and Soo-Mook Moon",
  title =        "Irregular Register Allocation for Translation of
                 Test-pattern Programs",
  journal =      j-TACO,
  volume =       "18",
  number =       "1",
  pages =        "5:1--5:23",
  month =        jan,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3427378",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Sat Jan 16 06:46:44 MST 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3427378",
  abstract =     "Test-pattern programs are for testing DRAM memory
                 chips. They run on a special embedded system called
                 automated test equipment (ATE). Each ATE manufacturer
                 provides its own programming language, which is mostly
                 low level, thus accessing the registers in \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "5",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Nematollahi:2021:ENN,
  author =       "Negin Nematollahi and Mohammad Sadrosadati and Hajar
                 Falahati and Marzieh Barkhordar and Mario Paulo Drumond
                 and Hamid Sarbazi-Azad and Babak Falsafi",
  title =        "Efficient Nearest-Neighbor Data Sharing in {GPUs}",
  journal =      j-TACO,
  volume =       "18",
  number =       "1",
  pages =        "6:1--6:26",
  month =        jan,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3429981",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Sat Jan 16 06:46:44 MST 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3429981",
  abstract =     "Stencil codes (a.k.a. nearest-neighbor computations)
                 are widely used in image processing, machine learning,
                 and scientific applications. Stencil codes incur
                 nearest-neighbor data exchange because the value of
                 each point in the structured grid is \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "6",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Braun:2021:SMP,
  author =       "Lorenz Braun and Sotirios Nikas and Chen Song and
                 Vincent Heuveline and Holger Fr{\"o}ning",
  title =        "A Simple Model for Portable and Fast Prediction of
                 Execution Time and Power Consumption of {GPU} Kernels",
  journal =      j-TACO,
  volume =       "18",
  number =       "1",
  pages =        "7:1--7:25",
  month =        jan,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3431731",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Sat Jan 16 06:46:44 MST 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3431731",
  abstract =     "Characterizing compute kernel execution behavior on
                 GPUs for efficient task scheduling is a non-trivial
                 task. We address this with a simple model enabling
                 portable and fast predictions among different GPUs
                 using only hardware-independent features. This
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "7",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Mettler:2021:DHM,
  author =       "Marcel Mettler and Daniel Mueller-Gritschneder and Ulf
                 Schlichtmann",
  title =        "A Distributed Hardware Monitoring System for Runtime
                 Verification on Multi-Tile {MPSoCs}",
  journal =      j-TACO,
  volume =       "18",
  number =       "1",
  pages =        "8:1--8:25",
  month =        jan,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3430699",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Sat Jan 16 06:46:44 MST 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3430699",
  abstract =     "Exhaustive verification techniques do not scale with
                 the complexity of today's multi-tile Multi-processor
                 Systems-on-chip (MPSoCs). Hence, runtime verification
                 (RV) has emerged as a complementary method, which
                 verifies the correct behavior of \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "8",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Wang:2021:EPO,
  author =       "Yu Emma Wang and Carole-Jean Wu and Xiaodong Wang and
                 Kim Hazelwood and David Brooks",
  title =        "Exploiting Parallelism Opportunities with Deep
                 Learning Frameworks",
  journal =      j-TACO,
  volume =       "18",
  number =       "1",
  pages =        "9:1--9:23",
  month =        jan,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3431388",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Sat Jan 16 06:46:44 MST 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3431388",
  abstract =     "State-of-the-art machine learning frameworks support a
                 wide variety of design features to enable a flexible
                 machine learning programming interface and to ease the
                 programmability burden on machine learning developers.
                 Identifying and using a performance-. \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "9",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Tavarageri:2021:PPO,
  author =       "Sanket Tavarageri and Alexander Heinecke and Sasikanth
                 Avancha and Bharat Kaul and Gagandeep Goyal and
                 Ramakrishna Upadrasta",
  title =        "{PolyDL}: Polyhedral Optimizations for Creation of
                 High-performance {DL} Primitives",
  journal =      j-TACO,
  volume =       "18",
  number =       "1",
  pages =        "11:1--11:27",
  month =        jan,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3433103",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Sat Jan 16 06:46:44 MST 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3433103",
  abstract =     "Deep Neural Networks (DNNs) have revolutionized many
                 aspects of our lives. The use of DNNs is becoming
                 ubiquitous, including in software for image
                 recognition, speech recognition, speech synthesis,
                 language translation, to name a few. The training of
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "11",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Yadalam:2021:SXS,
  author =       "Sujay Yadalam and Vinod Ganapathy and Arkaprava Basu",
  title =        "{SG XL}: Security and Performance for Enclaves Using
                 Large Pages",
  journal =      j-TACO,
  volume =       "18",
  number =       "1",
  pages =        "12:1--12:25",
  month =        jan,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3433983",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Sat Jan 16 06:46:44 MST 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3433983",
  abstract =     "Intel's SGX architecture offers clients of public
                 cloud computing platforms the ability to create
                 hardware-protected enclaves whose contents are
                 protected from privileged system software. However, SGX
                 relies on system software for enclave memory
                 \ldots{}.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "12",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Kalaitzidis:2021:LVE,
  author =       "Kleovoulos Kalaitzidis and Andr{\'e} Seznec",
  title =        "Leveraging Value Equality Prediction for Value
                 Speculation",
  journal =      j-TACO,
  volume =       "18",
  number =       "1",
  pages =        "13:1--13:20",
  month =        jan,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3436821",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Sat Jan 16 06:46:44 MST 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3436821",
  abstract =     "Value Prediction (VP) has recently been gaining
                 interest in the research community, since prior work
                 has established practical solutions for its
                 implementation that provide meaningful performance
                 gains. A constant challenge of contemporary
                 context-based \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "13",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Singh:2021:SSM,
  author =       "Abhishek Singh and Shail Dave and Pantea Zardoshti and
                 Robert Brotzman and Chao Zhang and Xiaochen Guo and
                 Aviral Shrivastava and Gang Tan and Michael Spear",
  title =        "{SPX64}: a Scratchpad Memory for General-purpose
                 Microprocessors",
  journal =      j-TACO,
  volume =       "18",
  number =       "1",
  pages =        "14:1--14:26",
  month =        jan,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3436730",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Sat Jan 16 06:46:44 MST 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3436730",
  abstract =     "General-purpose computing systems employ memory
                 hierarchies to provide the appearance of a single
                 large, fast, coherent memory. In special-purpose CPUs,
                 programmers manually manage distinct, non-coherent
                 scratchpad memories. In this article, we combine
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "14",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Labini:2021:APM,
  author =       "Paolo Sylos Labini and Marco Cianfriglia and Damiano
                 Perri and Osvaldo Gervasi and Grigori Fursin and Anton
                 Lokhmotov and Cedric Nugteren and Bruno Carpentieri and
                 Fabiana Zollo and Flavio Vella",
  title =        "On the Anatomy of Predictive Models for Accelerating
                 {GPU} Convolution Kernels and Beyond",
  journal =      j-TACO,
  volume =       "18",
  number =       "1",
  pages =        "16:1--16:24",
  month =        jan,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3434402",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Sat Jan 16 06:46:44 MST 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3434402",
  abstract =     "Efficient HPC libraries often expose multiple tunable
                 parameters, algorithmic implementations, or a
                 combination of them, to provide optimized routines. The
                 optimal parameters and algorithmic choices may depend
                 on input properties such as the shapes of \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "16",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Voss:2021:PRS,
  author =       "Nils Voss and Bastiaan Kwaadgras and Oskar Mencer and
                 Wayne Luk and Georgi Gaydadjiev",
  title =        "On Predictable Reconfigurable System Design",
  journal =      j-TACO,
  volume =       "18",
  number =       "2",
  pages =        "17:1--17:28",
  month =        mar,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3436995",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Sat Mar 20 17:25:10 MDT 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3436995",
  abstract =     "We propose a design methodology to facilitate rigorous
                 development of complex applications targeting
                 reconfigurable hardware. Our methodology relies on
                 analytical estimation of system performance and area
                 utilisation for a given specific application and
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "17",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Kaushik:2021:GHP,
  author =       "Anirudh Mohan Kaushik and Gennady Pekhimenko and Hiren
                 Patel",
  title =        "{Gretch}: a Hardware Prefetcher for Graph Analytics",
  journal =      j-TACO,
  volume =       "18",
  number =       "2",
  pages =        "18:1--18:25",
  month =        mar,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3439803",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Sat Mar 20 17:25:10 MDT 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3439803",
  abstract =     "Data-dependent memory accesses (DDAs) pose an
                 important challenge for high-performance graph
                 analytics (GA). This is because such memory accesses do
                 not exhibit enough temporal and spatial locality
                 resulting in low cache performance. Prior efforts that
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "18",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Ho:2021:GFD,
  author =       "Nhut-Minh Ho and Himeshi {De Silva} and Weng-Fai
                 Wong",
  title =        "{GRAM}: a Framework for Dynamically Mixing Precisions
                 in {GPU} Applications",
  journal =      j-TACO,
  volume =       "18",
  number =       "2",
  pages =        "19:1--19:24",
  month =        mar,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3441830",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Sat Mar 20 17:25:10 MDT 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/fparith.bib;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3441830",
  abstract =     "This article presents GRAM (GPU-based Runtime Adaption
                 for Mixed-precision) a framework for the effective use
                 of mixed precision arithmetic for CUDA programs. Our
                 method provides a fine-grain tradeoff between output
                 error and performance. It can create many variants that
                 satisfy different accuracy requirements by assigning
                 different groups of threads to different precision
                 levels adaptively at runtime. To widen the range of
                 applications that can benefit from its approximation,
                 GRAM comes with an optional half-precision approximate
                 math library. Using GRAM, we can trade off precision
                 for any performance improvement of up to 540\%,
                 depending on the application and accuracy
                 requirement.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "19",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Biswas:2021:CSI,
  author =       "Arnab Kumar Biswas",
  title =        "Cryptographic Software {IP} Protection without
                 Compromising Performance or Timing Side-channel
                 Leakage",
  journal =      j-TACO,
  volume =       "18",
  number =       "2",
  pages =        "20:1--20:20",
  month =        mar,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3443707",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Sat Mar 20 17:25:10 MDT 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/cryptography2020.bib;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3443707",
  abstract =     "Program obfuscation is a widely used cryptographic
                 software intellectual property (IP) protection
                 technique against reverse engineering attacks in
                 embedded systems. However, very few works have studied
                 the impact of combining various obfuscation techniques
                 on the obscurity (difficulty of reverse engineering)
                 and performance (execution time) of obfuscated
                 programs. In this article, we propose a Genetic
                 Algorithm (GA)-based framework that not only optimizes
                 obscurity and performance of obfuscated cryptographic
                 programs, but it also ensures very low timing
                 side-channel leakage. Our proposed Timing Side Channel
                 Sensitive Program Obfuscation Optimization Framework
                 (TSC-SPOOF) determines the combination of obfuscation
                 transformation functions that produce optimized
                 obfuscated programs with preferred optimization
                 parameters. In particular, TSC-SPOOF employs normalized
                 compression distance (NCD) and channel capacity to
                 measure obscurity and timing side-channel leakage,
                 respectively. We also use RISC-V rocket core running on
                 a Xilinx Zynq FPGA device as part of our framework to
                 obtain realistic results. The experimental results
                 clearly show that our proposed solution leads to
                 cryptographic programs with lower execution time,
                 higher obscurity, and lower timing side-channel leakage
                 than unguided obfuscation.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "20",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{France-Pillois:2021:NIT,
  author =       "Maxime France-Pillois and J{\'e}r{\^o}me Martin and
                 Fr{\'e}d{\'e}ric Rousseau",
  title =        "A Non-Intrusive Tool Chain to Optimize {MPSoC}
                 End-to-End Systems",
  journal =      j-TACO,
  volume =       "18",
  number =       "2",
  pages =        "21:1--21:22",
  month =        mar,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3445030",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Sat Mar 20 17:25:10 MDT 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3445030",
  abstract =     "Multi-core systems are now found in many electronic
                 devices. But does current software design fully
                 leverage their capabilities? The complexity of the
                 hardware and software stacks in these platforms
                 requires software optimization with end-to-end
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "21",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Wang:2021:GTU,
  author =       "Pengyu Wang and Jing Wang and Chao Li and Jianzong
                 Wang and Haojin Zhu and Minyi Guo",
  title =        "{Grus}: Toward Unified-memory-efficient
                 High-performance Graph Processing on {GPU}",
  journal =      j-TACO,
  volume =       "18",
  number =       "2",
  pages =        "22:1--22:25",
  month =        mar,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3444844",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Sat Mar 20 17:25:10 MDT 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3444844",
  abstract =     "Today's GPU graph processing frameworks face
                 scalability and efficiency issues as the graph size
                 exceeds GPU-dedicated memory limit. Although recent
                 GPUs can over-subscribe memory with Unified Memory
                 (UM), they incur significant overhead when handling
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "22",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Izadpanah:2021:PPT,
  author =       "Ramin Izadpanah and Christina Peterson and Yan Solihin
                 and Damian Dechev",
  title =        "{PETRA}: Persistent Transactional Non-blocking Linked
                 Data Structures",
  journal =      j-TACO,
  volume =       "18",
  number =       "2",
  pages =        "23:1--23:26",
  month =        mar,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3446391",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Sat Mar 20 17:25:10 MDT 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3446391",
  abstract =     "Emerging byte-addressable Non-Volatile Memories (NVMs)
                 enable persistent memory where process state can be
                 recovered after crashes. To enable applications to rely
                 on persistent data, durable data structures with
                 failure-atomic operations have been \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "23",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Hassan:2021:RCM,
  author =       "Muhammad Hassan and Chang Hyun Park and David
                 Black-Schaffer",
  title =        "A Reusable Characterization of the Memory System
                 Behavior of {SPEC2017} and {SPEC2006}",
  journal =      j-TACO,
  volume =       "18",
  number =       "2",
  pages =        "24:1--24:20",
  month =        mar,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3446200",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Sat Mar 20 17:25:10 MDT 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3446200",
  abstract =     "The SPEC CPU Benchmarks are used extensively for
                 evaluating and comparing improvements to computer
                 systems. This ubiquity makes characterization critical
                 for researchers to understand the bottlenecks the
                 benchmarks do and do not expose and where new designs
                 should and should not be expected to show impact.
                 However, in characterization there is a tradeoff
                 between accuracy and reusability: The more precisely we
                 characterize a benchmark's performance on a given
                 system, the less usable it is across different
                 micro-architectures and varying memory configurations.
                 For SPEC, most existing characterizations include
                 system-specific effects (e.g., via performance
                 counters) and/or only look at aggregate behavior (e.g.,
                 averages over the full application execution). While
                 such approaches simplify characterization, they make it
                 difficult to separate the applications intrinsic
                 behavior from the system-specific effects and/or lose
                 the diverse phase-based behaviors.\par

                 In this work we focus on characterizing the
                 applications intrinsic memory behaviour by isolating
                 them from micro-architectural configuration specifics.
                 We do this by providing a simplified generic system
                 model that evaluates the applications memory behavior
                 across multiple cache sizes, with and without
                 prefetching, and over time. The resulting
                 characterization can be reused across a range of
                 systems to understand application behavior and allow us
                 to see how frequently different behaviors occur. We use
                 this approach to compare the SPEC 2006 and 2017 suites,
                 providing insight into their memory system behaviour
                 beyond previous system-specific and/or aggregate
                 results. We demonstrate the ability to use this
                 characterization in different contexts by showing a
                 portion of the SPEC 2017 benchmark suite that could
                 benefit from giga-scale caches, despite aggregate
                 results indicating otherwise.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "24",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Tiwari:2021:PCP,
  author =       "Sugandha Tiwari and Neel Gala and Chester Rebeiro and
                 V. Kamakoti",
  title =        "{PERI}: a Configurable Posit Enabled {RISC-V} Core",
  journal =      j-TACO,
  volume =       "18",
  number =       "3",
  pages =        "25:1--25:26",
  month =        jun,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3446210",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Tue Jun 29 08:21:11 MDT 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/fparith.bib;
                 https://www.math.utah.edu/pub/tex/bib/risc-v.bib;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3446210",
  abstract =     "Owing to the failure of Dennard's scaling, the past
                 decade has seen a steep growth of prominent new
                 paradigms leveraging opportunities in computer
                 architecture. Two technologies of interest are Posit
                 and RISC-V. Posit was introduced in mid-2017 as a
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "25",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Charitopoulos:2021:MDC,
  author =       "George Charitopoulos and Dionisios N. Pnevmatikatos
                 and Georgi Gaydadjiev",
  title =        "{MC-DeF}: Creating Customized {CGRAs} for Dataflow
                 Applications",
  journal =      j-TACO,
  volume =       "18",
  number =       "3",
  pages =        "26:1--26:25",
  month =        jun,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3447970",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Tue Jun 29 08:21:11 MDT 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3447970",
  abstract =     "Executing complex scientific applications on
                 Coarse-Grain Reconfigurable Arrays (CGRAs) promises
                 improvements in execution time and/or energy
                 consumption compared to optimized software
                 implementations or even fully customized hardware
                 solutions. Typical \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "26",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Borbon:2021:APB,
  author =       "Jose M. Rodriguez Borbon and Junjie Huang and Bryan M.
                 Wong and Walid Najjar",
  title =        "Acceleration of Parallel-Blocked {$ Q R $}
                 Decomposition of Tall-and-Skinny Matrices on {FPGAs}",
  journal =      j-TACO,
  volume =       "18",
  number =       "3",
  pages =        "27:1--27:25",
  month =        jun,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3447775",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Tue Jun 29 08:21:11 MDT 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3447775",
  abstract =     "$ Q R $ decomposition is one of the most useful
                 factorization kernels in modern numerical linear
                 algebra algorithms. In particular, the decomposition of
                 tall-and-skinny matrices (TSMs) has major applications
                 in areas including scientific computing, machine
                 learning, image processing, wireless networks, and
                 numerical methods. Traditionally, CPUs and GPUs have
                 achieved better throughput on these applications by
                 using large cache hierarchies and compute cores running
                 at a high frequency, leading to high power consumption.
                 With the advent of heterogeneous platforms, however,
                 FPGAs are emerging as a promising viable alternative.
                 In this work, we propose a high-throughput FPGA-based
                 engine that has a very high computational efficiency
                 (ratio of achieved to peak throughput) compared to
                 similar $ Q R $ solvers running on FPGAs. Although
                 comparable $ Q R $ solvers achieve an efficiency of
                 36\%, our design exhibits an efficiency of 54\%. For
                 TSMs, our experimental results show that our design can
                 outperform highly optimized $ Q R $ solvers running on
                 CPUs and GPUs. For TSMs with more than 50K rows, our
                 design outperforms the Intel MKL solver running on an
                 Intel quad-core processor by a factor of $ 1.5 \times
                 $. For TSMs containing 256 columns or less, our design
                 outperforms the NVIDIA CUBLAS solver running on a K40
                 GPU by a factor of $ 3.0 \times $. In addition to being
                 fast, our design is energy efficient competing
                 platforms execute up to 0.6 GFLOPS/Joule, whereas our
                 design executes more than 1.0 GFLOPS/Joule.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "27",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Stokes:2021:DMR,
  author =       "Michael Stokes and David Whalley and Soner Onder",
  title =        "Decreasing the Miss Rate and Eliminating the
                 Performance Penalty of a Data Filter Cache",
  journal =      j-TACO,
  volume =       "18",
  number =       "3",
  pages =        "28:1--28:22",
  month =        jun,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3449043",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Tue Jun 29 08:21:11 MDT 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3449043",
  abstract =     "While data filter caches (DFCs) have been shown to be
                 effective at reducing data access energy, they have not
                 been adopted in processors due to the associated
                 performance penalty caused by high DFC miss rates. In
                 this article, we present a design that \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "28",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Akram:2021:PEI,
  author =       "Shoaib Akram",
  title =        "Performance Evaluation of {Intel Optane} Memory for
                 Managed Workloads",
  journal =      j-TACO,
  volume =       "18",
  number =       "3",
  pages =        "29:1--29:26",
  month =        jun,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3451342",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Tue Jun 29 08:21:11 MDT 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/java2020.bib;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3451342",
  abstract =     "Intel Optane memory offers non-volatility, byte
                 addressability, and high capacity. It suits managed
                 workloads that prefer large main memory heaps. We
                 investigate Optane as the main memory for managed
                 (Java) workloads, focusing on performance scalability.
                 As the workload (core count) increases, we note
                 Optane's performance relative to DRAM. A few workloads
                 incur a slight slowdown on Optane memory, which helps
                 conserve limited DRAM capacity. Unfortunately, other
                 workloads scale poorly beyond a few core
                 counts.\par

                 This article investigates scaling bottlenecks for Java
                 workloads on Optane memory, analyzing the application,
                 runtime, and microarchitectural interactions. Poorly
                 scaling workloads allocate objects rapidly and access
                 objects in Optane memory frequently. These
                 characteristics slow down the mutator and substantially
                 slow down garbage collection (GC). At the
                 microarchitecture level, load, store, and instruction
                 miss penalties rise. To regain performance, we
                 partition heaps across DRAM and Optane memory, a hybrid
                 that scales considerably better than Optane alone. We
                 exploit state-of-the-art GC approaches to partition
                 heaps. Unfortunately, existing GC approaches needlessly
                 waste DRAM capacity because they ignore runtime
                 behavior.\par

                 This article also introduces performance impact-guided
                 memory allocation (PIMA) for hybrid memories. PIMA
                 maximizes Optane utilization, allocating in DRAM only
                 if it improves performance. It estimates the
                 performance impact of allocating heaps in either memory
                 type by sampling. We target PIMA at graph analytics
                 workloads, offering a novel performance estimation
                 method and detailed evaluation. PIMA identifies
                 workload phases that benefit from DRAM with high
                 (94.33\%) accuracy, incurring only a 2\% sampling
                 overhead. PIMA operates stand-alone or combines with
                 prior approaches to offer new performance versus DRAM
                 capacity trade-offs. This work opens up Optane memory
                 to a real-life role as the main memory for Java
                 workloads.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "29",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Lu:2021:GAG,
  author =       "Yashuai L{\"u} and Hui Guo and Libo Huang and Qi Yu
                 and Li Shen and Nong Xiao and Zhiying Wang",
  title =        "{GraphPEG}: Accelerating Graph Processing on {GPUs}",
  journal =      j-TACO,
  volume =       "18",
  number =       "3",
  pages =        "30:1--30:24",
  month =        jun,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3450440",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Tue Jun 29 08:21:11 MDT 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3450440",
  abstract =     "Due to massive thread-level parallelism, GPUs have
                 become an attractive platform for accelerating
                 large-scale data parallel computations, such as graph
                 processing. However, achieving high performance for
                 graph processing with GPUs is non-trivial. \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "30",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Omar:2021:PSH,
  author =       "Hamza Omar and Omer Khan",
  title =        "{PRISM}: Strong Hardware Isolation-based Soft-Error
                 Resilient Multicore Architecture with High Performance
                 and Availability at Low Hardware Overheads",
  journal =      j-TACO,
  volume =       "18",
  number =       "3",
  pages =        "31:1--31:25",
  month =        jun,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3450523",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Tue Jun 29 08:21:11 MDT 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3450523",
  abstract =     "Multicores increasingly deploy safety-critical
                 parallel applications that demand resiliency against
                 soft-errors to satisfy the safety standards. However,
                 protection against these errors is challenging due to
                 complex communication and data access \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "31",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Tripathy:2021:PLG,
  author =       "Devashree Tripathy and Amirali Abdolrashidi and Laxmi
                 Narayan Bhuyan and Liang Zhou and Daniel Wong",
  title =        "{PAVER}: Locality Graph-Based Thread Block Scheduling
                 for {GPUs}",
  journal =      j-TACO,
  volume =       "18",
  number =       "3",
  pages =        "32:1--32:26",
  month =        jun,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3451164",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Tue Jun 29 08:21:11 MDT 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3451164",
  abstract =     "The massive parallelism present in GPUs comes at the
                 cost of reduced L1 and L2 cache sizes per thread,
                 leading to serious cache contention problems such as
                 thrashing. Hence, the data access locality of an
                 application should be considered during thread
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "32",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Heirman:2021:ASE,
  author =       "Wim Heirman and Stijn Eyerman and Kristof {Du Bois}
                 and Ibrahim Hur",
  title =        "Automatic Sublining for Efficient Sparse Memory
                 Accesses",
  journal =      j-TACO,
  volume =       "18",
  number =       "3",
  pages =        "33:1--33:23",
  month =        jun,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3452141",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Tue Jun 29 08:21:11 MDT 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3452141",
  abstract =     "Sparse memory accesses, which are scattered accesses
                 to single elements of a large data structure, are a
                 challenge for current processor architectures. Their
                 lack of spatial and temporal locality and their
                 irregularity makes caches and traditional \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "33",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Cavus:2021:FKV,
  author =       "Mustafa Cavus and Mohammed Shatnawi and Resit Sendag
                 and Augustus K. Uht",
  title =        "Fast Key-Value Lookups with Node Tracker",
  journal =      j-TACO,
  volume =       "18",
  number =       "3",
  pages =        "34:1--34:26",
  month =        jun,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3452099",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Tue Jun 29 08:21:11 MDT 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3452099",
  abstract =     "Lookup operations for in-memory databases are heavily
                 memory bound, because they often rely on
                 pointer-chasing linked data structure traversals. They
                 also have many branches that are hard-to-predict due to
                 random key lookups. In this study, we show that
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "34",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Song:2021:CRE,
  author =       "Weijia Song and Christina Delimitrou and Zhiming Shen
                 and Robbert {Van Renesse} and Hakim Weatherspoon and
                 Lotfi Benmohamed and Frederic {De Vaulx} and Charif
                 Mahmoudi",
  title =        "{CacheInspector}: Reverse Engineering Cache Resources
                 in Public Clouds",
  journal =      j-TACO,
  volume =       "18",
  number =       "3",
  pages =        "35:1--35:25",
  month =        jun,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3457373",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Tue Jun 29 08:21:11 MDT 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib;
                 https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3457373",
  abstract =     "Infrastructure-as-a-Service cloud providers sell
                 virtual machines that are only specified in terms of
                 number of CPU cores, amount of memory, and I/O
                 throughput. Performance-critical aspects such as cache
                 sizes and memory latency are missing or reported in
                 ways that make them hard to compare across cloud
                 providers. It is difficult for users to adapt their
                 application's behavior to the available resources. In
                 this work, we aim to increase the visibility that cloud
                 users have into shared resources on public clouds.
                 Specifically, we present CacheInspector, a lightweight
                 runtime that determines the performance and allocated
                 capacity of shared caches on multi-tenant public
                 clouds. We validate CacheInspector's accuracy in a
                 controlled environment, and use it to study the
                 characteristics and variability of cache resources in
                 the cloud, across time, instances, availability
                 regions, and cloud providers. We show that
                 CacheInspector's output allows cloud users to tailor
                 their application's behavior, including their output
                 quality, to avoid suboptimal performance when resources
                 are scarce.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "35",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{RodriguesCarvalho:2021:UCC,
  author =       "Daniel {Rodrigues Carvalho} and Andr{\'e} Seznec",
  title =        "Understanding Cache Compression",
  journal =      j-TACO,
  volume =       "18",
  number =       "3",
  pages =        "36:1--36:27",
  month =        jun,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3457207",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Tue Jun 29 08:21:11 MDT 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/datacompression.bib;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3457207",
  abstract =     "Hardware cache compression derives from
                 software-compression research; yet, its implementation
                 is not a straightforward translation, since it must
                 abide by multiple restrictions to comply with area,
                 power, and latency constraints. This study sheds light
                 on the challenges of adopting compression in cache
                 design ---from the shrinking of the data until its
                 physical placement. The goal of this article is not to
                 summarize proposals but to put in evidence the
                 solutions they employ to handle those challenges. An
                 in-depth description of the main characteristics of
                 multiple methods is provided, as well as criteria that
                 can be used as a basis for the assessment of such
                 schemes. It is expected that this article will ease the
                 understanding of decisions to be taken for the design
                 of compressed systems and provide directions for future
                 work.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "36",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Thuerck:2021:FRA,
  author =       "Daniel Thuerck and Nicolas Weber and Roberto Bifulco",
  title =        "{Flynn}'s Reconciliation: Automating the Register
                 Cache Idiom for Cross-accelerator Programming",
  journal =      j-TACO,
  volume =       "18",
  number =       "3",
  pages =        "37:1--37:26",
  month =        jun,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3458357",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Tue Jun 29 08:21:11 MDT 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3458357",
  abstract =     "A large portion of the recent performance increase in
                 the High Performance Computing (HPC) and Machine
                 Learning (ML) domains is fueled by accelerator cards.
                 Many popular ML frameworks support accelerators by
                 organizing computations as a computational \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "37",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Carvalho:2021:KRN,
  author =       "Jo{\~a}o P. L. {De Carvalho} and Braedy Kuzma and Ivan
                 Korostelev and Jos{\'e} Nelson Amaral and Christopher
                 Barton and Jos{\'e} Moreira and Guido Araujo",
  title =        "{KernelFaRer}: Replacing Native-Code Idioms with
                 High-Performance Library Calls",
  journal =      j-TACO,
  volume =       "18",
  number =       "3",
  pages =        "38:1--38:22",
  month =        jun,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3459010",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Tue Jun 29 08:21:11 MDT 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3459010",
  abstract =     "Well-crafted libraries deliver much higher performance
                 than code generated by sophisticated application
                 programmers using advanced optimizing compilers. When a
                 code pattern for which a well-tuned library
                 implementation exists is found in the source code
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "38",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Alves:2021:EAP,
  author =       "Ricardo Alves and Stefanos Kaxiras and David
                 Black-Schaffer",
  title =        "Early Address Prediction: Efficient Pipeline Prefetch
                 and Reuse",
  journal =      j-TACO,
  volume =       "18",
  number =       "3",
  pages =        "39:1--39:22",
  month =        jun,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3458883",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Tue Jun 29 08:21:11 MDT 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3458883",
  abstract =     "Achieving low load-to-use latency with low energy and
                 storage overheads is critical for performance. Existing
                 techniques either prefetch into the pipeline (via
                 address prediction and validation) or provide data
                 reuse in the pipeline (via register sharing or L0
                 caches). These techniques provide a range of tradeoffs
                 between latency, reuse, and overhead.\par

                 In this work, we present a pipeline prefetching
                 technique that achieves state-of-the-art performance
                 and data reuse without additional data storage, data
                 movement, or validation overheads by adding address
                 tags to the register file. Our addition of register
                 file tags allows us to forward (reuse) load data from
                 the register file with no additional data movement,
                 keep the data alive in the register file beyond the
                 instruction s lifetime to increase temporal reuse, and
                 coalesce prefetch requests to achieve spatial reuse.
                 Further, we show that we can use the existing memory
                 order violation detection hardware to validate
                 prefetches and data forwards without additional
                 overhead.\par

                 Our design achieves the performance of existing
                 pipeline prefetching while also forwarding 32\% of the
                 loads from the register file (compared to 15\% in
                 state-of-the-art register sharing), delivering a 16\%
                 reduction in L1 dynamic energy (1.6\% total processor
                 energy), with an area overhead of less than 0.5\%.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "39",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Goswami:2021:TES,
  author =       "Kaustav Goswami and Dip Sankar Banerjee and Shirshendu
                 Das",
  title =        "Towards Enhanced System Efficiency while Mitigating
                 Row Hammer",
  journal =      j-TACO,
  volume =       "18",
  number =       "4",
  pages =        "40:1--40:26",
  month =        dec,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3458749",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Oct 4 07:14:07 MDT 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3458749",
  abstract =     "In recent years, DRAM-based main memories have become
                 susceptible to the Row Hammer (RH) problem, which
                 causes bits to flip in a row without accessing them
                 directly. Frequent activation of a row, called an
                 aggressor row, causes its adjacent rows' (victim) bits
                 to flip. The state-of-the-art solution is to refresh
                 the victim rows explicitly to prevent bit flipping.
                 There have been several proposals made to detect RH
                 attacks. These include both probabilistic as well as
                 deterministic counter-based methods. The technique of
                 handling RH attacks, however, remains the same. In this
                 work, we propose an efficient technique for handling
                 the RH problem. We show that the mechanism is agnostic
                 of the detection mechanism. Our RH handling technique
                 omits the necessity of refreshing the victim rows.
                 Instead, we use a small non-volatile Spin-Transfer
                 Torque Magnetic Random Access Memory (STTRAM) that
                 ensures no unnecessary refreshes of the victim rows on
                 the DRAM device and thus allowing more time for normal
                 applications in the same DRAM device. Our model relies
                 on the migration of the aggressor rows. This accounts
                 for removing blocking of the DRAM operations due to the
                 refreshing of victim rows incurred in the previous
                 solution. After extensive evaluation, we found that,
                 compared to the conventional RH mitigation techniques,
                 our model minimizes the blocking time of the memory
                 that is imposed due to explicit refreshing by an
                 average of 80.72\% in the worst-case scenario and
                 provides energy savings of about 15.82\% on average,
                 across different types of RH-based workloads. A lookup
                 table is necessary to pinpoint the location of a
                 particular row, which, when combined with the STTMRAM,
                 limits the storage overhead to 0.39\% of a 2 GB DRAM.
                 Our proposed model prevents repeated refreshing of the
                 same victim rows in different refreshing windows on the
                 DRAM device and leads to an efficient RH handling
                 technique.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "40",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Proficz:2021:AGA,
  author =       "Jerzy Proficz",
  title =        "All-gather Algorithms Resilient to Imbalanced Process
                 Arrival Patterns",
  journal =      j-TACO,
  volume =       "18",
  number =       "4",
  pages =        "41:1--41:22",
  month =        dec,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3460122",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Oct 4 07:14:07 MDT 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3460122",
  abstract =     "Two novel algorithms for the all-gather operation
                 resilient to imbalanced process arrival patterns (PATs)
                 are presented. The first one, Background Disseminated
                 Ring (BDR), is based on the regular parallel ring
                 algorithm often supplied in MPI implementations and
                 exploits an auxiliary background thread for early data
                 exchange from faster processes to accelerate the
                 performed all-gather operation. The other algorithm,
                 Background Sorted Linear synchronized tree with
                 Broadcast (BSLB), is built upon the already existing
                 PAP-aware gather algorithm, that is, Background Sorted
                 Linear Synchronized tree (BSLS), followed by a regular
                 broadcast distributing gathered data to all
                 participating processes. The background of the
                 imbalanced PAP subject is described, along with the PAP
                 monitoring and evaluation topics. An experimental
                 evaluation of the algorithms based on a proposed
                 mini-benchmark is presented. The mini-benchmark was
                 performed over 2,000 times in a typical HPC cluster
                 architecture with homogeneous compute nodes. The
                 obtained results are analyzed according to different
                 PATs, data sizes, and process numbers, showing that the
                 proposed optimization works well for various
                 configurations, is scalable, and can significantly
                 reduce the all-gather elapsed times, in our case, up to
                 factor 1.9 or 47\% in comparison with the best
                 state-of-the-art solution.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "41",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Xu:2021:CMD,
  author =       "Rui Xu and Sheng Ma and Yaohua Wang and Xinhai Chen
                 and Yang Guo",
  title =        "Configurable Multi-directional Systolic Array
                 Architecture for Convolutional Neural Networks",
  journal =      j-TACO,
  volume =       "18",
  number =       "4",
  pages =        "42:1--42:24",
  month =        dec,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3460776",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Oct 4 07:14:07 MDT 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3460776",
  abstract =     "The systolic array architecture is one of the most
                 popular choices for convolutional neural network
                 hardware accelerators. The biggest advantage of the
                 systolic array architecture is its simple and efficient
                 design principle. Without complicated control
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "42",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Seo:2021:SAI,
  author =       "Wonik Seo and Sanghoon Cha and Yeonjae Kim and Jaehyuk
                 Huh and Jongse Park",
  title =        "{SLO}-Aware Inference Scheduler for Heterogeneous
                 Processors in Edge Platforms",
  journal =      j-TACO,
  volume =       "18",
  number =       "4",
  pages =        "43:1--43:26",
  month =        dec,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3460352",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Oct 4 07:14:07 MDT 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3460352",
  abstract =     "With the proliferation of applications with machine
                 learning (ML), the importance of edge platforms has
                 been growing to process streaming sensor, data locally
                 without resorting to remote servers. Such edge
                 platforms are commonly equipped with \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "43",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Qureshi:2021:GXM,
  author =       "Yasir Mahmood Qureshi and William Andrew Simon and
                 Marina Zapater and Katzalin Olcoz and David Atienza",
  title =        "{Gem5-X}: a Many-core Heterogeneous Simulation
                 Platform for Architectural Exploration and
                 Optimization",
  journal =      j-TACO,
  volume =       "18",
  number =       "4",
  pages =        "44:1--44:27",
  month =        dec,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3461662",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Oct 4 07:14:07 MDT 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3461662",
  abstract =     "The increasing adoption of smart systems in our daily
                 life has led to the development of new applications
                 with varying performance and energy constraints, and
                 suitable computing architectures need to be developed
                 for these new applications. In this \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "44",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Jung:2021:PPB,
  author =       "Tina Jung and Fabian Ritter and Sebastian Hack",
  title =        "{PICO}: a {Presburger} In-bounds Check Optimization
                 for Compiler-based Memory Safety Instrumentations",
  journal =      j-TACO,
  volume =       "18",
  number =       "4",
  pages =        "45:1--45:27",
  month =        dec,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3460434",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Oct 4 07:14:07 MDT 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3460434",
  abstract =     "Memory safety violations such as buffer overflows are
                 a threat to security to this day. A common solution to
                 ensure memory safety for C is code instrumentation.
                 However, this often causes high execution-time overhead
                 and is therefore rarely used in \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "45",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Sha:2021:LIA,
  author =       "Zhibing Sha and Jun Li and Lihao Song and Jiewen Tang
                 and Min Huang and Zhigang Cai and Lianju Qian and
                 Jianwei Liao and Zhiming Liu",
  title =        "Low {I/O} Intensity-aware Partial {GC} Scheduling to
                 Reduce Long-tail Latency in {SSDs}",
  journal =      j-TACO,
  volume =       "18",
  number =       "4",
  pages =        "46:1--46:25",
  month =        dec,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3460433",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Oct 4 07:14:07 MDT 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3460433",
  abstract =     "This article proposes a low I/O intensity-aware
                 scheduling scheme on garbage collection (GC) in SSDs
                 for minimizing the I/O long-tail latency to ensure I/O
                 responsiveness. The basic idea is to assemble partial
                 GC operations by referring to several \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "46",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Alam:2021:LPL,
  author =       "Syed Asad Alam and James Garland and David Gregg",
  title =        "Low-precision Logarithmic Number Systems: Beyond
                 Base-2",
  journal =      j-TACO,
  volume =       "18",
  number =       "4",
  pages =        "47:1--47:25",
  month =        dec,
  year =         "2021",
  CODEN =        "????",
  DOI =          "",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Oct 4 07:14:07 MDT 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/fparith.bib;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3461699",
  abstract =     "Logarithmic number systems (LNS) are used to represent
                 real numbers in many applications using a constant base
                 raised to a fixed-point exponent making its
                 distribution exponential. This greatly simplifies
                 hardware multiply, divide, and square root. LNS with
                 base-2 is most common, but in this article, we show
                 that for low-precision LNS the choice of base has a
                 significant impact.\par

                 We make four main contributions. First, LNS is not
                 closed under addition and subtraction, so the result is
                 approximate. We show that choosing a suitable base can
                 manipulate the distribution to reduce the average
                 error. Second, we show that low-precision LNS addition
                 and subtraction can be implemented efficiently in logic
                 rather than commonly used ROM lookup tables, the
                 complexity of which can be reduced by an appropriate
                 choice of base. A similar effect is shown where the
                 result of arithmetic has greater precision than the
                 input.Third, where input data from external sources is
                 not expected to be in LNS, we can reduce the conversion
                 error by selecting a LNS base to match the expected
                 distribution of the input. Thus, there is no one base
                 that gives the global optimum, and base selection is a
                 trade-off between different factors. Fourth, we show
                 that circuits realized in LNS require lower area and
                 power consumption for short word lengths",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "47",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Walden:2021:MIN,
  author =       "Candace Walden and Devesh Singh and Meenatchi
                 Jagasivamani and Shang Li and Luyi Kang and Mehdi
                 Asnaashari and Sylvain Dubois and Bruce Jacob and
                 Donald Yeung",
  title =        "Monolithically Integrating Non-Volatile Main Memory
                 over the Last-Level Cache",
  journal =      j-TACO,
  volume =       "18",
  number =       "4",
  pages =        "48:1--48:26",
  month =        dec,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3462632",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Oct 4 07:14:07 MDT 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3462632",
  abstract =     "Many emerging non-volatile memories are compatible
                 with CMOS logic, potentially enabling their integration
                 into a CPU's die. This article investigates such
                 monolithically integrated CPU-main memory chips. We
                 exploit non-volatile memories employing 3D \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "48",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Tomei:2021:BSC,
  author =       "Matthew Tomei and Shomit Das and Mohammad Seyedzadeh
                 and Philip Bedoukian and Bradford Beckmann and Rakesh
                 Kumar and David Wood",
  title =        "Byte-Select Compression",
  journal =      j-TACO,
  volume =       "18",
  number =       "4",
  pages =        "49:1--49:27",
  month =        dec,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3462209",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Oct 4 07:14:07 MDT 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/datacompression.bib;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3462209",
  abstract =     "Cache-block compression is a highly effective
                 technique for both reducing accesses to lower levels in
                 the memory hierarchy (cache compression) and minimizing
                 data transfers (link compression). While many effective
                 cache-block compression algorithms have been proposed,
                 the design of these algorithms is largely ad hoc and
                 manual and relies on human recognition of patterns. In
                 this article, we take an entirely different approach.
                 We introduce a class of ``byte-select'' compression
                 algorithms, as well as an automated methodology for
                 generating compression algorithms in this class. We
                 argue that, based on upper bounds within the class, the
                 study of this class of byte-select algorithms has
                 potential to yield algorithms with better performance
                 than existing cache-block compression algorithms. The
                 upper bound we establish on the compression ratio is 2X
                 that of any existing algorithm. We then offer a
                 generalized representation of a subset of byte-select
                 compression algorithms and search through the resulting
                 space guided by a set of training data traces. Using
                 this automated process, we find efficient and effective
                 algorithms for various hardware applications. We find
                 that the resulting algorithms exploit novel patterns
                 that can inform future algorithm designs. The generated
                 byte-select algorithms are evaluated against a separate
                 set of traces and evaluations show that Byte-Select has
                 a 23\% higher compression ratio on average. While no
                 previous algorithm performs best for all our data sets
                 which include CPU and GPU applications, our generated
                 algorithms do. Using an automated hardware generator
                 for these algorithms, we show that their decompression
                 and compression latency is one and two cycles
                 respectively, much lower than any existing algorithm
                 with a competitive compression ratio.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "49",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Li:2021:CHC,
  author =       "Cunlu Li and Dezun Dong and Shazhou Yang and Xiangke
                 Liao and Guangyu Sun and Yongheng Liu",
  title =        "{CIB-HIER}: Centralized Input Buffer Design in
                 Hierarchical High-radix Routers",
  journal =      j-TACO,
  volume =       "18",
  number =       "4",
  pages =        "50:1--50:21",
  month =        dec,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3468062",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Oct 4 07:14:07 MDT 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3468062",
  abstract =     "Hierarchical organization is widely used in high-radix
                 routers to enable efficient scaling to higher switch
                 port count. A general-purpose hierarchical router must
                 be symmetrically designed with the same input buffer
                 depth, resulting in a large amount of \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "50",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Gysi:2021:DSM,
  author =       "Tobias Gysi and Christoph M{\"u}ller and Oleksandr
                 Zinenko and Stephan Herhut and Eddie Davis and Tobias
                 Wicky and Oliver Fuhrer and Torsten Hoefler and Tobias
                 Grosser",
  title =        "Domain-Specific Multi-Level {IR} Rewriting for {GPU}:
                 The {Open Earth} Compiler for {GPU}-accelerated Climate
                 Simulation",
  journal =      j-TACO,
  volume =       "18",
  number =       "4",
  pages =        "51:1--51:23",
  month =        dec,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3469030",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Oct 4 07:14:07 MDT 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3469030",
  abstract =     "Most compilers have a single core intermediate
                 representation (IR) (e.g., LLVM) sometimes complemented
                 with vaguely defined IR-like data structures. This IR
                 is commonly low-level and close to machine
                 instructions. As a result, optimizations relying on
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "51",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Zou:2021:SLE,
  author =       "An Zou and Huifeng Zhu and Jingwen Leng and Xin He and
                 Vijay Janapa Reddi and Christopher D. Gill and Xuan
                 Zhang",
  title =        "System-level Early-stage Modeling and Evaluation of
                 {IVR}-assisted Processor Power Delivery System",
  journal =      j-TACO,
  volume =       "18",
  number =       "4",
  pages =        "52:1--52:27",
  month =        dec,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3468145",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Oct 4 07:14:07 MDT 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3468145",
  abstract =     "Despite being employed in numerous efforts to improve
                 power delivery efficiency, the integrated voltage
                 regulator (IVR) approach has yet to be evaluated
                 rigorously and quantitatively in a full power delivery
                 system (PDS) setting. To fulfill this need, \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "52",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Manocha:2021:GOD,
  author =       "Aninda Manocha and Tyler Sorensen and Esin Tureci and
                 Opeoluwa Matthews and Juan L. Arag{\'o}n and Margaret
                 Martonosi",
  title =        "{GraphAttack}: Optimizing Data Supply for Graph
                 Applications on In-Order Multicore Architectures",
  journal =      j-TACO,
  volume =       "18",
  number =       "4",
  pages =        "53:1--53:26",
  month =        dec,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3469846",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Oct 4 07:14:07 MDT 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3469846",
  abstract =     "Graph structures are a natural representation of
                 important and pervasive data. While graph applications
                 have significant parallelism, their characteristic
                 pointer indirect loads to neighbor data hinder
                 scalability to large datasets on multicore systems.
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "53",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Benz:2021:SAP,
  author =       "Joscha Benz and Oliver Bringmann",
  title =        "Scenario-Aware Program Specialization for Timing
                 Predictability",
  journal =      j-TACO,
  volume =       "18",
  number =       "4",
  pages =        "54:1--54:26",
  month =        dec,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3473333",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Oct 4 07:14:07 MDT 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3473333",
  abstract =     "The successful application of static program analysis
                 strongly depends on flow facts of a program such as
                 loop bounds, control-flow constraints, and operating
                 modes. This problem heavily affects the design of
                 real-time systems, since static program \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "54",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Chakraborty:2021:WGC,
  author =       "Shounak Chakraborty and Magnus Sj{\"a}lander",
  title =        "{WaFFLe}: Gated Cache-Ways with Per-Core Fine-Grained
                 {DVFS} for Reduced On-Chip Temperature and Leakage
                 Consumption",
  journal =      j-TACO,
  volume =       "18",
  number =       "4",
  pages =        "55:1--55:25",
  month =        dec,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3471908",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Oct 4 07:14:07 MDT 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3471908",
  abstract =     "Managing thermal imbalance in contemporary chip
                 multi-processors (CMPs) is crucial in assuring
                 functional correctness of modern mobile as well as
                 server systems. Localized regions with high activity,
                 e.g., register files, ALUs, FPUs, and so on, \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "55",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Srikanth:2021:SIC,
  author =       "Sriseshan Srikanth and Anirudh Jain and Thomas M.
                 Conte and Erik P. Debenedictis and Jeanine Cook",
  title =        "{SortCache}: Intelligent Cache Management for
                 Accelerating Sparse Data Workloads",
  journal =      j-TACO,
  volume =       "18",
  number =       "4",
  pages =        "56:1--56:24",
  month =        dec,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3473332",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Oct 4 07:14:07 MDT 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3473332",
  abstract =     "Sparse data applications have irregular access
                 patterns that stymie modern memory architectures.
                 Although hyper-sparse workloads have received
                 considerable attention in the past, moderately-sparse
                 workloads prevalent in machine learning applications,
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "56",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Metzger:2021:DHT,
  author =       "Paul Metzger and Volker Seeker and Christian Fensch
                 and Murray Cole",
  title =        "Device Hopping: Transparent Mid-Kernel Runtime
                 Switching for Heterogeneous Systems",
  journal =      j-TACO,
  volume =       "18",
  number =       "4",
  pages =        "57:1--57:25",
  month =        dec,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3471909",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Oct 4 07:14:07 MDT 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3471909",
  abstract =     "Existing OS techniques for homogeneous many-core
                 systems make it simple for single and multithreaded
                 applications to migrate between cores. Heterogeneous
                 systems do not benefit so fully from this flexibility,
                 and applications that cannot migrate in mid-.
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "57",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Zhang:2021:LED,
  author =       "Yu Zhang and Da Peng and Xiaofei Liao and Hai Jin and
                 Haikun Liu and Lin Gu and Bingsheng He",
  title =        "{LargeGraph}: an Efficient Dependency-Aware
                 {GPU}-Accelerated Large-Scale Graph Processing",
  journal =      j-TACO,
  volume =       "18",
  number =       "4",
  pages =        "58:1--58:24",
  month =        dec,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3477603",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Oct 4 07:14:07 MDT 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3477603",
  abstract =     "Many out-of-GPU-memory systems are recently designed
                 to support iterative processing of large-scale graphs.
                 However, these systems still suffer from long time to
                 converge because of inefficient propagation of active
                 vertices' new states along graph \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "58",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Cilasun:2021:SNN,
  author =       "H{\"u}srev Cilasun and Salonik Resch and Zamshed I.
                 Chowdhury and Erin Olson and Masoud Zabihi and
                 Zhengyang Zhao and Thomas Peterson and Keshab K. Parhi
                 and Jian-Ping Wang and Sachin S. Sapatnekar and Ulya R.
                 Karpuzcu",
  title =        "Spiking Neural Networks in Spintronic Computational
                 {RAM}",
  journal =      j-TACO,
  volume =       "18",
  number =       "4",
  pages =        "59:1--59:21",
  month =        dec,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3475963",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Oct 4 07:14:07 MDT 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3475963",
  abstract =     "Spiking Neural Networks (SNNs) represent a
                 biologically inspired computation model capable of
                 emulating neural computation in human brain and
                 brain-like structures. The main promise is very low
                 energy consumption. Classic Von Neumann architecture
                 based \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "59",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Ukarande:2022:LAC,
  author =       "Aditya Ukarande and Suryakant Patidar and Ram Rangan",
  title =        "Locality-Aware {CTA} Scheduling for Gaming
                 Applications",
  journal =      j-TACO,
  volume =       "19",
  number =       "1",
  pages =        "1:1--1:26",
  month =        mar,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3477497",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Mar 18 06:51:06 MDT 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3477497",
  abstract =     "The compute work rasterizer or the GigaThread Engine
                 of a modern NVIDIA GPU focuses on maximizing compute
                 work occupancy across all streaming multiprocessors in
                 a GPU while retaining design simplicity. In this
                 article, we identify the operational aspects \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "1",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Liu:2022:ICO,
  author =       "Hongzhi Liu and Jie Luo and Ying Li and Zhonghai Wu",
  title =        "Iterative Compilation Optimization Based on Metric
                 Learning and Collaborative Filtering",
  journal =      j-TACO,
  volume =       "19",
  number =       "1",
  pages =        "2:1--2:25",
  month =        mar,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3480250",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Mar 18 06:51:06 MDT 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3480250",
  abstract =     "Pass selection and phase ordering are two critical
                 compiler auto-tuning problems. Traditional heuristic
                 methods cannot effectively address these NP-hard
                 problems especially given the increasing number of
                 compiler passes and diverse hardware architectures.
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "2",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Sasongko:2022:RFY,
  author =       "Muhammad Aditya Sasongko and Milind Chabbi and Mandana
                 Bagheri Marzijarani and Didem Unat",
  title =        "{ReuseTracker}: Fast Yet Accurate Multicore Reuse
                 Distance Analyzer",
  journal =      j-TACO,
  volume =       "19",
  number =       "1",
  pages =        "3:1--3:25",
  month =        mar,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3484199",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Mar 18 06:51:06 MDT 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3484199",
  abstract =     "One widely used metric that measures data locality is
                 reuse distance -the number of unique memory locations
                 that are accessed between two consecutive accesses to a
                 particular memory location. State-of-the-art techniques
                 that measure reuse distance in \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "3",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Fu:2022:GDS,
  author =       "Yaosheng Fu and Evgeny Bolotin and Niladrish
                 Chatterjee and David Nellans and Stephen W. Keckler",
  title =        "{GPU} Domain Specialization via Composable On-Package
                 Architecture",
  journal =      j-TACO,
  volume =       "19",
  number =       "1",
  pages =        "4:1--4:23",
  month =        mar,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3484505",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Mar 18 06:51:06 MDT 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3484505",
  abstract =     "As GPUs scale their low-precision matrix math
                 throughput to boost deep learning (DL) performance,
                 they upset the balance between math throughput and
                 memory system capabilities. We demonstrate that a
                 converged GPU design trying to address diverging
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "4",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Lee:2022:SBC,
  author =       "Daeyeal Lee and Bill Lin and Chung-Kuan Cheng",
  title =        "{SMT}-Based Contention-Free Task Mapping and
                 Scheduling on {$2$D\slash $3$D SMART NoC} with Mixed
                 Dimension-Order Routing",
  journal =      j-TACO,
  volume =       "19",
  number =       "1",
  pages =        "5:1--5:21",
  month =        mar,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3487018",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Mar 18 06:51:06 MDT 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3487018",
  abstract =     "SMART NoCs achieve ultra-low latency by enabling
                 single-cycle multiple-hop transmission via bypass
                 channels. However, contention along bypass channels can
                 seriously degrade the performance of SMART NoCs by
                 breaking the bypass paths. Therefore, contention-.
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "5",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Chatarasi:2022:MDC,
  author =       "Prasanth Chatarasi and Hyoukjun Kwon and Angshuman
                 Parashar and Michael Pellauer and Tushar Krishna and
                 Vivek Sarkar",
  title =        "{Marvel}: a Data-Centric Approach for Mapping Deep
                 Learning Operators on Spatial Accelerators",
  journal =      j-TACO,
  volume =       "19",
  number =       "1",
  pages =        "6:1--6:26",
  month =        mar,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3485137",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Mar 18 06:51:06 MDT 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3485137",
  abstract =     "A spatial accelerator's efficiency depends heavily on
                 both its mapper and cost models to generate optimized
                 mappings for various operators of DNN models. However,
                 existing cost models lack a formal boundary over their
                 input programs (operators) for \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "6",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Rieber:2022:JPL,
  author =       "Dennis Rieber and Axel Acosta and Holger Fr{\"o}ning",
  title =        "Joint Program and Layout Transformations to Enable
                 Convolutional Operators on Specialized Hardware Based
                 on Constraint Programming",
  journal =      j-TACO,
  volume =       "19",
  number =       "1",
  pages =        "7:1--7:26",
  month =        mar,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3487922",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Mar 18 06:51:06 MDT 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3487922",
  abstract =     "The success of Deep Artificial Neural Networks (DNNs)
                 in many domains created a rich body of research
                 concerned with hardware accelerators for
                 compute-intensive DNN operators. However, implementing
                 such operators efficiently with complex hardware
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "7",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Lei:2022:SEW,
  author =       "Mengya Lei and Fan Li and Fang Wang and Dan Feng and
                 Xiaomin Zou and Renzhi Xiao",
  title =        "{SecNVM}: an Efficient and Write-Friendly Metadata
                 Crash Consistency Scheme for Secure {NVM}",
  journal =      j-TACO,
  volume =       "19",
  number =       "1",
  pages =        "8:1--8:26",
  month =        mar,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3488724",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Mar 18 06:51:06 MDT 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3488724",
  abstract =     "Data security is an indispensable part of non-volatile
                 memory (NVM) systems. However, implementing data
                 security efficiently on NVM is challenging, since we
                 have to guarantee the consistency of user data and the
                 related security metadata. Existing \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "8",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Di:2022:TPM,
  author =       "Bang Di and Daokun Hu and Zhen Xie and Jianhua Sun and
                 Hao Chen and Jinkui Ren and Dong Li",
  title =        "{TLB}-pilot: Mitigating {TLB} Contention Attack on
                 {GPUs} with Microarchitecture-Aware Scheduling",
  journal =      j-TACO,
  volume =       "19",
  number =       "1",
  pages =        "9:1--9:23",
  month =        mar,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3491218",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Mar 18 06:51:06 MDT 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3491218",
  abstract =     "Co-running GPU kernels on a single GPU can provide
                 high system throughput and improve hardware
                 utilization, but this raises concerns on application
                 security. We reveal that translation lookaside buffer
                 (TLB) attack, one of the common attacks on CPU, can
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "9",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Saileshwar:2022:HLC,
  author =       "Gururaj Saileshwar and Rick Boivie and Tong Chen and
                 Benjamin Segal and Alper Buyuktosunoglu",
  title =        "{HeapCheck}: Low-cost Hardware Support for Memory
                 Safety",
  journal =      j-TACO,
  volume =       "19",
  number =       "1",
  pages =        "10:1--10:24",
  month =        mar,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3495152",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Mar 18 06:51:06 MDT 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3495152",
  abstract =     "Programs written in C/C++ are vulnerable to
                 memory-safety errors like buffer-overflows and
                 use-after-free. While several mechanisms to detect such
                 errors have been previously proposed, they suffer from
                 a variety of drawbacks, including poor performance,
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "10",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Azhar:2022:TRR,
  author =       "M. Waqar Azhar and Miquel Peric{\`a}s and Per
                 Stenstr{\"o}m",
  title =        "{Task-RM}: a Resource Manager for Energy Reduction in
                 Task-Parallel Applications under Quality of Service
                 Constraints",
  journal =      j-TACO,
  volume =       "19",
  number =       "1",
  pages =        "11:1--11:26",
  month =        mar,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3494537",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Mar 18 06:51:06 MDT 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3494537",
  abstract =     "Improving energy efficiency is an important goal of
                 computer system design. This article focuses on a
                 general model of task-parallel applications under
                 quality-of-service requirements on the completion time.
                 Our technique, called Task-RM, exploits the \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "11",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Gomes:2022:CCA,
  author =       "Cesar Gomes and Maziar Amiraski and Mark Hempstead",
  title =        "{CASHT}: Contention Analysis in Shared Hierarchies
                 with Thefts",
  journal =      j-TACO,
  volume =       "19",
  number =       "1",
  pages =        "12:1--12:27",
  month =        mar,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3494538",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Mar 18 06:51:06 MDT 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3494538",
  abstract =     "Cache management policies should consider workloads'
                 contention behavior when managing a shared cache. Prior
                 art makes estimates about shared cache behavior by
                 adding extra logic or time to isolate per workload
                 cache statistics. These approaches provide \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "12",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Wang:2022:OSS,
  author =       "Yufei Wang and Xiaoshe Dong and Longxiang Wang and
                 Weiduo Chen and Xingjun Zhang",
  title =        "Optimizing Small-Sample Disk Fault Detection Based on
                 {LSTM-GAN} Model",
  journal =      j-TACO,
  volume =       "19",
  number =       "1",
  pages =        "13:1--13:24",
  month =        mar,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3500917",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Mar 18 06:51:06 MDT 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3500917",
  abstract =     "In recent years, researches on disk fault detection
                 based on SMART data combined with different machine
                 learning algorithms have been proven to be effective.
                 However, these methods require a large amount of data.
                 In the early stages of the establishment \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "13",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Silfa:2022:BEE,
  author =       "Franyell Silfa and Jose Maria Arnau and Antonio
                 Gonz{\'a}lez",
  title =        "{E-BATCH}: Energy-Efficient and High-Throughput {RNN}
                 Batching",
  journal =      j-TACO,
  volume =       "19",
  number =       "1",
  pages =        "14:1--14:23",
  month =        mar,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3499757",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Mar 18 06:51:06 MDT 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3499757",
  abstract =     "Recurrent Neural Network (RNN) inference exhibits low
                 hardware utilization due to the strict data
                 dependencies across time-steps. Batching multiple
                 requests can increase throughput. However, RNN batching
                 requires a large amount of padding since the \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "14",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Ding:2022:CCA,
  author =       "Chen Ding and Dong Chen and Fangzhou Liu and Benjamin
                 Reber and Wesley Smith",
  title =        "{CARL}: Compiler Assigned Reference Leasing",
  journal =      j-TACO,
  volume =       "19",
  number =       "1",
  pages =        "15:1--15:28",
  month =        mar,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3498730",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Mar 18 06:51:06 MDT 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3498730",
  abstract =     "Data movement is a common performance bottleneck, and
                 its chief remedy is caching. Traditional cache
                 management is transparent to the workload: data that
                 should be kept in cache are determined by the recency
                 information only, while the program information,.
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "15",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Schlaak:2022:MAF,
  author =       "Christof Schlaak and Tzung-Han Juang and Christophe
                 Dubach",
  title =        "Memory-Aware Functional {IR} for Higher-Level
                 Synthesis of Accelerators",
  journal =      j-TACO,
  volume =       "19",
  number =       "2",
  pages =        "16:1--16:26",
  month =        jun,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3501768",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Mar 25 07:03:00 MDT 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3501768",
  abstract =     "Specialized accelerators deliver orders of a magnitude
                 of higher performance than general-purpose processors.
                 The ever-changing nature of modern workloads is pushing
                 the adoption of Field Programmable Gate Arrays (FPGAs)
                 as the substrate of choice. \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "16",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Lakshminarasimhan:2022:FSC,
  author =       "Kartik Lakshminarasimhan and Ajeya Naithani and
                 Josu{\'e} Feliu and Lieven Eeckhout",
  title =        "The Forward Slice Core: a High-Performance, Yet
                 Low-Complexity Microarchitecture",
  journal =      j-TACO,
  volume =       "19",
  number =       "2",
  pages =        "17:1--17:25",
  month =        jun,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3499424",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Mar 25 07:03:00 MDT 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3499424",
  abstract =     "Superscalar out-of-order cores deliver high
                 performance at the cost of increased complexity and
                 power budget. In-order cores, in contrast, are less
                 complex and have a smaller power budget, but offer low
                 performance. A processor architecture should ideally
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "17",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Srikanthan:2022:MMA,
  author =       "Sharanyan Srikanthan and Sayak Chakraborti and
                 Princeton Ferro and Sandhya Dwarkadas",
  title =        "{MAPPER}: Managing Application Performance via
                 Parallel Efficiency Regulation *",
  journal =      j-TACO,
  volume =       "19",
  number =       "2",
  pages =        "18:1--18:26",
  month =        jun,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3501767",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Mar 25 07:03:00 MDT 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3501767",
  abstract =     "State-of-the-art systems, whether in servers or
                 desktops, provide ample computational and storage
                 resources to allow multiple simultaneously executing
                 potentially parallel applications. However, performance
                 tends to be unpredictable, being a function of
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "18",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Athanasios:2022:LPN,
  author =       "Tziouvaras Athanasios and Dimitriou Georgios and
                 Stamoulis Georgios",
  title =        "Low-power Near-data Instruction Execution Leveraging
                 Opcode-based Timing Analysis",
  journal =      j-TACO,
  volume =       "19",
  number =       "2",
  pages =        "19:1--19:26",
  month =        jun,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3504005",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Mar 25 07:03:00 MDT 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3504005",
  abstract =     "Traditional processor architectures utilize an
                 external DRAM for data storage, while they also operate
                 under worst-case timing constraints. Such designs are
                 heavily constrained by the delay costs of the data
                 transfer between the core pipeline and the DRAM,.
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "19",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Jia:2022:GND,
  author =       "Xingguo Jia and Jin Zhang and Boshi Yu and Xingyue
                 Qian and Zhengwei Qi and Haibing Guan",
  title =        "{GiantVM}: a Novel Distributed Hypervisor for Resource
                 Aggregation with {DSM-aware} Optimizations",
  journal =      j-TACO,
  volume =       "19",
  number =       "2",
  pages =        "20:1--20:27",
  month =        jun,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3505251",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Mar 25 07:03:00 MDT 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3505251",
  abstract =     "We present GiantVM, an open-source distributed
                 hypervisor that provides the many-to-one virtualization
                 to aggregate resources from multiple physical machines.
                 We propose techniques to enable distributed CPU and I/O
                 virtualization and distributed shared \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "20",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Nejat:2022:CSM,
  author =       "Mehrzad Nejat and Madhavan Manivannan and Miquel
                 Peric{\`a}s and Per Stenstr{\"o}m",
  title =        "Cooperative Slack Management: Saving Energy of
                 Multicore Processors by Trading Performance Slack
                 Between {QoS}-Constrained Applications",
  journal =      j-TACO,
  volume =       "19",
  number =       "2",
  pages =        "21:1--21:27",
  month =        jun,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3505559",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Mar 25 07:03:00 MDT 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3505559",
  abstract =     "Processor resources can be adapted at runtime
                 according to the dynamic behavior of applications to
                 reduce the energy consumption of multicore processors
                 without affecting the Quality-of-Service (QoS). To
                 achieve this, an online resource management scheme
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "21",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Pompougnac:2022:WSR,
  author =       "Hugo Pompougnac and Ulysse Beaugnon and Albert Cohen
                 and Dumitru Potop Butucaru",
  title =        "Weaving Synchronous Reactions into the Fabric of
                 {SSA}-form Compilers",
  journal =      j-TACO,
  volume =       "19",
  number =       "2",
  pages =        "22:1--22:25",
  month =        jun,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3506706",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Mar 25 07:03:00 MDT 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3506706",
  abstract =     "We investigate the programming of reactive systems
                 combining closed-loop control with
                 performance-intensive components such as Machine
                 Learning (ML). Reactive control systems are often
                 safety-critical and associated with real-time execution
                 requirements, \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "22",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Shobaki:2022:RPA,
  author =       "Ghassan Shobaki and Vahl Scott Gordon and Paul McHugh
                 and Theodore Dubois and Austin Kerbow",
  title =        "Register-Pressure-Aware Instruction Scheduling Using
                 Ant Colony Optimization",
  journal =      j-TACO,
  volume =       "19",
  number =       "2",
  pages =        "23:1--23:23",
  month =        jun,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3505558",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Mar 25 07:03:00 MDT 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3505558",
  abstract =     "This paper describes a new approach to
                 register-pressure-aware instruction scheduling, using
                 Ant Colony Optimization (ACO). ACO is a nature-inspired
                 optimization technique that researchers have
                 successfully applied to NP-hard sequencing problems
                 like the \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "23",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Wang:2022:MOG,
  author =       "Qihan Wang and Zhen Peng and Bin Ren and Jie Chen and
                 Robert G. Edwards",
  title =        "{MemHC}: an Optimized {GPU} Memory Management
                 Framework for Accelerating Many-body Correlation",
  journal =      j-TACO,
  volume =       "19",
  number =       "2",
  pages =        "24:1--24:26",
  month =        jun,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3506705",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Mar 25 07:03:00 MDT 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3506705",
  abstract =     "The many-body correlation function is a fundamental
                 computation kernel in modern physics computing
                 applications, e.g., Hadron Contractions in Lattice
                 quantum chromodynamics (QCD). This kernel is both
                 computation and memory intensive, involving a series of
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "24",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Kumar:2022:DAS,
  author =       "Rakesh Kumar and Mehdi Alipour and David
                 Black-Schaffer",
  title =        "Dependence-aware Slice Execution to Boost {MLP} in
                 Slice-out-of-order Cores",
  journal =      j-TACO,
  volume =       "19",
  number =       "2",
  pages =        "25:1--25:28",
  month =        jun,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3506704",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Mar 25 07:03:00 MDT 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3506704",
  abstract =     "Exploiting memory-level parallelism (MLP) is crucial
                 to hide long memory and last-level cache access
                 latencies. While out-of-order (OoO) cores, and
                 techniques building on them, are effective at
                 exploiting MLP, they deliver poor energy efficiency due
                 to \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "25",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Vijaykumar:2022:MPO,
  author =       "Nandita Vijaykumar and Ataberk Olgun and Konstantinos
                 Kanellopoulos and F. Nisa Bostanci and Hasan Hassan and
                 Mehrshad Lotfi and Phillip B. Gibbons and Onur Mutlu",
  title =        "\pkg{MetaSys}: a Practical Open-source Metadata
                 Management System to Implement and Evaluate Cross-layer
                 Optimizations",
  journal =      j-TACO,
  volume =       "19",
  number =       "2",
  pages =        "26:1--26:29",
  month =        jun,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3505250",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Mar 25 07:03:00 MDT 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/gnu.bib;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3505250",
  abstract =     "This article introduces the first open-source
                 FPGA-based infrastructure, MetaSys, with a prototype in
                 a RISC-V system, to enable the rapid implementation and
                 evaluation of a wide range of cross-layer techniques in
                 real hardware. Hardware-software \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "26",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Chen:2022:EEE,
  author =       "Jing Chen and Madhavan Manivannan and Mustafa
                 Abduljabbar and Miquel Peric{\`a}s",
  title =        "\pkg{ERASE}: Energy Efficient Task Mapping and
                 Resource Management for Work Stealing Runtimes",
  journal =      j-TACO,
  volume =       "19",
  number =       "2",
  pages =        "27:1--27:29",
  month =        jun,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3510422",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Mar 25 07:03:00 MDT 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3510422",
  abstract =     "Parallel applications often rely on work stealing
                 schedulers in combination with fine-grained tasking to
                 achieve high performance and scalability. However,
                 reducing the total energy consumption in the context of
                 work stealing runtimes is still challenging,.
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "27",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Ye:2022:PAU,
  author =       "Chencheng Ye and Yuanchao Xu and Xipeng Shen and Hai
                 Jin and Xiaofei Liao and Yan Solihin",
  title =        "Preserving Addressability Upon {GC}-Triggered Data
                 Movements on Non-Volatile Memory",
  journal =      j-TACO,
  volume =       "19",
  number =       "2",
  pages =        "28:1--28:26",
  month =        jun,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3511706",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Mar 25 07:03:00 MDT 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3511706",
  abstract =     "This article points out an important threat that
                 application-level Garbage Collection (GC) creates to
                 the use of non-volatile memory (NVM). Data movements
                 incurred by GC may invalidate the pointers to objects
                 on NVM and, hence, harm the reusability of \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "28",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Michelogiannakis:2022:CIR,
  author =       "George Michelogiannakis and Benjamin Klenk and Brandon
                 Cook and Min Yee Teh and Madeleine Glick and Larry
                 Dennison and Keren Bergman and John Shalf",
  title =        "A Case For Intra-rack Resource Disaggregation in
                 {HPC}",
  journal =      j-TACO,
  volume =       "19",
  number =       "2",
  pages =        "29:1--29:26",
  month =        jun,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3514245",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Mar 25 07:03:00 MDT 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3514245",
  abstract =     "The expected halt of traditional technology scaling is
                 motivating increased heterogeneity in high-performance
                 computing (HPC) systems with the emergence of numerous
                 specialized accelerators. As heterogeneity increases,
                 so does the risk of underutilizing \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "29",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Wang:2022:SMS,
  author =       "Ping Wang and Fei Wen and Paul V. Gratz and Alex
                 Sprintson",
  title =        "{SIMD-Matcher}: a {SIMD}-based Arbitrary Matching
                 Framework",
  journal =      j-TACO,
  volume =       "19",
  number =       "3",
  pages =        "30:1--30:20",
  month =        sep,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3514246",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Sep 2 10:07:01 MDT 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3514246",
  abstract =     "Packet classification methods rely upon matching
                 packet content/header against pre-defined rules, which
                 are generated by network applications and their
                 configurations. With the rapid development of network
                 technology and the fast-growing network \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "30",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Mettler:2022:FBA,
  author =       "Marcel Mettler and Martin Rapp and Heba Khdr and
                 Daniel Mueller-Gritschneder and J{\"o}rg Henkel and Ulf
                 Schlichtmann",
  title =        "An {FPGA}-based Approach to Evaluate Thermal and
                 Resource Management Strategies of Many-core
                 Processors",
  journal =      j-TACO,
  volume =       "19",
  number =       "3",
  pages =        "31:1--31:24",
  month =        sep,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3516825",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Sep 2 10:07:01 MDT 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3516825",
  abstract =     "The continuous technology scaling of integrated
                 circuits results in increasingly higher power densities
                 and operating temperatures. Hence, modern many-core
                 processors require sophisticated thermal and resource
                 management strategies to mitigate these \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "31",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Mpeis:2022:OIC,
  author =       "Paschalis Mpeis and Pavlos Petoumenos and Kim
                 Hazelwood and Hugh Leather",
  title =        "Object Intersection Captures on Interactive Apps to
                 Drive a Crowd-sourced Replay-based Compiler
                 Optimization",
  journal =      j-TACO,
  volume =       "19",
  number =       "3",
  pages =        "32:1--32:25",
  month =        sep,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3517338",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Sep 2 10:07:01 MDT 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3517338",
  abstract =     "Traditional offline optimization frameworks rely on
                 representative hardware, software, and inputs to
                 compare different optimizations on. With
                 application-specific optimization for mobile systems
                 though, the idea of a representative testbench is
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "32",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Li:2022:MRM,
  author =       "Cunlu Li and Dezun Dong and Xiangke Liao",
  title =        "{MUA-Router}: Maximizing the Utility-of-Allocation for
                 On-chip Pipelining Routers",
  journal =      j-TACO,
  volume =       "19",
  number =       "3",
  pages =        "33:1--33:23",
  month =        sep,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3519027",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Sep 2 10:07:01 MDT 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3519027",
  abstract =     "As an important pipeline stage in the router of
                 Network-on-Chips, switch allocation assigns output
                 ports to input ports and allows flits to transit
                 through the switch without conflicts. Previous work
                 designed efficient switch allocation strategies by
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "33",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Choudhury:2022:FOC,
  author =       "Ziaul Choudhury and Shashwat Shrivastava and Lavanya
                 Ramapantulu and Suresh Purini",
  title =        "An {FPGA} Overlay for {CNN} Inference with
                 Fine-grained Flexible Parallelism",
  journal =      j-TACO,
  volume =       "19",
  number =       "3",
  pages =        "34:1--34:26",
  month =        sep,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3519598",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Sep 2 10:07:01 MDT 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3519598",
  abstract =     "Increasingly, pre-trained convolutional neural
                 networks (CNNs) are being deployed for inference in
                 various computer vision applications, both on the
                 server-side in the data centers and at the edge. CNN
                 inference is a very compute-intensive task. It is a
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "34",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Moolchandani:2022:PPP,
  author =       "Diksha Moolchandani and Anshul Kumar and Smruti R.
                 Sarangi",
  title =        "Performance and Power Prediction for Concurrent
                 Execution on {GPUs}",
  journal =      j-TACO,
  volume =       "19",
  number =       "3",
  pages =        "35:1--35:27",
  month =        sep,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3522712",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Sep 2 10:07:01 MDT 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3522712",
  abstract =     "The unprecedented growth of edge computing and 5G has
                 led to an increased offloading of mobile applications
                 to cloud servers or edge cloudlets. The most prominent
                 workloads comprise computer vision applications.
                 Conventional wisdom suggests that computer \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "35",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Jahanshahi:2022:PQA,
  author =       "Ali Jahanshahi and Nanpeng Yu and Daniel Wong",
  title =        "{PowerMorph}: {QoS}-Aware Server Power Reshaping for
                 Data Center Regulation Service",
  journal =      j-TACO,
  volume =       "19",
  number =       "3",
  pages =        "36:1--36:27",
  month =        sep,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3524129",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Sep 2 10:07:01 MDT 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3524129",
  abstract =     "Adoption of renewable energy in power grids introduces
                 stability challenges in regulating the operation
                 frequency of the electricity grid. Thus, electrical
                 grid operators call for provisioning of frequency
                 regulation services from end-user customers, such
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "36",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Xu:2022:BFE,
  author =       "Peng Xu and Nannan Zhao and Jiguang Wan and Wei Liu
                 and Shuning Chen and Yuanhui Zhou and Hadeel Albahar
                 and Hanyang Liu and Liu Tang and Zhihu Tan",
  title =        "Building a Fast and Efficient {LSM}-tree Store by
                 Integrating Local Storage with Cloud Storage",
  journal =      j-TACO,
  volume =       "19",
  number =       "3",
  pages =        "37:1--37:26",
  month =        sep,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3527452",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Sep 2 10:07:01 MDT 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3527452",
  abstract =     "The explosive growth of modern web-scale applications
                 has made cost-effectiveness a primary design goal for
                 their underlying databases. As a backbone of modern
                 databases, LSM-tree based key-value stores (LSM store)
                 face limited storage options. They are \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "37",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Huang:2022:AVC,
  author =       "Horng-Ruey Huang and Ding-Yong Hong and Jan-Jan Wu and
                 Kung-Fu Chen and Pangfeng Liu and Wei-Chung Hsu",
  title =        "Accelerating Video Captioning on Heterogeneous System
                 Architectures",
  journal =      j-TACO,
  volume =       "19",
  number =       "3",
  pages =        "38:1--38:25",
  month =        sep,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3527609",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Sep 2 10:07:01 MDT 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3527609",
  abstract =     "Video captioning is a core technology to many
                 important applications, such as AI-assisted medical
                 diagnosis, video question answering, storytelling
                 through videos, and lip-reading. Video captioning
                 employs a hybrid CNN + RNN model. Accelerating such a
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "38",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Corbalan-Navarro:2022:TDO,
  author =       "David Corbal{\'a}n-Navarro and Juan L. Arag{\'o}n and
                 Mart{\'\i} Anglada and Joan-Manuel Parcerisa and
                 Antonio Gonz{\'a}lez",
  title =        "Triangle Dropping: an Occluded-geometry Predictor for
                 Energy-efficient Mobile {GPUs}",
  journal =      j-TACO,
  volume =       "19",
  number =       "3",
  pages =        "39:1--39:20",
  month =        sep,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3527861",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Sep 2 10:07:01 MDT 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3527861",
  abstract =     "This article proposes a novel micro-architecture
                 approach for mobile GPUs aimed at early removing the
                 occluded geometry in a scene by leveraging
                 frame-to-frame coherence, thus reducing the overall
                 energy consumption. Mobile GPUs commonly implement a
                 Tile-. \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "39",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Kundan:2022:PAP,
  author =       "Shivam Kundan and Theodoros Marinakis and Iraklis
                 Anagnostopoulos and Dimitri Kagaris",
  title =        "A Pressure-Aware Policy for Contention Minimization on
                 Multicore Systems",
  journal =      j-TACO,
  volume =       "19",
  number =       "3",
  pages =        "40:1--40:26",
  month =        sep,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3524616",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Sep 2 10:07:01 MDT 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3524616",
  abstract =     "Modern Chip Multiprocessors (CMPs) are integrating an
                 increasing amount of cores to address the continually
                 growing demand for high-application performance. The
                 cores of a CMP share several components of the memory
                 hierarchy, such as Last-Level Cache (LLC). \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "40",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Alsop:2022:CFG,
  author =       "Johnathan Alsop and Weon Taek Na and Matthew D.
                 Sinclair and Samuel Grayson and Sarita Adve",
  title =        "A Case for Fine-grain Coherence Specialization in
                 Heterogeneous Systems",
  journal =      j-TACO,
  volume =       "19",
  number =       "3",
  pages =        "41:1--41:26",
  month =        sep,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3530819",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Sep 2 10:07:01 MDT 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3530819",
  abstract =     "Hardware specialization is becoming a key enabler of
                 energy-efficient performance. Future systems will be
                 increasingly heterogeneous, integrating multiple
                 specialized and programmable accelerators, each with
                 different memory demands. Traditionally, \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "41",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Soltaniyeh:2022:ASC,
  author =       "Mohammadreza Soltaniyeh and Richard P. Martin and
                 Santosh Nagarakatte",
  title =        "An Accelerator for Sparse Convolutional Neural
                 Networks Leveraging Systolic General Matrix--matrix
                 Multiplication",
  journal =      j-TACO,
  volume =       "19",
  number =       "3",
  pages =        "42:1--42:26",
  month =        sep,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3532863",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Sep 2 10:07:01 MDT 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3532863",
  abstract =     "This article proposes a novel hardware accelerator for
                 the inference task with sparse convolutional neural
                 networks (CNNs) by building a hardware unit to perform
                 Image to Column (Im2Col) transformation of the input
                 feature map coupled with a systolic-. \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "42",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Dang:2022:LAP,
  author =       "Dharanidhar Dang and Bill Lin and Debashis Sahoo",
  title =        "{LiteCON}: an All-photonic Neuromorphic Accelerator
                 for Energy-efficient Deep Learning",
  journal =      j-TACO,
  volume =       "19",
  number =       "3",
  pages =        "43:1--43:22",
  month =        sep,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3531226",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Sep 2 10:07:01 MDT 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3531226",
  abstract =     "Deep learning is highly pervasive in today's
                 data-intensive era. In particular, convolutional neural
                 networks (CNNs) are being widely adopted in a variety
                 of fields for superior accuracy. However, computing
                 deep CNNs on traditional CPUs and GPUs brings
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "43",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Siddhu:2022:CII,
  author =       "Lokesh Siddhu and Rajesh Kedia and Shailja Pandey and
                 Martin Rapp and Anuj Pathania and J{\"o}rg Henkel and
                 Preeti Ranjan Panda",
  title =        "{CoMeT}: an Integrated Interval Thermal Simulation
                 Toolchain for {$2$D}, {2.5D}, and {$3$D}
                 Processor-Memory Systems",
  journal =      j-TACO,
  volume =       "19",
  number =       "3",
  pages =        "44:1--44:25",
  month =        sep,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3532185",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Sep 2 10:07:01 MDT 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3532185",
  abstract =     "Processing cores and the accompanying main memory
                 working in tandem enable modern processors. Dissipating
                 heat produced from computation remains a significant
                 problem for processors. Therefore, the thermal
                 management of processors continues to be an \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "44",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Olson:2022:OAG,
  author =       "M. Ben Olson and Brandon Kammerdiener and Michael R.
                 Jantz and Kshitij A. Doshi and Terry Jones",
  title =        "Online Application Guidance for Heterogeneous Memory
                 Systems",
  journal =      j-TACO,
  volume =       "19",
  number =       "3",
  pages =        "45:1--45:27",
  month =        sep,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3533855",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Sep 2 10:07:01 MDT 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3533855",
  abstract =     "As scaling of conventional memory devices has stalled,
                 many high-end computing systems have begun to
                 incorporate alternative memory technologies to meet
                 performance goals. Since these technologies present
                 distinct advantages and tradeoffs compared to
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "45",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Honorio:2022:UBE,
  author =       "Bruno {Chinelato Honorio} and Jo{\~a}o P. L. {De
                 Carvalho} and Catalina {Munoz Morales} and Alexandro
                 Baldassin and Guido Araujo",
  title =        "Using Barrier Elision to Improve Transactional Code
                 Generation",
  journal =      j-TACO,
  volume =       "19",
  number =       "3",
  pages =        "46:1--46:23",
  month =        sep,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3533318",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Sep 2 10:07:01 MDT 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3533318",
  abstract =     "With chip manufacturers such as Intel, IBM, and ARM
                 offering native support for transactional memory in
                 their instruction set architectures, memory
                 transactions are on the verge of being considered a
                 genuine application tool rather than just an \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "46",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Li:2022:AOM,
  author =       "Jiansong Li and Xueying Wang and Xiaobing Chen and
                 Guangli Li and Xiao Dong and Peng Zhao and Xianzhi Yu
                 and Yongxin Yang and Wei Cao and Lei Liu and Xiaobing
                 Feng",
  title =        "An Application-oblivious Memory Scheduling System for
                 {DNN} Accelerators",
  journal =      j-TACO,
  volume =       "19",
  number =       "4",
  pages =        "47:1--47:??",
  month =        dec,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3535355",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Thu Dec 8 06:39:05 MST 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3535355",
  abstract =     "Deep Neural Networks (DNNs) tend to go deeper and
                 wider, which poses a significant challenge to the
                 training of DNNs, due to the limited memory capacity of
                 DNN \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "47",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Narayan:2022:AOC,
  author =       "Aditya Narayan and Yvain Thonnart and Pascal Vivet and
                 Ayse Coskun and Ajay Joshi",
  title =        "Architecting Optically Controlled Phase Change
                 Memory",
  journal =      j-TACO,
  volume =       "19",
  number =       "4",
  pages =        "48:1--48:??",
  month =        dec,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3533252",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Thu Dec 8 06:39:05 MST 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3533252",
  abstract =     "Phase Change Memory (PCM) is an attractive candidate
                 for main memory, as it offers non-volatility and zero
                 leakage power while providing higher cell densities,
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "48",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Zhang:2022:AAS,
  author =       "Chao Zhang and Maximilian Bremer and Cy Chan and John
                 Shalf and Xiaochen Guo",
  title =        "{ASA}: Accelerating Sparse Accumulation in Column-wise
                 {SpGEMM}",
  journal =      j-TACO,
  volume =       "19",
  number =       "4",
  pages =        "49:1--49:??",
  month =        dec,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3543068",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Thu Dec 8 06:39:05 MST 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3543068",
  abstract =     "Sparse linear algebra is an important kernel in many
                 different applications. Among various sparse general
                 matrix-matrix multiplication (SpGEMM) algorithms,
                 Gustavson's column-wise SpGEMM has good locality when
                 reading input matrix and can be easily parallelized by
                 distributing the computation of different columns of an
                 output matrix to different processors. However, the
                 sparse accumulation (SPA) step in column-wise SpGEMM,
                 which merges partial sums from each of the
                 multiplications by the row indices, is still a
                 performance bottleneck. The state-of-the-art software
                 implementation uses a hash table for partial sum search
                 in the SPA, which makes SPA the largest contributor to
                 the execution time of SpGEMM. There are three reasons
                 that cause the SPA to become the bottleneck: (1) hash
                 probing requires data-dependent branches that are
                 difficult for a branch predictor to predict correctly;
                 (2) the accumulation of partial sum is dependent on the
                 results of the hash probing, which makes it difficult
                 to hide the hash probing latency; and (3) hash
                 collision requires time-consuming linear search and
                 optimizations to reduce these collisions require an
                 accurate estimation of the number of non-zeros in each
                 column of the output matrix.

                 This work proposes ASA architecture to accelerate the
                 SPA. ASA overcomes the challenges of SPA by (1)
                 executing the partial sum search and accumulate with a
                 single instruction through ISA extension to eliminate
                 data-dependent branches in hash probing, (2) using a
                 dedicated on-chip cache to perform the search and
                 accumulation in a pipelined fashion, (3) relying on the
                 parallel search capability of a set-associative cache
                 to reduce search latency, and (4) delaying the merging
                 of overflowed entries. As a result, ASA achieves an
                 average of 2.25$ \times $ and 5.05$ \times $ speedup as
                 compared to the state-of-the-art software
                 implementation of a Markov clustering application and
                 its SpGEMM kernel, respectively. As compared to a
                 state-of-the-art hashing accelerator design, ASA
                 achieves an average of 1.95$ \times $ speedup in the
                 SpGEMM kernel.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "49",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Bik:2022:CSS,
  author =       "Aart Bik and Penporn Koanantakool and Tatiana
                 Shpeisman and Nicolas Vasilache and Bixia Zheng and
                 Fredrik Kjolstad",
  title =        "Compiler Support for Sparse Tensor Computations in
                 {MLIR}",
  journal =      j-TACO,
  volume =       "19",
  number =       "4",
  pages =        "50:1--50:??",
  month =        dec,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3544559",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Thu Dec 8 06:39:05 MST 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3544559",
  abstract =     "Sparse tensors arise in problems in science,
                 engineering, machine learning, and data analytics.
                 Programs that operate on such tensors can exploit
                 sparsity to \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "50",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Michaud:2022:HHA,
  author =       "Pierre Michaud and Anis Peysieux",
  title =        "{HAIR}: Halving the Area of the Integer Register File
                 with Odd\slash Even Banking",
  journal =      j-TACO,
  volume =       "19",
  number =       "4",
  pages =        "51:1--51:??",
  month =        dec,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3544838",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Thu Dec 8 06:39:05 MST 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3544838",
  abstract =     "This article proposes a new microarchitectural scheme
                 for reducing the hardware complexity of the integer
                 register file of a superscalar processor. \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "51",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Yousefzadeh:2022:EEM,
  author =       "Amirreza Yousefzadeh and Jan Stuijt and Martijn Hijdra
                 and Hsiao-Hsuan Liu and Anteneh Gebregiorgis and
                 Abhairaj Singh and Said Hamdioui and Francky Catthoor",
  title =        "Energy-efficient In-Memory Address Calculation",
  journal =      j-TACO,
  volume =       "19",
  number =       "4",
  pages =        "52:1--52:??",
  month =        dec,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3546071",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Thu Dec 8 06:39:05 MST 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3546071",
  abstract =     "Computation-in-Memory (CIM) is an emerging computing
                 paradigm to address memory bottleneck challenges in
                 computer architecture. A CIM unit cannot \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "52",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{So:2022:EES,
  author =       "Hwisoo So and Moslem Didehban and Yohan Ko and Aviral
                 Shrivastava and Kyoungwoo Lee",
  title =        "{EXPERTISE}: an Effective Software-level Redundant
                 Multithreading Scheme against Hardware Faults",
  journal =      j-TACO,
  volume =       "19",
  number =       "4",
  pages =        "53:1--53:??",
  month =        dec,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3546073",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Thu Dec 8 06:39:05 MST 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3546073",
  abstract =     "Error resilience is the primary design concern for
                 safety- and mission-critical applications. Redundant
                 MultiThreading (RMT) is one of the most promising soft
                 and hard \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "53",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Hartley:2022:JTC,
  author =       "Tim Hartley and Foivos S. Zakkak and Andy Nisbet and
                 Christos Kotselidis and Mikel Luj{\'a}n",
  title =        "Just-In-Time Compilation on {ARM} --- a Closer Look at
                 Call-Site Code Consistency",
  journal =      j-TACO,
  volume =       "19",
  number =       "4",
  pages =        "54:1--54:??",
  month =        dec,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3546568",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Thu Dec 8 06:39:05 MST 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3546568",
  abstract =     "The increase in computational capability of low-power
                 Arm architectures has seen them diversify from their
                 more traditional domain of portable battery powered
                 devices into data center servers, personal computers,
                 and even Supercomputers. Thus, managed languages (Java,
                 Javascript, etc.) that require a managed runtime
                 environment (MRE) need to be ported to the Arm
                 architecture, requiring an understanding of different
                 design tradeoffs.

                 This article studies how the lack of strong hardware
                 support for Self Modifying Code (SMC) in low-power
                 architectures (e.g., absence of cache coherence between
                 instruction cache and data caches), affects
                 Just-In-Time (JIT) compilation and runtime behavior in
                 MREs. Specifically, we focus on the implementation and
                 treatment of call-sites, that must maintain code
                 consistency in the face of concurrent execution and
                 modification to redirect control (patching) by the MRE.
                 The lack of coherence, is compounded with the maximum
                 distance (reach of) a call-site can jump to as the
                 reach is more constrained (smaller distance) in Arm
                 when compared with Intel/AMD. We present four different
                 robust implementations for call-sites and discuss their
                 advantages and disadvantages in the absence of strong
                 hardware support for SMC. Finally, we evaluate each
                 approach using a microbenchmark, further evaluating the
                 best three techniques using three JVM benchmark suites
                 and the open source MaxineVM showcasing performance
                 differences up to 12\%. Based on these observations, we
                 propose extending code-cache partitioning strategies
                 for JIT compiled code to encourage more efficient local
                 branching for architectures with limited direct branch
                 ranges.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "54",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Jellum:2022:SSA,
  author =       "Erling Jellum and Milica Orlandi{\'c} and Edmund
                 Brekke and Tor Johansen and Torleiv Bryne",
  title =        "Solving Sparse Assignment Problems on {FPGAs}",
  journal =      j-TACO,
  volume =       "19",
  number =       "4",
  pages =        "55:1--55:??",
  month =        dec,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3546072",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Thu Dec 8 06:39:05 MST 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3546072",
  abstract =     "The assignment problem is a fundamental optimization
                 problem and a crucial part of many systems. For
                 example, in multiple object tracking, the assignment
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "55",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Li:2022:PEP,
  author =       "Yuhao Li and Benjamin C. Lee",
  title =        "{Phronesis}: Efficient Performance Modeling for
                 High-dimensional Configuration Tuning",
  journal =      j-TACO,
  volume =       "19",
  number =       "4",
  pages =        "56:1--56:??",
  month =        dec,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3546868",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Thu Dec 8 06:39:05 MST 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3546868",
  abstract =     "We present Phronesis, a learning framework for
                 efficiently modeling the performance of data analytic
                 workloads as a function of their high-dimensional
                 software \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "56",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Tirumalasetty:2022:RMP,
  author =       "Chandrahas Tirumalasetty and Chih Chieh Chou and
                 Narasimha Reddy and Paul Gratz and Ayman Abouelwafa",
  title =        "Reducing Minor Page Fault Overheads through Enhanced
                 Page Walker",
  journal =      j-TACO,
  volume =       "19",
  number =       "4",
  pages =        "57:1--57:??",
  month =        dec,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3547142",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Thu Dec 8 06:39:05 MST 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3547142",
  abstract =     "Application virtual memory footprints are growing
                 rapidly in all systems from servers down to
                 smartphones. To address this growing demand, system
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "57",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Gao:2022:ACM,
  author =       "Lan Gao and Jing Wang and Weigong Zhang",
  title =        "Adaptive Contention Management for Fine-Grained
                 Synchronization on Commodity {GPUs}",
  journal =      j-TACO,
  volume =       "19",
  number =       "4",
  pages =        "58:1--58:??",
  month =        dec,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3547301",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Thu Dec 8 06:39:05 MST 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3547301",
  abstract =     "As more emerging applications are moving to GPUs,
                 fine-grained synchronization has become imperative.
                 However, their performance can be severely \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "58",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Han:2022:CEC,
  author =       "Ruobing Han and Jaewon Lee and Jaewoong Sim and
                 Hyesoon Kim",
  title =        "{COX} : Exposing {CUDA} Warp-level Functions to
                 {CPUs}",
  journal =      j-TACO,
  volume =       "19",
  number =       "4",
  pages =        "59:1--59:??",
  month =        dec,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3554736",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Thu Dec 8 06:39:05 MST 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3554736",
  abstract =     "As CUDA becomes the de facto programming language
                 among data parallel applications such as
                 high-performance computing or machine learning
                 applications, running CUDA on other platforms becomes a
                 compelling option. Although several efforts have
                 attempted to support CUDA on devices other than NVIDIA
                 GPUs, due to extra steps in the translation, the
                 support is always a few years behind CUDA's latest
                 features. In particular, the new CUDA programming model
                 exposes the warp concept in the programming language,
                 which greatly changes the way the CUDA code should be
                 mapped to CPU programs. In this article, hierarchical
                 collapsing that correctly supports CUDA warp-level
                 functions on CPUs is proposed. To verify hierarchical
                 collapsing, we build a framework, COX, that supports
                 executing CUDA source code on the CPU backend. With
                 hierarchical collapsing, 90\% of kernels in CUDA SDK
                 samples can be executed on CPUs, much higher than
                 previous works (68%). We also evaluate the performance
                 with benchmarks for real applications and show that
                 hierarchical collapsing can generate CPU programs with
                 comparable or even higher performance than previous
                 projects in general.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "59",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Liu:2022:DAS,
  author =       "Yiding Liu and Xingyao Zhang and Donglin Zhuang and
                 Xin Fu and Shuaiwen Song",
  title =        "{DynamAP}: Architectural Support for Dynamic Graph
                 Traversal on the Automata Processor",
  journal =      j-TACO,
  volume =       "19",
  number =       "4",
  pages =        "60:1--60:??",
  month =        dec,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3556976",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Thu Dec 8 06:39:05 MST 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3556976",
  abstract =     "Dynamic graph traversals (DGTs) currently are widely
                 used in many important application domains, especially
                 in this big-data era that urgently demands \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "60",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Zou:2022:PSB,
  author =       "Changwei Zou and Yaoqing Gao and Jingling Xue",
  title =        "Practical Software-Based Shadow Stacks on x86-64",
  journal =      j-TACO,
  volume =       "19",
  number =       "4",
  pages =        "61:1--61:??",
  month =        dec,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3556977",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Thu Dec 8 06:39:05 MST 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3556977",
  abstract =     "Control-Flow Integrity (CFI) techniques focus often on
                 protecting forward edges and assume that backward edges
                 are protected by shadow stacks. However, software-based
                 shadow stacks that can provide performance, security,
                 and compatibility are still hard to obtain, leaving an
                 important security gap on x86-64. In this article, we
                 introduce a simple, efficient, and effective parallel
                 shadow stack design (based on LLVM), FlashStack, for
                 protecting return addresses in single- and
                 multi-threaded programs running under 64-bit Linux on
                 x86-64, with three distinctive features. First, we
                 introduce a novel dual-prologue approach to enable a
                 protected function to thwart the TOCTTOU attacks, which
                 are constructed by Microsoft's red team and lead to the
                 deprecation of Microsoft's RFG. Second, we design a new
                 mapping mechanism, Segment+Rsp-S, to allow the parallel
                 shadow stack to be accessed efficiently while
                 satisfying the constraints of arch\_prctl() and ASLR in
                 64-bit Linux. Finally, we introduce a lightweight
                 inspection mechanism, SideChannel-K, to harden
                 FlashStack further by detecting entropy-reduction
                 attacks efficiently and protecting the parallel shadow
                 stack effectively with a 10-ms shuffling policy. Our
                 evaluation on SPEC CPU2006, Nginx, and Firefox shows
                 that FlashStack can provide high performance,
                 meaningful security, and reasonable compatibility for
                 server- and client-side programs on x86-64.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "61",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Luinaud:2023:SAD,
  author =       "Thomas Luinaud and J. M. Pierre Langlois and Yvon
                 Savaria",
  title =        "Symbolic Analysis for Data Plane Programs
                 Specialization",
  journal =      j-TACO,
  volume =       "20",
  number =       "1",
  pages =        "1:1--1:??",
  month =        mar,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3557727",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Feb 17 06:54:21 MST 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3557727",
  abstract =     "Programmable network data planes have extended the
                 capabilities of packet processing in network devices by
                 allowing custom processing pipelines and agnostic
                 packet processing. While a variety of applications can
                 be implemented on current programmable data \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "1",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Shah:2023:BSA,
  author =       "Nilesh Rajendra Shah and Ashitabh Misra and Antoine
                 Min{\'e} and Rakesh Venkat and Ramakrishna Upadrasta",
  title =        "{BullsEye}: Scalable and Accurate Approximation
                 Framework for Cache Miss Calculation",
  journal =      j-TACO,
  volume =       "20",
  number =       "1",
  pages =        "2:1--2:??",
  month =        mar,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3558003",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Feb 17 06:54:21 MST 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3558003",
  abstract =     "For Affine Control Programs or Static Control Programs
                 (SCoP), symbolic counting of reuse distances could
                 induce polynomials for each reuse pair. These
                 polynomials along with cache capacity constraints lead
                 to non-affine (semi-algebraic) sets; and \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "2",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Soni:2023:AC,
  author =       "Mitali Soni and Asmita Pal and Joshua {San Miguel}",
  title =        "As-Is Approximate Computing",
  journal =      j-TACO,
  volume =       "20",
  number =       "1",
  pages =        "3:1--3:??",
  month =        mar,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3559761",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Feb 17 06:54:21 MST 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3559761",
  abstract =     "Although approximate computing promises better
                 performance for applications allowing marginal errors,
                 dearth of hardware support and lack of run-time
                 accuracy guarantees makes it difficult to adopt. We
                 present As-Is, an Anytime Speculative Interruptible
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "3",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Shah:2023:TDS,
  author =       "Parth Shah and Ranjal Gautham Shenoy and Vaidyanathan
                 Srinivasan and Pradip Bose and Alper Buyuktosunoglu",
  title =        "{TokenSmart}: Distributed, Scalable Power Management
                 in the Many-core Era",
  journal =      j-TACO,
  volume =       "20",
  number =       "1",
  pages =        "4:1--4:??",
  month =        mar,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3559762",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Feb 17 06:54:21 MST 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3559762",
  abstract =     "Centralized power management control systems are
                 hitting a scalability limit. In particular, enforcing a
                 power cap in a many-core system in a
                 performance-friendly manner is quite challenging.
                 Today's on-chip controller reduces the clock speed of
                 compute \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "4",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Chen:2023:LFH,
  author =       "Zhangyu Chen and Yu Hua and Luochangqi Ding and Bo
                 Ding and Pengfei Zuo and Xue Liu",
  title =        "Lock-Free High-performance Hashing for Persistent
                 Memory via {PM}-aware Holistic Optimization",
  journal =      j-TACO,
  volume =       "20",
  number =       "1",
  pages =        "5:1--5:??",
  month =        mar,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3561651",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Feb 17 06:54:21 MST 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/hash.bib;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3561651",
  abstract =     "Persistent memory (PM) provides large-scale
                 non-volatile memory (NVM) with DRAM-comparable
                 performance. The non-volatility and other unique
                 characteristics of PM architecture bring new
                 opportunities and challenges for the efficient storage
                 system design. For example, some recent
                 crash-consistent and write-friendly hashing schemes are
                 proposed to provide fast queries for PM systems.
                 However, existing PM hashing indexes suffer from the
                 concurrency bottleneck due to the blocking resizing and
                 expensive lock-based concurrency control for queries.
                 Moreover, the lack of PM awareness and systematical
                 design further increases the query latency. To address
                 the concurrency bottleneck of lock contention in PM
                 hashing, we propose clevel hashing, a lock-free
                 concurrent level hashing scheme that provides
                 non-blocking resizing via background threads and
                 lock-free search/insertion/update/deletion using atomic
                 primitives to enable high concurrency for PM hashing.
                 By exploiting the PM characteristics, we present a
                 holistic approach to building clevel hashing for high
                 throughput and low tail latency via the PM-aware
                 index/allocator co-design. The proposed volatile
                 announcement array with a helping mechanism coordinates
                 lock-free insertions and guarantees a strong
                 consistency model. Our experiments using real-world
                 YCSB workloads on Intel Optane DC PMM show that clevel
                 hashing, respectively, achieves up to 5.7x and 1.6x
                 higher throughput than state-of-the-art P-CLHT and Dash
                 while guaranteeing low tail latency, e.g., 1.9x--7.2x
                 speedup for the p99 latency with the insert-only
                 workload.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "5",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Mastoras:2023:DIN,
  author =       "Aristeidis Mastoras and Sotiris Anagnostidis and
                 Albert-Jan N. Yzelman",
  title =        "Design and Implementation for Nonblocking Execution in
                 {GraphBLAS}: Tradeoffs and Performance",
  journal =      j-TACO,
  volume =       "20",
  number =       "1",
  pages =        "6:1--6:??",
  month =        mar,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3561652",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Feb 17 06:54:21 MST 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3561652",
  abstract =     "GraphBLAS is a recent standard that allows the
                 expression of graph algorithms in the language of
                 linear algebra and enables automatic code
                 parallelization and optimization. GraphBLAS operations
                 are memory bound and may benefit from data locality
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "6",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Xu:2023:SSC,
  author =       "Yemao Xu and Dezun Dong and Dongsheng Wang and Shi Xu
                 and Enda Yu and Weixia Xu and Xiangke Liao",
  title =        "{SSD-SGD}: Communication Sparsification for
                 Distributed Deep Learning Training",
  journal =      j-TACO,
  volume =       "20",
  number =       "1",
  pages =        "7:1--7:??",
  month =        mar,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3563038",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Feb 17 06:54:21 MST 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3563038",
  abstract =     "Intensive communication and synchronization cost for
                 gradients and parameters is the well-known bottleneck
                 of distributed deep learning training. Based on the
                 observations that Synchronous SGD (SSGD) obtains good
                 convergence accuracy while asynchronous \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "7",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Olgun:2023:PHE,
  author =       "Ataberk Olgun and Juan G{\'o}mez Luna and Konstantinos
                 Kanellopoulos and Behzad Salami and Hasan Hassan and
                 Oguz Ergin and Onur Mutlu",
  title =        "{PiDRAM}: a Holistic End-to-end {FPGA}-based Framework
                 for Processing-in-{DRAM}",
  journal =      j-TACO,
  volume =       "20",
  number =       "1",
  pages =        "8:1--8:??",
  month =        mar,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3563697",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Feb 17 06:54:21 MST 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3563697",
  abstract =     "Commodity DRAM-based processing-using-memory (PuM)
                 techniques that are supported by off-the-shelf DRAM
                 chips present an opportunity for alleviating the data
                 movement bottleneck at low cost. However, system
                 integration of these techniques imposes non-.
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "8",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Sakalis:2023:DSS,
  author =       "Christos Sakalis and Stefanos Kaxiras and Magnus
                 Sj{\"a}lander",
  title =        "Delay-on-Squash: Stopping Microarchitectural Replay
                 Attacks in Their Tracks",
  journal =      j-TACO,
  volume =       "20",
  number =       "1",
  pages =        "9:1--9:??",
  month =        mar,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3563695",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Feb 17 06:54:21 MST 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3563695",
  abstract =     "MicroScope and other similar microarchitectural replay
                 attacks take advantage of the characteristics of
                 speculative execution to trap the execution of the
                 victim application in a loop, enabling the attacker to
                 amplify a side-channel attack by executing it
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "9",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Liang:2023:QRC,
  author =       "Yi Liang and Shaokang Zeng and Lei Wang",
  title =        "Quantifying Resource Contention of Co-located
                 Workloads with the System-level Entropy",
  journal =      j-TACO,
  volume =       "20",
  number =       "1",
  pages =        "10:1--10:??",
  month =        mar,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3563696",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Feb 17 06:54:21 MST 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3563696",
  abstract =     "The workload co-location, such as deploying offline
                 analysis workloads with online service workloads on the
                 same node, has become common for modern data centers.
                 Workload co-location deployment improves data center
                 resource utilization significantly. \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "10",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Suyeon:2023:FFF,
  author =       "Hur Suyeon and Seongmin Na and Dongup Kwon and Kim
                 Joonsung and Andrew Boutros and Eriko Nurvitadhi and
                 Jangwoo Kim",
  title =        "A Fast and Flexible {FPGA-based} Accelerator for
                 Natural Language Processing Neural Networks",
  journal =      j-TACO,
  volume =       "20",
  number =       "1",
  pages =        "11:1--11:??",
  month =        mar,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3564606",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Feb 17 06:54:21 MST 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3564606",
  abstract =     "Deep neural networks (DNNs) have become key solutions
                 in the natural language processing (NLP) domain.
                 However, the existing accelerators customized for their
                 narrow target models cannot support diverse NLP models.
                 Therefore, naively running complex NLP \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "11",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Gondimalla:2023:OOD,
  author =       "Ashish Gondimalla and Jianqiao Liu and Mithuna
                 Thottethodi and T. N. Vijaykumar",
  title =        "{Occam}: Optimal Data Reuse for Convolutional Neural
                 Networks",
  journal =      j-TACO,
  volume =       "20",
  number =       "1",
  pages =        "12:1--12:??",
  month =        mar,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3566052",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Feb 17 06:54:21 MST 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3566052",
  abstract =     "Convolutional neural networks (CNNs) are emerging as
                 powerful tools for image processing in important
                 commercial applications. We focus on the important
                 problem of improving the latency of image recognition.
                 While CNNs are highly amenable to prefetching
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "12",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Peng:2023:FPS,
  author =       "Bo Peng and Yaozu Dong and Jianguo Yao and Fengguang
                 Wu and Haibing Guan",
  title =        "{FlexHM}: a Practical System for Heterogeneous Memory
                 with Flexible and Efficient Performance Optimizations",
  journal =      j-TACO,
  volume =       "20",
  number =       "1",
  pages =        "13:1--13:??",
  month =        mar,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3565885",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Feb 17 06:54:21 MST 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3565885",
  abstract =     "With the rapid development of cloud computing,
                 numerous cloud services, containers, and virtual
                 machines have been bringing tremendous demands on
                 high-performance memory resources to modern data
                 centers. Heterogeneous memory, especially the newly
                 released \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "13",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Zhang:2023:RRB,
  author =       "Qiang Zhang and Lei Xu and Baowen Xu",
  title =        "{RegCPython}: a Register-based {Python} Interpreter
                 for Better Performance",
  journal =      j-TACO,
  volume =       "20",
  number =       "1",
  pages =        "14:1--14:??",
  month =        mar,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3568973",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Feb 17 06:54:21 MST 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/python.bib;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3568973",
  abstract =     "Interpreters are widely used in the implementation of
                 many programming languages, such as Python, Perl, and
                 Java. Even though various JIT compilers emerge in an
                 endless stream, interpretation efficiency still plays a
                 critical role in program performance. Does a
                 stack-based interpreter or a register-based interpreter
                 perform better? The pros and cons of the pair of
                 architectures have long been discussed. The stack
                 architecture is attractive for its concise model and
                 compact bytecode, but our study finds that the
                 register-based interpreter can also be implemented
                 easily and that its bytecode size only grows by a small
                 margin. Moreover, the latter turns out to be
                 appreciably faster. Specifically, we implemented an
                 open source Python interpreter named RegCPython based
                 on CPython v3.10.1. The former is register based, while
                 the latter is stack based. Without changes in syntax,
                 Application Programming Interface, and Application
                 Binary Interface, RegCPython is excellently compatible
                 with CPython, as it does not break existing syntax or
                 interfaces. It achieves a speedup of 1.287 on the most
                 favorable benchmark and 0.977 even on the most
                 unfavorable benchmark. For all Python-intensive
                 benchmarks, the average speedup reaches 1.120 on x86
                 and 1.130 on ARM. Our evaluation work, which also
                 serves as an empirical study, provides a detailed
                 performance survey of both interpreters on modern
                 hardware. It points out that the register-based
                 interpreters are more efficient mainly due to the
                 elimination of machine instructions needed, while
                 changes in branch mispredictions and cache misses have
                 a limited impact on performance. Additionally, it
                 confirms that the register-based implementation is also
                 satisfactory in terms of memory footprint, compilation
                 cost, and implementation complexity.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "14",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Jin:2023:SBS,
  author =       "Hai Jin and Zhuo He and Weizhong Qiang",
  title =        "{SpecTerminator}: Blocking Speculative Side Channels
                 Based on Instruction Classes on {RISC-V}",
  journal =      j-TACO,
  volume =       "20",
  number =       "1",
  pages =        "15:1--15:??",
  month =        mar,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3566053",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Feb 17 06:54:21 MST 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/risc-v.bib;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3566053",
  abstract =     "In modern processors, speculative execution has
                 significantly improved the performance of processors,
                 but it has also introduced speculative execution
                 vulnerabilities. Recent defenses are based on the
                 delayed execution to block various speculative side
                 channels, but we show that several of the current
                 state-of-the-art defenses fail to block some of the
                 available speculative side channels, and the current
                 most secure defense introduces a performance overhead
                 of up to 24.5\%.\par

                 We propose SpecTerminator, the first defense framework
                 based on instruction classes that can comprehensively
                 and precisely block all existing speculative side
                 channels. In SpecTerminator, a novel speculative side
                 channel classification scheme based on the features of
                 secret transmission is proposed, and the sensitive
                 instructions in the speculative window are classified
                 and identified using optimized hardware taint tracking
                 and instruction masking techniques to accurately
                 determine the scope of leakage. Then, according to the
                 execution characteristics of these instructions,
                 dedicated delayed execution strategies, such as TLB
                 request ignoring, selective issue, and extended
                 delay-on-miss, are designed for each type of sensitive
                 instruction to precisely control that these
                 instructions are delayed only in pipeline stages that
                 are at risk of leakage. In contrast to previous
                 defenses based on the Gem5 simulator, we have
                 innovatively implemented defenses against Spectre
                 attacks based on the open-source instruction set RISC-V
                 on an FPGA-accelerated simulation platform that is more
                 similar to real hardware. To evaluate the security of
                 SpecTerminator, we have replicated various existing
                 x86-based Spectre variants on RISC-V. On SPEC 2006,
                 SpecTerminator defends against Spectre attacks based on
                 memory hierarchy side channels with a performance
                 overhead of 2.6\% and against all existing Spectre
                 attacks with a performance overhead of 6.0\%.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "15",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Zhao:2023:PSC,
  author =       "Tuowen Zhao and Tobi Popoola and Mary Hall and
                 Catherine Olschanowsky and Michelle Strout",
  title =        "Polyhedral Specification and Code Generation of Sparse
                 Tensor Contraction with Co-iteration",
  journal =      j-TACO,
  volume =       "20",
  number =       "1",
  pages =        "16:1--16:??",
  month =        mar,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3566054",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Feb 17 06:54:21 MST 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3566054",
  abstract =     "This article presents a code generator for sparse
                 tensor contraction computations. It leverages a
                 mathematical representation of loop nest computations
                 in the sparse polyhedral framework (SPF), which extends
                 the polyhedral model to support non-affine \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "16",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Schuler:2023:XOT,
  author =       "Manuela Schuler and Richard Membarth and Philipp
                 Slusallek",
  title =        "{XEngine}: Optimal Tensor Rematerialization for Neural
                 Networks in Heterogeneous Environments",
  journal =      j-TACO,
  volume =       "20",
  number =       "1",
  pages =        "17:1--17:??",
  month =        mar,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3568956",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Feb 17 06:54:21 MST 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3568956",
  abstract =     "Memory efficiency is crucial in training deep learning
                 networks on resource-restricted devices. During
                 backpropagation, forward tensors are used to calculate
                 gradients. Despite the option of keeping those
                 dependencies in memory until they are reused in
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "17",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Korostelev:2023:YCL,
  author =       "Ivan Korostelev and Jo{\~a}o P. L. {De Carvalho} and
                 Jos{\'e} Moreira and Jos{\'e} Nelson Amaral",
  title =        "{YaConv}: Convolution with Low Cache Footprint",
  journal =      j-TACO,
  volume =       "20",
  number =       "1",
  pages =        "18:1--18:??",
  month =        mar,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3570305",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Feb 17 06:54:21 MST 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3570305",
  abstract =     "This article introduces YaConv, a new algorithm to
                 compute convolution using GEMM microkernels from a
                 Basic Linear Algebra Subprograms library that is
                 efficient for multiple CPU architectures. Previous
                 approaches either create a copy of each image element
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "18",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Eris:2023:PRF,
  author =       "Furkan Eris and Marcia Louis and Kubra Eris and
                 Jos{\'e} Abell{\'a}n and Ajay Joshi",
  title =        "{Puppeteer}: a Random Forest Based Manager for
                 Hardware Prefetchers Across the Memory Hierarchy",
  journal =      j-TACO,
  volume =       "20",
  number =       "1",
  pages =        "19:1--19:??",
  month =        mar,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3570304",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Feb 17 06:54:21 MST 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3570304",
  abstract =     "Over the years, processor throughput has steadily
                 increased. However, the memory throughput has not
                 increased at the same rate, which has led to the memory
                 wall problem in turn increasing the gap between
                 effective and theoretical peak processor \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "19",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Tollenaere:2023:ACE,
  author =       "Nicolas Tollenaere and Guillaume Iooss and
                 St{\'e}phane Pouget and Hugo Brunie and Christophe
                 Guillon and Albert Cohen and P. Sadayappan and Fabrice
                 Rastello",
  title =        "Autotuning Convolutions Is Easier Than You Think",
  journal =      j-TACO,
  volume =       "20",
  number =       "2",
  pages =        "20:1--20:??",
  month =        jun,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3570641",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Sat Jun 10 08:08:06 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3570641",
  abstract =     "A wide range of scientific and machine learning
                 applications depend on highly optimized implementations
                 of tensor computations. Exploiting the full capacity of
                 a given processor architecture remains a challenging
                 task, due to the complexity of the \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "20",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Perez:2023:UDO,
  author =       "V{\'\i}ctor P{\'e}rez and Lukas Sommer and Victor
                 Lom{\"u}ller and Kumudha Narasimhan and Mehdi Goli",
  title =        "User-driven Online Kernel Fusion for {SYCL}",
  journal =      j-TACO,
  volume =       "20",
  number =       "2",
  pages =        "21:1--21:??",
  month =        jun,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3571284",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Sat Jun 10 08:08:06 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3571284",
  abstract =     "Heterogeneous programming models are becoming
                 increasingly popular to support the ever-evolving
                 hardware architectures, especially for new and emerging
                 specialized accelerators optimizing specific tasks.
                 While such programs provide performance portability
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "21",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Espindola:2023:SMR,
  author =       "Vinicius Espindola and Luciano Zago and Herv{\'e}
                 Yviquel and Guido Araujo",
  title =        "Source Matching and Rewriting for {MLIR} Using
                 String-Based Automata",
  journal =      j-TACO,
  volume =       "20",
  number =       "2",
  pages =        "22:1--22:??",
  month =        jun,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3571283",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Sat Jun 10 08:08:06 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/string-matching.bib;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3571283",
  abstract =     "A typical compiler flow relies on a uni-directional
                 sequence of translation/optimization steps that lower
                 the program abstract representation, making it hard to
                 preserve higher-level program information across each
                 transformation step. On the other hand, \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "22",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Ma:2023:OFM,
  author =       "Wenjing Ma and Fangfang Liu and Daokun Chen and
                 Qinglin Lu and Yi Hu and Hongsen Wang and Xinhui Yuan",
  title =        "An Optimized Framework for Matrix Factorization on the
                 New {Sunway} Many-core Platform",
  journal =      j-TACO,
  volume =       "20",
  number =       "2",
  pages =        "23:1--23:??",
  month =        jun,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3571856",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Sat Jun 10 08:08:06 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3571856",
  abstract =     "Matrix factorization functions are used in many areas
                 and often play an important role in the overall
                 performance of the applications. In the LAPACK library,
                 matrix factorization functions are implemented with
                 blocked factorization algorithm, shifting \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "23",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Singh:2023:HHP,
  author =       "Sarabjeet Singh and Neelam Surana and Kailash Prasad
                 and Pranjali Jain and Joycee Mekie and Manu Awasthi",
  title =        "{HyGain}: High-performance, Energy-efficient Hybrid
                 Gain Cell-based Cache Hierarchy",
  journal =      j-TACO,
  volume =       "20",
  number =       "2",
  pages =        "24:1--24:??",
  month =        jun,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3572839",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Sat Jun 10 08:08:06 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3572839",
  abstract =     "In this article, we propose a ``full-stack'' solution
                 to designing high-apacity and low-latency on-chip cache
                 hierarchies by starting at the circuit level of the
                 hardware design stack. We propose a novel half V
                 $_{DD}$ precharge 2T Gain Cell (GC) design for the
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "24",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Mummidi:2023:AAC,
  author =       "Chandra Sekhar Mummidi and Sandip Kundu",
  title =        "{ACTION}: Adaptive Cache Block Migration in
                 Distributed Cache Architectures",
  journal =      j-TACO,
  volume =       "20",
  number =       "2",
  pages =        "25:1--25:??",
  month =        jun,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3572911",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Sat Jun 10 08:08:06 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3572911",
  abstract =     "Chip multiprocessors (CMP) with more cores have more
                 traffic to the last-level cache (LLC). Without a
                 corresponding increase in LLC bandwidth, such traffic
                 cannot be sustained, resulting in performance
                 degradation. Previous research focused on data
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "25",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Liu:2023:UBC,
  author =       "Qiaoyi Liu and Jeff Setter and Dillon Huff and Maxwell
                 Strange and Kathleen Feng and Mark Horowitz and
                 Priyanka Raina and Fredrik Kjolstad",
  title =        "Unified Buffer: Compiling Image Processing and Machine
                 Learning Applications to Push-Memory Accelerators",
  journal =      j-TACO,
  volume =       "20",
  number =       "2",
  pages =        "26:1--26:??",
  month =        jun,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3572908",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Sat Jun 10 08:08:06 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3572908",
  abstract =     "Image processing and machine learning applications
                 benefit tremendously from hardware acceleration.
                 Existing compilers target either FPGAs, which sacrifice
                 power and performance for programmability, or ASICs,
                 which become obsolete as applications change.
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "26",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Yuzuguler:2023:SSA,
  author =       "Ahmet Caner Y{\"u}z{\"u}g{\"u}ler and Canberk
                 S{\"o}nmez and Mario Drumond and Yunho Oh and Babak
                 Falsafi and Pascal Frossard",
  title =        "Scale-out Systolic Arrays",
  journal =      j-TACO,
  volume =       "20",
  number =       "2",
  pages =        "27:1--27:??",
  month =        jun,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3572917",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Sat Jun 10 08:08:06 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3572917",
  abstract =     "Multi-pod systolic arrays are emerging as the
                 architecture of choice in DNN inference accelerators.
                 Despite their potential, designing multi-pod systolic
                 arrays to maximize effective throughput/Watt-i.e.,
                 throughput/Watt adjusted when accounting for array
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "27",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Minervini:2023:VAE,
  author =       "Francesco Minervini and Oscar Palomar and Osman Unsal
                 and Enrico Reggiani and Josue Quiroga and Joan Marimon
                 and Carlos Rojas and Roger Figueras and Abraham Ruiz
                 and Alberto Gonzalez and Jonnatan Mendoza and Ivan
                 Vargas and C{\'e}sar Hernandez and Joan Cabre and Lina
                 Khoirunisya and Mustapha Bouhali and Julian Pavon and
                 Francesc Moll and Mauro Olivieri and Mario Kovac and
                 Mate Kovac and Leon Dragic and Mateo Valero and Adrian
                 Cristal",
  title =        "{Vitruvius+}: an Area-Efficient {RISC-V} Decoupled
                 Vector Coprocessor for High Performance Computing
                 Applications",
  journal =      j-TACO,
  volume =       "20",
  number =       "2",
  pages =        "28:1--28:??",
  month =        jun,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3575861",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Sat Jun 10 08:08:06 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/risc-v.bib;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3575861",
  abstract =     "The maturity level of RISC-V and the availability of
                 domain-specific instruction set extensions, like vector
                 processing, make RISC-V a good candidate for supporting
                 the integration of specialized hardware in processor
                 cores for the High Performance \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "28",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Benmeziane:2023:MOH,
  author =       "Hadjer Benmeziane and Hamza Ouarnoughi and Kaoutar {El
                 Maghraoui} and Smail Niar",
  title =        "Multi-objective Hardware-aware Neural Architecture
                 Search with {Pareto} Rank-preserving Surrogate Models",
  journal =      j-TACO,
  volume =       "20",
  number =       "2",
  pages =        "29:1--29:??",
  month =        jun,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3579853",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Sat Jun 10 08:08:06 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3579853",
  abstract =     "Deep learning (DL) models such as convolutional neural
                 networks (ConvNets) are being deployed to solve various
                 computer vision and natural language processing tasks
                 at the edge. It is a challenge to find the right DL
                 architecture that simultaneously meets \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "29",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Chen:2023:FFA,
  author =       "Dongwei Chen and Dong Tong and Chun Yang and Jiangfang
                 Yi and Xu Cheng",
  title =        "{FlexPointer}: Fast Address Translation Based on Range
                 {TLB} and Tagged Pointers",
  journal =      j-TACO,
  volume =       "20",
  number =       "2",
  pages =        "30:1--30:??",
  month =        jun,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3579854",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Sat Jun 10 08:08:06 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3579854",
  abstract =     "Page-based virtual memory relies on TLBs to accelerate
                 the address translation. Nowadays, the gap between
                 application workloads and the capacity of TLB continues
                 to grow, bringing many costly TLB misses and making the
                 TLB a performance bottleneck. \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "30",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Du:2023:FOS,
  author =       "Jingwen Du and Fang Wang and Dan Feng and Changchen
                 Gan and Yuchao Cao and Xiaomin Zou and Fan Li",
  title =        "Fast One-Sided {RDMA}-Based State Machine Replication
                 for Disaggregated Memory",
  journal =      j-TACO,
  volume =       "20",
  number =       "2",
  pages =        "31:1--31:??",
  month =        jun,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3587096",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Sat Jun 10 08:08:06 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3587096",
  abstract =     "Disaggregated memory architecture has risen in
                 popularity for large datacenters with the advantage of
                 improved resource utilization, failure isolation, and
                 elasticity. Replicated state machines (RSMs) have been
                 extensively used for reliability and \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "31",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Sahni:2023:AAS,
  author =       "Abdul Rasheed Sahni and Hamza Omar and Usman Ali and
                 Omer Khan",
  title =        "{ASM}: an Adaptive Secure Multicore for Co-located
                 Mutually Distrusting Processes",
  journal =      j-TACO,
  volume =       "20",
  number =       "3",
  pages =        "32:1--32:??",
  month =        sep,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3587480",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Thu Aug 10 07:14:56 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3587480",
  abstract =     "With the ever-increasing virtualization of software
                 and hardware, the privacy of user-sensitive data is a
                 fundamental concern in computation outsourcing. Secure
                 processors enable a trusted execution environment to
                 guarantee security properties based on \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "32",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Puthoor:2023:TBS,
  author =       "Sooraj Puthoor and Mikko H. Lipasti",
  title =        "Turn-based Spatiotemporal Coherence for {GPUs}",
  journal =      j-TACO,
  volume =       "20",
  number =       "3",
  pages =        "33:1--33:??",
  month =        sep,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3593054",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Thu Aug 10 07:14:56 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3593054",
  abstract =     "This article introduces turn-based spatiotemporal
                 coherence. Spatiotemporal coherence is a novel
                 coherence implementation that assigns write permission
                 to epochs (or turns) as opposed to a processor core.
                 This paradigm shift in the assignment of write
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "33",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Chen:2023:JOJ,
  author =       "Ruobing Chen and Haosen Shi and Jinping Wu and Yusen
                 Li and Xiaoguang Liu and Gang Wang",
  title =        "Jointly Optimizing Job Assignment and Resource
                 Partitioning for Improving System Throughput in Cloud
                 Datacenters",
  journal =      j-TACO,
  volume =       "20",
  number =       "3",
  pages =        "34:1--34:??",
  month =        sep,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3593055",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Thu Aug 10 07:14:56 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3593055",
  abstract =     "Colocating multiple jobs on the same server has been
                 widely applied for improving resource utilization in
                 cloud datacenters. However, the colocated jobs would
                 contend for the shared resources, which could lead to
                 significant performance degradation. An \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "34",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Ravi:2023:TMA,
  author =       "Gokul Subramanian Ravi and Tushar Krishna and Mikko
                 Lipasti",
  title =        "{TNT}: a Modular Approach to Traversing Physically
                 Heterogeneous {NOCs} at Bare-wire Latency",
  journal =      j-TACO,
  volume =       "20",
  number =       "3",
  pages =        "35:1--35:??",
  month =        sep,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3597611",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Thu Aug 10 07:14:56 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3597611",
  abstract =     "The ideal latency for on-chip network traversal would
                 be the delay incurred from wire traversal alone.
                 Unfortunately, in a realistic modular network, the
                 latency for a packet to traverse the network is
                 significantly higher than this wire delay. The main
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "35",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Xu:2023:ACN,
  author =       "Weizhi Xu and Yintai Sun and Shengyu Fan and Hui Yu
                 and Xin Fu",
  title =        "Accelerating Convolutional Neural Network by
                 Exploiting Sparsity on {GPUs}",
  journal =      j-TACO,
  volume =       "20",
  number =       "3",
  pages =        "36:1--36:??",
  month =        sep,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3600092",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Thu Aug 10 07:14:56 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3600092",
  abstract =     "The convolutional neural network (CNN) is an important
                 deep learning method, which is widely used in many
                 fields. However, it is very time consuming to implement
                 the CNN where convolution usually takes most of the
                 time. There are many zero values in \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "36",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Zhao:2023:GED,
  author =       "Jin Zhao and Yu Zhang and Ligang He and Qikun Li and
                 Xiang Zhang and Xinyu Jiang and Hui Yu and Xiaofei Liao
                 and Hai Jin and Lin Gu and Haikun Liu and Bingsheng He
                 and Ji Zhang and Xianzheng Song and Lin Wang and Jun
                 Zhou",
  title =        "{GraphTune}: an Efficient Dependency-Aware Substrate
                 to Alleviate Irregularity in Concurrent Graph
                 Processing",
  journal =      j-TACO,
  volume =       "20",
  number =       "3",
  pages =        "37:1--37:??",
  month =        sep,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3600091",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Thu Aug 10 07:14:56 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3600091",
  abstract =     "With the increasing need for graph analysis, massive
                 Concurrent iterative Graph Processing (CGP) jobs are
                 usually performed on the common large-scale real-world
                 graph. Although several solutions have been proposed,
                 these CGP jobs are not coordinated with \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "37",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Zhou:2023:IPS,
  author =       "Yufeng Zhou and Alan L. Cox and Sandhya Dwarkadas and
                 Xiaowan Dong",
  title =        "The Impact of Page Size and Microarchitecture on
                 Instruction Address Translation Overhead",
  journal =      j-TACO,
  volume =       "20",
  number =       "3",
  pages =        "38:1--38:??",
  month =        sep,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3600089",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Thu Aug 10 07:14:56 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3600089",
  abstract =     "As the volume of data processed by applications has
                 increased, considerable attention has been paid to data
                 address translation overheads, leading to the
                 widespread use of larger page sizes (``superpages'')
                 and multi-level translation lookaside buffers (.
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "38",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Reber:2023:CPS,
  author =       "Benjamin Reber and Matthew Gould and Alexander H.
                 Kneipp and Fangzhou Liu and Ian Prechtl and Chen Ding
                 and Linlin Chen and Dorin Patru",
  title =        "Cache Programming for Scientific Loops Using Leases",
  journal =      j-TACO,
  volume =       "20",
  number =       "3",
  pages =        "39:1--39:??",
  month =        sep,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3600090",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Thu Aug 10 07:14:56 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3600090",
  abstract =     "Cache management is important in exploiting locality
                 and reducing data movement. This article studies a new
                 type of programmable cache called the lease cache. By
                 assigning leases, software exerts the primary control
                 on when and how long data stays in the \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "39",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Xie:2023:MMC,
  author =       "Xinfeng Xie and Peng Gu and Yufei Ding and Dimin Niu
                 and Hongzhong Zheng and Yuan Xie",
  title =        "{MPU}: Memory-centric {SIMT} Processor via In-{DRAM}
                 Near-bank Computing",
  journal =      j-TACO,
  volume =       "20",
  number =       "3",
  pages =        "40:1--40:??",
  month =        sep,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3603113",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Thu Aug 10 07:14:56 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3603113",
  abstract =     "With the growing number of data-intensive workloads,
                 GPU, which is the state-of-the-art
                 single-instruction-multiple-thread (SIMT) processor, is
                 hindered by the memory bandwidth wall. To alleviate
                 this bottleneck, previously proposed 3D-stacking
                 near-bank \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "40",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Krolik:2023:RFQ,
  author =       "Alexander Krolik and Clark Verbrugge and Laurie
                 Hendren",
  title =        "{rNdN}: Fast Query Compilation for {NVIDIA GPUs}",
  journal =      j-TACO,
  volume =       "20",
  number =       "3",
  pages =        "41:1--41:??",
  month =        sep,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3603503",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Thu Aug 10 07:14:56 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3603503",
  abstract =     "GPU database systems are an effective solution to
                 query optimization, particularly with compilation and
                 data caching. They fall short, however, in end-to-end
                 workloads, as existing compiler toolchains are too
                 expensive for use with short-running queries.
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "41",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Jiang:2023:HMP,
  author =       "Jiazhi Jiang and Zijian Huang and Dan Huang and
                 Jiangsu Du and Lin Chen and Ziguan Chen and Yutong Lu",
  title =        "Hierarchical Model Parallelism for Optimizing
                 Inference on Many-core Processor via Decoupled
                 {$3$D-CNN} Structure",
  journal =      j-TACO,
  volume =       "20",
  number =       "3",
  pages =        "42:1--42:??",
  month =        sep,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3605149",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Thu Aug 10 07:14:56 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3605149",
  abstract =     "The tremendous success of convolutional neural network
                 (CNN) has made it ubiquitous in many fields of human
                 endeavor. Many applications such as biomedical analysis
                 and scientific data analysis involve analyzing
                 volumetric data. This spawns huge demand for \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "42",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Zhao:2023:MGA,
  author =       "Yuwen Zhao and Fangfang Liu and Wenjing Ma and Huiyuan
                 Li and Yuanchi Peng and Cui Wang",
  title =        "{MFFT}: a {GPU} Accelerated Highly Efficient
                 Mixed-Precision Large-Scale {FFT} Framework",
  journal =      j-TACO,
  volume =       "20",
  number =       "3",
  pages =        "43:1--43:??",
  month =        sep,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3605148",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Thu Aug 10 07:14:56 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3605148",
  abstract =     "Fast Fourier transform (FFT) is widely used in
                 computing applications in large-scale parallel
                 programs, and data communication is the main
                 performance bottleneck of FFT and seriously affects its
                 parallel efficiency. To tackle this problem, we propose
                 a new large-scale FFT framework, MFFT, which optimizes
                 parallel FFT with a new mixed-precision optimization
                 technique, adopting the ``high precision computation,
                 low precision communication'' strategy. To enable ``low
                 precision communication'', we propose a shared-exponent
                 floating-point number compression technique, which
                 reduces the volume of data communication, while
                 maintaining higher accuracy. In addition, we apply a
                 two-phase normalization technique to further reduce the
                 round-off error. Based on the mixed-precision MFFT
                 framework, we apply several optimization techniques to
                 improve the performance, such as streaming of GPU
                 kernels, MPI message combination, kernel optimization,
                 and memory optimization. We evaluate MFFT on a system
                 with 4,096 GPUs. The results show that shared-exponent
                 MFFT is $ 1.23 \times $ faster than that of
                 double-precision MFFT on average, and double-precision
                 MFFT achieves performance $ 3.53 \times $ and $ 9.48
                 \times $ on average higher than open source library
                 2Decomp\&FFT (CPU-based version) and heFFTe (AMD
                 GPU-based version), respectively. The parallel
                 efficiency of double-precision MFFT increased from
                 53.2\% to 78.1\% compared with 2Decomp\&FFT, and
                 shared-exponent MFFT further increases the parallel
                 efficiency to 83.8\%.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "43",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Azhar:2023:ARR,
  author =       "Muhammad Waqar Azhar and Madhavan Manivannan and Per
                 Stenstr{\"o}m",
  title =        "{Approx-RM}: Reducing Energy on Heterogeneous
                 Multicore Processors under Accuracy and Timing
                 Constraints",
  journal =      j-TACO,
  volume =       "20",
  number =       "3",
  pages =        "44:1--44:??",
  month =        sep,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3605214",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Thu Aug 10 07:14:56 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3605214",
  abstract =     "Reducing energy consumption while providing
                 performance and quality guarantees is crucial for
                 computing systems ranging from battery-powered embedded
                 systems to data centers. This article considers
                 approximate iterative applications executing on
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "44",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Huang:2023:STE,
  author =       "Dong Huang and Dan Feng and Qiankun Liu and Bo Ding
                 and Wei Zhao and Xueliang Wei and Wei Tong",
  title =        "{SplitZNS}: Towards an Efficient {LSM}-Tree on Zoned
                 Namespace {SSDs}",
  journal =      j-TACO,
  volume =       "20",
  number =       "3",
  pages =        "45:1--45:??",
  month =        sep,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3608476",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Thu Aug 10 07:14:56 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3608476",
  abstract =     "The Zoned Namespace (ZNS) Solid State Drive (SSD) is a
                 nascent form of storage device that offers novel
                 prospects for the Log Structured Merge Tree (LSM-tree).
                 ZNS exposes erase blocks in SSD as append-only zones,
                 enabling the LSM-tree to gain awareness \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "45",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Du:2023:ICM,
  author =       "Jiangsu Du and Jiazhi Jiang and Jiang Zheng and
                 Hongbin Zhang and Dan Huang and Yutong Lu",
  title =        "Improving Computation and Memory Efficiency for
                 Real-world {Transformer} Inference on {GPUs}",
  journal =      j-TACO,
  volume =       "20",
  number =       "4",
  pages =        "46:1--46:??",
  month =        dec,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3617689",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Thu Dec 21 10:29:36 MST 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3617689",
  abstract =     "Transformer models have emerged as a leading approach
                 in the field of natural language processing (NLP) and
                 are increasingly being deployed in \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "46",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Jin:2023:CTC,
  author =       "Hai Jin and Bo Lei and Haikun Liu and Xiaofei Liao and
                 Zhuohui Duan and Chencheng Ye and Yu Zhang",
  title =        "A Compilation Tool for Computation Offloading in
                 {ReRAM}-based {CIM} Architectures",
  journal =      j-TACO,
  volume =       "20",
  number =       "4",
  pages =        "47:1--47:??",
  month =        dec,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3617686",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Thu Dec 21 10:29:36 MST 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3617686",
  abstract =     "Computing-in-Memory (CIM) architectures using
                 Non-volatile Memories (NVMs) have emerged as a
                 promising way to address the ``memory wall'' problem in
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "47",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Menard:2023:HPD,
  author =       "Christian Menard and Marten Lohstroh and Soroush
                 Bateni and Matthew Chorlian and Arthur Deng and Peter
                 Donovan and Cl{\'e}ment Fournier and Shaokai Lin and
                 Felix Suchert and Tassilo Tanneberger and Hokeun Kim
                 and Jeronimo Castrillon and Edward A. Lee",
  title =        "High-performance Deterministic Concurrency Using
                 {Lingua Franca}",
  journal =      j-TACO,
  volume =       "20",
  number =       "4",
  pages =        "48:1--48:??",
  month =        dec,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3617687",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Thu Dec 21 10:29:36 MST 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3617687",
  abstract =     "Actor frameworks and similar reactive programming
                 techniques are widely used for building concurrent
                 systems. They promise to be efficient and scale well to
                 a \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "48",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Wu:2023:SDM,
  author =       "Donglei Wu and Weihao Yang and Xiangyu Zou and Wen Xia
                 and Shiyi Li and Zhenbo Hu and Weizhe Zhang and Binxing
                 Fang",
  title =        "{Smart-DNN+}: a Memory-efficient Neural Networks
                 Compression Framework for the Model Inference",
  journal =      j-TACO,
  volume =       "20",
  number =       "4",
  pages =        "49:1--49:??",
  month =        dec,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3617688",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Thu Dec 21 10:29:36 MST 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3617688",
  abstract =     "Deep Neural Networks (DNNs) have achieved remarkable
                 success in various real-world applications. However,
                 running a Deep Neural Network (DNN) typically
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "49",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{TACO-294350005,
  author =       "Syed Salauddin Mohammad Tariq and Lance Menard and
                 Pengfei Su and Probir Roy",
  title =        "{MicroProf}: Code-level Attribution of Unnecessary
                 Data Transfer in Microservice Applications",
  journal =      j-TACO,
  volume =       "20",
  number =       "4",
  pages =        "50:1--50:??",
  month =        dec,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3622787",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Thu Dec 21 10:29:36 MST 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3622787",
  abstract =     "The microservice architecture style has gained
                 popularity due to its ability to fault isolation, ease
                 of scaling applications, and developer's agility.
                 However, \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "50",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Li:2023:GGM,
  author =       "Shiyi Li and Qiang Cao and Shenggang Wan and Wen Xia
                 and Changsheng Xie",
  title =        "{gPPM}: a Generalized Matrix Operation and Parallel
                 Algorithm to Accelerate the Encoding\slash Decoding
                 Process of Erasure Codes",
  journal =      j-TACO,
  volume =       "20",
  number =       "4",
  pages =        "51:1--51:??",
  month =        dec,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3625005",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Thu Dec 21 10:29:36 MST 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3625005",
  abstract =     "Erasure codes are widely deployed in modern storage
                 systems, leading to frequent usage of their
                 encoding/decoding operations. The encoding/decoding
                 process for \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "51",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Anastasiadis:2023:PPA,
  author =       "Petros Anastasiadis and Nikela Papadopoulou and
                 Georgios Goumas and Nectarios Koziris and Dennis Hoppe
                 and Li Zhong",
  title =        "{PARALiA}: a Performance Aware Runtime for Auto-tuning
                 Linear Algebra on Heterogeneous Systems",
  journal =      j-TACO,
  volume =       "20",
  number =       "4",
  pages =        "52:1--52:??",
  month =        dec,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3624569",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Thu Dec 21 10:29:36 MST 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3624569",
  abstract =     "Dense linear algebra operations appear very frequently
                 in high-performance computing (HPC) applications,
                 rendering their performance crucial to achieve
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "52",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Yu:2023:RER,
  author =       "Hui Yu and Yu Zhang and Jin Zhao and Yujian Liao and
                 Zhiying Huang and Donghao He and Lin Gu and Hai Jin and
                 Xiaofei Liao and Haikun Liu and Bingsheng He and
                 Jianhui Yue",
  title =        "{RACE}: an Efficient Redundancy-aware Accelerator for
                 Dynamic Graph Neural Network",
  journal =      j-TACO,
  volume =       "20",
  number =       "4",
  pages =        "53:1--53:??",
  month =        dec,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3617685",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Thu Dec 21 10:29:36 MST 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3617685",
  abstract =     "Dynamic Graph Neural Network (DGNN) has recently
                 attracted a significant amount of research attention
                 from various domains, because most real-world graphs
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "53",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Ferrari:2023:ADC,
  author =       "Victor Ferrari and Rafael Sousa and Marcio Pereira and
                 Jo{\~a}o P. L. {De Carvalho} and Jos{\'e} Nelson Amaral
                 and Jos{\'e} Moreira and Guido Araujo",
  title =        "Advancing Direct Convolution Using Convolution Slicing
                 Optimization and {ISA} Extensions",
  journal =      j-TACO,
  volume =       "20",
  number =       "4",
  pages =        "54:1--54:??",
  month =        dec,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3625004",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Thu Dec 21 10:29:36 MST 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3625004",
  abstract =     "Convolution is one of the most computationally
                 intensive operations that must be performed for machine
                 learning model inference. A traditional \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "54",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{He:2023:DLS,
  author =       "Bowen He and Xiao Zheng and Yuan Chen and Weinan Li
                 and Yajin Zhou and Xin Long and Pengcheng Zhang and
                 Xiaowei Lu and Linquan Jiang and Qiang Liu and Dennis
                 Cai and Xiantao Zhang",
  title =        "{DxPU}: Large-scale Disaggregated {GPU} Pools in the
                 Datacenter",
  journal =      j-TACO,
  volume =       "20",
  number =       "4",
  pages =        "55:1--55:??",
  month =        dec,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3617995",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Thu Dec 21 10:29:36 MST 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3617995",
  abstract =     "The rapid adoption of AI and convenience offered by
                 cloud services have resulted in the growing demands for
                 GPUs in the cloud. Generally, GPUs are physically
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "55",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Zhang:2023:CMC,
  author =       "Shiqing Zhang and Mahmood Naderan-Tahan and Magnus
                 Jahre and Lieven Eeckhout",
  title =        "Characterizing Multi-Chip {GPU} Data Sharing",
  journal =      j-TACO,
  volume =       "20",
  number =       "4",
  pages =        "56:1--56:??",
  month =        dec,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3629521",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Thu Dec 21 10:29:36 MST 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3629521",
  abstract =     "Multi-chip Graphics Processing Unit (GPU) systems are
                 critical to scale performance beyond a single GPU chip
                 for a wide variety of important emerging \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "56",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Domke:2023:LPQ,
  author =       "Jens Domke and Emil Vatai and Balazs Gerofi and Yuetsu
                 Kodama and Mohamed Wahib and Artur Podobas and Sparsh
                 Mittal and Miquel Peric{\`a}s and Lingqi Zhang and Peng
                 Chen and Aleksandr Drozd and Satoshi Matsuoka",
  title =        "At the Locus of Performance: Quantifying the Effects
                 of Copious {$3$D}-Stacked Cache on {HPC} Workloads",
  journal =      j-TACO,
  volume =       "20",
  number =       "4",
  pages =        "57:1--57:??",
  month =        dec,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3629520",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Thu Dec 21 10:29:36 MST 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3629520",
  abstract =     "Over the last three decades, innovations in the memory
                 subsystem were primarily targeted at overcoming the
                 data movement bottleneck. In this paper, we focus
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "57",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Badri:2023:MPE,
  author =       "Satya Jaswanth Badri and Mukesh Saini and Neeraj
                 Goel",
  title =        "{Mapi-Pro}: an Energy Efficient Memory Mapping
                 Technique for Intermittent Computing",
  journal =      j-TACO,
  volume =       "20",
  number =       "4",
  pages =        "58:1--58:??",
  month =        dec,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3629524",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Thu Dec 21 10:29:36 MST 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3629524",
  abstract =     "Battery-less technology evolved to replace battery
                 usage in space, deep mines, and other environments to
                 reduce cost and pollution. Non-volatile memory
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "58",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Yu:2023:MFE,
  author =       "Miao Yu and Tingting Xiang and Venkata Pavan Kumar
                 Miriyala and Trevor E. Carlson",
  title =        "{Multiply-and-Fire}: an Event-Driven Sparse Neural
                 Network Accelerator",
  journal =      j-TACO,
  volume =       "20",
  number =       "4",
  pages =        "59:1--59:??",
  month =        dec,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3630255",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Thu Dec 21 10:29:36 MST 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3630255",
  abstract =     "Deep neural network inference has become a vital
                 workload for many systems from edge-based computing to
                 data centers. To reduce the performance and power
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "59",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Choudhury:2023:FAI,
  author =       "Ziaul Choudhury and Anish Gulati and Suresh Purini",
  title =        "{FlowPix}: Accelerating Image Processing Pipelines on
                 an {FPGA} Overlay using a Domain Specific Compiler",
  journal =      j-TACO,
  volume =       "20",
  number =       "4",
  pages =        "60:1--60:??",
  month =        dec,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3629523",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Thu Dec 21 10:29:36 MST 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3629523",
  abstract =     "The exponential performance growth guaranteed by
                 Moore's law has started to taper in recent years. At
                 the same time, emerging applications like image
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "60",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Susskind:2023:UNA,
  author =       "Zachary Susskind and Aman Arora and Igor D. S. Miranda
                 and Alan T. L. Bacellar and Luis A. Q. Villon and
                 Rafael F. Katopodis and Leandro S. de Ara{\'u}jo and
                 Diego L. C. Dutra and Priscila M. V. Lima and Felipe M.
                 G. Fran{\c{c}}a and Mauricio {Breternitz Jr.} and Lizy
                 K. John",
  title =        "{ULEEN}: a Novel Architecture for Ultra-low-energy
                 Edge Neural Networks",
  journal =      j-TACO,
  volume =       "20",
  number =       "4",
  pages =        "61:1--61:??",
  month =        dec,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3629522",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Thu Dec 21 10:29:36 MST 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3629522",
  abstract =     "``Extreme edge'' devices, such as smart sensors, are a
                 uniquely challenging environment for the deployment of
                 machine learning. The tiny energy budgets \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "61",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Wei:2023:FOT,
  author =       "Jia Wei and Xingjun Zhang and Longxiang Wang and Zheng
                 Wei",
  title =        "{Fastensor}: Optimise the {Tensor} {I/O} Path from
                 {SSD} to {GPU} for Deep Learning Training",
  journal =      j-TACO,
  volume =       "20",
  number =       "4",
  pages =        "62:1--62:??",
  month =        dec,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3630108",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Thu Dec 21 10:29:36 MST 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3630108",
  abstract =     "In recent years, benefiting from the increase in model
                 size and complexity, deep learning has achieved
                 tremendous success in computer vision (CV) and (NLP).
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "62",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Luo:2024:CDB,
  author =       "Longfei Luo and Dingcui Yu and Yina Lv and Liang Shi",
  title =        "Critical Data Backup with Hybrid Flash-Based Consumer
                 Devices",
  journal =      j-TACO,
  volume =       "21",
  number =       "1",
  pages =        "1:1--1:??",
  month =        mar,
  year =         "2024",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3631529",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Feb 23 16:28:09 MST 2024",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3631529",
  abstract =     "Hybrid flash-based storage constructed with
                 high-density and low-cost flash memory has become
                 increasingly popular in consumer devices in the last
                 decade due to its low cost. However, its poor
                 reliability is one of the major concerns. To protect
                 critical \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "1",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Chen:2024:DOO,
  author =       "Peng Chen and Hui Chen and Weichen Liu and Linbo Long
                 and Wanli Chang and Nan Guan",
  title =        "{DAG-Order}: an Order-Based Dynamic {DAG} Scheduling
                 for Real-Time Networks-on-Chip",
  journal =      j-TACO,
  volume =       "21",
  number =       "1",
  pages =        "2:1--2:??",
  month =        mar,
  year =         "2024",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3631527",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Feb 23 16:28:09 MST 2024",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3631527",
  abstract =     "With the high-performance requirement of
                 safety-critical real-time tasks, the platforms of
                 many-core processors with high parallelism are widely
                 utilized, where network-on-chip (NoC) is generally
                 employed for inter-core communication due to its
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "2",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Jiang:2024:JRG,
  author =       "Zhang Jiang and Ying Chen and Xiaoli Gong and Jin
                 Zhang and Wenwen Wang and Pen-Chung Yew",
  title =        "{JiuJITsu}: Removing Gadgets with Safe Register
                 Allocation for {JIT} Code Generation",
  journal =      j-TACO,
  volume =       "21",
  number =       "1",
  pages =        "3:1--3:??",
  month =        mar,
  year =         "2024",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3631526",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Feb 23 16:28:09 MST 2024",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3631526",
  abstract =     "Code-reuse attacks have the capability to craft
                 malicious instructions from small code fragments,
                 commonly referred to as ``gadgets.'' These gadgets are
                 generated by JIT (Just-In-Time) engines as integral
                 components of native instructions, with the \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "3",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Tayeb:2024:AAV,
  author =       "Hayfa Tayeb and Ludovic Paillat and B{\'e}renger
                 Bramas",
  title =        "{Autovesk}: Automatic Vectorized Code Generation from
                 Unstructured Static Kernels Using Graph
                 Transformations",
  journal =      j-TACO,
  volume =       "21",
  number =       "1",
  pages =        "4:1--4:??",
  month =        mar,
  year =         "2024",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3631709",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Feb 23 16:28:09 MST 2024",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3631709",
  abstract =     "Leveraging the SIMD capability of modern CPU
                 architectures is mandatory to take full advantage of
                 their increased performance. To exploit this
                 capability, binary executables must be vectorized,
                 either manually by developers or automatically by a
                 tool. For \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "4",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Wang:2024:FCM,
  author =       "Xueying Wang and Guangli Li and Zhen Jia and Xiaobing
                 Feng and Yida Wang",
  title =        "Fast Convolution Meets Low Precision: Exploring
                 Efficient Quantized {Winograd} Convolution on Modern
                 {CPUs}",
  journal =      j-TACO,
  volume =       "21",
  number =       "1",
  pages =        "5:1--5:??",
  month =        mar,
  year =         "2024",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3632956",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Feb 23 16:28:09 MST 2024",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3632956",
  abstract =     "Low-precision computation has emerged as one of the
                 most effective techniques for accelerating
                 convolutional neural networks and has garnered
                 widespread support on modern hardware. Despite its
                 effectiveness in accelerating convolutional neural
                 networks, low-precision computation has not been
                 commonly applied to fast convolutions, such as the
                 Winograd algorithm, due to numerical issues. In this
                 article, we propose an effective quantized Winograd
                 convolution, named LoWino, which employs an in-side
                 quantization method in the Winograd domain to reduce
                 the precision loss caused by transformations.
                 Meanwhile, we present an efficient implementation that
                 integrates well-designed optimization techniques,
                 allowing us to fully exploit the capabilities of
                 low-precision computation on modern CPUs. We evaluate
                 LoWino on two Intel Xeon Scalable Processor platforms
                 with representative convolutional layers and neural
                 network models. The experimental results demonstrate
                 that our approach can achieve an average of $ 1.84
                 \times $ and $ 1.91 \times $ operator speedups over
                 state-of-the-art implementations in the vendor library
                 while preserving accuracy loss at a reasonable level.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "5",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Fan:2024:QPQ,
  author =       "Hao Fan and Yiliang Ye and Shadi Ibrahim and Zhuo
                 Huang and Xingru Li and Weibin Xue and Song Wu and Chen
                 Yu and Xuanhua Shi and Hai Jin",
  title =        "{QoS-pro}: a {QoS}-enhanced Transaction Processing
                 Framework for Shared {SSDs}",
  journal =      j-TACO,
  volume =       "21",
  number =       "1",
  pages =        "6:1--6:??",
  month =        mar,
  year =         "2024",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3632955",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Feb 23 16:28:09 MST 2024",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3632955",
  abstract =     "Solid State Drives (SSDs) are widely used in
                 data-intensive scenarios due to their high performance
                 and decreasing cost. However, in shared environments,
                 concurrent workloads can interfere with each other,
                 leading to a violation of Quality of Service (QoS).
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "6",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Zhao:2024:SUE,
  author =       "Yunping Zhao and Sheng Ma and Heng Liu and Libo Huang
                 and Yi Dai",
  title =        "{SAC}: an Ultra-Efficient Spin-based Architecture for
                 Compressed {DNNs}",
  journal =      j-TACO,
  volume =       "21",
  number =       "1",
  pages =        "7:1--7:??",
  month =        mar,
  year =         "2024",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3632957",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Feb 23 16:28:09 MST 2024",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3632957",
  abstract =     "Deep Neural Networks (DNNs) have achieved great
                 progress in academia and industry. But they have become
                 computational and memory intensive with the increase of
                 network depth. Previous designs seek breakthroughs in
                 software and hardware levels to mitigate \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "7",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Liu:2024:ECP,
  author =       "Tong-Yu Liu and Jianmei Guo and Bo Huang",
  title =        "Efficient Cross-platform Multiplexing of Hardware
                 Performance Counters via Adaptive Grouping",
  journal =      j-TACO,
  volume =       "21",
  number =       "1",
  pages =        "8:1--8:??",
  month =        mar,
  year =         "2024",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3629525",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Feb 23 16:28:09 MST 2024",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3629525",
  abstract =     "Collecting sufficient microarchitecture performance
                 data is essential for performance evaluation and
                 workload characterization. There are many events to be
                 monitored in a modern processor while only a few
                 hardware performance monitoring counters (PMCs)
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "8",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Liu:2024:QHQ,
  author =       "Lei Liu and Xinglei Dou",
  title =        "{QuCloud+}: a Holistic Qubit Mapping Scheme for
                 Single\slash Multi-programming on {$2$D\slash $3$D
                 NISQ} Quantum Computers",
  journal =      j-TACO,
  volume =       "21",
  number =       "1",
  pages =        "9:1--9:??",
  month =        mar,
  year =         "2024",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3631525",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Feb 23 16:28:09 MST 2024",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3631525",
  abstract =     "Qubit mapping for NISQ superconducting quantum
                 computers is essential to fidelity and resource
                 utilization. The existing qubit mapping schemes meet
                 challenges, e.g., crosstalk, SWAP overheads, diverse
                 device topologies, etc., leading to qubit resource
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "9",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Wu:2024:AAM,
  author =       "Lingxi Wu and Minxuan Zhou and Weihong Xu and Ashish
                 Venkat and Tajana Rosing and Kevin Skadron",
  title =        "{Abakus}: Accelerating $k$-mer Counting with Storage
                 Technology",
  journal =      j-TACO,
  volume =       "21",
  number =       "1",
  pages =        "10:1--10:??",
  month =        mar,
  year =         "2024",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3632952",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Feb 23 16:28:09 MST 2024",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3632952",
  abstract =     "This work seeks to leverage
                 Processing-with-storage-technology (PWST) to accelerate
                 a key bioinformatics kernel called $k$-mer counting,
                 which involves processing large files of sequence data
                 on the disk to build a histogram of fixed-size genome
                 sequence \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "10",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Kang:2024:IAG,
  author =       "Seokwon Kang and Jongbin Kim and Gyeongyong Lee and
                 Jeongmyung Lee and Jiwon Seo and Hyungsoo Jung and Yong
                 Ho Song and Yongjun Park",
  title =        "{ISP Agent}: a Generalized In-storage-processing
                 Workload Offloading Framework by Providing Multiple
                 Optimization Opportunities",
  journal =      j-TACO,
  volume =       "21",
  number =       "1",
  pages =        "11:1--11:??",
  month =        mar,
  year =         "2024",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3632951",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Feb 23 16:28:09 MST 2024",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3632951",
  abstract =     "As solid-state drives (SSDs) with sufficient computing
                 power have recently become the dominant devices in
                 modern computer systems, in-storage processing (ISP),
                 which processes data within the storage without
                 transferring it to the host memory, is being \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "11",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Mishra:2024:CHP,
  author =       "Prasoon Mishra and V. Krishna Nandivada",
  title =        "{COWS} for High Performance: Cost Aware Work Stealing
                 for Irregular Parallel Loop",
  journal =      j-TACO,
  volume =       "21",
  number =       "1",
  pages =        "12:1--12:??",
  month =        mar,
  year =         "2024",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3633331",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Feb 23 16:28:09 MST 2024",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3633331",
  abstract =     "Parallel libraries such as OpenMP distribute the
                 iterations of parallel-for-loops among the threads,
                 using a programmer-specified scheduling policy. While
                 the existing scheduling policies perform reasonably
                 well in the context of balanced workloads, in
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "12",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Park:2024:HHS,
  author =       "Joongun Park and Seunghyo Kang and Sanghyeon Lee and
                 Taehoon Kim and Jongse Park and Youngjin Kwon and
                 Jaehyuk Huh",
  title =        "Hardware-hardened Sandbox Enclaves for Trusted
                 Serverless Computing",
  journal =      j-TACO,
  volume =       "21",
  number =       "1",
  pages =        "13:1--13:??",
  month =        mar,
  year =         "2024",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3632954",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Feb 23 16:28:09 MST 2024",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3632954",
  abstract =     "In cloud-based serverless computing, an application
                 consists of multiple functions provided by mutually
                 distrusting parties. For secure serverless computing,
                 the hardware-based trusted execution environment (TEE)
                 can provide strong isolation among \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "13",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Allen:2024:FGQ,
  author =       "Tyler Allen and Bennett Cooper and Rong Ge",
  title =        "Fine-grain Quantitative Analysis of Demand Paging in
                 Unified Virtual Memory",
  journal =      j-TACO,
  volume =       "21",
  number =       "1",
  pages =        "14:1--14:??",
  month =        mar,
  year =         "2024",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3632953",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Feb 23 16:28:09 MST 2024",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3632953",
  abstract =     "The abstraction of a shared memory space over separate
                 CPU and GPU memory domains has eased the burden of
                 portability for many HPC codebases. However, users pay
                 for ease of use provided by system-managed memory with
                 a moderate-to-high performance \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "14",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Wang:2024:RRR,
  author =       "Zhonghua Wang and Yixing Guo and Kai Lu and Jiguang
                 Wan and Daohui Wang and Ting Yao and Huatao Wu",
  title =        "{Rcmp}: Reconstructing {RDMA-Based} Memory
                 Disaggregation via {CXL}",
  journal =      j-TACO,
  volume =       "21",
  number =       "1",
  pages =        "15:1--15:??",
  month =        mar,
  year =         "2024",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3634916",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Feb 23 16:28:09 MST 2024",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3634916",
  abstract =     "Memory disaggregation is a promising architecture for
                 modern datacenters that separates compute and memory
                 resources into independent pools connected by
                 ultra-fast networks, which can improve memory
                 utilization, reduce cost, and enable elastic scaling of
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "15",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Long:2024:WZW,
  author =       "Linbo Long and Shuiyong He and Jingcheng Shen and
                 Renping Liu and Zhenhua Tan and Congming Gao and Duo
                 Liu and Kan Zhong and Yi Jiang",
  title =        "{WA-Zone}: Wear-Aware Zone Management Optimization for
                 {LSM}-Tree on {ZNS SSDs}",
  journal =      j-TACO,
  volume =       "21",
  number =       "1",
  pages =        "16:1--16:??",
  month =        mar,
  year =         "2024",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3637488",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Feb 23 16:28:09 MST 2024",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3637488",
  abstract =     "ZNS SSDs divide the storage space into
                 sequential-write zones, reducing costs of DRAM
                 utilization, garbage collection, and over-provisioning.
                 The sequential-write feature of zones is well-suited
                 for LSM-based databases, where random writes are
                 organized \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "16",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Fan:2024:IUD,
  author =       "Zhihua Fan and Wenming Li and Zhen Wang and Yu Yang
                 and Xiaochun Ye and Dongrui Fan and Ninghui Sun and
                 Xuejun An",
  title =        "Improving Utilization of Dataflow Unit for Multi-Batch
                 Processing",
  journal =      j-TACO,
  volume =       "21",
  number =       "1",
  pages =        "17:1--17:??",
  month =        mar,
  year =         "2024",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3637906",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Feb 23 16:28:09 MST 2024",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3637906",
  abstract =     "Dataflow architectures can achieve much better
                 performance and higher efficiency than general-purpose
                 core, approaching the performance of a specialized
                 design while retaining programmability. However,
                 advanced application scenarios place higher demands
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "17",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Zhang:2024:EVI,
  author =       "Dunbo Zhang and Qingjie Lang and Ruoxi Wang and Li
                 Shen",
  title =        "Extension {VM}: Interleaved Data Layout in Vector
                 Memory",
  journal =      j-TACO,
  volume =       "21",
  number =       "1",
  pages =        "18:1--18:??",
  month =        mar,
  year =         "2024",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3631528",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Feb 23 16:28:09 MST 2024",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3631528",
  abstract =     "While vector architecture is widely employed in
                 processors for neural networks, signal processing, and
                 high-performance computing; however, its performance is
                 limited by inefficient column-major memory access. The
                 column-major access limitation originates \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "18",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Firtina:2024:AAP,
  author =       "Can Firtina and Kamlesh Pillai and Gurpreet S. Kalsi
                 and Bharathwaj Suresh and Damla Senol Cali and Jeremie
                 S. Kim and Taha Shahroodi and Meryem Banu Cavlak and
                 Jo{\"e}l Lindegger and Mohammed Alser and Juan
                 G{\'o}mez Luna and Sreenivas Subramoney and Onur
                 Mutlu",
  title =        "{ApHMM}: Accelerating Profile Hidden {Markov} Models
                 for Fast and Energy-efficient Genome Analysis",
  journal =      j-TACO,
  volume =       "21",
  number =       "1",
  pages =        "19:1--19:??",
  month =        mar,
  year =         "2024",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3632950",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Feb 23 16:28:09 MST 2024",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3632950",
  abstract =     "Profile hidden Markov models (pHMMs) are widely
                 employed in various bioinformatics applications to
                 identify similarities between biological sequences,
                 such as DNA or protein sequences. In pHMMs, sequences
                 are represented as graph structures, where states
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "19",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Ahmad:2024:EDL,
  author =       "Khalid Ahmad and Cris Cecka and Michael Garland and
                 Mary Hall",
  title =        "Exploring Data Layout for Sparse Tensor Times Dense
                 Matrix on {GPUs}",
  journal =      j-TACO,
  volume =       "21",
  number =       "1",
  pages =        "20:1--20:??",
  month =        mar,
  year =         "2024",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3633462",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Feb 23 16:28:09 MST 2024",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3633462",
  abstract =     "An important sparse tensor computation is
                 sparse-tensor-dense-matrix multiplication (SpTM), which
                 is used in tensor decomposition and applications. SpTM
                 is a multi-dimensional analog to
                 sparse-matrix-dense-matrix multiplication (SpMM). In
                 this article, we \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Archit. Code Optim.",
  articleno =    "20",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}