Sophie

Sophie

distrib > Mageia > 5 > x86_64 > media > nonfree-updates > by-pkgid > fd8445e7e4d58b8cfe6e0150bd441ee1 > files > 1321

nvidia-cuda-toolkit-devel-6.5.14-6.1.mga5.nonfree.x86_64.rpm

<!DOCTYPE html
  PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" lang="en-us" xml:lang="en-us">
   <head>
      <meta http-equiv="Content-Type" content="text/html; charset=utf-8"></meta>
      <meta http-equiv="X-UA-Compatible" content="IE=edge"></meta>
      <meta name="copyright" content="(C) Copyright 2005"></meta>
      <meta name="DC.rights.owner" content="(C) Copyright 2005"></meta>
      <meta name="DC.Type" content="concept"></meta>
      <meta name="DC.Title" content="Floating Point and IEEE 754 Compliance for NVIDIA GPUs"></meta>
      <meta name="abstract" content="White paper covering the most common issues related to NVIDIA GPUs."></meta>
      <meta name="description" content="White paper covering the most common issues related to NVIDIA GPUs."></meta>
      <meta name="DC.Coverage" content="White Papers"></meta>
      <meta name="DC.subject" content="CUDA Floating Point, CUDA Floating Point formats, CUDA Floating Point FMA, CUDA Floating Point accuracy, CUDA Floating Point rounding mode, CUDA Floating Point x86 differences, CUDA Floating Point compiler flags, CUDA Floating Point core counts, CUDA Floating Point x87, CUDA Floating Point recommendations"></meta>
      <meta name="keywords" content="CUDA Floating Point, CUDA Floating Point formats, CUDA Floating Point FMA, CUDA Floating Point accuracy, CUDA Floating Point rounding mode, CUDA Floating Point x86 differences, CUDA Floating Point compiler flags, CUDA Floating Point core counts, CUDA Floating Point x87, CUDA Floating Point recommendations"></meta>
      <meta name="DC.Format" content="XHTML"></meta>
      <meta name="DC.Identifier" content="abstract"></meta>
      <link rel="stylesheet" type="text/css" href="../common/formatting/commonltr.css"></link>
      <link rel="stylesheet" type="text/css" href="../common/formatting/site.css"></link>
      <title>Floating Point and IEEE 754 :: CUDA Toolkit Documentation</title>
      <!--[if lt IE 9]>
      <script src="../common/formatting/html5shiv-printshiv.min.js"></script>
      <![endif]-->
      <script type="text/javascript" charset="utf-8" src="../common/scripts/tynt/tynt.js"></script>
      <script type="text/javascript" charset="utf-8" src="../common/formatting/jquery.min.js"></script>
      <script type="text/javascript" charset="utf-8" src="../common/formatting/jquery.ba-hashchange.min.js"></script>
      <script type="text/javascript" charset="utf-8" src="../common/formatting/jquery.scrollintoview.min.js"></script>
      <script type="text/javascript" src="../search/htmlFileList.js"></script>
      <script type="text/javascript" src="../search/htmlFileInfoList.js"></script>
      <script type="text/javascript" src="../search/nwSearchFnt.min.js"></script>
      <script type="text/javascript" src="../search/stemmers/en_stemmer.min.js"></script>
      <script type="text/javascript" src="../search/index-1.js"></script>
      <script type="text/javascript" src="../search/index-2.js"></script>
      <script type="text/javascript" src="../search/index-3.js"></script>
      <link rel="canonical" href="http://docs.nvidia.com/cuda/floating-point/index.html"></link>
      <link rel="stylesheet" type="text/css" href="../common/formatting/qwcode.highlight.css"></link>
   </head>
   <body>
      
      <header id="header"><span id="company">NVIDIA</span><span id="site-title">CUDA Toolkit Documentation</span><form id="search" method="get" action="search">
            <input type="text" name="search-text"></input><fieldset id="search-location">
               <legend>Search In:</legend>
               <label><input type="radio" name="search-type" value="site"></input>Entire Site</label>
               <label><input type="radio" name="search-type" value="document"></input>Just This Document</label></fieldset>
            <button type="reset">clear search</button>
            <button id="submit" type="submit">search</button></form>
      </header>
      <div id="site-content">
         <nav id="site-nav">
            <div class="category closed"><a href="../index.html" title="The root of the site.">CUDA Toolkit
                  v6.5</a></div>
            <div class="category"><a href="index.html" title="Floating Point and IEEE 754">Floating Point and IEEE 754</a></div>
            <ul>
               <li>
                  <div class="section-link"><a href="#introduction">1.&nbsp;Introduction</a></div>
               </li>
               <li>
                  <div class="section-link"><a href="#floating-point">2.&nbsp;Floating Point</a></div>
                  <ul>
                     <li>
                        <div class="section-link"><a href="#formats">2.1.&nbsp;Formats</a></div>
                     </li>
                     <li>
                        <div class="section-link"><a href="#operations-and-accuracy">2.2.&nbsp;Operations and Accuracy</a></div>
                     </li>
                     <li>
                        <div class="section-link"><a href="#fused-multiply-add-fma">2.3.&nbsp;The Fused Multiply-Add (FMA)</a></div>
                     </li>
                  </ul>
               </li>
               <li>
                  <div class="section-link"><a href="#dot-product-accuracy-example">3.&nbsp;Dot Product: An Accuracy Example</a></div>
                  <ul>
                     <li>
                        <div class="section-link"><a href="#example-algorithms">3.1.&nbsp;Example Algorithms</a></div>
                     </li>
                     <li>
                        <div class="section-link"><a href="#comparison">3.2.&nbsp;Comparison</a></div>
                     </li>
                  </ul>
               </li>
               <li>
                  <div class="section-link"><a href="#cuda-and-floating-point">4.&nbsp;CUDA and Floating Point</a></div>
                  <ul>
                     <li>
                        <div class="section-link"><a href="#compute-capability-1-2-and-below">4.1.&nbsp;Compute Capability 1.2 and Below</a></div>
                     </li>
                     <li>
                        <div class="section-link"><a href="#compute-capability-1-3">4.2.&nbsp;Compute Capability 1.3</a></div>
                     </li>
                     <li>
                        <div class="section-link"><a href="#compute-capability-2-0-and-above">4.3.&nbsp;Compute Capability 2.0 and Above</a></div>
                     </li>
                     <li>
                        <div class="section-link"><a href="#rounding-modes">4.4.&nbsp;Rounding Modes</a></div>
                     </li>
                     <li>
                        <div class="section-link"><a href="#controlling-fused-multiply-add">4.5.&nbsp;Controlling Fused Multiply-add</a></div>
                     </li>
                     <li>
                        <div class="section-link"><a href="#compiler-flags">4.6.&nbsp;Compiler Flags</a></div>
                     </li>
                     <li>
                        <div class="section-link"><a href="#differences-from-x86">4.7.&nbsp;Differences from x86</a></div>
                     </li>
                  </ul>
               </li>
               <li>
                  <div class="section-link"><a href="#considerations-for-heterogeneous-world">5.&nbsp;Considerations for a Heterogeneous World</a></div>
                  <ul>
                     <li>
                        <div class="section-link"><a href="#mathematical-function-accuracy">5.1.&nbsp;Mathematical Function Accuracy</a></div>
                     </li>
                     <li>
                        <div class="section-link"><a href="#x87-sse">5.2.&nbsp;x87 and SSE</a></div>
                     </li>
                     <li>
                        <div class="section-link"><a href="#core-counts">5.3.&nbsp;Core Counts</a></div>
                     </li>
                     <li>
                        <div class="section-link"><a href="#verifying-gpu-results">5.4.&nbsp;Verifying GPU Results</a></div>
                     </li>
                  </ul>
               </li>
               <li>
                  <div class="section-link"><a href="#concrete-recommendations">6.&nbsp;Concrete Recommendations</a></div>
               </li>
               <li>
                  <div class="section-link"><a href="#acknowledgements">A.&nbsp;Acknowledgements</a></div>
               </li>
               <li>
                  <div class="section-link"><a href="#references">B.&nbsp;References</a></div>
               </li>
            </ul>
         </nav>
         <div id="resize-nav"></div>
         <nav id="search-results">
            <h2>Search Results</h2>
            <ol></ol>
         </nav>
         
         <div id="contents-container">
            <div id="breadcrumbs-container">
               <div id="eqn-warning">This document includes math equations
                  (highlighted in red) which are best viewed with <a target="_blank" href="https://www.mozilla.org/firefox">Firefox</a> version 4.0
                  or higher, or another <a target="_blank" href="http://www.w3.org/Math/Software/mathml_software_cat_browsers.html">MathML-aware
                     browser</a>. There is also a <a href="../../pdf/Floating_Point_on_NVIDIA_GPU.pdf">PDF version of this document</a>.
                  
               </div>
               <div id="release-info">Floating Point and IEEE 754
                  (<a href="../../pdf/Floating_Point_on_NVIDIA_GPU.pdf">PDF</a>)
                  -
                  
                  v6.5
                  (<a href="https://developer.nvidia.com/cuda-toolkit-archive">older</a>)
                  -
                  Last updated August 1, 2014
                  -
                  <a href="mailto:cudatools@nvidia.com?subject=CUDA Toolkit Documentation Feedback: Floating Point and IEEE 754">Send Feedback</a>
                  -
                  <span class="st_facebook"></span><span class="st_twitter"></span><span class="st_linkedin"></span><span class="st_reddit"></span><span class="st_slashdot"></span><span class="st_tumblr"></span><span class="st_sharethis"></span></div>
            </div>
            <article id="contents">
               <div class="topic nested0" id="abstract"><a name="abstract" shape="rect">
                     <!-- --></a><h2 class="title topictitle1"><a href="#abstract" name="abstract" shape="rect"><span class="ph">Floating Point and IEEE 754 Compliance for NVIDIA GPUs</span></a></h2>
                  <div class="body conbody">
                     <p class="p">A number of issues related to floating point accuracy and compliance are
                        a frequent source of confusion on both CPUs and GPUs. The purpose of this
                        white paper is to discuss the most common issues related to NVIDIA GPUs
                        and to supplement the documentation in the CUDA C Programming Guide.
                     </p>
                  </div>
               </div>
               <div class="topic concept nested0" id="introduction"><a name="introduction" shape="rect">
                     <!-- --></a><h2 class="title topictitle1"><a href="#introduction" name="introduction" shape="rect">1.&nbsp;Introduction</a></h2>
                  <div class="body conbody">
                     <p class="p">Since the widespread adoption in 1985 of the IEEE Standard for
                        <dfn class="term">Binary Floating-Point Arithmetic</dfn> (IEEE 754-1985 <a class="xref" href="index.html#references__1" shape="rect">[1]</a>) virtually all
                        mainstream computing systems have implemented the standard, including
                        NVIDIA with the CUDA architecture. IEEE 754 standardizes how arithmetic
                        results should be <em class="ph i">approximated</em> in floating point.  Whenever
                        working with inexact results, programming decisions can affect accuracy.
                        It is important to consider many aspects of floating point behavior in
                        order to achieve the highest performance with the precision required for
                        any specific application. This is especially true in a heterogeneous
                        computing environment where operations will be performed on different
                        types of hardware.
                     </p>
                     <p class="p">Understanding some of the intricacies of floating point and the
                        specifics of how NVIDIA hardware handles floating point is obviously
                        important to CUDA programmers striving to implement correct numerical
                        algorithms. In addition, users of libraries such as <dfn class="term">cuBLAS</dfn>
                        and <dfn class="term">cuFFT</dfn> will also find it informative to learn how NVIDIA
                        handles floating point under the hood.
                     </p>
                     <p class="p">We review some of the basic properties of floating point calculations in
                        <a class="xref" href="index.html#floating-point" shape="rect">Chapter&nbsp;2</a>. We also discuss the
                        fused multiply-add operator, which was added to the IEEE 754 standard in
                        2008 <a class="xref" href="index.html#references__2" shape="rect">[2]</a> and is built into the
                        hardware of NVIDIA GPUs. In <a class="xref" href="index.html#dot-product-accuracy-example" shape="rect">Chapter&nbsp;3</a> we work
                        through an example of computing the dot product of two short vectors to
                        illustrate how different choices of implementation affect the accuracy of
                        the final result. In <a class="xref" href="index.html#cuda-and-floating-point" shape="rect">Chapter&nbsp;4</a> we describe
                        NVIDIA hardware versions and NVCC compiler options that affect floating
                        point calculations. In <a class="xref" href="index.html#considerations-for-heterogeneous-world" shape="rect">Chapter&nbsp;5</a>
                        we consider some issues regarding the comparison of CPU and GPU results.
                        Finally, in <a class="xref" href="index.html#concrete-recommendations" shape="rect">Chapter&nbsp;6</a>
                        we conclude with concrete recommendations to programmers that deal with
                        numeric issues relating to floating point on the GPU.
                     </p>
                  </div>
               </div>
               <div class="topic concept nested0" id="floating-point"><a name="floating-point" shape="rect">
                     <!-- --></a><h2 class="title topictitle1"><a href="#floating-point" name="floating-point" shape="rect">2.&nbsp;Floating Point</a></h2>
                  <div class="topic concept nested1" id="formats"><a name="formats" shape="rect">
                        <!-- --></a><h3 class="title topictitle2"><a href="#formats" name="formats" shape="rect">2.1.&nbsp;Formats</a></h3>
                     <div class="body conbody">
                        <div class="section">
                           <p class="p">Floating point encodings and functionality are defined in the IEEE 754
                              Standard <a class="xref" href="index.html#references__2" shape="rect">[2]</a> last revised in 2008.
                              Goldberg <a class="xref" href="index.html#references__5" shape="rect">[5]</a> gives a good
                              introduction to floating point and many of the issues that arise.
                              
                           </p>
                           <p class="p">The standard mandates binary floating point data be encoded on three
                              fields: a one bit sign field, followed by exponent bits encoding the
                              exponent offset by a numeric bias specific to each format, and bits
                              encoding the significand (or fraction).
                              
                           </p><br clear="none"></br><div class="imagecenter"><img class="image imagecenter" src="graphics/sign-exponent-fraction.png"></img></div><br clear="none"></br><p class="p">In order to ensure consistent computations across platforms and to
                              exchange floating point data, IEEE 754 defines basic and interchange
                              formats. The 32 and 64 bit basic binary floating point formats
                              correspond to the C data types <samp class="ph codeph">float</samp> and
                              <samp class="ph codeph">double</samp>. Their corresponding representations have the
                              following bit lengths:
                              
                           </p><br clear="none"></br><div class="imagecenter"><img class="image imagecenter" src="graphics/float-double.png"></img></div><br clear="none"></br><p class="p">For numerical data representing finite values, the sign is either
                              negative or positive, the exponent field encodes the exponent in base
                              2, and the fraction field encodes the significand without the most
                              significant non-zero bit. For example, the value -192 equals
                              (-1)<sup class="ph sup">1</sup> x 2<sup class="ph sup">7</sup> x 1.5,
                              and can be represented as having a negative sign, an
                              exponent of 7, and a fractional part .5. The exponents are biased by
                              127 and 1023, respectively, to allow exponents to extend from negative
                              to positive. Hence the exponent 7 is represented by bit strings with
                              values 134 for float and 1030 for double. The integral part of 1. is
                              implicit in the fraction.
                              
                           </p><br clear="none"></br><div class="imagecenter"><img class="image imagecenter" src="graphics/float-1-double-1.png"></img></div><br clear="none"></br><p class="p">Also, encodings to represent infinity and not-a-number (NaN) data are
                              reserved. The IEEE 754 Standard <a class="xref" href="index.html#references__2" shape="rect">[2]</a> describes floating point encodings
                              in full.
                              
                           </p>
                           <p class="p">Given that the fraction field uses a limited number of bits, not all
                              real numbers can be represented exactly. For example the mathematical
                              value of the fraction 2/3 represented in binary is 0.10101010... which
                              has an infinite number of bits after the binary point. The value 2/3
                              must be rounded first in order to be represented as a floating point
                              number with limited precision. The rules for rounding and the rounding
                              modes are specified in IEEE 754. The most frequently used is the
                              round-to-nearest-or-even mode (abbreviated as round-to-nearest). The
                              value 2/3 rounded in this mode is represented in binary as:
                              
                           </p><br clear="none"></br><div class="imagecenter"><img class="image imagecenter" src="graphics/float-0-double-0.png"></img></div><br clear="none"></br><p class="p">The sign is positive and the stored exponent value represents an
                              exponent of -1.
                              
                           </p>
                        </div>
                     </div>
                  </div>
                  <div class="topic concept nested1" id="operations-and-accuracy"><a name="operations-and-accuracy" shape="rect">
                        <!-- --></a><h3 class="title topictitle2"><a href="#operations-and-accuracy" name="operations-and-accuracy" shape="rect">2.2.&nbsp;Operations and Accuracy</a></h3>
                     <div class="body conbody">
                        <div class="section">
                           <p class="p">The IEEE 754 standard requires support for a handful of operations.
                              These include the arithmetic operations add, subtract, multiply,
                              divide, square root, fused-multiply-add, remainder, conversion
                              operations, scaling, sign operations, and comparisons. The results of
                              these operations are guaranteed to be the same for all implementations
                              of the standard, for a given format and rounding mode.
                           </p>
                           <p class="p">The rules and properties of mathematical arithmetic do not hold
                              directly for floating point arithmetic because of floating point's
                              limited precision. For example, the table below shows single precision
                              values <em class="ph i">A</em>, <em class="ph i">B</em>, and <em class="ph i">C</em>, and the mathematical exact
                              value of their sum computed using different
                              associativity.
                           </p>
                           <p class="p d4p_eqn_block">
                              <math xmlns="http://www.w3.org/1998/Math/MathML">
                                 <mtable columnalign="right left left" columnspacing="0.2em">
                                    <mtr>
                                       <mtd>
                                          <mi>A</mi>
                                       </mtd>
                                       <mtd>
                                          <mo>=</mo>
                                       </mtd>
                                       <mtd>
                                          <msup>
                                             <mn>2</mn>
                                             <mn>1</mn>
                                          </msup>
                                          <mo>×</mo>
                                          <mn>1.00000000000000000000001</mn>
                                       </mtd>
                                    </mtr>
                                    <mtr>
                                       <mtd>
                                          <mi>B</mi>
                                       </mtd>
                                       <mtd>
                                          <mo>=</mo>
                                       </mtd>
                                       <mtd>
                                          <msup>
                                             <mn>2</mn>
                                             <mn>0</mn>
                                          </msup>
                                          <mo>×</mo>
                                          <mn>1.00000000000000000000001</mn>
                                       </mtd>
                                    </mtr>
                                    <mtr>
                                       <mtd>
                                          <mi>C</mi>
                                       </mtd>
                                       <mtd>
                                          <mo>=</mo>
                                       </mtd>
                                       <mtd>
                                          <msup>
                                             <mn>2</mn>
                                             <mn>3</mn>
                                          </msup>
                                          <mo>×</mo>
                                          <mn>1.00000000000000000000001</mn>
                                       </mtd>
                                    </mtr>
                                    <mtr>
                                       <mtd>
                                          <mrow>
                                             <mo>(</mo>
                                             <mi>A</mi>
                                             <mo>+</mo>
                                             <mi>B</mi>
                                             <mo>)</mo>
                                             <mo>+</mo>
                                             <mi>C</mi>
                                          </mrow>
                                       </mtd>
                                       <mtd>
                                          <mo>=</mo>
                                       </mtd>
                                       <mtd>
                                          <msup>
                                             <mn>2</mn>
                                             <mn>3</mn>
                                          </msup>
                                          <mo>×</mo>
                                          <mn>1.01100000000000000000001011</mn>
                                       </mtd>
                                    </mtr>
                                    <mtr>
                                       <mtd>
                                          <mrow>
                                             <mi>A</mi>
                                             <mo>+</mo>
                                             <mo>(</mo>
                                             <mi>B</mi>
                                             <mo>+</mo>
                                             <mi>C</mi>
                                             <mo>)</mo>
                                          </mrow>
                                       </mtd>
                                       <mtd>
                                          <mo>=</mo>
                                       </mtd>
                                       <mtd>
                                          <msup>
                                             <mn>2</mn>
                                             <mn>3</mn>
                                          </msup>
                                          <mo>×</mo>
                                          <mn>1.01100000000000000000001011</mn>
                                       </mtd>
                                    </mtr>
                                 </mtable>
                              </math>
                           </p>
                           <p class="p">Mathematically, (<em class="ph i">A</em> + <em class="ph i">B</em>) + <em class="ph i">C</em> does equal <em class="ph i">A</em> +
                              (<em class="ph i">B</em> + <em class="ph i">C</em>).
                           </p>
                           <p class="p">Let rn(<em class="ph i">x</em>) denote one rounding step on <em class="ph i">x</em>. Performing
                              these same computations in single precision floating point arithmetic
                              in round-to-nearest mode according to IEEE 754, we obtain:
                           </p>
                           <p class="p d4p_eqn_block">
                              <math xmlns="http://www.w3.org/1998/Math/MathML">
                                 <mtable columnalign="right left left" columnspacing="0.2em">
                                    <mtr>
                                       <mtd>
                                          <mi>A</mi>
                                          <mo>+</mo>
                                          <mi>B</mi>
                                       </mtd>
                                       <mtd>
                                          <mo>=</mo>
                                       </mtd>
                                       <mtd>
                                          <msup>
                                             <mn>2</mn>
                                             <mn>1</mn>
                                          </msup>
                                          <mo>×</mo>
                                          <mn>1.1000000000000000000000110000...</mn>
                                       </mtd>
                                    </mtr>
                                    <mtr>
                                       <mtd>
                                          <mtext>rn</mtext>
                                          <mo>(</mo>
                                          <mi>A</mi>
                                          <mo>+</mo>
                                          <mi>B</mi>
                                          <mo>)</mo>
                                       </mtd>
                                       <mtd>
                                          <mo>=</mo>
                                       </mtd>
                                       <mtd>
                                          <msup>
                                             <mn>2</mn>
                                             <mn>1</mn>
                                          </msup>
                                          <mo>×</mo>
                                          <mn>1.10000000000000000000010</mn>
                                       </mtd>
                                    </mtr>
                                    <mtr>
                                       <mtd>
                                          <mi>B</mi>
                                          <mo>+</mo>
                                          <mi>C</mi>
                                       </mtd>
                                       <mtd>
                                          <mo>=</mo>
                                       </mtd>
                                       <mtd>
                                          <msup>
                                             <mn>2</mn>
                                             <mn>3</mn>
                                          </msup>
                                          <mo>×</mo>
                                          <mn>1.0010000000000000000000100100...</mn>
                                       </mtd>
                                    </mtr>
                                    <mtr>
                                       <mtd>
                                          <mtext>rn</mtext>
                                          <mo>(</mo>
                                          <mi>B</mi>
                                          <mo>+</mo>
                                          <mi>C</mi>
                                          <mo>)</mo>
                                       </mtd>
                                       <mtd>
                                          <mo>=</mo>
                                       </mtd>
                                       <mtd>
                                          <msup>
                                             <mn>2</mn>
                                             <mn>3</mn>
                                          </msup>
                                          <mo>×</mo>
                                          <mn>1.00100000000000000000001</mn>
                                       </mtd>
                                    </mtr>
                                    <mtr>
                                       <mtd>
                                          <mi>A</mi>
                                          <mo>+</mo>
                                          <mi>B</mi>
                                          <mo>+</mo>
                                          <mi>C</mi>
                                       </mtd>
                                       <mtd>
                                          <mo>=</mo>
                                       </mtd>
                                       <mtd>
                                          <msup>
                                             <mn>2</mn>
                                             <mn>3</mn>
                                          </msup>
                                          <mo>×</mo>
                                          <mn>1.0110000000000000000000101100...</mn>
                                       </mtd>
                                    </mtr>
                                    <mtr>
                                       <mtd>
                                          <mtext>rn</mtext>
                                          <mo>(</mo>
                                          <mtext>rn</mtext>
                                          <mo>(</mo>
                                          <mi>A</mi>
                                          <mo>+</mo>
                                          <mi>B</mi>
                                          <mo>)</mo>
                                          <mo>+</mo>
                                          <mi>C</mi>
                                          <mo>)</mo>
                                       </mtd>
                                       <mtd>
                                          <mo>=</mo>
                                       </mtd>
                                       <mtd>
                                          <msup>
                                             <mn>2</mn>
                                             <mn>3</mn>
                                          </msup>
                                          <mo>×</mo>
                                          <mn>1.01100000000000000000010</mn>
                                       </mtd>
                                    </mtr>
                                    <mtr>
                                       <mtd>
                                          <mtext>rn</mtext>
                                          <mo>(</mo>
                                          <mi>A</mi>
                                          <mo>+</mo>
                                          <mtext>rn</mtext>
                                          <mo>(</mo>
                                          <mi>B</mi>
                                          <mo>+</mo>
                                          <mi>C</mi>
                                          <mo>)</mo>
                                          <mo>)</mo>
                                       </mtd>
                                       <mtd>
                                          <mo>=</mo>
                                       </mtd>
                                       <mtd>
                                          <msup>
                                             <mn>2</mn>
                                             <mn>3</mn>
                                          </msup>
                                          <mo>×</mo>
                                          <mn>1.01100000000000000000001</mn>
                                       </mtd>
                                    </mtr>
                                 </mtable>
                              </math>
                           </p>
                           <p class="p">For reference, the exact, mathematical results are computed as well in
                              the table above. Not only are the results computed according to IEEE
                              754 different from the exact mathematical results, but also the results
                              corresponding to the sum rn(rn(A + B) + C) and the sum rn(A + rn(B +
                              C)) are different from each other. In this case, rn(A + rn(B + C)) is
                              closer to the correct mathematical result than rn(rn(A + B) + C).
                           </p>
                           <p class="p">This example highlights that seemingly identical computations can
                              produce different results even if all basic operations are computed in
                              compliance with IEEE 754.
                           </p>
                           <p class="p">Here, the order in which operations are executed affects the accuracy
                              of the result.  The results are independent of the host system. These
                              same results would be obtained using any microprocessor, CPU or GPU,
                              which supports single precision floating point.
                           </p>
                        </div>
                     </div>
                  </div>
                  <div class="topic concept nested1" id="fused-multiply-add-fma"><a name="fused-multiply-add-fma" shape="rect">
                        <!-- --></a><h3 class="title topictitle2"><a href="#fused-multiply-add-fma" name="fused-multiply-add-fma" shape="rect">2.3.&nbsp;The Fused Multiply-Add (FMA)</a></h3>
                     <div class="body conbody">
                        <p class="p">In 2008 the IEEE 754 standard was revised to include the fused
                           multiply-add operation (<dfn class="term">FMA</dfn>). The FMA operation computes
                           
                           
                           <math xmlns="http://www.w3.org/1998/Math/MathML">
                              <mrow>
                                 <mtext>rn</mtext>
                                 <mo>(</mo>
                                 <mi>X</mi>
                                 <mo>×</mo>
                                 <mi>Y</mi>
                                 <mo>+</mo>
                                 <mi>Z</mi>
                                 <mo>)</mo>
                              </mrow>
                           </math> with only one rounding step.
                           Without the FMA operation the result would have to be computed as
                           
                           
                           <math xmlns="http://www.w3.org/1998/Math/MathML">
                              <mrow>
                                 <mtext>rn</mtext>
                                 <mo>(</mo>
                                 <mtext>rn</mtext>
                                 <mo>(</mo>
                                 <mi>X</mi>
                                 <mo>×</mo>
                                 <mi>Y</mi>
                                 <mo>)</mo>
                                 <mo>+</mo>
                                 <mi>Z</mi>
                                 <mo>)</mo>
                              </mrow>
                           </math> with two rounding steps, one
                           for multiply and one for add.  Because the FMA uses only a single
                           rounding step the result is computed more accurately.
                        </p>
                        <p class="p">Let's consider an example to illustrate how the FMA operation works
                           using decimal arithmetic first for clarity. Let's compute 
                           
                           <math xmlns="http://www.w3.org/1998/Math/MathML">
                              <mrow>
                                 <msup>
                                    <mi>x</mi>
                                    <mn>2</mn>
                                 </msup>
                                 <mo>−</mo>
                                 <mn>1</mn>
                              </mrow>
                           </math> with four digits of precision after the
                           decimal point, or a total of five digits of precision including the
                           leading digit before the decimal point.
                        </p>
                        <p class="p">For 
                           
                           <math xmlns="http://www.w3.org/1998/Math/MathML">
                              <mrow>
                                 <mi>x</mi>
                                 <mo>=</mo>
                                 <mn>1.0008</mn>
                              </mrow>
                           </math> , the correct
                           mathematical result is 
                           
                           <math xmlns="http://www.w3.org/1998/Math/MathML">
                              <mrow>
                                 <msup>
                                    <mi>x</mi>
                                    <mn>2</mn>
                                 </msup>
                                 <mo>−</mo>
                                 <mn>1</mn>
                                 <mo>=</mo>
                                 <mn>1.60064</mn>
                                 <mo>×</mo>
                                 <msup>
                                    <mrow>
                                       <mn>10</mn>
                                    </mrow>
                                    <mrow>
                                       <mo>−</mo>
                                       <mn>4</mn>
                                    </mrow>
                                 </msup>
                              </mrow>
                           </math>. The
                           closest number using only four digits after the decimal point is
                           
                           
                           <math xmlns="http://www.w3.org/1998/Math/MathML">
                              <mrow>
                                 <mn>1.6006</mn>
                                 <mo>×</mo>
                                 <msup>
                                    <mn>10</mn>
                                    <mrow>
                                       <mo>−</mo>
                                       <mn>4</mn>
                                    </mrow>
                                 </msup>
                              </mrow>
                           </math>. In this case 
                           
                           <math xmlns="http://www.w3.org/1998/Math/MathML">
                              <mrow>
                                 <mtext>rn</mtext>
                                 <mo>(</mo>
                                 <msup>
                                    <mi>x</mi>
                                    <mn>2</mn>
                                 </msup>
                                 <mo>−</mo>
                                 <mn>1</mn>
                                 <mo>)</mo>
                                 <mo>=</mo>
                                 <mn>1.6006</mn>
                                 <mo>×</mo>
                                 <msup>
                                    <mn>10</mn>
                                    <mrow>
                                       <mo>−</mo>
                                       <mn>4</mn>
                                    </mrow>
                                 </msup>
                              </mrow>
                           </math> which corresponds to the fused
                           multiply-add operation 
                           
                           <math xmlns="http://www.w3.org/1998/Math/MathML">
                              <mrow>
                                 <mtext>rn</mtext>
                                 <mo>(</mo>
                                 <mi>x</mi>
                                 <mo>×</mo>
                                 <mi>x</mi>
                                 <mo>+</mo>
                                 <mo>(</mo>
                                 <mo>−</mo>
                                 <mn>1</mn>
                                 <mo>)</mo>
                                 <mo>)</mo>
                              </mrow>
                           </math>. The
                           alternative is to compute separate multiply and add steps. For the
                           multiply, 
                           
                           <math xmlns="http://www.w3.org/1998/Math/MathML">
                              <mrow>
                                 <msup>
                                    <mi>x</mi>
                                    <mn>2</mn>
                                 </msup>
                                 <mo>=</mo>
                                 <mn>1.00160064</mn>
                              </mrow>
                           </math>, so 
                           
                           <math xmlns="http://www.w3.org/1998/Math/MathML">
                              <mrow>
                                 <mtext>rn</mtext>
                                 <mo>(</mo>
                                 <msup>
                                    <mi>x</mi>
                                    <mn>2</mn>
                                 </msup>
                                 <mo>)</mo>
                                 <mo>=</mo>
                                 <mn>1.0016</mn>
                              </mrow>
                           </math>. The final result is 
                           
                           <math xmlns="http://www.w3.org/1998/Math/MathML">
                              <mrow>
                                 <mtext>rn</mtext>
                                 <mo>(</mo>
                                 <mtext>rn</mtext>
                                 <mo>(</mo>
                                 <msup>
                                    <mi>x</mi>
                                    <mn>2</mn>
                                 </msup>
                                 <mo>)</mo>
                                 <mo>−</mo>
                                 <mn>1</mn>
                                 <mo>)</mo>
                                 <mo>=</mo>
                                 <mn>1.6000</mn>
                                 <mo>×</mo>
                                 <msup>
                                    <mn>10</mn>
                                    <mrow>
                                       <mo>−</mo>
                                       <mn>4</mn>
                                    </mrow>
                                 </msup>
                              </mrow>
                           </math>.
                        </p>
                        <p class="p">Rounding the multiply and add separately yields a result that is off
                           by 0.00064. The corresponding FMA computation is wrong by only 0.00004,
                           and its result is closest to the correct mathematical answer. The results
                           are summarized below:
                        </p>
                        <p class="p d4p_eqn_block">
                           <math xmlns="http://www.w3.org/1998/Math/MathML">
                              <mtable columnalign="right left left left" columnspacing="0.2em">
                                 <mtr>
                                    <mtd>
                                       <mi>x</mi>
                                    </mtd>
                                    <mtd>
                                       <mo>=</mo>
                                    </mtd>
                                    <mtd>
                                       <mn>1.0008</mn>
                                    </mtd>
                                 </mtr>
                                 <mtr>
                                    <mtd>
                                       <msup>
                                          <mi>x</mi>
                                          <mn>2</mn>
                                       </msup>
                                    </mtd>
                                    <mtd>
                                       <mo>=</mo>
                                    </mtd>
                                    <mtd>
                                       <mn>1.00160064</mn>
                                    </mtd>
                                 </mtr>
                                 <mtr>
                                    <mtd>
                                       <msup>
                                          <mi>x</mi>
                                          <mn>2</mn>
                                       </msup>
                                       <mo>−</mo>
                                       <mn>1</mn>
                                    </mtd>
                                    <mtd>
                                       <mo>=</mo>
                                    </mtd>
                                    <mtd>
                                       <mn>1.60064</mn>
                                       <mo>×</mo>
                                       <msup>
                                          <mn>10</mn>
                                          <mrow>
                                             <mo>−</mo>
                                             <mn>4</mn>
                                          </mrow>
                                       </msup>
                                       <mtext>&nbsp;&nbsp;</mtext>
                                    </mtd>
                                    <mtd>
                                       <mtext>true value</mtext>
                                    </mtd>
                                 </mtr>
                                 <mtr>
                                    <mtd>
                                       <mtext>rn</mtext>
                                       <mo>(</mo>
                                       <msup>
                                          <mi>x</mi>
                                          <mn>2</mn>
                                       </msup>
                                       <mo>−</mo>
                                       <mn>1</mn>
                                       <mo>)</mo>
                                    </mtd>
                                    <mtd>
                                       <mo>=</mo>
                                    </mtd>
                                    <mtd>
                                       <mn>1.6006</mn>
                                       <mo>×</mo>
                                       <msup>
                                          <mn>10</mn>
                                          <mrow>
                                             <mo>−</mo>
                                             <mn>4</mn>
                                          </mrow>
                                       </msup>
                                    </mtd>
                                    <mtd>
                                       <mtext>fused multiply-add</mtext>
                                    </mtd>
                                 </mtr>
                                 <mtr>
                                    <mtd>
                                       <mtext>rn</mtext>
                                       <mo>(</mo>
                                       <msup>
                                          <mi>x</mi>
                                          <mn>2</mn>
                                       </msup>
                                       <mo>)</mo>
                                    </mtd>
                                    <mtd>
                                       <mo>=</mo>
                                    </mtd>
                                    <mtd>
                                       <mn>1.0016</mn>
                                       <mo>×</mo>
                                       <msup>
                                          <mn>10</mn>
                                          <mrow>
                                             <mo>−</mo>
                                             <mn>4</mn>
                                          </mrow>
                                       </msup>
                                    </mtd>
                                 </mtr>
                                 <mtr>
                                    <mtd>
                                       <mtext>rn</mtext>
                                       <mo>(</mo>
                                       <mtext>rn</mtext>
                                       <mo>(</mo>
                                       <msup>
                                          <mi>x</mi>
                                          <mn>2</mn>
                                       </msup>
                                       <mo>)</mo>
                                       <mo>−</mo>
                                       <mn>1</mn>
                                       <mo>)</mo>
                                    </mtd>
                                    <mtd>
                                       <mo>=</mo>
                                    </mtd>
                                    <mtd>
                                       <mn>1.6000</mn>
                                       <mo>×</mo>
                                       <msup>
                                          <mn>10</mn>
                                          <mrow>
                                             <mo>−</mo>
                                             <mn>4</mn>
                                          </mrow>
                                       </msup>
                                    </mtd>
                                    <mtd>
                                       <mtext>multiply, then add</mtext>
                                    </mtd>
                                 </mtr>
                              </mtable>
                           </math>
                        </p>
                        <p class="p">Below is another example, using binary single precision values:</p>
                        <p class="p d4p_eqn_block">
                           <math xmlns="http://www.w3.org/1998/Math/MathML">
                              <mtable columnalign="right left left left left" columnspacing="0.2em">
                                 <mtr>
                                    <mtd>
                                       <mi>A</mi>
                                    </mtd>
                                    <mtd>
                                       <mo>=</mo>
                                    </mtd>
                                    <mtd></mtd>
                                    <mtd>
                                       <msup>
                                          <mn>2</mn>
                                          <mn>0</mn>
                                       </msup>
                                    </mtd>
                                    <mtd>
                                       <mo>×</mo>
                                       <mn>1.00000000000000000000001</mn>
                                    </mtd>
                                 </mtr>
                                 <mtr>
                                    <mtd>
                                       <mi>B</mi>
                                    </mtd>
                                    <mtd>
                                       <mo>=</mo>
                                    </mtd>
                                    <mtd>
                                       <mo>−</mo>
                                    </mtd>
                                    <mtd>
                                       <msup>
                                          <mn>2</mn>
                                          <mn>0</mn>
                                       </msup>
                                    </mtd>
                                    <mtd>
                                       <mo>×</mo>
                                       <mn>1.00000000000000000000010</mn>
                                    </mtd>
                                 </mtr>
                                 <mtr>
                                    <mtd>
                                       <mtext>rn</mtext>
                                       <mo>(</mo>
                                       <mi>A</mi>
                                       <mo>×</mo>
                                       <mi>A</mi>
                                       <mo>+</mo>
                                       <mi>B</mi>
                                       <mo>)</mo>
                                    </mtd>
                                    <mtd>
                                       <mo>=</mo>
                                    </mtd>
                                    <mtd></mtd>
                                    <mtd>
                                       <msup>
                                          <mn>2</mn>
                                          <mrow>
                                             <mo>−</mo>
                                             <mn>46</mn>
                                          </mrow>
                                       </msup>
                                    </mtd>
                                    <mtd>
                                       <mo>×</mo>
                                       <mn>1.00000000000000000000000</mn>
                                    </mtd>
                                 </mtr>
                                 <mtr>
                                    <mtd>
                                       <mtext>rn</mtext>
                                       <mo>(</mo>
                                       <mtext>rn</mtext>
                                       <mo>(</mo>
                                       <mi>A</mi>
                                       <mo>×</mo>
                                       <mi>A</mi>
                                       <mo>)</mo>
                                       <mo>+</mo>
                                       <mi>B</mi>
                                       <mo>)</mo>
                                    </mtd>
                                    <mtd>
                                       <mo>=</mo>
                                    </mtd>
                                    <mtd></mtd>
                                    <mtd>
                                       <mn>0</mn>
                                    </mtd>
                                 </mtr>
                              </mtable>
                           </math>
                        </p>
                        <p class="p">In this particular case, computing 
                           
                           <math xmlns="http://www.w3.org/1998/Math/MathML">
                              <mrow>
                                 <mtext>rn</mtext>
                                 <mo>(</mo>
                                 <mtext>rn</mtext>
                                 <mo>(</mo>
                                 <mi>A</mi>
                                 <mo>×</mo>
                                 <mi>A</mi>
                                 <mo>)</mo>
                                 <mo>+</mo>
                                 <mi>B</mi>
                                 <mo>)</mo>
                              </mrow>
                           </math> as an IEEE 754 multiply followed by an
                           IEEE 754 add loses all bits of precision, and the computed result is 0.
                           The alternative of computing the FMA 
                           
                           <math xmlns="http://www.w3.org/1998/Math/MathML">
                              <mrow>
                                 <mtext>rn</mtext>
                                 <mo>(</mo>
                                 <mi>A</mi>
                                 <mo>×</mo>
                                 <mi>A</mi>
                                 <mo>+</mo>
                                 <mi>B</mi>
                                 <mo>)</mo>
                              </mrow>
                           </math> provides a result equal to the
                           mathematical value. In general, the fused-multiply-add operation
                           generates more accurate results than computing one multiply followed by
                           one add. The choice of whether or not to use the fused operation depends
                           on whether the platform provides the operation and also on how the code
                           is compiled.
                        </p>
                        <p class="p"><a class="xref" href="index.html#fused-multiply-add-fma__multiply-and-add-code-fragment-and-output-for-x86-and-nvidia-fermi-gpu" shape="rect">Figure 1</a> shows CUDA C code and output corresponding to inputs
                           <em class="ph i">A</em> and <em class="ph i">B</em> and operations from the example above. The code is
                           executed on two different hardware platforms: an x86-class CPU using
                           <dfn class="term">SSE</dfn> in single precision, and an NVIDIA GPU with compute
                           capability 2.0. At the time this paper is written (Spring 2011) there are
                           no commercially available x86 CPUs which offer hardware FMA. Because of
                           this, the computed result in single precision in SSE would be 0. NVIDIA
                           GPUs with compute capability 2.0 do offer hardware FMAs, so the result of
                           executing this code will be the more accurate one by default. However,
                           both results are correct according to the IEEE 754 standard. The code
                           fragment was compiled without any special intrinsics or compiler options
                           for either platform.
                        </p>
                        <p class="p">The fused multiply-add helps avoid loss of precision
                           during subtractive cancellation. Subtractive cancellation occurs during
                           the addition of quantities of similar magnitude with opposite signs. In
                           this case many of the leading bits cancel, leaving fewer meaningful bits
                           of precision in the result. The fused multiply-add computes a
                           double-width product during the multiplication. Thus even if subtractive
                           cancellation occurs during the addition there are still enough valid bits
                           remaining in the product to get a precise result with no loss of
                           precision.
                        </p>
                        <div class="fig fignone" id="fused-multiply-add-fma__multiply-and-add-code-fragment-and-output-for-x86-and-nvidia-fermi-gpu"><a name="fused-multiply-add-fma__multiply-and-add-code-fragment-and-output-for-x86-and-nvidia-fermi-gpu" shape="rect">
                              <!-- --></a><span class="figcap">Figure 1. Multiply and Add Code Fragment and Output for x86 and NVIDIA
                              Fermi GPU</span><pre xml:space="preserve"><span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-keyword">union</span>  {
    <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-keyword">float</span>  f;
    <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-keyword">unsigned</span>  <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-keyword">int</span>  i
} a,  b;
<span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-keyword">float</span>  r;

a.i = 0x3F800001;
b.i = 0xBF800002;
r = a.f  * a.f  + b.f;

printf(<span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-string">"a %.8g\n"</span>, a.f); 
printf(<span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-string">"b %.8g\n"</span>, b.f); 
printf(<span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-string">"r %.8g\n"</span>,   r);</pre><p class="p">x86-64 output:</p><pre class="pre screen" xml:space="preserve">a:  1.0000001 
b: -1.0000002 
<strong class="ph b">r:  0</strong></pre><p class="p">NVIDIA Fermi output:</p><pre class="pre screen" xml:space="preserve">a:  1.0000001 
b: -1.0000002
<strong class="ph b">r:  1.4210855e-14</strong></pre></div>
                     </div>
                  </div>
               </div>
               <div class="topic concept nested0" id="dot-product-accuracy-example"><a name="dot-product-accuracy-example" shape="rect">
                     <!-- --></a><h2 class="title topictitle1"><a href="#dot-product-accuracy-example" name="dot-product-accuracy-example" shape="rect">3.&nbsp;Dot Product: An Accuracy Example</a></h2>
                  <div class="body conbody">
                     <p class="p">Consider the problem of finding the dot product of two short vectors 
                        
                        <math xmlns="http://www.w3.org/1998/Math/MathML">
                           <semantics definitionURL="" encoding="">
                              <mover accent="true">
                                 <mi>a</mi>
                                 <mo>→</mo>
                              </mover>
                           </semantics>
                        </math>
                        and 
                        
                        <math xmlns="http://www.w3.org/1998/Math/MathML">
                           <semantics definitionURL="" encoding="">
                              <mover accent="true">
                                 <mi>b</mi>
                                 <mo>→</mo>
                              </mover>
                           </semantics>
                        </math>, both with four elements.
                     </p>
                     <ul class="sl simple">
                        <li class="sli">
                           <math xmlns="http://www.w3.org/1998/Math/MathML">
                              <mrow>
                                 <mover accent="true">
                                    <mi>a</mi>
                                    <mo>⇀</mo>
                                 </mover>
                                 <mo>=</mo>
                                 <mrow>
                                    <mo>[</mo>
                                    <mrow>
                                       <mtable>
                                          <mtr>
                                             <mtd>
                                                <mrow>
                                                   <msub>
                                                      <mi>a</mi>
                                                      <mn>1</mn>
                                                   </msub>
                                                </mrow>
                                             </mtd>
                                          </mtr>
                                          <mtr>
                                             <mtd>
                                                <mrow>
                                                   <msub>
                                                      <mi>a</mi>
                                                      <mn>2</mn>
                                                   </msub>
                                                </mrow>
                                             </mtd>
                                          </mtr>
                                          <mtr>
                                             <mtd>
                                                <mrow>
                                                   <msub>
                                                      <mi>a</mi>
                                                      <mn>3</mn>
                                                   </msub>
                                                </mrow>
                                             </mtd>
                                          </mtr>
                                          <mtr>
                                             <mtd>
                                                <mrow>
                                                   <msub>
                                                      <mi>a</mi>
                                                      <mn>4</mn>
                                                   </msub>
                                                </mrow>
                                             </mtd>
                                          </mtr>
                                       </mtable>
                                    </mrow>
                                    <mo>]</mo>
                                 </mrow>
                                 <mtext> </mtext>
                                 <mtext> </mtext>
                                 <mover accent="true">
                                    <mi>b</mi>
                                    <mo>⇀</mo>
                                 </mover>
                                 <mo>=</mo>
                                 <mrow>
                                    <mo>[</mo>
                                    <mrow>
                                       <mtable>
                                          <mtr>
                                             <mtd>
                                                <mrow>
                                                   <msub>
                                                      <mi>b</mi>
                                                      <mn>1</mn>
                                                   </msub>
                                                </mrow>
                                             </mtd>
                                          </mtr>
                                          <mtr>
                                             <mtd>
                                                <mrow>
                                                   <msub>
                                                      <mi>b</mi>
                                                      <mn>2</mn>
                                                   </msub>
                                                </mrow>
                                             </mtd>
                                          </mtr>
                                          <mtr>
                                             <mtd>
                                                <mrow>
                                                   <msub>
                                                      <mi>b</mi>
                                                      <mn>3</mn>
                                                   </msub>
                                                </mrow>
                                             </mtd>
                                          </mtr>
                                          <mtr>
                                             <mtd>
                                                <mrow>
                                                   <msub>
                                                      <mi>b</mi>
                                                      <mn>4</mn>
                                                   </msub>
                                                </mrow>
                                             </mtd>
                                          </mtr>
                                       </mtable>
                                    </mrow>
                                    <mo>]</mo>
                                 </mrow>
                                 <mtext> </mtext>
                                 <mover accent="true">
                                    <mi>a</mi>
                                    <mo>⇀</mo>
                                 </mover>
                                 <mo>⋅</mo>
                                 <mover accent="true">
                                    <mi>b</mi>
                                    <mo>⇀</mo>
                                 </mover>
                                 <mo>=</mo>
                                 <msub>
                                    <mi>a</mi>
                                    <mn>1</mn>
                                 </msub>
                                 <msub>
                                    <mi>b</mi>
                                    <mn>1</mn>
                                 </msub>
                                 <mo>+</mo>
                                 <msub>
                                    <mi>a</mi>
                                    <mn>2</mn>
                                 </msub>
                                 <msub>
                                    <mi>b</mi>
                                    <mn>2</mn>
                                 </msub>
                                 <mo>+</mo>
                                 <msub>
                                    <mi>a</mi>
                                    <mn>3</mn>
                                 </msub>
                                 <msub>
                                    <mi>b</mi>
                                    <mn>3</mn>
                                 </msub>
                                 <mo>+</mo>
                                 <msub>
                                    <mi>a</mi>
                                    <mn>4</mn>
                                 </msub>
                                 <msub>
                                    <mi>b</mi>
                                    <mn>4</mn>
                                 </msub>
                              </mrow>
                           </math>
                        </li>
                     </ul>
                     <p class="p">This operation is easy to write mathematically, but its implementation
                        in software involves several choices. All of the strategies we will
                        discuss use purely IEEE 754 compliant operations.
                     </p>
                  </div>
                  <div class="topic concept nested1" id="example-algorithms"><a name="example-algorithms" shape="rect">
                        <!-- --></a><h3 class="title topictitle2"><a href="#example-algorithms" name="example-algorithms" shape="rect">3.1.&nbsp;Example Algorithms</a></h3>
                     <div class="body conbody">
                        <p class="p">We present three algorithms which differ in how the multiplications,
                           additions, and possibly fused multiply-adds are organized. These
                           algorithms are presented in <a class="xref" href="index.html#example-algorithms__serial-method-to-compute-vectors-dot-product" title="The serial method uses a simple loop with separate multiplies and adds to compute the do t product of the vectors. The final result can be represented as ((((a1 x b1) + (a2 x b2)) + (a3 x b3)) + (a4 x b4))." shape="rect">Figure 2</a>, <a class="xref" href="index.html#example-algorithms__fma-method-to-compute-vectors-dot-product" title="The FMA method uses a simple loop with fused multiply-adds to compute the dot product of the vectors. The final result can be represented as a4 x b4 = (a3 x b3 + (a2 x b2 + (a1 x b1 + 0)))." shape="rect">Figure 3</a>, and <a class="xref" href="index.html#comparison__parallel-method-to-reduce-individual-elements-products-into-final-sum" title="The parallel method uses a tree to reduce all the products of individual elements into a final sum. The final result can be represented as ((a1 x b1) + (a2 x b2)) + ((a3 x b3) + (a4 x b4))." shape="rect">Figure 4</a>. Each of the three algorithms is represented
                           graphically. Individual operation are shown as a circle with arrows
                           pointing from arguments to operations.
                        </p>
                        <p class="p">The simplest way to compute the dot product is using a short loop as
                           shown in <a class="xref" href="index.html#example-algorithms__serial-method-to-compute-vectors-dot-product" title="The serial method uses a simple loop with separate multiplies and adds to compute the do t product of the vectors. The final result can be represented as ((((a1 x b1) + (a2 x b2)) + (a3 x b3)) + (a4 x b4))." shape="rect">Figure 2</a>. The multiplications and additions are done
                           separately.
                        </p>
                        <div class="fig fignone" id="example-algorithms__serial-method-to-compute-vectors-dot-product"><a name="example-algorithms__serial-method-to-compute-vectors-dot-product" shape="rect">
                              <!-- --></a><span class="figcap">Figure 2. Serial Method to Compute Vectors Dot Product</span>. <span class="desc figdesc">The serial method uses a simple loop with separate multiplies and
                              adds to compute the do t product of the vectors. The final result can
                              be represented as ((((a<sub class="ph sub">1</sub> x b<sub class="ph sub">1</sub>) + (a<sub class="ph sub">2</sub> x
                              b<sub class="ph sub">2</sub>)) + (a<sub class="ph sub">3</sub> x b<sub class="ph sub">3</sub>)) + (a<sub class="ph sub">4</sub> x
                              b<sub class="ph sub">4</sub>)).</span><br clear="none"></br><div class="imagecenter"><img class="image imagecenter" src="graphics/serial-method.png" alt="A figure of serial method to compute the vector dot product using a simple loop with separate multiplies and adds."></img></div><br clear="none"></br></div>
                        <div class="fig fignone" id="example-algorithms__fma-method-to-compute-vectors-dot-product"><a name="example-algorithms__fma-method-to-compute-vectors-dot-product" shape="rect">
                              <!-- --></a><span class="figcap">Figure 3. FMA Method to Compute Vector Dot Product</span>. <span class="desc figdesc">The FMA method uses a simple loop with fused multiply-adds to
                              compute the dot product of the vectors. The final result can be
                              represented as a<sub class="ph sub">4</sub> x b<sub class="ph sub">4</sub> = (a<sub class="ph sub">3</sub> x
                              b<sub class="ph sub">3</sub> + (a<sub class="ph sub">2</sub> x b<sub class="ph sub">2</sub> + (a<sub class="ph sub">1</sub> x
                              b<sub class="ph sub">1</sub> + 0))).</span><br clear="none"></br><div class="imagecenter"><img class="image imagecenter" src="graphics/fma-method.png" alt="A figure of the FMA method to compute the vector dot product using a simple loop with fused multiply-adds."></img></div><br clear="none"></br></div>
                        <p class="p">A simple improvement to the algorithm is to use the fused multiply-add
                           to do the multiply and addition in one step to improve accuracy. <a class="xref" href="index.html#example-algorithms__fma-method-to-compute-vectors-dot-product" title="The FMA method uses a simple loop with fused multiply-adds to compute the dot product of the vectors. The final result can be represented as a4 x b4 = (a3 x b3 + (a2 x b2 + (a1 x b1 + 0)))." shape="rect">Figure 3</a> shows this version. 
                        </p>
                        <p class="p">Yet another way to compute the dot product is to use a
                           divide-and-conquer strategy in which we first find the dot products of
                           the first half and the second half of the vectors, then combine these
                           results using addition. This is a recursive strategy; the base case is
                           the dot product of vectors of length 1 which is a single multiply. <a class="xref" href="index.html#comparison__parallel-method-to-reduce-individual-elements-products-into-final-sum" title="The parallel method uses a tree to reduce all the products of individual elements into a final sum. The final result can be represented as ((a1 x b1) + (a2 x b2)) + ((a3 x b3) + (a4 x b4))." shape="rect">Figure 4</a> graphically illustrates this approach. We call this
                           algorithm the parallel algorithm because the two sub-problems can be
                           computed in parallel as they have no dependencies. The algorithm does
                           not require a parallel implementation, however; it can still be
                           implemented with a single thread.
                        </p>
                     </div>
                  </div>
                  <div class="topic concept nested1" id="comparison"><a name="comparison" shape="rect">
                        <!-- --></a><h3 class="title topictitle2"><a href="#comparison" name="comparison" shape="rect">3.2.&nbsp;Comparison</a></h3>
                     <div class="body conbody">
                        <p class="p">All three algorithms for computing a dot product use IEEE 754 arithmetic
                           and can be implemented on any system that supports the IEEE standard. In
                           fact, an implementation of the serial algorithm on multiple systems will
                           give exactly the same result. So will implementations of the FMA or
                           parallel algorithms. However, results computed by an implementation of
                           the serial algorithm may differ from those computed by an implementation
                           of the other two algorithms.
                        </p>
                        <div class="fig fignone" id="comparison__parallel-method-to-reduce-individual-elements-products-into-final-sum"><a name="comparison__parallel-method-to-reduce-individual-elements-products-into-final-sum" shape="rect">
                              <!-- --></a><span class="figcap">Figure 4. The Parallel Method to Reduce Individual Elements Products into a
                              Final Sum</span>. <span class="desc figdesc">The parallel method uses a tree to reduce all the products of
                              individual elements into a final sum. The final result can be
                              represented as ((a<sub class="ph sub">1</sub> x b<sub class="ph sub">1</sub>) + (a<sub class="ph sub">2</sub> x
                              b<sub class="ph sub">2</sub>)) + ((a<sub class="ph sub">3</sub> x b<sub class="ph sub">3</sub>) + (a<sub class="ph sub">4</sub> x
                              b<sub class="ph sub">4</sub>)).</span><br clear="none"></br><div class="imagecenter"><img class="image imagecenter" src="graphics/parallel-method.png" alt="A figure of the Parallel Method using a tree to reduce the products of individual elements into a final sum"></img></div><br clear="none"></br></div>
                        <div class="fig fignone" id="comparison__algorithms-results-vs-correct-mathematical-dot-product"><a name="comparison__algorithms-results-vs-correct-mathematical-dot-product" shape="rect">
                              <!-- --></a><span class="figcap">Figure 5. Algorithms Results vs. the Correct Mathematical Dot
                              Product</span>. <span class="desc figdesc">The three algorithms yield results slightly different from the
                              correct mathematical dot product.</span><table cellpadding="4" cellspacing="0" summary="" border="1" class="simpletable">
                              <tr class="sthead">
                                 <th valign="bottom" align="left" id="d54e2678" class="stentry" rowspan="1" colspan="1">method</th>
                                 <th valign="bottom" align="left" id="d54e2681" class="stentry" rowspan="1" colspan="1">result</th>
                                 <th valign="bottom" align="left" id="d54e2684" class="stentry" rowspan="1" colspan="1">float value</th>
                              </tr>
                              <tr class="strow">
                                 <td valign="top" headers="d54e2678" class="stentry" rowspan="1" colspan="1">exact</td>
                                 <td valign="top" headers="d54e2681" class="stentry" rowspan="1" colspan="1">.0559587528435...</td>
                                 <td valign="top" headers="d54e2684" class="stentry" rowspan="1" colspan="1">0x3D65350158...</td>
                              </tr>
                              <tr class="strow">
                                 <td valign="top" headers="d54e2678" class="stentry" rowspan="1" colspan="1">serial</td>
                                 <td valign="top" headers="d54e2681" class="stentry" rowspan="1" colspan="1">.0559588074</td>
                                 <td valign="top" headers="d54e2684" class="stentry" rowspan="1" colspan="1">0x3D653510</td>
                              </tr>
                              <tr class="strow">
                                 <td valign="top" headers="d54e2678" class="stentry" rowspan="1" colspan="1">FMA</td>
                                 <td valign="top" headers="d54e2681" class="stentry" rowspan="1" colspan="1">.0559587515</td>
                                 <td valign="top" headers="d54e2684" class="stentry" rowspan="1" colspan="1">0x3D653501</td>
                              </tr>
                              <tr class="strow">
                                 <td valign="top" headers="d54e2678" class="stentry" rowspan="1" colspan="1">parallel</td>
                                 <td valign="top" headers="d54e2681" class="stentry" rowspan="1" colspan="1">.0559587478</td>
                                 <td valign="top" headers="d54e2684" class="stentry" rowspan="1" colspan="1">0x3D653500</td>
                              </tr>
                           </table>
                        </div>
                        <p class="p">For example, consider the vectors:</p>
                        <ul class="sl simple">
                           <li class="sli">
                              a = [1.907607, -.7862027, 1.148311, .9604002]
                              
                           </li>
                           <li class="sli">
                              b = [-.9355000, -.6915108, 1.724470, -.7097529]
                              
                           </li>
                        </ul>
                        <p class="p">whose elements are randomly chosen values between -1 and 2. The accuracy
                           of each algorithm corresponding to these inputs is shown in <a class="xref" href="index.html#comparison__algorithms-results-vs-correct-mathematical-dot-product" title="The three algorithms yield results slightly different from the correct mathematical dot product." shape="rect">Figure 5</a>.
                        </p>
                        <p class="p">The main points to notice from the table are that each algorithm yields
                           a different result, and they are all slightly different from the correct
                           mathematical dot product. In this example the FMA version is the most
                           accurate, and the parallel algorithm is more accurate than the serial
                           algorithm. In our experience these results are typical; fused
                           multiply-add significantly increases the accuracy of results, and
                           parallel tree reductions for summation are usually much more accurate
                           than serial summation.
                        </p>
                     </div>
                  </div>
               </div>
               <div class="topic concept nested0" id="cuda-and-floating-point"><a name="cuda-and-floating-point" shape="rect">
                     <!-- --></a><h2 class="title topictitle1"><a href="#cuda-and-floating-point" name="cuda-and-floating-point" shape="rect">4.&nbsp;CUDA and Floating Point</a></h2>
                  <div class="body conbody">
                     <p class="p">NVIDIA has extended the capabilities of GPUs with each successive
                        hardware generation.  Current generations of the NVIDIA architecture such
                        as <dfn class="term">Tesla C2xxx</dfn>, <dfn class="term">GTX 4xx</dfn>, and <dfn class="term">GTX
                           5xx</dfn>, support both single and double precision with <dfn class="term">IEEE
                           754</dfn> precision and include hardware support for fused
                        multiply-add in both single and double precision. Older NVIDIA
                        architectures support some of these features but not others. In CUDA, the
                        features supported by the GPU are encoded in the <dfn class="term">compute
                           capability</dfn> number. The runtime library supports a function
                        call to determine the compute capability of a GPU at runtime; the
                        <cite class="cite">CUDA C Programming Guide</cite> also includes a table of compute
                        capabilities for many different devices <a class="xref" href="index.html#references__7" shape="rect">[7]</a>.
                     </p>
                  </div>
                  <div class="topic concept nested1" id="compute-capability-1-2-and-below"><a name="compute-capability-1-2-and-below" shape="rect">
                        <!-- --></a><h3 class="title topictitle2"><a href="#compute-capability-1-2-and-below" name="compute-capability-1-2-and-below" shape="rect">4.1.&nbsp;Compute Capability 1.2 and Below</a></h3>
                     <div class="body conbody">
                        <p class="p">Devices with compute capability <em class="ph i">1.2 and below</em> support single
                           precision only. In addition, not all operations in single precision on
                           these GPUs are <dfn class="term">IEEE 754</dfn> accurate. Denormal numbers (small
                           numbers close to zero) are flushed to zero. Operations such as square
                           root and division may not always result in the floating point value
                           closest to the correct mathematical value.
                        </p>
                     </div>
                  </div>
                  <div class="topic concept nested1" id="compute-capability-1-3"><a name="compute-capability-1-3" shape="rect">
                        <!-- --></a><h3 class="title topictitle2"><a href="#compute-capability-1-3" name="compute-capability-1-3" shape="rect">4.2.&nbsp;Compute Capability 1.3</a></h3>
                     <div class="body conbody">
                        <p class="p">Devices with compute capability <em class="ph i">1.3</em> support both single and
                           double precision floating point computation. Double precision operations
                           are always <dfn class="term">IEEE 754</dfn> accurate. Single precision in devices of
                           compute capability 1.3 is unchanged from previous compute
                           capabilities.
                        </p>
                        <p class="p">In addition, the double precision hardware offers fused multiply-add. As
                           described in <a class="xref" href="index.html#fused-multiply-add-fma" shape="rect">Section&nbsp;2.3</a>, the fused
                           multiply-add operation is faster and more accurate than separate
                           multiplies and additions. There is no single precision fused multiply-add
                           operation in compute capability 1.3.
                        </p>
                     </div>
                  </div>
                  <div class="topic concept nested1" id="compute-capability-2-0-and-above"><a name="compute-capability-2-0-and-above" shape="rect">
                        <!-- --></a><h3 class="title topictitle2"><a href="#compute-capability-2-0-and-above" name="compute-capability-2-0-and-above" shape="rect">4.3.&nbsp;Compute Capability 2.0 and Above</a></h3>
                     <div class="body conbody">
                        <p class="p">Devices with compute capability <em class="ph i">2.0 and above</em> support both single
                           and double precision <dfn class="term">IEEE 754</dfn> including fused multiply-add
                           in both single and double precision. Operations such as square root and
                           division will result in the floating point value closest to the correct
                           mathematical result in both single and double precision, by default.
                        </p>
                     </div>
                  </div>
                  <div class="topic concept nested1" id="rounding-modes"><a name="rounding-modes" shape="rect">
                        <!-- --></a><h3 class="title topictitle2"><a href="#rounding-modes" name="rounding-modes" shape="rect">4.4.&nbsp;Rounding Modes</a></h3>
                     <div class="body conbody">
                        <p class="p">The <dfn class="term">IEEE 754</dfn> standard defines four rounding modes:
                           round-to-nearest, round towards positive, round towards negative, and
                           round towards zero. CUDA supports all four modes. By default, operations
                           use round-to-nearest. Compiler intrinsics like the ones listed in the
                           tables below can be used to select other rounding modes for individual
                           operations.
                        </p>
                        <p class="p"></p>
                        <p class="p"></p>
                        <p class="p"></p>
                        <p class="p"></p>
                        <div class="tablenoborder">
                           <table cellpadding="4" cellspacing="0" summary="" class="table" frame="border" border="1" rules="all">
                              <thead class="thead" align="left">
                                 <tr class="row">
                                    <th class="entry" align="right" valign="top" width="16.666666666666664%" id="d54e2894" rowspan="1" colspan="1">mode</th>
                                    <th class="entry" valign="top" width="83.33333333333334%" id="d54e2897" rowspan="1" colspan="1">interpretation</th>
                                 </tr>
                              </thead>
                              <tbody class="tbody">
                                 <tr class="row">
                                    <td class="entry" align="right" valign="top" width="16.666666666666664%" headers="d54e2894" rowspan="1" colspan="1">rn</td>
                                    <td class="entry" valign="top" width="83.33333333333334%" headers="d54e2897" rowspan="1" colspan="1">round to nearest, ties to even</td>
                                 </tr>
                                 <tr class="row">
                                    <td class="entry" align="right" valign="top" width="16.666666666666664%" headers="d54e2894" rowspan="1" colspan="1">rz</td>
                                    <td class="entry" valign="top" width="83.33333333333334%" headers="d54e2897" rowspan="1" colspan="1">round towards zero</td>
                                 </tr>
                                 <tr class="row">
                                    <td class="entry" align="right" valign="top" width="16.666666666666664%" headers="d54e2894" rowspan="1" colspan="1">ru</td>
                                    <td class="entry" valign="top" width="83.33333333333334%" headers="d54e2897" rowspan="1" colspan="1">round towards 
                                       
                                       <math xmlns="http://www.w3.org/1998/Math/MathML">
                                          <mo>+</mo>
                                          <mtext mathvariant="normal" mathsize="big">∞</mtext>
                                       </math>
                                    </td>
                                 </tr>
                                 <tr class="row">
                                    <td class="entry" align="right" valign="top" width="16.666666666666664%" headers="d54e2894" rowspan="1" colspan="1">rd</td>
                                    <td class="entry" valign="top" width="83.33333333333334%" headers="d54e2897" rowspan="1" colspan="1">round towards 
                                       
                                       <math xmlns="http://www.w3.org/1998/Math/MathML">
                                          <mo>−</mo>
                                          <mtext mathvariant="normal" mathsize="big">∞</mtext>
                                       </math>
                                    </td>
                                 </tr>
                              </tbody>
                           </table>
                        </div>
                        <p class="p"></p>
                        <p class="p"></p>
                        <p class="p"></p>
                        <p class="p"></p>
                        <div class="tablenoborder">
                           <table cellpadding="4" cellspacing="0" summary="" class="table" frame="border" border="1" rules="all">
                              <tbody class="tbody">
                                 <tr class="row">
                                    <td class="entry" align="right" valign="top" width="83.33333333333334%" rowspan="1" colspan="1">
                                       <p class="p"><samp class="ph codeph">x + y</samp></p>
                                       <p class="p"><samp class="ph codeph">__fadd_[rn | rz | ru | rd] (x, y)</samp></p>
                                    </td>
                                    <td class="entry" valign="top" width="16.666666666666664%" rowspan="1" colspan="1">
                                       <p class="p">addition</p>
                                    </td>
                                 </tr>
                                 <tr class="row">
                                    <td class="entry" align="right" valign="top" width="83.33333333333334%" rowspan="1" colspan="1">
                                       <p class="p"><samp class="ph codeph">x * y</samp></p>
                                       <p class="p"><samp class="ph codeph">__fmul_[rn | rz | ru | rd] (x, y)</samp></p>
                                    </td>
                                    <td class="entry" valign="top" width="16.666666666666664%" rowspan="1" colspan="1">
                                       <p class="p">multiplication</p>
                                    </td>
                                 </tr>
                                 <tr class="row">
                                    <td class="entry" align="right" valign="top" width="83.33333333333334%" rowspan="1" colspan="1">
                                       <p class="p"><samp class="ph codeph">fmaf (x, y, z)</samp></p>
                                       <p class="p"><samp class="ph codeph">__fmaf_[rn | rz | ru | rd] (x, y, z)</samp></p>
                                    </td>
                                    <td class="entry" valign="top" width="16.666666666666664%" rowspan="1" colspan="1">
                                       <p class="p">FMA</p>
                                    </td>
                                 </tr>
                                 <tr class="row">
                                    <td class="entry" align="right" valign="top" width="83.33333333333334%" rowspan="1" colspan="1">
                                       <p class="p"><samp class="ph codeph">1.0f / x</samp></p>
                                       <p class="p"><samp class="ph codeph">__frcp_[rn | rz | ru | rd] (x)</samp></p>
                                    </td>
                                    <td class="entry" valign="top" width="16.666666666666664%" rowspan="1" colspan="1">
                                       <p class="p">reciprocal</p>
                                    </td>
                                 </tr>
                                 <tr class="row">
                                    <td class="entry" align="right" valign="top" width="83.33333333333334%" rowspan="1" colspan="1">
                                       <p class="p"><samp class="ph codeph">x / y</samp></p>
                                       <p class="p"><samp class="ph codeph">__fdiv_[rn | rz | ru | rd] (x, y)</samp></p>
                                    </td>
                                    <td class="entry" valign="top" width="16.666666666666664%" rowspan="1" colspan="1">
                                       <p class="p">division</p>
                                    </td>
                                 </tr>
                                 <tr class="row">
                                    <td class="entry" align="right" valign="top" width="83.33333333333334%" rowspan="1" colspan="1">
                                       <p class="p"><samp class="ph codeph">sqrtf(x)</samp></p>
                                       <p class="p"><samp class="ph codeph">__fsqrt_[rn | rz | ru | rd] (x)</samp></p>
                                    </td>
                                    <td class="entry" valign="top" width="16.666666666666664%" rowspan="1" colspan="1">
                                       <p class="p">square root</p>
                                    </td>
                                 </tr>
                              </tbody>
                           </table>
                        </div>
                        <p class="p"></p>
                        <p class="p"></p>
                        <p class="p"></p>
                        <p class="p"></p>
                        <div class="tablenoborder">
                           <table cellpadding="4" cellspacing="0" summary="" class="table" frame="border" border="1" rules="all">
                              <tbody class="tbody">
                                 <tr class="row">
                                    <td class="entry" align="right" valign="top" width="83.33333333333334%" rowspan="1" colspan="1">
                                       <p class="p"><samp class="ph codeph">x + y</samp></p>
                                       <p class="p"><samp class="ph codeph">__dadd_[rn | rz | ru | rd] (x, y)</samp></p>
                                    </td>
                                    <td class="entry" valign="top" width="16.666666666666664%" rowspan="1" colspan="1">
                                       <p class="p">addition</p>
                                    </td>
                                 </tr>
                                 <tr class="row">
                                    <td class="entry" align="right" valign="top" width="83.33333333333334%" rowspan="1" colspan="1">
                                       <p class="p"><samp class="ph codeph">x * y</samp></p>
                                       <p class="p"><samp class="ph codeph">__dmul_[rn | rz | ru | rd] (x, y)</samp></p>
                                    </td>
                                    <td class="entry" valign="top" width="16.666666666666664%" rowspan="1" colspan="1">
                                       <p class="p">multiplication</p>
                                    </td>
                                 </tr>
                                 <tr class="row">
                                    <td class="entry" align="right" valign="top" width="83.33333333333334%" rowspan="1" colspan="1">
                                       <p class="p"><samp class="ph codeph">fma (x, y, z)</samp></p>
                                       <p class="p"><samp class="ph codeph">__fma_[rn | rz | ru | rd] (x, y, z)</samp></p>
                                    </td>
                                    <td class="entry" valign="top" width="16.666666666666664%" rowspan="1" colspan="1">
                                       <p class="p">FMA</p>
                                    </td>
                                 </tr>
                                 <tr class="row">
                                    <td class="entry" align="right" valign="top" width="83.33333333333334%" rowspan="1" colspan="1">
                                       <p class="p"><samp class="ph codeph">1.0 / x</samp></p>
                                       <p class="p"><samp class="ph codeph">__drcp_[rn | rz | ru | rd] (x)</samp></p>
                                    </td>
                                    <td class="entry" valign="top" width="16.666666666666664%" rowspan="1" colspan="1">
                                       <p class="p">reciprocal</p>
                                    </td>
                                 </tr>
                                 <tr class="row">
                                    <td class="entry" align="right" valign="top" width="83.33333333333334%" rowspan="1" colspan="1">
                                       <p class="p"><samp class="ph codeph">x / y</samp></p>
                                       <p class="p"><samp class="ph codeph">__ddiv_[rn | rz | ru | rd] (x, y)</samp></p>
                                    </td>
                                    <td class="entry" valign="top" width="16.666666666666664%" rowspan="1" colspan="1">
                                       <p class="p">division</p>
                                    </td>
                                 </tr>
                                 <tr class="row">
                                    <td class="entry" align="right" valign="top" width="83.33333333333334%" rowspan="1" colspan="1">
                                       <p class="p"><samp class="ph codeph">sqrtf(x)</samp></p>
                                       <p class="p"><samp class="ph codeph">__dsqrt_[rn | rz | ru | rd] (x)</samp></p>
                                    </td>
                                    <td class="entry" valign="top" width="16.666666666666664%" rowspan="1" colspan="1">
                                       <p class="p">square root</p>
                                    </td>
                                 </tr>
                              </tbody>
                           </table>
                        </div>
                     </div>
                  </div>
                  <div class="topic concept nested1" id="controlling-fused-multiply-add"><a name="controlling-fused-multiply-add" shape="rect">
                        <!-- --></a><h3 class="title topictitle2"><a href="#controlling-fused-multiply-add" name="controlling-fused-multiply-add" shape="rect">4.5.&nbsp;Controlling Fused Multiply-add</a></h3>
                     <div class="body conbody">
                        <p class="p">In general, the fused multiply-add operation is faster and more accurate
                           than performing separate multiply and add operations. However, on
                           occasion you may wish to <em class="ph i">disable</em> the merging of multiplies and
                           adds into fused multiply-add instructions. To inhibit this optimization
                           one can write the multiplies and additions using intrinsics with explicit
                           rounding mode as shown in the previous tables. Operations written
                           directly as intrinsics are guaranteed to remain independent and will not
                           be merged into fused multiply-add instructions. With CUDA Fortran it is
                           possible to disable FMA merging via a compiler flag.
                        </p>
                     </div>
                  </div>
                  <div class="topic concept nested1" id="compiler-flags"><a name="compiler-flags" shape="rect">
                        <!-- --></a><h3 class="title topictitle2"><a href="#compiler-flags" name="compiler-flags" shape="rect">4.6.&nbsp;Compiler Flags</a></h3>
                     <div class="body conbody">
                        <p class="p">Compiler flags relevant to <dfn class="term">IEEE 754</dfn> operations are
                           <samp class="ph codeph">-ftz={true|false}</samp>,
                           <samp class="ph codeph">-prec-div={true|false}</samp>, and
                           <samp class="ph codeph">-prec-sqrt={true|false}</samp>. These flags
                           control single precision operations on devices of compute capability of
                           2.0 or later.
                        </p>
                        <div class="tablenoborder">
                           <table cellpadding="4" cellspacing="0" summary="" class="table" frame="border" border="1" rules="all">
                              <thead class="thead" align="left">
                                 <tr class="row">
                                    <th class="entry" valign="top" width="50%" id="d54e3275" rowspan="1" colspan="1">mode</th>
                                    <th class="entry" valign="top" width="50%" id="d54e3278" rowspan="1" colspan="1">flags</th>
                                 </tr>
                              </thead>
                              <tbody class="tbody">
                                 <tr class="row">
                                    <td class="entry" valign="top" width="50%" headers="d54e3275" rowspan="1" colspan="1">
                                       <p class="p">IEEE 754 mode (default)</p>
                                    </td>
                                    <td class="entry" valign="top" width="50%" headers="d54e3278" rowspan="1" colspan="1">
                                       <p class="p">-ftz=false</p>
                                       <p class="p">-prec-div=true</p>
                                       <p class="p">-prec-sqrt=true</p>
                                    </td>
                                 </tr>
                                 <tr class="row">
                                    <td class="entry" valign="top" width="50%" headers="d54e3275" rowspan="1" colspan="1">
                                       <p class="p">fast mode</p>
                                    </td>
                                    <td class="entry" valign="top" width="50%" headers="d54e3278" rowspan="1" colspan="1">
                                       <p class="p">-ftz=true</p>
                                       <p class="p">-prec-div=false</p>
                                       <p class="p">-prec-sqrt=false</p>
                                    </td>
                                 </tr>
                              </tbody>
                           </table>
                        </div>
                        <p class="p">The default <dfn class="term">IEEE 754 mode</dfn> means that single precision
                           operations are correctly rounded and support denormals, as per the IEEE
                           754 standard. In the <dfn class="term">fast mode</dfn> denormal numbers are flushed
                           to zero, and the operations division and square root are not computed to
                           the nearest floating point value. The flags have no effect on double
                           precision or on devices of compute capability below 2.0.
                        </p>
                     </div>
                  </div>
                  <div class="topic concept nested1" id="differences-from-x86"><a name="differences-from-x86" shape="rect">
                        <!-- --></a><h3 class="title topictitle2"><a href="#differences-from-x86" name="differences-from-x86" shape="rect">4.7.&nbsp;Differences from x86</a></h3>
                     <div class="body conbody">
                        <p class="p">NVIDIA GPUs differ from the x86 architecture in that rounding modes are
                           encoded within each floating point instruction instead of dynamically
                           using a floating point control word. Trap handlers for floating point
                           exceptions are not supported. On the GPU there is no status flag to
                           indicate when calculations have overflowed, underflowed, or have involved
                           inexact arithmetic. Like <dfn class="term">SSE</dfn>, the precision of each GPU
                           operation is encoded in the instruction (for x87 the precision is
                           controlled dynamically by the floating point control word).
                        </p>
                     </div>
                  </div>
               </div>
               <div class="topic concept nested0" id="considerations-for-heterogeneous-world"><a name="considerations-for-heterogeneous-world" shape="rect">
                     <!-- --></a><h2 class="title topictitle1"><a href="#considerations-for-heterogeneous-world" name="considerations-for-heterogeneous-world" shape="rect">5.&nbsp;Considerations for a Heterogeneous World</a></h2>
                  <div class="topic concept nested1" id="mathematical-function-accuracy"><a name="mathematical-function-accuracy" shape="rect">
                        <!-- --></a><h3 class="title topictitle2"><a href="#mathematical-function-accuracy" name="mathematical-function-accuracy" shape="rect">5.1.&nbsp;Mathematical Function Accuracy</a></h3>
                     <div class="body conbody">
                        <p class="p">So far we have only considered simple math operations such as addition,
                           multiplication, division, and square root. These operations are simple
                           enough that computing the best floating point result (e.g., the closest in
                           round-to-nearest) is reasonable. For other mathematical operations
                           computing the best floating point result is harder.
                        </p>
                        <p class="p">The problem is called the <dfn class="term">table maker's dilemma</dfn>. To
                           guarantee the correctly rounded result, it is not generally enough to
                           compute the function to a fixed high accuracy. There might still be rare
                           cases where the error in the high accuracy result affects the rounding
                           step at the lower accuracy.
                        </p>
                        <p class="p">It is possible to solve the dilemma for particular functions by doing
                           mathematical analysis and formal proofs <a class="xref" href="index.html#references__4" shape="rect">[4]</a>, but most math libraries
                           choose instead to give up the guarantee of correct rounding. Instead they
                           provide implementations of math functions and document bounds on the
                           relative error of the functions over the input range. For example, the
                           double precision <samp class="ph codeph">sin</samp> function in CUDA is guaranteed to
                           be accurate to within 2 units in the last place (ulp) of the correctly
                           rounded result. In other words, the difference between the computed
                           result and the mathematical result is at most ±2 with respect to the
                           least significant bit position of the fraction part of the floating point
                           result.
                        </p>
                        <p class="p">For most inputs the <samp class="ph codeph">sin</samp> function produces the correctly
                           rounded result.  precisions, libraries and hardware. Take for example the
                           C code sequence shown in <a class="xref" href="index.html#mathematical-function-accuracy__cosine-computation-using-glibc-math-library-when-compiled-with-m32-and-m64" title="The computation of cosine using the glibc Math Library yields different results when compiled with -m32 and -m64." shape="rect">Figure 6</a>. We compiled the code sequence on a 64-bit x86 platform
                           using gcc version 4.4.3 (Ubuntu 4.3.3-4ubuntu5).
                        </p>
                        <p class="p">This shows that the result of computing cos(5992555.0) using a common
                           library differs depending on whether the code is compiled in 32-bit mode
                           or 64-bit mode.
                        </p>
                        <p class="p">The consequence is that different math libraries cannot be expected to
                           compute exactly the same result for a given input. This applies to GPU
                           programming as well. Functions compiled for the GPU will use the NVIDIA
                           CUDA math library implementation while functions compiled for the CPU
                           will use the host compiler math library implementation (e.g.,
                           <dfn class="term">glibc</dfn> on Linux). Because these implementations are
                           independent and neither is guaranteed to be correctly rounded, the
                           results will often differ slightly.
                        </p>
                        <div class="fig fignone" id="mathematical-function-accuracy__cosine-computation-using-glibc-math-library-when-compiled-with-m32-and-m64"><a name="mathematical-function-accuracy__cosine-computation-using-glibc-math-library-when-compiled-with-m32-and-m64" shape="rect">
                              <!-- --></a><span class="figcap">Figure 6. Cosine Computations using the <samp class="ph codeph">glibc</samp> Math
                              Library</span>. <span class="desc figdesc">The computation of cosine using the <samp class="ph codeph">glibc</samp> Math
                              Library yields different results when compiled with
                              <samp class="ph codeph">-m32</samp> and <samp class="ph codeph">-m64</samp>.</span><pre class="pre screen" xml:space="preserve">volatile float x = 5992555.0;
printf("cos(%f): %.10g\n", x, cos(x));

gcc test.c -lm -m64
<strong class="ph b">cos(5992555.000000): 3.320904615e-07</strong>
          
gcc test.c -lm -m32
<strong class="ph b">cos(5992555.000000): 3.320904692e-07</strong></pre></div>
                     </div>
                  </div>
                  <div class="topic concept nested1" id="x87-sse"><a name="x87-sse" shape="rect">
                        <!-- --></a><h3 class="title topictitle2"><a href="#x87-sse" name="x87-sse" shape="rect">5.2.&nbsp;x87 and SSE</a></h3>
                     <div class="body conbody">
                        <p class="p">One of the unfortunate realities of C compilers is that they are often
                           poor at preserving IEEE 754 semantics of floating point operations <a class="xref" href="index.html#references__6" shape="rect">[6]</a>. This can be
                           particularly confusing on platforms that support x87 and SSE operations.
                           Just like CUDA operations, SSE operations are performed on single or
                           double precision values, while x87 operations often use an additional
                           internal 80-bit precision format. Sometimes the results of a computation
                           using x87 can depend on whether an intermediate result was allocated to a
                           register or stored to memory. Values stored to memory are rounded to the
                           declared precision (e.g., single precision for <samp class="ph codeph">float</samp> and
                           double precision for <samp class="ph codeph">double</samp>). Values kept in registers
                           can remain in extended precision.  Also, x87 instructions will often be
                           used by default for 32-bit compiles but SSE instructions will be used by
                           default for 64-bit compiles.
                        </p>
                        <p class="p">Because of these issues, guaranteeing a specific precision level on the
                           CPU can sometimes be tricky. When comparing CPU results to results
                           computed on the GPU, it is generally best to compare using SSE
                           instructions. SSE instructions follow IEEE 754 for single and
                           doubleprecision.
                        </p>
                        <p class="p">On 32-bit x86 targets without SSE it can be helpful to declare
                           variables using <samp class="ph codeph">volatile</samp> and force floating point values
                           to be stored to memory (<samp class="ph codeph">/Op</samp> in Visual Studio and
                           <samp class="ph codeph">-ffloat-store</samp> in <samp class="ph codeph">gcc</samp>). This moves
                           results from extended precision registers into memory, where the
                           precision is precisely single or double precision. Alternately, the x87
                           control word can be updated to set the precision to 24 or 53 bits using
                           the assembly instruction <samp class="ph codeph">fldcw</samp> or a compiler option such
                           as <samp class="ph codeph">-mpc32</samp> or<samp class="ph codeph">-mpc64</samp> in
                           <samp class="ph codeph">gcc</samp>.
                        </p>
                     </div>
                  </div>
                  <div class="topic concept nested1" id="core-counts"><a name="core-counts" shape="rect">
                        <!-- --></a><h3 class="title topictitle2"><a href="#core-counts" name="core-counts" shape="rect">5.3.&nbsp;Core Counts</a></h3>
                     <div class="body conbody">
                        <p class="p">As we have shown in <a class="xref" href="index.html#dot-product-accuracy-example" shape="rect">Chapter&nbsp;3</a>,
                           the final values computed using <dfn class="term">IEEE 754</dfn> arithmetic can
                           depend on implementation choices such as whether to use fused
                           multiply-add or whether additions are organized in series or parallel.
                           These differences affect computation on the CPU and on the GPU.
                        </p>
                        <p class="p">One way such differences can arise is from differences between the
                           number of concurrent threads involved in a computation. On the GPU, a
                           common design pattern is to have all threads in a block coordinate to do
                           a parallel reduction on data within the block, followed by a serial
                           reduction of the results from each block.  Changing the number of threads
                           per block reorganizes the reduction; if the reduction is addition, then
                           the change rearranges parentheses in the long string of additions.
                        </p>
                        <p class="p">Even if the same general strategy such as parallel reduction is used on
                           the CPU and GPU, it is common to have widely different numbers of threads
                           on the GPU compared to the CPU.  For example, the GPU implementation
                           might launch blocks with 128 threads per block, while the CPU
                           implementation might use 4 threads in total.
                        </p>
                     </div>
                  </div>
                  <div class="topic concept nested1" id="verifying-gpu-results"><a name="verifying-gpu-results" shape="rect">
                        <!-- --></a><h3 class="title topictitle2"><a href="#verifying-gpu-results" name="verifying-gpu-results" shape="rect">5.4.&nbsp;Verifying GPU Results</a></h3>
                     <div class="body conbody">
                        <p class="p">The same inputs will give the same results for individual <dfn class="term">IEEE
                              754</dfn> operations to a given precision on the CPU and GPU. As we
                           have explained, there are many reasons why the same sequence of
                           operations may not be performed on the CPU and GPU. The GPU has fused
                           multiply-add while the CPU does not. Parallelizing algorithms may
                           rearrange operations, yielding different numeric results. The CPU may be
                           computing results in a precision higher than expected. Finally, many
                           common mathematical functions are not required by the IEEE 754 standard
                           to be correctly rounded so should not be expected to yield identical
                           results between implementations.
                        </p>
                        <p class="p">When porting numeric code from the CPU to the GPU of course it makes
                           sense to use the x86 CPU results as a reference. But differences between
                           the CPU result and GPU result must be interpreted carefully. Differences
                           are not automatically evidence that the result computed by the GPU is
                           wrong or that there is a problem on the GPU.
                        </p>
                        <p class="p">Computing results in a high precision and then comparing to results
                           computed in a lower precision can be helpful to see if the lower
                           precision is adequate for a particular application. However, rounding
                           high precision results to a lower precision is not equivalent to
                           performing the entire computation in lower precision. This can sometimes
                           be a problem when using x87 and comparing results against the GPU. The
                           results of the CPU may be computed to an unexpectedly high extended
                           precision for some or all of the operations. The GPU result will be
                           computed using single or double precision only.
                        </p>
                     </div>
                  </div>
               </div>
               <div class="topic concept nested0" id="concrete-recommendations"><a name="concrete-recommendations" shape="rect">
                     <!-- --></a><h2 class="title topictitle1"><a href="#concrete-recommendations" name="concrete-recommendations" shape="rect">6.&nbsp;Concrete Recommendations</a></h2>
                  <div class="body conbody">
                     <p class="p">The key points we have covered are the following:</p>
                     <dl class="dl">
                        <dt class="dt dlterm">Use the fused multiply-add operator.</dt>
                        <dd class="dd">The fused multiply-add operator on the GPU has high performance
                           and increases the accuracy of computations. No special flags
                           or function calls are needed to gain this benefit in CUDA
                           programs. Understand that a hardware fused multiply-add
                           operation is not yet available on the CPU, which can cause
                           differences in numerical results.
                        </dd>
                        <dt class="dt dlterm">Compare results carefully.</dt>
                        <dd class="dd">Even in the strict world of <dfn class="term">IEEE 754</dfn> operations,
                           minor details such as organization of parentheses or thread
                           counts can affect the final result. Take this into account
                           when doing comparisons between implementations.
                        </dd>
                        <dt class="dt dlterm">Know the capabilities of your GPU.</dt>
                        <dd class="dd">The numerical capabilities are encoded in the compute capability
                           number of your GPU. Devices of compute capability 2.0 and
                           later are capable of single and double precision arithmetic
                           following the IEEE 754 standard, and have hardware units for
                           performing fused multiply-add in both single and double
                           precision.
                        </dd>
                        <dt class="dt dlterm">Take advantage of the CUDA math library functions.</dt>
                        <dd class="dd">These functions are documented in Appendix C of the <cite class="cite">CUDA C
                              Programming Guide </cite><a class="xref" href="index.html#references__7" shape="rect">[7]</a>. The math library
                           includes all the math functions listed in the C99 standard <a class="xref" href="index.html#references__3" shape="rect">[3]</a> plus some
                           additional useful functions.  These functions have been tuned for a
                           reasonable compromise between performance and accuracy.
                        </dd>
                        <dd class="dd">We constantly strive to improve the quality of our math library
                           functionality. Please let us know about any functions that you
                           require that we do not provide, or if the accuracy or performance of
                           any of our functions does not meet your needs. Leave comments in the
                           <cite class="cite">NVIDIA CUDA forum</cite><a name="fnsrc_1" href="#fntarg_1" shape="rect"><sup>1</sup></a>
                           or join the <cite class="cite">Registered Developer Program</cite><a name="fnsrc_2" href="#fntarg_2" shape="rect"><sup>2</sup></a>
                           and file a bug with your feedback.
                        </dd>
                     </dl>
                  </div>
               </div>
               <div class="topic reference nested0" id="acknowledgements"><a name="acknowledgements" shape="rect">
                     <!-- --></a><h2 class="title topictitle1"><a href="#acknowledgements" name="acknowledgements" shape="rect">A.&nbsp;Acknowledgements</a></h2>
                  <div class="body refbody">
                     <div class="section">
                        <p class="p">This paper was authored by Nathan Whitehead and Alex Fit-Florea for
                           NVIDIA Corporation.
                        </p>
                        <p class="p">Thanks to Ujval Kapasi, Kurt Wall, Paul Sidenblad, Massimiliano
                           Fatica, Everett Phillips, Norbert Juffa, and Will Ramey for their
                           helpful comments and suggestions.
                        </p>
                        <p class="p">Permission to make digital or hard copies of all or part of this work
                           for any use is granted without fee provided that copies bear this
                           notice and the full citation on the first page.
                        </p>
                     </div>
                  </div>
               </div>
               <div class="topic reference nested0" id="references"><a name="references" shape="rect">
                     <!-- --></a><h2 class="title topictitle1"><a href="#references" name="references" shape="rect">B.&nbsp;References</a></h2>
                  <div class="body refbody">
                     <div class="section" id="references__1"><a name="references__1" shape="rect">
                           <!-- --></a><p class="p">[1] <cite class="cite">ANSI/IEEE 754-1985. American National Standard - IEEE
                              Standard for Binary Floating-Point Arithmetic. American National
                              Standards Institute, Inc., New York, 1985.</cite></p>
                     </div>
                     <div class="section" id="references__2"><a name="references__2" shape="rect">
                           <!-- --></a><p class="p">[2] <cite class="cite">IEEE 754-2008. IEEE 754–2008 Standard for Floating-Point
                              Arithmetic. August 2008.</cite></p>
                     </div>
                     <div class="section" id="references__3"><a name="references__3" shape="rect">
                           <!-- --></a><p class="p">[3] <cite class="cite">ISO/IEC 9899:1999(E). Programming languages - C. American
                              National Standards Institute, Inc., New York, 1999.</cite></p>
                     </div>
                     <div class="section" id="references__4"><a name="references__4" shape="rect">
                           <!-- --></a><p class="p">[4] <cite class="cite">Catherine Daramy-Loirat, David Defour, Florent de Dinechin,
                              Matthieu Gallet, Nicolas Gast, and Jean-Michel Muller. CR-LIBM: A
                              library of correctly rounded elementary functions in
                              double-precision, February 2005.</cite></p>
                     </div>
                     <div class="section" id="references__5"><a name="references__5" shape="rect">
                           <!-- --></a><p class="p">[5] <cite class="cite">David Goldberg. What every computer scientist should know
                              about floating-point arithmetic. ACM Computing Surveys, March
                              1991.</cite> Edited reprint available at: <a class="xref" href="http://download.oracle.com/docs/cd/E19957-01/806-3568/ncg_goldberg.html" target="_blank" shape="rect">http://download.oracle.com/docs/cd/E19957-01/806-3568/ncg_goldberg.html</a>.
                        </p>
                     </div>
                     <div class="section" id="references__6"><a name="references__6" shape="rect">
                           <!-- --></a><p class="p">[6] <cite class="cite">David Monniaux. The pitfalls of verifying floating-point
                              computations. ACM Transactions on Programming Languages and Systems,
                              May 2008.</cite></p>
                     </div>
                     <div class="section" id="references__7"><a name="references__7" shape="rect">
                           <!-- --></a><p class="p">[7] <cite class="cite">NVIDIA. CUDA C Programming Guide Version 4.0,
                              2011.</cite></p>
                     </div>
                  </div>
               </div>
               <div class="topic concept nested0" id="notices-header"><a name="notices-header" shape="rect">
                     <!-- --></a><h2 class="title topictitle1"><a href="#notices-header" name="notices-header" shape="rect">Notices</a></h2>
                  <div class="topic reference nested1" id="notice"><a name="notice" shape="rect">
                        <!-- --></a><h3 class="title topictitle2"><a href="#notice" name="notice" shape="rect"></a></h3>
                     <div class="body refbody">
                        <div class="section">
                           <h3 class="title sectiontitle">Notice</h3>
                           <p class="p">ALL NVIDIA DESIGN SPECIFICATIONS, REFERENCE BOARDS, FILES, DRAWINGS, DIAGNOSTICS, LISTS, AND OTHER DOCUMENTS (TOGETHER AND
                              SEPARATELY, "MATERIALS") ARE BEING PROVIDED "AS IS." NVIDIA MAKES NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE
                              WITH RESPECT TO THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT, MERCHANTABILITY, AND FITNESS
                              FOR A PARTICULAR PURPOSE. 
                           </p>
                           <p class="p">Information furnished is believed to be accurate and reliable. However, NVIDIA Corporation assumes no responsibility for the
                              consequences of use of such information or for any infringement of patents or other rights of third parties that may result
                              from its use. No license is granted by implication of otherwise under any patent rights of NVIDIA Corporation. Specifications
                              mentioned in this publication are subject to change without notice. This publication supersedes and replaces all other information
                              previously supplied. NVIDIA Corporation products are not authorized as critical components in life support devices or systems
                              without express written approval of NVIDIA Corporation.
                           </p>
                        </div>
                     </div>
                  </div>
                  <div class="topic reference nested1" id="trademarks"><a name="trademarks" shape="rect">
                        <!-- --></a><h3 class="title topictitle2"><a href="#trademarks" name="trademarks" shape="rect"></a></h3>
                     <div class="body refbody">
                        <div class="section">
                           <h3 class="title sectiontitle">Trademarks</h3>
                           <p class="p">NVIDIA and the NVIDIA logo are trademarks or registered trademarks of NVIDIA Corporation
                              in the U.S. and other countries.  Other company and product names may be trademarks of
                              the respective companies with which they are associated.
                           </p>
                        </div>
                     </div>
                  </div>
                  <div class="topic reference nested1" id="copyright-past-to-present"><a name="copyright-past-to-present" shape="rect">
                        <!-- --></a><h3 class="title topictitle2"><a href="#copyright-past-to-present" name="copyright-past-to-present" shape="rect"></a></h3>
                     <div class="body refbody">
                        <div class="section">
                           <h3 class="title sectiontitle">Copyright</h3>
                           <p class="p">© <span class="ph">2011</span>-<span class="ph">2014</span> NVIDIA
                              Corporation. All rights reserved.
                           </p>
                           <p class="p">This product includes software developed by the Syncro Soft SRL (http://www.sync.ro/).</p>
                        </div>
                     </div>
                  </div>
               </div>
               <div class="fn"><a name="fntarg_1" href="#fnsrc_1" shape="rect"><sup>1</sup></a><a class="xref" href="http://forums.nvidia.com/index.php?showforum=62" target="_blank" shape="rect">http://forums.nvidia.com/index.php?showforum=62</a></div>
               <div class="fn"><a name="fntarg_2" href="#fnsrc_2" shape="rect"><sup>2</sup></a><a class="xref" href="http://developer.nvidia.com/join-nvidia-registered-developer-program" target="_blank" shape="rect">http://developer.nvidia.com/</a><a class="xref" href="http://developer.nvidia.com/join-nvidia-registered-developer-program" target="_blank" shape="rect">join-nvidia-registered-developer-program</a></div>
               
               <hr id="contents-end"></hr>
               
            </article>
         </div>
      </div>
      <script language="JavaScript" type="text/javascript" charset="utf-8" src="../common/formatting/common.min.js"></script>
      <script language="JavaScript" type="text/javascript" charset="utf-8" src="../common/scripts/google-analytics/google-analytics-write.js"></script>
      <script language="JavaScript" type="text/javascript" charset="utf-8" src="../common/scripts/google-analytics/google-analytics-tracker.js"></script>
      <script type="text/javascript">var switchTo5x=true;</script><script type="text/javascript" src="http://w.sharethis.com/button/buttons.js"></script><script type="text/javascript">stLight.options({publisher: "998dc202-a267-4d8e-bce9-14debadb8d92", doNotHash: false, doNotCopy: false, hashAddressBar: false});</script></body>
</html>