Sophie: nvidia-cuda-toolkit-devel-10.1.168-1.2.mga7.nonfree x86

nvidia-cuda-toolkit-devel-10.1.168-1.2.mga7.nonfree.x86_64.rpm

<!DOCTYPE html
  PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" lang="en-us" xml:lang="en-us">
   <head>
      <meta http-equiv="Content-Type" content="text/html; charset=utf-8"></meta>
      <meta http-equiv="X-UA-Compatible" content="IE=edge"></meta>
      <meta name="copyright" content="(C) Copyright 2005"></meta>
      <meta name="DC.rights.owner" content="(C) Copyright 2005"></meta>
      <meta name="DC.Type" content="concept"></meta>
      <meta name="DC.Title" content="CUDA C Best Practices Guide"></meta>
      <meta name="abstract" content="The programming guide to using the CUDA Toolkit to obtain the best performance from NVIDIA GPUs."></meta>
      <meta name="description" content="The programming guide to using the CUDA Toolkit to obtain the best performance from NVIDIA GPUs."></meta>
      <meta name="DC.Coverage" content="Programming Guides"></meta>
      <meta name="DC.subject" content="CUDA Best Practices, CUDA APOD, CUDA optimization"></meta>
      <meta name="keywords" content="CUDA Best Practices, CUDA APOD, CUDA optimization"></meta>
      <meta name="DC.Format" content="XHTML"></meta>
      <meta name="DC.Identifier" content="abstract"></meta>
      <link rel="stylesheet" type="text/css" href="../common/formatting/commonltr.css"></link>
      <link rel="stylesheet" type="text/css" href="../common/formatting/site.css"></link>
      <title>Best Practices Guide :: CUDA Toolkit Documentation</title>
      <!--[if lt IE 9]>
      <script src="../common/formatting/html5shiv-printshiv.min.js"></script>
      <![endif]-->
      <script type="text/javascript" charset="utf-8" src="//assets.adobedtm.com/b92787824f2e0e9b68dc2e993f9bd995339fe417/satelliteLib-7ba51e58dc61bcb0e9311aadd02a0108ab24cc6c.js"></script>
      <script type="text/javascript" charset="utf-8" src="../common/formatting/jquery.min.js"></script>
      <script type="text/javascript" charset="utf-8" src="../common/formatting/jquery.ba-hashchange.min.js"></script>
      <script type="text/javascript" charset="utf-8" src="../common/formatting/jquery.scrollintoview.min.js"></script>
      <script type="text/javascript" src="../search/htmlFileList.js"></script>
      <script type="text/javascript" src="../search/htmlFileInfoList.js"></script>
      <script type="text/javascript" src="../search/nwSearchFnt.min.js"></script>
      <script type="text/javascript" src="../search/stemmers/en_stemmer.min.js"></script>
      <script type="text/javascript" src="../search/index-1.js"></script>
      <script type="text/javascript" src="../search/index-2.js"></script>
      <script type="text/javascript" src="../search/index-3.js"></script>
      <link rel="canonical" href="http://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html"></link>
      <link rel="stylesheet" type="text/css" href="../common/formatting/qwcode.highlight.css"></link>
   </head>
   <body>
      
      <header id="header"><span id="company">NVIDIA</span><span id="site-title">CUDA Toolkit Documentation</span><form id="search" method="get" action="search">
            <input type="text" name="search-text"></input><fieldset id="search-location">
               <legend>Search In:</legend>
               <label><input type="radio" name="search-type" value="site"></input>Entire Site</label>
               <label><input type="radio" name="search-type" value="document"></input>Just This Document</label></fieldset>
            <button type="reset">clear search</button>
            <button id="submit" type="submit">search</button></form>
      </header>
      <div id="site-content">
         <nav id="site-nav">
            <div class="category closed"><a href="../index.html" title="The root of the site.">CUDA Toolkit 
                  
                  
                  v10.1.168</a></div>
            <div class="category"><a href="index.html" title="Best Practices Guide">Best Practices Guide</a></div>
            <ul>
               <li>
                  <div class="section-link"><a href="#preface">Preface</a></div>
                  <ul>
                     <li>
                        <div class="section-link"><a href="#what-is-this-document">What Is This Document?</a></div>
                     </li>
                     <li>
                        <div class="section-link"><a href="#who-should-read-this-guide">Who Should Read This Guide?</a></div>
                     </li>
                     <li>
                        <div class="section-link"><a href="#assess-parallelize-optimize-deploy">Assess, Parallelize, Optimize, Deploy</a></div>
                        <ul>
                           <li>
                              <div class="section-link"><a href="#assess">Assess</a></div>
                           </li>
                           <li>
                              <div class="section-link"><a href="#parallelize">Parallelize</a></div>
                           </li>
                           <li>
                              <div class="section-link"><a href="#optimize">Optimize</a></div>
                           </li>
                           <li>
                              <div class="section-link"><a href="#deploy">Deploy</a></div>
                           </li>
                        </ul>
                     </li>
                     <li>
                        <div class="section-link"><a href="#recommendations-and-best-practices-preface">Recommendations and Best Practices</a></div>
                     </li>
                  </ul>
               </li>
               <li>
                  <div class="section-link"><a href="#assessing-your-application">1.&nbsp;Assessing Your Application</a></div>
               </li>
               <li>
                  <div class="section-link"><a href="#heterogeneous-computing">2.&nbsp;Heterogeneous Computing</a></div>
                  <ul>
                     <li>
                        <div class="section-link"><a href="#differences-between-host-and-device">2.1.&nbsp;Differences between Host and Device</a></div>
                     </li>
                     <li>
                        <div class="section-link"><a href="#what-runs-on-cuda-enabled-device">2.2.&nbsp;What Runs on a CUDA-Enabled Device?</a></div>
                     </li>
                  </ul>
               </li>
               <li>
                  <div class="section-link"><a href="#application-profiling">3.&nbsp;Application Profiling</a></div>
                  <ul>
                     <li>
                        <div class="section-link"><a href="#profile">3.1.&nbsp;Profile</a></div>
                        <ul>
                           <li>
                              <div class="section-link"><a href="#creating-profile">3.1.1.&nbsp;Creating the Profile</a></div>
                           </li>
                           <li>
                              <div class="section-link"><a href="#identifying-hotspots">3.1.2.&nbsp;Identifying Hotspots</a></div>
                           </li>
                           <li>
                              <div class="section-link"><a href="#understanding-scaling">3.1.3.&nbsp;Understanding Scaling</a></div>
                              <ul>
                                 <li>
                                    <div class="section-link"><a href="#strong-scaling-and-amdahls-law">3.1.3.1.&nbsp;Strong Scaling and Amdahl's Law</a></div>
                                 </li>
                                 <li>
                                    <div class="section-link"><a href="#weak-scaling-and-gustafsons-law">3.1.3.2.&nbsp;Weak Scaling and Gustafson's Law</a></div>
                                 </li>
                                 <li>
                                    <div class="section-link"><a href="#applying-strong-and-weak-scaling">3.1.3.3.&nbsp;Applying Strong and Weak Scaling</a></div>
                                 </li>
                              </ul>
                           </li>
                        </ul>
                     </li>
                  </ul>
               </li>
               <li>
                  <div class="section-link"><a href="#parallelizing-your-application">4.&nbsp;Parallelizing Your Application</a></div>
               </li>
               <li>
                  <div class="section-link"><a href="#getting-started">5.&nbsp;Getting Started</a></div>
                  <ul>
                     <li>
                        <div class="section-link"><a href="#parallel-libraries">5.1.&nbsp;Parallel Libraries</a></div>
                     </li>
                     <li>
                        <div class="section-link"><a href="#parallelizing-compilers">5.2.&nbsp;Parallelizing Compilers</a></div>
                     </li>
                     <li>
                        <div class="section-link"><a href="#coding-to-expose-parallelism">5.3.&nbsp;Coding to Expose Parallelism</a></div>
                     </li>
                  </ul>
               </li>
               <li>
                  <div class="section-link"><a href="#getting-right-answer">6.&nbsp;Getting the Right Answer</a></div>
                  <ul>
                     <li>
                        <div class="section-link"><a href="#verification">6.1.&nbsp;Verification</a></div>
                        <ul>
                           <li>
                              <div class="section-link"><a href="#reference-comparison">6.1.1.&nbsp;Reference Comparison</a></div>
                           </li>
                           <li>
                              <div class="section-link"><a href="#unit-testing">6.1.2.&nbsp;Unit Testing</a></div>
                           </li>
                        </ul>
                     </li>
                     <li>
                        <div class="section-link"><a href="#debugging">6.2.&nbsp;Debugging</a></div>
                     </li>
                     <li>
                        <div class="section-link"><a href="#numerical-accuracy-and-precision">6.3.&nbsp;Numerical Accuracy and Precision</a></div>
                        <ul>
                           <li>
                              <div class="section-link"><a href="#single-vs-double-precision">6.3.1.&nbsp;Single vs. Double Precision</a></div>
                           </li>
                           <li>
                              <div class="section-link"><a href="#floating-point-math-is-not-associative">6.3.2.&nbsp;Floating Point Math Is not Associative</a></div>
                           </li>
                           <li>
                              <div class="section-link"><a href="#promotions-to-doubles-and-truncations-to-floats">6.3.3.&nbsp;Promotions to Doubles and Truncations to Floats</a></div>
                           </li>
                           <li>
                              <div class="section-link"><a href="#ieee-754-compliance">6.3.4.&nbsp;IEEE 754 Compliance</a></div>
                           </li>
                           <li>
                              <div class="section-link"><a href="#x86-80-bit-computations">6.3.5.&nbsp;x86 80-bit Computations</a></div>
                           </li>
                        </ul>
                     </li>
                  </ul>
               </li>
               <li>
                  <div class="section-link"><a href="#optimizing-cuda-applications">7.&nbsp;Optimizing CUDA Applications</a></div>
               </li>
               <li>
                  <div class="section-link"><a href="#performance-metrics">8.&nbsp;Performance Metrics</a></div>
                  <ul>
                     <li>
                        <div class="section-link"><a href="#timing">8.1.&nbsp;Timing</a></div>
                        <ul>
                           <li>
                              <div class="section-link"><a href="#using-cpu-timers">8.1.1.&nbsp;Using CPU Timers</a></div>
                           </li>
                           <li>
                              <div class="section-link"><a href="#using-cuda-gpu-timers">8.1.2.&nbsp;Using CUDA GPU Timers</a></div>
                           </li>
                        </ul>
                     </li>
                     <li>
                        <div class="section-link"><a href="#bandwidth">8.2.&nbsp;Bandwidth</a></div>
                        <ul>
                           <li>
                              <div class="section-link"><a href="#theoretical-bandwidth-calculation">8.2.1.&nbsp;Theoretical Bandwidth Calculation</a></div>
                           </li>
                           <li>
                              <div class="section-link"><a href="#effective-bandwidth-calculation">8.2.2.&nbsp;Effective Bandwidth Calculation</a></div>
                           </li>
                           <li>
                              <div class="section-link"><a href="#throughput-reported-by-visual-profiler">8.2.3.&nbsp;Throughput Reported by Visual Profiler</a></div>
                           </li>
                        </ul>
                     </li>
                  </ul>
               </li>
               <li>
                  <div class="section-link"><a href="#memory-optimizations">9.&nbsp;Memory Optimizations</a></div>
                  <ul>
                     <li>
                        <div class="section-link"><a href="#data-transfer-between-host-and-device">9.1.&nbsp;Data Transfer Between Host and Device</a></div>
                        <ul>
                           <li>
                              <div class="section-link"><a href="#pinned-memory">9.1.1.&nbsp;Pinned Memory</a></div>
                           </li>
                           <li>
                              <div class="section-link"><a href="#asynchronous-transfers-and-overlapping-transfers-with-computation">9.1.2.&nbsp;Asynchronous and Overlapping Transfers with Computation</a></div>
                           </li>
                           <li>
                              <div class="section-link"><a href="#zero-copy">9.1.3.&nbsp;Zero Copy</a></div>
                           </li>
                           <li>
                              <div class="section-link"><a href="#unified-virtual-addressing">9.1.4.&nbsp;Unified Virtual Addressing</a></div>
                           </li>
                        </ul>
                     </li>
                     <li>
                        <div class="section-link"><a href="#device-memory-spaces">9.2.&nbsp;Device Memory Spaces</a></div>
                        <ul>
                           <li>
                              <div class="section-link"><a href="#coalesced-access-to-global-memory">9.2.1.&nbsp;Coalesced Access to Global Memory</a></div>
                              <ul>
                                 <li>
                                    <div class="section-link"><a href="#simple-access-pattern">9.2.1.1.&nbsp;A Simple Access Pattern</a></div>
                                 </li>
                                 <li>
                                    <div class="section-link"><a href="#sequential-but-misaligned-access-pattern">9.2.1.2.&nbsp;A Sequential but Misaligned Access Pattern</a></div>
                                 </li>
                                 <li>
                                    <div class="section-link"><a href="#effects-of-misaligned-accesses">9.2.1.3.&nbsp;Effects of Misaligned Accesses</a></div>
                                 </li>
                                 <li>
                                    <div class="section-link"><a href="#strided-accesses">9.2.1.4.&nbsp;Strided Accesses</a></div>
                                 </li>
                              </ul>
                           </li>
                           <li>
                              <div class="section-link"><a href="#shared-memory">9.2.2.&nbsp;Shared Memory</a></div>
                              <ul>
                                 <li>
                                    <div class="section-link"><a href="#shared-memory-and-memory-banks">9.2.2.1.&nbsp;Shared Memory and Memory Banks</a></div>
                                 </li>
                                 <li>
                                    <div class="section-link"><a href="#shared-memory-in-matrix-multiplication-c-ab">9.2.2.2.&nbsp;Shared Memory in Matrix Multiplication (C=AB)</a></div>
                                 </li>
                                 <li>
                                    <div class="section-link"><a href="#shared-memory-in-matrix-multiplication-c-aa">9.2.2.3.&nbsp;Shared Memory in Matrix Multiplication (C=AAT)</a></div>
                                 </li>
                              </ul>
                           </li>
                           <li>
                              <div class="section-link"><a href="#local-memory">9.2.3.&nbsp;Local Memory</a></div>
                           </li>
                           <li>
                              <div class="section-link"><a href="#texture-memory">9.2.4.&nbsp;Texture Memory</a></div>
                              <ul>
                                 <li>
                                    <div class="section-link"><a href="#additional-texture-capabilities">9.2.4.1.&nbsp;Additional Texture Capabilities</a></div>
                                 </li>
                              </ul>
                           </li>
                           <li>
                              <div class="section-link"><a href="#constant-memory">9.2.5.&nbsp;Constant Memory</a></div>
                           </li>
                           <li>
                              <div class="section-link"><a href="#registers">9.2.6.&nbsp;Registers</a></div>
                              <ul>
                                 <li>
                                    <div class="section-link"><a href="#register-pressure">9.2.6.1.&nbsp;Register Pressure</a></div>
                                 </li>
                              </ul>
                           </li>
                        </ul>
                     </li>
                     <li>
                        <div class="section-link"><a href="#allocation">9.3.&nbsp;Allocation</a></div>
                     </li>
                     <li>
                        <div class="section-link"><a href="#numa-best-practices">9.4.&nbsp;NUMA Best Practices</a></div>
                     </li>
                  </ul>
               </li>
               <li>
                  <div class="section-link"><a href="#execution-configuration-optimizations">10.&nbsp;Execution Configuration Optimizations</a></div>
                  <ul>
                     <li>
                        <div class="section-link"><a href="#occupancy">10.1.&nbsp;Occupancy</a></div>
                        <ul>
                           <li>
                              <div class="section-link"><a href="#calculating-occupancy">10.1.1.&nbsp;Calculating Occupancy</a></div>
                           </li>
                        </ul>
                     </li>
                     <li>
                        <div class="section-link"><a href="#concurrent-kernel-execution">10.2.&nbsp;Concurrent Kernel Execution</a></div>
                     </li>
                     <li>
                        <div class="section-link"><a href="#multiple-contexts">10.3.&nbsp;Multiple contexts</a></div>
                     </li>
                     <li>
                        <div class="section-link"><a href="#hiding-register-dependencies">10.4.&nbsp;Hiding Register Dependencies</a></div>
                     </li>
                     <li>
                        <div class="section-link"><a href="#thread-and-block-heuristics">10.5.&nbsp;Thread and Block Heuristics</a></div>
                     </li>
                     <li>
                        <div class="section-link"><a href="#effects-of-shared-memory">10.6.&nbsp;Effects of Shared Memory</a></div>
                     </li>
                  </ul>
               </li>
               <li>
                  <div class="section-link"><a href="#instruction-optimization">11.&nbsp;Instruction Optimization</a></div>
                  <ul>
                     <li>
                        <div class="section-link"><a href="#arithmetic-instructions">11.1.&nbsp;Arithmetic Instructions</a></div>
                        <ul>
                           <li>
                              <div class="section-link"><a href="#division-and-modulo-operations">11.1.1.&nbsp;Division Modulo Operations</a></div>
                           </li>
                           <li>
                              <div class="section-link"><a href="#reciprocal-square-root">11.1.2.&nbsp;Reciprocal Square Root</a></div>
                           </li>
                           <li>
                              <div class="section-link"><a href="#other-arithmetic-instructions">11.1.3.&nbsp;Other Arithmetic Instructions</a></div>
                           </li>
                           <li>
                              <div class="section-link"><a href="#exponentiation-small-fractions">11.1.4.&nbsp;Exponentiation With Small Fractional Arguments</a></div>
                           </li>
                           <li>
                              <div class="section-link"><a href="#math-libraries">11.1.5.&nbsp;Math Libraries</a></div>
                           </li>
                           <li>
                              <div class="section-link"><a href="#precision-related-compiler-flags">11.1.6.&nbsp;Precision-related Compiler Flags</a></div>
                           </li>
                        </ul>
                     </li>
                     <li>
                        <div class="section-link"><a href="#memory-instructions">11.2.&nbsp;Memory Instructions</a></div>
                     </li>
                  </ul>
               </li>
               <li>
                  <div class="section-link"><a href="#control-flow">12.&nbsp;Control Flow</a></div>
                  <ul>
                     <li>
                        <div class="section-link"><a href="#branching-and-divergence">12.1.&nbsp;Branching and Divergence</a></div>
                     </li>
                     <li>
                        <div class="section-link"><a href="#branch-predication">12.2.&nbsp;Branch Predication</a></div>
                     </li>
                     <li>
                        <div class="section-link"><a href="#loop-counters-signed-vs-unsigned">12.3.&nbsp;Loop Counters Signed vs. Unsigned</a></div>
                     </li>
                     <li>
                        <div class="section-link"><a href="#synchronizing-divergent-threads">12.4.&nbsp;Synchronizing Divergent Threads in a Loop</a></div>
                     </li>
                  </ul>
               </li>
               <li>
                  <div class="section-link"><a href="#deploying-cuda-applications">13.&nbsp;Deploying CUDA Applications</a></div>
               </li>
               <li>
                  <div class="section-link"><a href="#understanding-programming-environment">14.&nbsp;Understanding the Programming Environment</a></div>
                  <ul>
                     <li>
                        <div class="section-link"><a href="#cuda-compute-capability">14.1.&nbsp;CUDA Compute Capability</a></div>
                     </li>
                     <li>
                        <div class="section-link"><a href="#additional-hardware-data">14.2.&nbsp;Additional Hardware Data</a></div>
                     </li>
                     <li>
                        <div class="section-link"><a href="#which-compute-capability-to-target">14.3.&nbsp;Which Compute Capability Target</a></div>
                     </li>
                     <li>
                        <div class="section-link"><a href="#cuda-runtime">14.4.&nbsp;CUDA Runtime</a></div>
                     </li>
                  </ul>
               </li>
               <li>
                  <div class="section-link"><a href="#cuda-compatibility-and-upgrades">15.&nbsp;CUDA Compatibility and Upgrades</a></div>
                  <ul>
                     <li>
                        <div class="section-link"><a href="#cuda-runtime-and-driver-api-version">15.1.&nbsp;CUDA Runtime and Driver API Version</a></div>
                     </li>
                     <li>
                        <div class="section-link"><a href="#standard-upgrade-path">15.2.&nbsp;Standard Upgrade Path</a></div>
                     </li>
                     <li>
                        <div class="section-link"><a href="#flexible-upgrade-path">15.3.&nbsp;Flexible Upgrade Path</a></div>
                     </li>
                     <li>
                        <div class="section-link"><a href="#cuda-compatibility-package">15.4.&nbsp;CUDA Compatibility Platform Package</a></div>
                     </li>
                     <li>
                        <div class="section-link"><a href="#extended-nvidia-smi">15.5.&nbsp;Extended nvidia-smi</a></div>
                     </li>
                  </ul>
               </li>
               <li>
                  <div class="section-link"><a href="#preparing-for-deployment">16.&nbsp;Preparing for Deployment</a></div>
                  <ul>
                     <li>
                        <div class="section-link"><a href="#testing-for-cuda-availability">16.1.&nbsp;Testing for CUDA Availability</a></div>
                     </li>
                     <li>
                        <div class="section-link"><a href="#error-handling">16.2.&nbsp;Error Handling</a></div>
                     </li>
                     <li>
                        <div class="section-link"><a href="#building-for-maximum-compatibility">16.3.&nbsp;Building for Maximum Compatibility</a></div>
                     </li>
                     <li>
                        <div class="section-link"><a href="#distributing-cuda-runtime-and-libraries">16.4.&nbsp;Distributing the CUDA Runtime and Libraries</a></div>
                        <ul>
                           <li>
                              <div class="section-link"><a href="#redistribution">16.4.1.&nbsp;CUDA Toolkit Library Redistribution</a></div>
                              <ul>
                                 <li>
                                    <div class="section-link"><a href="#redistribution--which-files">16.4.1.1.&nbsp;Which Files to Redistribute</a></div>
                                 </li>
                                 <li>
                                    <div class="section-link"><a href="#redistribution--where-to-install">16.4.1.2.&nbsp;Where to Install Redistributed CUDA Libraries</a></div>
                                 </li>
                              </ul>
                           </li>
                        </ul>
                     </li>
                  </ul>
               </li>
               <li>
                  <div class="section-link"><a href="#deployment-infrastructure-tools">17.&nbsp;Deployment Infrastructure Tools</a></div>
                  <ul>
                     <li>
                        <div class="section-link"><a href="#nvidia-smi">17.1.&nbsp;Nvidia-SMI</a></div>
                        <ul>
                           <li>
                              <div class="section-link"><a href="#queryable-state">17.1.1.&nbsp;Queryable state</a></div>
                           </li>
                           <li>
                              <div class="section-link"><a href="#modifiable-state">17.1.2.&nbsp;Modifiable state</a></div>
                           </li>
                        </ul>
                     </li>
                     <li>
                        <div class="section-link"><a href="#nvml">17.2.&nbsp;NVML</a></div>
                     </li>
                     <li>
                        <div class="section-link"><a href="#cluster-management-tools">17.3.&nbsp;Cluster Management Tools</a></div>
                     </li>
                     <li>
                        <div class="section-link"><a href="#compiler-jit-cache-management">17.4.&nbsp;Compiler JIT Cache Management Tools</a></div>
                     </li>
                     <li>
                        <div class="section-link"><a href="#cuda-visible-devices">17.5.&nbsp;CUDA_VISIBLE_DEVICES</a></div>
                     </li>
                  </ul>
               </li>
               <li>
                  <div class="section-link"><a href="#recommendations-and-best-practices-appendix">A.&nbsp;Recommendations and Best Practices</a></div>
                  <ul>
                     <li>
                        <div class="section-link"><a href="#overall-performance-optimization-strategies">A.1.&nbsp;Overall Performance Optimization Strategies</a></div>
                     </li>
                  </ul>
               </li>
               <li>
                  <div class="section-link"><a href="#nvcc-compiler-switches">B.&nbsp;nvcc Compiler Switches</a></div>
                  <ul>
                     <li>
                        <div class="section-link"><a href="#nvcc">B.1.&nbsp;nvcc</a></div>
                     </li>
                  </ul>
               </li>
            </ul>
         </nav>
         <div id="resize-nav"></div>
         <nav id="search-results">
            <h2>Search Results</h2>
            <ol></ol>
         </nav>
         
         <div id="contents-container">
            <div id="breadcrumbs-container">
               <div id="eqn-warning">This document includes math equations
                  (highlighted in red) which are best viewed with <a target="_blank" href="https://www.mozilla.org/firefox">Firefox</a> version 4.0
                  or higher, or another <a target="_blank" href="http://www.w3.org/Math/Software/mathml_software_cat_browsers.html">MathML-aware
                     browser</a>. There is also a <a href="../../pdf/CUDA_C_Best_Practices_Guide.pdf">PDF version of this document</a>.
                  
               </div>
               <div id="release-info">Best Practices Guide
                  (<a href="../../pdf/CUDA_C_Best_Practices_Guide.pdf">PDF</a>)
                  -
                   
                  
                  
                  v10.1.168
                  (<a href="https://developer.nvidia.com/cuda-toolkit-archive">older</a>)
                  -
                  Last updated April 24, 2019
                  -
                  <a href="mailto:CUDAIssues@nvidia.com?subject=CUDA Toolkit Documentation Feedback: Best Practices Guide">Send Feedback</a></div>
            </div>
            <article id="contents">
               <div class="topic nested0" id="abstract"><a name="abstract" shape="rect">
                     <!-- --></a><h2 class="title topictitle1"><a href="#abstract" name="abstract" shape="rect">CUDA C Best Practices Guide</a></h2>
                  <div class="body conbody">
                     <p class="shortdesc">The programming guide to using the CUDA Toolkit to obtain the
                        best performance from NVIDIA GPUs.
                     </p>
                  </div>
               </div>
               <div class="topic concept nested0" xml:lang="en-US" id="preface"><a name="preface" shape="rect">
                     <!-- --></a><h2 class="title topictitle1"><a href="#preface" name="preface" shape="rect">Preface</a></h2>
                  <div class="topic concept nested1" xml:lang="en-US" id="what-is-this-document"><a name="what-is-this-document" shape="rect">
                        <!-- --></a><h3 class="title topictitle2"><a href="#what-is-this-document" name="what-is-this-document" shape="rect">What Is This Document?</a></h3>
                     <div class="body conbody">
                        <p class="p">This Best Practices Guide is a manual to help developers obtain the
                           best performance from NVIDIA<sup>®</sup> CUDA<sup>®</sup> GPUs.
                           It presents established parallelization and optimization
                           techniques and explains coding metaphors and idioms that can greatly
                           simplify programming for CUDA-capable GPU architectures.
                        </p>
                        <p class="p">While the contents can be used as a reference manual, you should be
                           aware that some topics are revisited in different contexts as various
                           programming and configuration topics are explored. As a result, it is
                           recommended that first-time readers proceed through the guide
                           sequentially. This approach will greatly improve your understanding of
                           effective programming practices and enable you to better use the guide
                           for reference later.
                        </p>
                     </div>
                  </div>
                  <div class="topic concept nested1" xml:lang="en-US" id="who-should-read-this-guide"><a name="who-should-read-this-guide" shape="rect">
                        <!-- --></a><h3 class="title topictitle2"><a href="#who-should-read-this-guide" name="who-should-read-this-guide" shape="rect">Who Should Read This Guide?</a></h3>
                     <div class="body conbody">
                        <p class="p">The discussions in this guide all use the C programming language, so
                           you should be comfortable reading C code.
                        </p>
                        <p class="p">This guide refers to and relies on several other documents that you
                           should have at your disposal for reference, all of which are available
                           at no cost from the CUDA website <a class="xref" href="http://developer.nvidia.com/cuda-downloads" target="_blank" shape="rect">http://developer.nvidia.com/cuda-downloads</a>. The following documents are especially important
                           resources:
                        </p>
                        <ul class="ul">
                           <li class="li"><cite class="cite">CUDA Installation Guide</cite></li>
                           <li class="li"><cite class="cite">CUDA C Programming Guide</cite></li>
                           <li class="li"><cite class="cite">CUDA Toolkit Reference Manual</cite></li>
                        </ul>
                        <p class="p">In particular, the optimization section of this guide assumes that
                           you have already successfully downloaded and installed the CUDA Toolkit
                           (if not, please refer to the relevant <cite class="cite">CUDA Installation
                              Guide</cite> for your platform) and that you have a basic familiarity
                           with the CUDA C programming language and environment (if not, please
                           refer to the <cite class="cite">CUDA C Programming Guide</cite>).
                        </p>
                     </div>
                  </div>
                  <div class="topic concept nested1" xml:lang="en-US" id="assess-parallelize-optimize-deploy"><a name="assess-parallelize-optimize-deploy" shape="rect">
                        <!-- --></a><h3 class="title topictitle2"><a href="#assess-parallelize-optimize-deploy" name="assess-parallelize-optimize-deploy" shape="rect">Assess, Parallelize, Optimize, Deploy</a></h3>
                     <div class="body conbody">
                        <p class="p">This guide introduces the <dfn class="term">Assess, Parallelize, Optimize,
                              Deploy</dfn><dfn class="term">(APOD)</dfn> design cycle for applications with
                           the goal of helping application developers to rapidly identify the
                           portions of their code that would most readily benefit from GPU
                           acceleration, rapidly realize that benefit, and begin leveraging the
                           resulting speedups in production as early as possible.
                        </p>
                        <p class="p">APOD is a cyclical process: initial speedups can be achieved,
                           tested, and deployed with only minimal initial investment of time, at
                           which point the cycle can begin again by identifying further
                           optimization opportunities, seeing additional speedups, and then
                           deploying the even faster versions of the application into
                           production.
                        </p>
                        <div class="fig fignone"><br clear="none"></br><div class="imagecenter"><img class="image imagecenter" src="graphics/apod-cycle.png" alt="Assess, Parallelize, Optimize, Deploy."></img></div><br clear="none"></br></div>
                     </div>
                     <div class="topic concept nested2" xml:lang="en-US" id="assess"><a name="assess" shape="rect">
                           <!-- --></a><h3 class="title topictitle2"><a href="#assess" name="assess" shape="rect">Assess</a></h3>
                        <div class="body conbody">
                           <p class="p">For an existing project, the first step is to assess the application
                              to locate the parts of the code that are responsible for the bulk of
                              the execution time. Armed with this knowledge, the developer can
                              evaluate these bottlenecks for parallelization and start to investigate
                              GPU acceleration.
                           </p>
                           <p class="p">By understanding the end-user's requirements and constraints and by
                              applying Amdahl's and Gustafson's laws, the developer can determine the
                              upper bound of performance improvement from acceleration of the
                              identified portions of the application.
                           </p>
                        </div>
                     </div>
                     <div class="topic concept nested2" xml:lang="en-US" id="parallelize"><a name="parallelize" shape="rect">
                           <!-- --></a><h3 class="title topictitle2"><a href="#parallelize" name="parallelize" shape="rect">Parallelize</a></h3>
                        <div class="body conbody">
                           <p class="p">Having identified the hotspots and having done the basic exercises
                              to set goals and expectations, the developer needs to parallelize the
                              code. Depending on the original code, this can be as simple as calling
                              into an existing GPU-optimized library such as <samp class="ph codeph">cuBLAS</samp>,
                              <samp class="ph codeph">cuFFT</samp>, or <samp class="ph codeph">Thrust</samp>, or it could be as
                              simple as adding a few preprocessor directives as hints to a
                              parallelizing compiler.
                           </p>
                           <p class="p">On the other hand, some applications' designs will require some
                              amount of refactoring to expose their inherent parallelism. As even
                              future CPU architectures will require exposing this parallelism in
                              order to improve or simply maintain the performance of sequential
                              applications, the CUDA family of parallel programming languages (CUDA
                              C/C++, CUDA Fortran, etc.) aims to make the expression of this
                              parallelism as simple as possible, while simultaneously enabling
                              operation on CUDA-capable GPUs designed for maximum parallel
                              throughput.
                           </p>
                        </div>
                     </div>
                     <div class="topic concept nested2" xml:lang="en-US" id="optimize"><a name="optimize" shape="rect">
                           <!-- --></a><h3 class="title topictitle2"><a href="#optimize" name="optimize" shape="rect">Optimize</a></h3>
                        <div class="body conbody">
                           <p class="p">After each round of application parallelization is complete, the
                              developer can move to optimizing the implementation to improve
                              performance. Since there are many possible optimizations that can be
                              considered, having a good understanding of the needs of the application
                              can help to make the process as smooth as possible. However, as with
                              APOD as a whole, program optimization is an iterative process (identify
                              an opportunity for optimization, apply and test the optimization,
                              verify the speedup achieved, and repeat), meaning that it is not
                              necessary for a programmer to spend large amounts of time memorizing
                              the bulk of all possible optimization strategies prior to seeing good
                              speedups. Instead, strategies can be applied incrementally as they are
                              learned.
                           </p>
                           <p class="p">Optimizations can be applied at various levels, from overlapping
                              data transfers with computation all the way down to fine-tuning
                              floating-point operation sequences. The available profiling tools are
                              invaluable for guiding this process, as they can help suggest a
                              next-best course of action for the developer's optimization efforts and
                              provide references into the relevant portions of the optimization
                              section of this guide.
                           </p>
                        </div>
                     </div>
                     <div class="topic concept nested2" xml:lang="en-US" id="deploy"><a name="deploy" shape="rect">
                           <!-- --></a><h3 class="title topictitle2"><a href="#deploy" name="deploy" shape="rect">Deploy</a></h3>
                        <div class="body conbody">
                           <p class="p">Having completed the GPU acceleration of one or more components of
                              the application it is possible to compare the outcome with the original
                              expectation. Recall that the initial <dfn class="term">assess</dfn> step allowed
                              the developer to determine an upper bound for the potential speedup
                              attainable by accelerating given hotspots.
                           </p>
                           <p class="p">Before tackling other hotspots to improve the total speedup, the
                              developer should consider taking the partially parallelized
                              implementation and carry it through to production. This is important
                              for a number of reasons; for example, it allows the user to profit
                              from their investment as early as possible (the speedup may be partial
                              but is still valuable), and it minimizes risk for the developer and
                              the user by providing an evolutionary rather than revolutionary set of
                              changes to the application.
                           </p>
                        </div>
                     </div>
                  </div>
                  <div class="topic concept nested1" xml:lang="en-US" id="recommendations-and-best-practices-preface"><a name="recommendations-and-best-practices-preface" shape="rect">
                        <!-- --></a><h3 class="title topictitle2"><a href="#recommendations-and-best-practices-preface" name="recommendations-and-best-practices-preface" shape="rect">Recommendations and Best Practices</a></h3>
                     <div class="body conbody">
                        <p class="p">Throughout this guide, specific recommendations are made regarding
                           the design and implementation of CUDA C code. These recommendations are
                           categorized by priority, which is a blend of the effect of the
                           recommendation and its scope. Actions that present substantial
                           improvements for most CUDA applications have the highest priority,
                           while small optimizations that affect only very specific situations are
                           given a lower priority.
                        </p>
                        <p class="p">Before implementing lower priority recommendations, it is good
                           practice to make sure all higher priority recommendations that are
                           relevant have already been applied. This approach will tend to provide
                           the best results for the time invested and will avoid the trap of
                           premature optimization.
                        </p>
                        <p class="p">The criteria of benefit and scope for establishing priority will
                           vary depending on the nature of the program. In this guide, they
                           represent a typical case. Your code might reflect different priority
                           factors. Regardless of this possibility, it is good practice to verify
                           that no higher-priority recommendations have been overlooked before
                           undertaking lower-priority items.
                        </p>
                        <div class="note note"><span class="notetitle">Note:</span> Code samples throughout the guide omit error checking for
                           conciseness. Production code should, however, systematically check the
                           error code returned by each API call and check for failures in kernel
                           launches by calling <samp class="ph codeph">cudaGetLastError()</samp>.
                        </div>
                     </div>
                  </div>
               </div>
               <div class="topic concept nested0" xml:lang="en-US" id="assessing-your-application"><a name="assessing-your-application" shape="rect">
                     <!-- --></a><h2 class="title topictitle1"><a href="#assessing-your-application" name="assessing-your-application" shape="rect">1.&nbsp;Assessing Your Application</a></h2>
                  <div class="body conbody">
                     <p class="p">From supercomputers to mobile phones, modern processors increasingly
                        rely on parallelism to provide performance. The core computational
                        unit, which includes control, arithmetic, registers and typically some
                        cache, is replicated some number of times and connected to memory via a
                        network. As a result, all modern processors require parallel code in
                        order to achieve good utilization of their computational power.
                     </p>
                     <p class="p">While  processors are evolving to expose more fine-grained
                        parallelism to the programmer, many existing applications have evolved
                        either as serial codes or as coarse-grained parallel codes (for
                        example, where the data is decomposed into regions processed in
                        parallel, with sub-regions shared using MPI). In order to profit from
                        any modern processor architecture, GPUs included, the first steps are
                        to assess the application to identify the hotspots, determine whether
                        they can be parallelized, and understand the relevant workloads both
                        now and in the future.
                     </p>
                  </div>
               </div>
               <div class="topic concept nested0" xml:lang="en-US" id="heterogeneous-computing"><a name="heterogeneous-computing" shape="rect">
                     <!-- --></a><h2 class="title topictitle1"><a href="#heterogeneous-computing" name="heterogeneous-computing" shape="rect">2.&nbsp;Heterogeneous Computing</a></h2>
                  <div class="body conbody">
                     <p class="p">CUDA programming involves running code on two different platforms
                        concurrently: a <dfn class="term">host</dfn> system with one or more CPUs and
                        one or more CUDA-enabled NVIDIA GPU <dfn class="term">devices</dfn>.
                     </p>
                     <p class="p">While NVIDIA GPUs are frequently associated with graphics, they are
                        also powerful arithmetic engines capable of running thousands of
                        lightweight threads in parallel. This capability makes them well suited
                        to computations that can leverage parallel execution.
                     </p>
                     <p class="p">However, the device is based on a distinctly different design from
                        the host system, and it's important to understand those differences and
                        how they determine the performance of CUDA applications in order to use
                        CUDA effectively.
                     </p>
                  </div>
                  <div class="topic concept nested1" xml:lang="en-US" id="differences-between-host-and-device"><a name="differences-between-host-and-device" shape="rect">
                        <!-- --></a><h3 class="title topictitle2"><a href="#differences-between-host-and-device" name="differences-between-host-and-device" shape="rect">2.1.&nbsp;Differences between Host and Device</a></h3>
                     <div class="body conbody">
                        <p class="p">The primary differences are in threading model and in separate
                           physical memories:
                        </p>
                        <dl class="dl">
                           <dt class="dt dlterm">Threading resources</dt>
                           <dd class="dd">Execution pipelines on host systems can support a limited
                              number of concurrent threads. Servers that have four hex-core
                              processors today can run only 24 threads concurrently (or 48 if
                              the CPUs support Hyper-Threading.) By comparison, the
                              <em class="ph i">smallest</em> executable unit of parallelism on a CUDA device
                              comprises 32 threads (termed a <dfn class="term">warp</dfn> of threads).
                              Modern NVIDIA GPUs can support up to 1536 active threads
                              concurrently per multiprocessor (see <cite class="cite">Features and
                                 Specifications</cite> of the <cite class="cite">CUDA C Programming
                                 Guide</cite>) On GPUs with 16 multiprocessors, this leads to
                              more than 24,000 concurrently active threads.
                           </dd>
                           <dt class="dt dlterm">Threads</dt>
                           <dd class="dd">Threads on a CPU are generally heavyweight entities. The
                              operating system must swap threads on and off CPU execution
                              channels to provide multithreading capability. Context switches
                              (when two threads are swapped) are therefore slow and
                              expensive. By comparison, threads on GPUs are extremely
                              lightweight. In a typical system, thousands of threads are
                              queued up for work (in warps of 32 threads each). If the GPU
                              must wait on one warp of threads, it simply begins executing
                              work on another. Because separate registers are allocated to
                              all active threads, no swapping of registers or other state
                              need occur when switching among GPU threads. Resources stay
                              allocated to each thread until it completes its execution. In
                              short, CPU cores are designed to <dfn class="term">minimize latency</dfn>
                              for one or two threads at a time each, whereas GPUs are
                              designed to handle a large number of concurrent, lightweight
                              threads in order to <dfn class="term">maximize throughput</dfn>.
                           </dd>
                           <dt class="dt dlterm">RAM</dt>
                           <dd class="dd">The host system and the device each have their own distinct
                              attached physical memories. As the host and device memories are
                              separated by the PCI Express (PCIe) bus, items in the host
                              memory must occasionally be communicated across the bus to the
                              device memory or vice versa as described in <a class="xref" href="index.html#what-runs-on-cuda-enabled-device" shape="rect">What Runs on a CUDA-Enabled Device?</a></dd>
                        </dl>
                        <p class="p">These are the primary hardware differences between CPU hosts and GPU
                           devices with respect to parallel programming. Other differences are
                           discussed as they arise elsewhere in this document. Applications
                           composed with these differences in mind can treat the host and device
                           together as a cohesive heterogeneous system wherein each processing
                           unit is leveraged to do the kind of work it does best: sequential work
                           on the host and parallel work on the device.
                        </p>
                     </div>
                  </div>
                  <div class="topic concept nested1" xml:lang="en-US" id="what-runs-on-cuda-enabled-device"><a name="what-runs-on-cuda-enabled-device" shape="rect">
                        <!-- --></a><h3 class="title topictitle2"><a href="#what-runs-on-cuda-enabled-device" name="what-runs-on-cuda-enabled-device" shape="rect">2.2.&nbsp;What Runs on a CUDA-Enabled Device?</a></h3>
                     <div class="body conbody">
                        <p class="p">The following issues should be considered when determining what
                           parts of an application to run on the device:
                        </p>
                        <ul class="ul">
                           <li class="li">The device is ideally suited for computations that can be run
                              on numerous data elements simultaneously in parallel. This
                              typically involves arithmetic on large data sets (such as
                              matrices) where the same operation can be performed across
                              thousands, if not millions, of elements at the same time. This
                              is a requirement for good performance on CUDA: the software
                              must use a large number (generally thousands or tens of
                              thousands) of concurrent threads. The support for running
                              numerous threads in parallel derives from CUDA's use of a
                              lightweight threading model described above.
                           </li>
                           <li class="li">For best performance, there should be some coherence in memory
                              access by adjacent threads running on the device. Certain
                              memory access patterns enable the hardware to coalesce groups
                              of reads or writes of multiple data items into one operation.
                              Data that cannot be laid out so as to enable
                              <dfn class="term">coalescing</dfn>, or that doesn't have enough locality
                              to use the L1 or texture caches effectively, will tend to see
                              lesser speedups when used in computations on CUDA.
                           </li>
                           <li class="li">To use CUDA, data values must be transferred from the host to
                              the device along the PCI Express (PCIe) bus. These transfers
                              are costly in terms of performance and should be minimized.
                              (See <a class="xref" href="index.html#data-transfer-between-host-and-device" shape="rect">Data Transfer Between Host and Device</a>.)
                              This cost has several ramifications:
                              
                              <ul class="ul">
                                 <li class="li">
                                    <p class="p">The complexity of operations should justify the cost
                                       of moving data to and from the device. Code that
                                       transfers data for brief use by a small number of
                                       threads will see little or no performance benefit. The
                                       ideal scenario is one in which many threads perform a
                                       substantial amount of work.
                                    </p>
                                    <p class="p">For example, transferring two matrices to the device
                                       to perform a matrix addition and then transferring the
                                       results back to the host will not realize much
                                       performance benefit. The issue here is the number of
                                       operations performed per data element transferred.  For
                                       the preceding procedure, assuming matrices of size NxN,
                                       there are N<sup class="ph sup">2</sup> operations (additions) and
                                       3N<sup class="ph sup">2</sup> elements transferred, so the ratio of
                                       operations to elements transferred is 1:3 or O(1).
                                       Performance benefits can be more readily achieved when
                                       this ratio is higher. For example, a matrix
                                       multiplication of the same matrices requires
                                       N<sup class="ph sup">3</sup> operations (multiply-add), so the ratio
                                       of operations to elements transferred is O(N), in which
                                       case the larger the matrix the greater the performance
                                       benefit. The types of operations are an additional
                                       factor, as additions have different complexity profiles
                                       than, for example, trigonometric functions. It is
                                       important to include the overhead of transferring data
                                       to and from the device in determining whether
                                       operations should be performed on the host or on the
                                       device.
                                    </p>
                                 </li>
                                 <li class="li">Data should be kept on the device as long as possible.
                                    Because transfers should be minimized, programs that
                                    run multiple kernels on the same data should favor
                                    leaving the data on the device between kernel calls,
                                    rather than transferring intermediate results to the
                                    host and then sending them back to the device for
                                    subsequent calculations. So, in the previous example,
                                    had the two matrices to be added already been on the
                                    device as a result of some previous calculation, or if
                                    the results of the addition would be used in some
                                    subsequent calculation, the matrix addition should be
                                    performed locally on the device. This approach should
                                    be used even if one of the steps in a sequence of
                                    calculations could be performed faster on the host.
                                    Even a relatively slow kernel may be advantageous if it
                                    avoids one or more PCIe transfers. <a class="xref" href="index.html#data-transfer-between-host-and-device" shape="rect">Data Transfer Between Host and Device</a>
                                    provides further details, including the measurements of
                                    bandwidth between the host and the device versus within
                                    the device proper.
                                    
                                 </li>
                              </ul>
                           </li>
                        </ul>
                     </div>
                  </div>
               </div>
               <div class="topic concept nested0" xml:lang="en-US" id="application-profiling"><a name="application-profiling" shape="rect">
                     <!-- --></a><h2 class="title topictitle1"><a href="#application-profiling" name="application-profiling" shape="rect">3.&nbsp;Application Profiling</a></h2>
                  <div class="topic concept nested1" xml:lang="en-US" id="profile"><a name="profile" shape="rect">
                        <!-- --></a><h3 class="title topictitle2"><a href="#profile" name="profile" shape="rect">3.1.&nbsp;Profile</a></h3>
                     <div class="body conbody">
                        <p class="p">Many codes accomplish a significant portion of the work with a
                           relatively small amount of code. Using a profiler, the developer can
                           identify such hotspots and start to compile a list of candidates for
                           parallelization.
                        </p>
                     </div>
                     <div class="topic concept nested2" xml:lang="en-US" id="creating-profile"><a name="creating-profile" shape="rect">
                           <!-- --></a><h3 class="title topictitle2"><a href="#creating-profile" name="creating-profile" shape="rect">3.1.1.&nbsp;Creating the Profile</a></h3>
                        <div class="body conbody">
                           <p class="p">There are many possible approaches to profiling the code, but in all
                              cases the objective is the same: to identify the function or functions
                              in which the application is spending most of its execution time.
                           </p>
                           <div class="note note"><span class="notetitle">Note:</span><strong class="ph b">High Priority:</strong> To maximize developer productivity, profile
                              the application to determine hotspots and bottlenecks.
                           </div>
                           <p class="p">The most important consideration with any profiling activity is to
                              ensure that the workload is realistic - i.e., that information gained
                              from the test and decisions based upon that information are relevant to
                              real data. Using unrealistic workloads can lead to sub-optimal results
                              and wasted effort both by causing developers to optimize for
                              unrealistic problem sizes and by causing developers to concentrate on
                              the wrong functions.
                           </p>
                           <p class="p">There are a number of tools that can be used to generate the
                              profile. The following example is based on <samp class="ph codeph">gprof</samp>,
                              which is an open-source profiler for Linux platforms from the GNU
                              Binutils collection.
                           </p><pre class="pre screen" xml:space="preserve">$ gcc -O2 -g -pg myprog.c
$ gprof ./a.out &gt; profile.txt
Each sample counts as 0.01 seconds.
  %   cumulative   self              self     total           
 time   seconds   seconds    calls  ms/call  ms/call  name    
 33.34      0.02     0.02     7208     0.00     0.00  genTimeStep
 16.67      0.03     0.01      240     0.04     0.12  calcStats
 16.67      0.04     0.01        8     1.25     1.25  calcSummaryData
 16.67      0.05     0.01        7     1.43     1.43  write
 16.67      0.06     0.01                             mcount
  0.00      0.06     0.00      236     0.00     0.00  tzset
  0.00      0.06     0.00      192     0.00     0.00  tolower
  0.00      0.06     0.00       47     0.00     0.00  strlen
  0.00      0.06     0.00       45     0.00     0.00  strchr
  0.00      0.06     0.00        1     0.00    50.00  main
  0.00      0.06     0.00        1     0.00     0.00  memcpy
  0.00      0.06     0.00        1     0.00    10.11  print
  0.00      0.06     0.00        1     0.00     0.00  profil
  0.00      0.06     0.00        1     0.00    50.00  report</pre></div>
                     </div>
                     <div class="topic concept nested2" xml:lang="en-US" id="identifying-hotspots"><a name="identifying-hotspots" shape="rect">
                           <!-- --></a><h3 class="title topictitle2"><a href="#identifying-hotspots" name="identifying-hotspots" shape="rect">3.1.2.&nbsp;Identifying Hotspots</a></h3>
                        <div class="body conbody">
                           <p class="p">In the example above, we can clearly see that the function
                              <samp class="ph codeph">genTimeStep()</samp> takes one-third of the total running
                              time of the application. This should be our first candidate function
                              for parallelization. <a class="xref" href="index.html#understanding-scaling" shape="rect">Understanding Scaling</a> discusses
                              the potential benefit we might expect from such parallelization.
                           </p>
                           <p class="p">It is worth noting that several of the other functions in the above
                              example also take up a significant portion of the overall running time,
                              such as <samp class="ph codeph">calcStats()</samp> and
                              <samp class="ph codeph">calcSummaryData()</samp>. Parallelizing these functions as
                              well should increase our speedup potential. However, since APOD is a
                              cyclical process, we might opt to parallelize these functions in a
                              subsequent APOD pass, thereby limiting the scope of our work in any
                              given pass to a smaller set of incremental changes.
                           </p>
                        </div>
                     </div>
                     <div class="topic concept nested2" xml:lang="en-US" id="understanding-scaling"><a name="understanding-scaling" shape="rect">
                           <!-- --></a><h3 class="title topictitle2"><a href="#understanding-scaling" name="understanding-scaling" shape="rect">3.1.3.&nbsp;Understanding Scaling</a></h3>
                        <div class="body conbody">
                           <p class="p">The amount of performance benefit an application will realize by
                              running on CUDA depends entirely on the extent to which it can be
                              parallelized. Code that cannot be sufficiently parallelized should run
                              on the host, unless doing so would result in excessive transfers
                              between the host and the device.
                           </p>
                           <div class="note note"><span class="notetitle">Note:</span><strong class="ph b">High Priority:</strong> To get the maximum benefit from CUDA, focus
                              first on finding ways to parallelize sequential code.
                           </div>
                           <p class="p">By understanding how applications can scale it is possible to set
                              expectations and plan an incremental parallelization strategy. <a class="xref" href="index.html#strong-scaling-and-amdahls-law" shape="rect">Strong Scaling and Amdahl's Law</a> describes strong scaling,
                              which allows us to set an upper bound for the speedup with a fixed
                              problem size. <a class="xref" href="index.html#weak-scaling-and-gustafsons-law" shape="rect">Weak Scaling and Gustafson's Law</a>
                              describes weak scaling, where the speedup is attained by growing the
                              problem size. In many applications, a combination of strong and weak
                              scaling is desirable.
                           </p>
                        </div>
                        <div class="topic concept nested3" xml:lang="en-US" id="strong-scaling-and-amdahls-law"><a name="strong-scaling-and-amdahls-law" shape="rect">
                              <!-- --></a><h3 class="title topictitle2"><a href="#strong-scaling-and-amdahls-law" name="strong-scaling-and-amdahls-law" shape="rect">3.1.3.1.&nbsp;Strong Scaling and Amdahl's Law</a></h3>
                           <div class="body conbody">
                              <p class="p">Strong scaling is a measure of how, for a fixed overall problem
                                 size, the time to solution decreases as more processors are added to a
                                 system.  An application that exhibits linear strong scaling has a
                                 speedup equal to the number of processors used.
                              </p>
                              <p class="p">Strong scaling is usually equated with Amdahl's Law, which specifies
                                 the maximum speedup that can be expected by parallelizing portions of a
                                 serial program. Essentially, it states that the maximum speedup
                                 <em class="ph i">S</em> of a program is:
                              </p>
                              <p class="p d4p_eqn_block">
                                 <math xmlns="http://www.w3.org/1998/Math/MathML">
                                    <mrow>
                                       <mi>S</mi>
                                       <mo>=</mo>
                                       <mfrac>
                                          <mrow>
                                             <mn>1</mn>
                                          </mrow>
                                          <mrow>
                                             <mo>(</mo>
                                             <mn>1</mn>
                                             <mo>−</mo>
                                             <mi>P</mi>
                                             <mo>)</mo>
                                             <mo>+</mo>
                                             <mfrac>
                                                <mrow>
                                                   <mi>P</mi>
                                                </mrow>
                                                <mrow>
                                                   <mi>N</mi>
                                                </mrow>
                                             </mfrac>
                                          </mrow>
                                       </mfrac>
                                    </mrow>
                                 </math>
                              </p>
                              <p class="p">Here <em class="ph i">P</em> is the fraction of the total serial execution time
                                 taken by the portion of code that can be parallelized and <em class="ph i">N</em> is
                                 the number of processors over which the parallel portion of the code
                                 runs.
                              </p>
                              <p class="p">The larger <em class="ph i">N</em> is(that is, the greater the number of
                                 processors), the smaller the <em class="ph i">P/N</em> fraction. It can be simpler to
                                 view <em class="ph i">N</em> as a very large number, which essentially transforms the
                                 equation into
                                 
                                 <math xmlns="http://www.w3.org/1998/Math/MathML">
                                    <mrow>
                                       <mi>S</mi>
                                       <mo>=</mo>
                                       <mn>1</mn>
                                       <mo>/</mo>
                                       <mrow>
                                          <mo>(</mo>
                                          <mn>1</mn>
                                          <mo>−</mo>
                                          <mi>P</mi>
                                          <mo>)</mo>
                                       </mrow>
                                    </mrow>
                                 </math>. Now, if 3/4 of the running time of a sequential
                                 program is parallelized, the maximum speedup over serial code is 1 / (1
                                 - 3/4) = 4.
                              </p>
                              <p class="p">In reality, most applications do not exhibit perfectly linear strong
                                 scaling, even if they do exhibit some degree of strong scaling. For
                                 most purposes, the key point is that the larger the parallelizable
                                 portion <em class="ph i">P</em> is, the greater the potential speedup.  Conversely, if
                                 <em class="ph i">P</em> is a small number (meaning that the application is not
                                 substantially parallelizable), increasing the number of processors
                                 <em class="ph i">N</em> does little to improve performance. Therefore, to get the
                                 largest speedup for a fixed problem size, it is worthwhile to spend
                                 effort on increasing <em class="ph i">P</em>, maximizing the amount of code that can
                                 be parallelized.
                              </p>
                           </div>
                        </div>
                        <div class="topic concept nested3" xml:lang="en-US" id="weak-scaling-and-gustafsons-law"><a name="weak-scaling-and-gustafsons-law" shape="rect">
                              <!-- --></a><h3 class="title topictitle2"><a href="#weak-scaling-and-gustafsons-law" name="weak-scaling-and-gustafsons-law" shape="rect">3.1.3.2.&nbsp;Weak Scaling and Gustafson's Law</a></h3>
                           <div class="body conbody">
                              <p class="p">Weak scaling is a measure of how the time to solution changes as
                                 more processors are added to a system with a fixed problem size
                                 <dfn class="term">per processor</dfn>; i.e., where the overall problem size
                                 increases as the number of processors is increased.
                              </p>
                              <p class="p">Weak scaling is often equated with Gustafson's Law, which states
                                 that in practice, the problem size scales with the number of
                                 processors. Because of this, the maximum speedup <em class="ph i">S</em> of a program
                                 is:
                              </p>
                              <p class="p d4p_eqn_block">
                                 <math xmlns="http://www.w3.org/1998/Math/MathML">
                                    <mrow>
                                       <mi>S</mi>
                                       <mo>=</mo>
                                       <mi>N</mi>
                                       <mo>+</mo>
                                       <mrow>
                                          <mo>(</mo>
                                          <mn>1</mn>
                                          <mo>−</mo>
                                          <mi>P</mi>
                                          <mo>)</mo>
                                       </mrow>
                                       <mrow>
                                          <mo>(</mo>
                                          <mn>1</mn>
                                          <mo>−</mo>
                                          <mi>N</mi>
                                          <mo>)</mo>
                                       </mrow>
                                    </mrow>
                                 </math>
                              </p>
                              <p class="p">Here <em class="ph i">P</em> is the fraction of the total serial execution time
                                 taken by the portion of code that can be parallelized and <em class="ph i">N</em> is
                                 the number of processors over which the parallel portion of the code
                                 runs.
                              </p>
                              <p class="p">Another way of looking at Gustafson's Law is that it is not the
                                 problem size that remains constant as we scale up the system but rather
                                 the execution time. Note that Gustafson's Law assumes that the ratio of
                                 serial to parallel execution remains constant, reflecting additional
                                 cost in setting up and handling the larger problem.
                              </p>
                           </div>
                        </div>
                        <div class="topic concept nested3" xml:lang="en-US" id="applying-strong-and-weak-scaling"><a name="applying-strong-and-weak-scaling" shape="rect">
                              <!-- --></a><h3 class="title topictitle2"><a href="#applying-strong-and-weak-scaling" name="applying-strong-and-weak-scaling" shape="rect">3.1.3.3.&nbsp;Applying Strong and Weak Scaling</a></h3>
                           <div class="body conbody">
                              <p class="p">Understanding which type of scaling is most applicable to an
                                 application is an important part of estimating speedup. For some
                                 applications the problem size will remain constant and hence only
                                 strong scaling is applicable. An example would be modeling how two
                                 molecules interact with each other, where the molecule sizes are
                                 fixed.
                              </p>
                              <p class="p">For other applications, the problem size will grow to fill the
                                 available processors. Examples include modeling fluids or structures as
                                 meshes or grids and some Monte Carlo simulations, where increasing the
                                 problem size provides increased accuracy.
                              </p>
                              <p class="p">Having understood the application profile, the developer should
                                 understand how the problem size would change if the computational
                                 performance changes and then apply either Amdahl's or Gustafson's Law
                                 to determine an upper bound for the speedup.
                              </p>
                           </div>
                        </div>
                     </div>
                  </div>
               </div>
               <div class="topic concept nested0" xml:lang="en-US" id="parallelizing-your-application"><a name="parallelizing-your-application" shape="rect">
                     <!-- --></a><h2 class="title topictitle1"><a href="#parallelizing-your-application" name="parallelizing-your-application" shape="rect">4.&nbsp;Parallelizing Your Application</a></h2>
                  <div class="body conbody">
                     <p class="p">Having identified the hotspots and having done the basic exercises
                        to set goals and expectations, the developer needs to parallelize the
                        code. Depending on the original code, this can be as simple as calling
                        into an existing GPU-optimized library such as <samp class="ph codeph">cuBLAS</samp>,
                        <samp class="ph codeph">cuFFT</samp>, or <samp class="ph codeph">Thrust</samp>, or it could be as
                        simple as adding a few preprocessor directives as hints to a
                        parallelizing compiler.
                     </p>
                     <p class="p">On the other hand, some applications' designs will require some
                        amount of refactoring to expose their inherent parallelism. As even
                        future CPU architectures will require exposing this parallelism in
                        order to improve or simply maintain the performance of sequential
                        applications, the CUDA family of parallel programming languages (CUDA
                        C/C++, CUDA Fortran, etc.) aims to make the expression of this
                        parallelism as simple as possible, while simultaneously enabling
                        operation on CUDA-capable GPUs designed for maximum parallel
                        throughput.
                     </p>
                  </div>
               </div>
               <div class="topic concept nested0" xml:lang="en-US" id="getting-started"><a name="getting-started" shape="rect">
                     <!-- --></a><h2 class="title topictitle1"><a href="#getting-started" name="getting-started" shape="rect">5.&nbsp;Getting Started</a></h2>
                  <div class="body conbody">
                     <p class="p">There are several key strategies for parallelizing sequential code.
                        While the details of how to apply these strategies to a particular
                        application is a complex and problem-specific topic, the general themes
                        listed here apply regardless of whether we are parallelizing code to
                        run on for multicore CPUs or for use on CUDA GPUs.
                     </p>
                  </div>
                  <div class="topic concept nested1" xml:lang="en-US" id="parallel-libraries"><a name="parallel-libraries" shape="rect">
                        <!-- --></a><h3 class="title topictitle2"><a href="#parallel-libraries" name="parallel-libraries" shape="rect">5.1.&nbsp;Parallel Libraries</a></h3>
                     <div class="body conbody">
                        <p class="p">The most straightforward approach to parallelizing an application is
                           to leverage existing libraries that take advantage of parallel
                           architectures on our behalf. The CUDA Toolkit includes a number of such
                           libraries that have been fine-tuned for NVIDIA CUDA GPUs, such as
                           <samp class="ph codeph">cuBLAS</samp>, <samp class="ph codeph">cuFFT</samp>, and so on.
                        </p>
                        <p class="p">The key here is that libraries are most useful when they match well
                           with the needs of the application. Applications already using other
                           BLAS libraries can often quite easily switch to
                           <samp class="ph codeph">cuBLAS</samp>, for example, whereas applications that do
                           little to no linear algebra will have little use for
                           <samp class="ph codeph">cuBLAS</samp>. The same goes for other CUDA Toolkit
                           libraries: <samp class="ph codeph">cuFFT</samp> has an interface similar to that of
                           <samp class="ph codeph">FFTW</samp>, etc.
                        </p>
                        <p class="p">Also of note is the Thrust library, which is a parallel C++ template
                           library similar to the C++ Standard Template Library. Thrust provides a
                           rich collection of data parallel primitives such as scan, sort, and
                           reduce, which can be composed together to implement complex algorithms
                           with concise, readable source code. By describing your computation in
                           terms of these high-level abstractions you provide Thrust with the
                           freedom to select the most efficient implementation automatically. As a
                           result, Thrust can be utilized in rapid prototyping of CUDA
                           applications, where programmer productivity matters most, as well as in
                           production, where robustness and absolute performance are crucial.
                        </p>
                     </div>
                  </div>
                  <div class="topic concept nested1" xml:lang="en-US" id="parallelizing-compilers"><a name="parallelizing-compilers" shape="rect">
                        <!-- --></a><h3 class="title topictitle2"><a href="#parallelizing-compilers" name="parallelizing-compilers" shape="rect">5.2.&nbsp;Parallelizing Compilers</a></h3>
                     <div class="body conbody">
                        <p class="p">Another common approach to parallelization of sequential codes is to
                           make use of parallelizing compilers. Often this means the use of
                           directives-based approaches, where the programmer uses a pragma or
                           other similar notation to provide hints to the compiler about where
                           parallelism can be found without needing to modify or adapt the
                           underlying code itself. By exposing parallelism to the compiler,
                           directives allow the compiler to do the detailed work of mapping the
                           computation onto the parallel architecture.
                        </p>
                        <p class="p">The OpenACC standard provides a set of compiler directives to
                           specify loops and regions of code in standard C, C++ and Fortran that
                           should be offloaded from a host CPU to an attached accelerator such as
                           a CUDA GPU. The details of managing the accelerator device are handled
                           implicitly by an OpenACC-enabled compiler and runtime.
                        </p>
                        <p class="p">See <a class="xref" href="http://www.openacc.org/" target="_blank" shape="rect">http://www.openacc.org/</a> for details.
                        </p>
                     </div>
                  </div>
                  <div class="topic concept nested1" xml:lang="en-US" id="coding-to-expose-parallelism"><a name="coding-to-expose-parallelism" shape="rect">
                        <!-- --></a><h3 class="title topictitle2"><a href="#coding-to-expose-parallelism" name="coding-to-expose-parallelism" shape="rect">5.3.&nbsp;Coding to Expose Parallelism</a></h3>
                     <div class="body conbody">
                        <p class="p">For applications that need additional functionality or performance
                           beyond what existing parallel libraries or parallelizing compilers can
                           provide, parallel programming languages such as CUDA C/C++ that
                           integrate seamlessly with existing sequential code are essential.
                        </p>
                        <p class="p">Once we have located a hotspot in our application's profile
                           assessment and determined that custom code is the best approach, we can
                           use CUDA C/C++ to expose the parallelism in that portion of our code as
                           a CUDA kernel. We can then launch this kernel onto the GPU and retrieve
                           the results without requiring major rewrites to the rest of our
                           application.
                        </p>
                        <p class="p">This approach is most straightforward when the majority of the total
                           running time of our application is spent in a few relatively isolated
                           portions of the code. More difficult to parallelize are applications
                           with a very flat profile - i.e., applications where the time spent is
                           spread out relatively evenly across a wide portion of the code base.
                           For the latter variety of application, some degree of code refactoring
                           to expose the inherent parallelism in the application might be
                           necessary, but keep in mind that this refactoring work will tend to
                           benefit all future architectures, CPU and GPU alike, so it is well
                           worth the effort should it become necessary.
                        </p>
                     </div>
                  </div>
               </div>
               <div class="topic concept nested0" xml:lang="en-US" id="getting-right-answer"><a name="getting-right-answer" shape="rect">
                     <!-- --></a><h2 class="title topictitle1"><a href="#getting-right-answer" name="getting-right-answer" shape="rect">6.&nbsp;Getting the Right Answer</a></h2>
                  <div class="body conbody">
                     <p class="p">Obtaining the right answer is clearly the principal goal of all
                        computation. On parallel systems, it is possible to run into
                        difficulties not typically found in traditional serial-oriented
                        programming. These include threading issues, unexpected values due to
                        the way floating-point values are computed, and challenges arising from
                        differences in the way CPU and GPU processors operate. This chapter
                        examines issues that can affect the correctness of returned data and
                        points to appropriate solutions.
                     </p>
                  </div>
                  <div class="topic concept nested1" xml:lang="en-US" id="verification"><a name="verification" shape="rect">
                        <!-- --></a><h3 class="title topictitle2"><a href="#verification" name="verification" shape="rect">6.1.&nbsp;Verification</a></h3>
                     <div class="topic concept nested2" xml:lang="en-US" id="reference-comparison"><a name="reference-comparison" shape="rect">
                           <!-- --></a><h3 class="title topictitle2"><a href="#reference-comparison" name="reference-comparison" shape="rect">6.1.1.&nbsp;Reference Comparison</a></h3>
                        <div class="body conbody">
                           <p class="p">A key aspect of correctness verification for modifications to any
                              existing program is to establish some mechanism whereby previous
                              known-good reference outputs from representative inputs can be compared
                              to new results. After each change is made, ensure that the results
                              match using whatever criteria apply to the particular algorithm. Some
                              will expect bitwise identical results, which is not always possible,
                              especially where floating-point arithmetic is concerned; see <a class="xref" href="index.html#numerical-accuracy-and-precision" shape="rect">Numerical Accuracy and Precision</a> regarding numerical
                              accuracy.  For other algorithms, implementations may be considered
                              correct if they match the reference within some small epsilon.
                           </p>
                           <p class="p">Note that the process used for validating numerical results can
                              easily be extended to validate performance results as well. We want to
                              ensure that each change we make is correct <em class="ph i">and</em> that it improves
                              performance (and by how much). Checking these things frequently as an
                              integral part of our cyclical APOD process will help ensure that we
                              achieve the desired results as rapidly as possible.
                           </p>
                        </div>
                     </div>
                     <div class="topic concept nested2" xml:lang="en-US" id="unit-testing"><a name="unit-testing" shape="rect">
                           <!-- --></a><h3 class="title topictitle2"><a href="#unit-testing" name="unit-testing" shape="rect">6.1.2.&nbsp;Unit Testing</a></h3>
                        <div class="body conbody">
                           <p class="p">A useful counterpart to the reference comparisons described above is
                              to structure the code itself in such a way that is readily verifiable
                              at the unit level. For example, we can write our CUDA kernels as a
                              collection of many short <samp class="ph codeph">__device__</samp> functions rather
                              than one large monolithic <samp class="ph codeph">__global__</samp> function; each
                              device function can be tested independently before hooking them all
                              together.
                           </p>
                           <p class="p">For example, many kernels have complex addressing logic for
                              accessing memory in addition to their actual computation. If we
                              validate our addressing logic separately prior to introducing the bulk
                              of the computation, then this will simplify any later debugging
                              efforts. (Note that the CUDA compiler considers any device code that
                              does not contribute to a write to global memory as dead code subject to
                              elimination, so we must at least write <em class="ph i">something</em> out to global
                              memory as a result of our addressing logic in order to successfully
                              apply this strategy.)
                           </p>
                           <p class="p">Going a step further, if most functions are defined as
                              <samp class="ph codeph">__host__</samp><samp class="ph codeph">__device__</samp> rather than just
                              <samp class="ph codeph">__device__</samp> functions, then these functions can be
                              tested on both the CPU and the GPU, thereby increasing our confidence
                              that the function is correct and that there will not be any unexpected
                              differences in the results. If there <em class="ph i">are</em> differences, then those
                              differences will be seen early and can be understood in the context of
                              a simple function.
                           </p>
                           <p class="p">As a useful side effect, this strategy will allow us a means to
                              reduce code duplication should we wish to include both CPU and GPU
                              execution paths in our application: if the bulk of the work of our CUDA
                              kernels is done in <samp class="ph codeph">__host__</samp><samp class="ph codeph">__device__</samp> functions, we can easily call those
                              functions from both the host code <em class="ph i">and</em> the device code without
                              duplication.
                           </p>
                        </div>
                     </div>
                  </div>
                  <div class="topic concept nested1" xml:lang="en-US" id="debugging"><a name="debugging" shape="rect">
                        <!-- --></a><h3 class="title topictitle2"><a href="#debugging" name="debugging" shape="rect">6.2.&nbsp;Debugging</a></h3>
                     <div class="body conbody">
                        <p class="p">CUDA-GDB is a port of the GNU Debugger that runs on Linux and Mac;
                           see: <a class="xref" href="http://developer.nvidia.com/cuda-gdb" target="_blank" shape="rect">http://developer.nvidia.com/cuda-gdb</a>.
                        </p>
                        <p class="p">The NVIDIA Parallel Nsight debugging and profiling tool for
                           Microsoft Windows Vista and Windows 7 is available as a free plugin for
                           Microsoft Visual Studio; see: <a class="xref" href="http://developer.nvidia.com/nvidia-parallel-nsight" target="_blank" shape="rect">http://developer.nvidia.com/nvidia-parallel-nsight</a>.
                        </p>
                        <p class="p">Several third-party debuggers now support CUDA debugging as well;
                           see: <a class="xref" href="http://developer.nvidia.com/debugging-solutions" target="_blank" shape="rect">http://developer.nvidia.com/debugging-solutions</a> for more details.
                        </p>
                     </div>
                  </div>
                  <div class="topic concept nested1" xml:lang="en-US" id="numerical-accuracy-and-precision"><a name="numerical-accuracy-and-precision" shape="rect">
                        <!-- --></a><h3 class="title topictitle2"><a href="#numerical-accuracy-and-precision" name="numerical-accuracy-and-precision" shape="rect">6.3.&nbsp;Numerical Accuracy and Precision</a></h3>
                     <div class="body conbody">
                        <p class="p">Incorrect or unexpected results arise principally from issues of
                           floating-point accuracy due to the way floating-point values are
                           computed and stored. The following sections explain the principal items
                           of interest. Other peculiarities of floating-point arithmetic are
                           presented in <cite class="cite">Features and Technical Specifications</cite> of the
                           <cite class="cite">CUDA C Programming Guide</cite> as well as in a whitepaper and
                           accompanying webinar on floating-point precision and performance
                           available from <a class="xref" href="http://developer.nvidia.com/content/precision-performance-floating-point-and-ieee-754-compliance-nvidia-gpus" target="_blank" shape="rect">http://developer.nvidia.com/content/precision-performance-floating-point-and-ieee-754-compliance-nvidia-gpus</a>.
                        </p>
                     </div>
                     <div class="topic concept nested2" xml:lang="en-US" id="single-vs-double-precision"><a name="single-vs-double-precision" shape="rect">
                           <!-- --></a><h3 class="title topictitle2"><a href="#single-vs-double-precision" name="single-vs-double-precision" shape="rect">6.3.1.&nbsp;Single vs. Double Precision</a></h3>
                        <div class="body conbody">
                           <p class="p">Devices of <a class="xref" href="index.html#cuda-compute-capability" shape="rect">compute
                                 capability</a> 1.3 and higher provide native support
                              for double-precision floating-point values (that is, values 64 bits
                              wide). Results obtained using double-precision arithmetic will
                              frequently differ from the same operation performed via
                              single-precision arithmetic due to the greater precision of the former
                              and due to rounding issues. Therefore, it is important to be sure to
                              compare values of like precision and to express the results within a
                              certain tolerance rather than expecting them to be exact.
                           </p>
                        </div>
                     </div>
                     <div class="topic concept nested2" xml:lang="en-US" id="floating-point-math-is-not-associative"><a name="floating-point-math-is-not-associative" shape="rect">
                           <!-- --></a><h3 class="title topictitle2"><a href="#floating-point-math-is-not-associative" name="floating-point-math-is-not-associative" shape="rect">6.3.2.&nbsp;Floating Point Math Is not Associative</a></h3>
                        <div class="body conbody">
                           <p class="p">Each floating-point arithmetic operation involves a certain amount
                              of rounding. Consequently, the order in which arithmetic operations are
                              performed is important. If A, B, and C are floating-point values,
                              (A+B)+C is not guaranteed to equal A+(B+C) as it is in symbolic math.
                              When you parallelize computations, you potentially change the order of
                              operations and therefore the parallel results might not match
                              sequential results. This limitation is not specific to CUDA, but an
                              inherent part of parallel computation on floating-point values.
                           </p>
                        </div>
                     </div>
                     <div class="topic concept nested2" xml:lang="en-US" id="promotions-to-doubles-and-truncations-to-floats"><a name="promotions-to-doubles-and-truncations-to-floats" shape="rect">
                           <!-- --></a><h3 class="title topictitle2"><a href="#promotions-to-doubles-and-truncations-to-floats" name="promotions-to-doubles-and-truncations-to-floats" shape="rect">6.3.3.&nbsp;Promotions to Doubles and Truncations to Floats</a></h3>
                        <div class="body conbody">
                           <p class="p">When comparing the results of computations of float variables between
                              the host and device, make sure that promotions to double precision on
                              the host do not account for different numerical results. For example,
                              if the code segment
                           </p><pre xml:space="preserve"><span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-keyword">float</span> a;
...
a = a*1.02;</pre><p class="p">were performed on a device of <a class="xref" href="index.html#cuda-compute-capability" shape="rect">compute
                                 capability</a> 1.2 or less,
                              or on a device with compute capability 1.3 but compiled without enabling
                              double precision (as mentioned above), then the multiplication would be
                              performed in single precision. However, if the code were performed on
                              the host, the literal <samp class="ph codeph">1.02</samp> would be interpreted as a
                              double-precision quantity and <samp class="ph codeph">a</samp> would be promoted to a
                              double, the multiplication would be performed in double precision, and
                              the result would be truncated to a float - thereby yielding a slightly
                              different result. If, however, the literal <samp class="ph codeph">1.02</samp> were
                              replaced with <samp class="ph codeph">1.02f</samp>, the result would be the same in
                              all cases because no promotion to doubles would occur. To ensure that
                              computations use single-precision arithmetic, always use float
                              literals.
                           </p>
                           <p class="p">In addition to accuracy, the conversion between doubles and floats
                              (and vice versa) has a detrimental effect on performance, as discussed
                              in <a class="xref" href="index.html#instruction-optimization" shape="rect">Instruction Optimization</a>.
                           </p>
                        </div>
                     </div>
                     <div class="topic concept nested2" xml:lang="en-US" id="ieee-754-compliance"><a name="ieee-754-compliance" shape="rect">
                           <!-- --></a><h3 class="title topictitle2"><a href="#ieee-754-compliance" name="ieee-754-compliance" shape="rect">6.3.4.&nbsp;IEEE 754 Compliance</a></h3>
                        <div class="body conbody">
                           <p class="p">All CUDA compute devices follow the IEEE 754 standard for binary
                              floating-point representation, with some small exceptions. These
                              exceptions, which are detailed in <cite class="cite">Features and Technical
                                 Specifications</cite> of the <cite class="cite">CUDA C Programming Guide</cite>, can
                              lead to results that differ from IEEE
                              754 values computed on the
                              host system.
                           </p>
                           <p class="p">One of the key differences is the fused multiply-add (FMA)
                              instruction, which combines multiply-add operations into a single
                              instruction execution. Its result will often differ slightly from
                              results obtained by doing the two operations separately.
                           </p>
                        </div>
                     </div>
                     <div class="topic concept nested2" xml:lang="en-US" id="x86-80-bit-computations"><a name="x86-80-bit-computations" shape="rect">
                           <!-- --></a><h3 class="title topictitle2"><a href="#x86-80-bit-computations" name="x86-80-bit-computations" shape="rect">6.3.5.&nbsp;x86 80-bit Computations</a></h3>
                        <div class="body conbody">
                           <p class="p">x86 processors can use an 80-bit <dfn class="term">double extended
                                 precision</dfn> math when performing floating-point calculations. The
                              results of these calculations can frequently differ from pure 64-bit
                              operations performed on the CUDA device. To get a closer match between
                              values, set the x86 host processor to use regular double or single
                              precision (64 bits and 32 bits, respectively). This is done with the
                              <samp class="ph codeph">FLDCW</samp> x86 assembly instruction or the equivalent
                              operating system API.
                           </p>
                        </div>
                     </div>
                  </div>
               </div>
               <div class="topic concept nested0" xml:lang="en-US" id="optimizing-cuda-applications"><a name="optimizing-cuda-applications" shape="rect">
                     <!-- --></a><h2 class="title topictitle1"><a href="#optimizing-cuda-applications" name="optimizing-cuda-applications" shape="rect">7.&nbsp;Optimizing CUDA Applications</a></h2>
                  <div class="body conbody">
                     <p class="p">After each round of application parallelization is complete, the
                        developer can move to optimizing the implementation to improve
                        performance. Since there are many possible optimizations that can be
                        considered, having a good understanding of the needs of the application
                        can help to make the process as smooth as possible. However, as with
                        APOD as a whole, program optimization is an iterative process (identify
                        an opportunity for optimization, apply and test the optimization,
                        verify the speedup achieved, and repeat), meaning that it is not
                        necessary for a programmer to spend large amounts of time memorizing
                        the bulk of all possible optimization strategies prior to seeing good
                        speedups. Instead, strategies can be applied incrementally as they are
                        learned.
                     </p>
                     <p class="p">Optimizations can be applied at various levels, from overlapping
                        data transfers with computation all the way down to fine-tuning
                        floating-point operation sequences. The available profiling tools are
                        invaluable for guiding this process, as they can help suggest a
                        next-best course of action for the developer's optimization efforts and
                        provide references into the relevant portions of the optimization
                        section of this guide.
                     </p>
                  </div>
               </div>
               <div class="topic concept nested0" xml:lang="en-US" id="performance-metrics"><a name="performance-metrics" shape="rect">
                     <!-- --></a><h2 class="title topictitle1"><a href="#performance-metrics" name="performance-metrics" shape="rect">8.&nbsp;Performance Metrics</a></h2>
                  <div class="body conbody">
                     <p class="p">When attempting to optimize CUDA code, it pays to know how to
                        measure performance accurately and to understand the role that
                        bandwidth plays in performance measurement. This chapter discusses how
                        to correctly measure performance using CPU timers and CUDA events. It
                        then explores how bandwidth affects performance metrics and how to
                        mitigate some of the challenges it poses.
                     </p>
                  </div>
                  <div class="topic concept nested1" xml:lang="en-US" id="timing"><a name="timing" shape="rect">
                        <!-- --></a><h3 class="title topictitle2"><a href="#timing" name="timing" shape="rect">8.1.&nbsp;Timing</a></h3>
                     <div class="body conbody">
                        <p class="p">CUDA calls and kernel executions can be timed using either CPU or
                           GPU timers. This section examines the functionality, advantages, and
                           pitfalls of both approaches.
                        </p>
                     </div>
                     <div class="topic concept nested2" xml:lang="en-US" id="using-cpu-timers"><a name="using-cpu-timers" shape="rect">
                           <!-- --></a><h3 class="title topictitle2"><a href="#using-cpu-timers" name="using-cpu-timers" shape="rect">8.1.1.&nbsp;Using CPU Timers</a></h3>
                        <div class="body conbody">
                           <p class="p">Any CPU timer can be used to measure the elapsed time of a CUDA call
                              or kernel execution. The details of various CPU timing approaches are
                              outside the scope of this document, but developers should always be
                              aware of the resolution their timing calls provide.
                           </p>
                           <p class="p">When using CPU timers, it is critical to remember that many CUDA API
                              functions are asynchronous; that is, they return control back to the
                              calling CPU thread prior to completing their work.  All kernel launches
                              are asynchronous, as are memory-copy functions with the
                              <samp class="ph codeph">Async</samp> suffix on their names. Therefore, to accurately
                              measure the elapsed time for a particular call or sequence of CUDA
                              calls, it is necessary to synchronize the CPU thread with the GPU by
                              calling <samp class="ph codeph">cudaDeviceSynchronize()</samp> immediately before
                              starting and stopping the CPU timer.
                              <samp class="ph codeph">cudaDeviceSynchronize()</samp>blocks the calling CPU thread
                              until all CUDA calls previously issued by the thread are completed.
                           </p>
                           <p class="p">Although it is also possible to synchronize the CPU thread with a
                              particular stream or event on the GPU, these synchronization functions
                              are not suitable for timing code in streams other than the default
                              stream. <samp class="ph codeph">cudaStreamSynchronize()</samp> blocks the CPU thread
                              until all CUDA calls previously issued into the given stream have
                              completed.  <samp class="ph codeph">cudaEventSynchronize()</samp> blocks until a
                              given event in a particular stream has been recorded by the GPU.
                              Because the driver may interleave execution of CUDA calls from other
                              non-default streams, calls in other streams may be included in the
                              timing.
                           </p>
                           <p class="p">Because the default stream, stream 0, exhibits serializing behavior
                              for work on the device (an operation in the default stream can begin
                              only after all preceding calls in any stream have completed; and no
                              subsequent operation in any stream can begin until it finishes), these
                              functions can be used reliably for timing in the default stream.
                           </p>
                           <p class="p">Be aware that CPU-to-GPU synchronization points such as those
                              mentioned in this section imply a stall in the GPU's processing
                              pipeline and should thus be used sparingly to minimize their
                              performance impact.
                           </p>
                        </div>
                     </div>
                     <div class="topic concept nested2" xml:lang="en-US" id="using-cuda-gpu-timers"><a name="using-cuda-gpu-timers" shape="rect">
                           <!-- --></a><h3 class="title topictitle2"><a href="#using-cuda-gpu-timers" name="using-cuda-gpu-timers" shape="rect">8.1.2.&nbsp;Using CUDA GPU Timers</a></h3>
                        <div class="body conbody">
                           <div class="section">
                              <p class="p">The CUDA event API provides calls that create and destroy events,
                                 record events (via timestamp), and convert timestamp differences into a
                                 floating-point value in milliseconds. <a class="xref" href="index.html#using-cuda-gpu-timers__how-to-time-code-using-cuda-events" shape="rect">How to time code using CUDA events</a>
                                 illustrates their use.
                              </p>
                           </div>
                           <div class="example" id="using-cuda-gpu-timers__how-to-time-code-using-cuda-events"><a name="using-cuda-gpu-timers__how-to-time-code-using-cuda-events" shape="rect">
                                 <!-- --></a><h4 class="title sectiontitle">How to time code using CUDA events</h4><pre xml:space="preserve">cudaEvent_t start, stop;
<span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-keyword">float</span> time;

cudaEventCreate(&amp;start);
cudaEventCreate(&amp;stop);

cudaEventRecord( start, 0 );
kernel<span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-attribute">&lt;&lt;&lt;</span>grid,threads<span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-attribute">&gt;&gt;&gt;</span> ( d_odata, d_idata, size_x, size_y, 
                           NUM_REPS);
cudaEventRecord( stop, 0 );
cudaEventSynchronize( stop );

cudaEventElapsedTime( &amp;time, start, stop );
cudaEventDestroy( start );
cudaEventDestroy( stop );</pre></div>
                           <div class="section">
                              <p class="p">Here <samp class="ph codeph">cudaEventRecord()</samp> is used to place the
                                 <samp class="ph codeph">start</samp> and <samp class="ph codeph">stop</samp> events into the
                                 default stream, stream 0. The device will record a timestamp for the
                                 event when it reaches that event in the stream. The
                                 <samp class="ph codeph">cudaEventElapsedTime()</samp> function returns the time
                                 elapsed between the recording of the <samp class="ph codeph">start</samp> and
                                 <samp class="ph codeph">stop</samp> events. This value is expressed in milliseconds
                                 and has a resolution of approximately half a microsecond. Like the
                                 other calls in this listing, their specific operation, parameters, and
                                 return values are described in the <em class="ph i">CUDA Toolkit Reference
                                    Manual</em>. Note that the timings are measured on the GPU clock, so the
                                 timing resolution is operating-system-independent.
                              </p>
                           </div>
                        </div>
                     </div>
                  </div>
                  <div class="topic concept nested1" xml:lang="en-US" id="bandwidth"><a name="bandwidth" shape="rect">
                        <!-- --></a><h3 class="title topictitle2"><a href="#bandwidth" name="bandwidth" shape="rect">8.2.&nbsp;Bandwidth</a></h3>
                     <div class="body conbody">
                        <p class="p">Bandwidth - the rate at which data can be transferred - is one of
                           the most important gating factors for performance. Almost all changes
                           to code should be made in the context of how they affect bandwidth. As
                           described in <a class="xref" href="index.html#memory-optimizations" shape="rect">Memory Optimizations</a> of this guide,
                           bandwidth can be dramatically affected by the choice of memory in which
                           data is stored, how the data is laid out and the order in which it is
                           accessed, as well as other factors.
                        </p>
                        <p class="p">To measure performance accurately, it is useful to calculate
                           theoretical and effective bandwidth. When the latter is much lower than
                           the former, design or implementation details are likely to reduce
                           bandwidth, and it should be the primary goal of subsequent optimization
                           efforts to increase it.
                        </p>
                        <div class="note note"><span class="notetitle">Note:</span><strong class="ph b">High Priority:</strong> Use the effective bandwidth of your
                           computation as a metric when measuring performance and optimization
                           benefits.
                        </div>
                     </div>
                     <div class="topic concept nested2" xml:lang="en-US" id="theoretical-bandwidth-calculation"><a name="theoretical-bandwidth-calculation" shape="rect">
                           <!-- --></a><h3 class="title topictitle2"><a href="#theoretical-bandwidth-calculation" name="theoretical-bandwidth-calculation" shape="rect">8.2.1.&nbsp;Theoretical Bandwidth Calculation</a></h3>
                        <div class="body conbody">
                           <p class="p">Theoretical bandwidth can be calculated using hardware
                              specifications available in the product literature. For example, the
                              NVIDIA Tesla M2090 uses GDDR5 (double data rate) RAM with a memory
                              clock rate of 1.85 GHz and a 384-bit-wide memory interface.
                           </p>
                           <p class="p">Using these data items, the peak theoretical memory bandwidth of the
                              NVIDIA Tesla M2090 is 177.6 GB/s:
                           </p>
                           <p class="p d4p_eqn_block">
                              <math xmlns="http://www.w3.org/1998/Math/MathML">
                                 <mrow>
                                    <mrow>
                                       <mrow>
                                          <mo>(</mo>
                                          <mn>1.85</mn>
                                          <mo>×</mo>
                                          <msup>
                                             <mrow>
                                                <mn>10</mn>
                                             </mrow>
                                             <mrow>
                                                <mn>9</mn>
                                             </mrow>
                                          </msup>
                                       </mrow>
                                       <mo>×</mo>
                                       <mrow>
                                          <mo>(</mo>
                                          <mn>384</mn>
                                          <mo>/</mo>
                                          <mn>8</mn>
                                          <mo>)</mo>
                                       </mrow>
                                       <mo>×</mo>
                                       <mn>2</mn>
                                       <mo>)</mo>
                                    </mrow>
                                    <mo>÷</mo>
                                    <msup>
                                       <mrow>
                                          <mn>10</mn>
                                       </mrow>
                                       <mrow>
                                          <mn>9</mn>
                                       </mrow>
                                    </msup>
                                    <mo>=</mo>
                                    <mn>177.6</mn>
                                    <mo>⁢</mo>
                                    <mtext></mtext>
                                    <mtext>GB/s</mtext>
                                    <mo>⁡</mo>
                                 </mrow>
                              </math>
                           </p>
                           <p class="p">In this calculation, the memory clock rate is converted in to Hz,
                              multiplied by the interface width (divided by 8, to convert bits to
                              bytes) and multiplied by 2 due to the double data rate. Finally, this
                              product is divided by 10<sup class="ph sup">9</sup> to convert the result to GB/s.
                           </p>
                           <div class="note note"><span class="notetitle">Note:</span> Some calculations use 1024<sup class="ph sup">3</sup> instead of
                              10<sup class="ph sup">9</sup> for the final calculation. In such a case, the bandwidth
                              would be 165.4GB/s. It is important to use the same divisor when
                              calculating theoretical and effective bandwidth so that the comparison
                              is valid.
                           </div>
                           <div class="note note"><span class="notetitle">Note:</span> When ECC is enabled, the effective maximum bandwidth is reduced
                              by approximately 20% due to the additional traffic for the memory
                              checksums, though the exact impact of ECC on bandwidth depends on the
                              memory access pattern.
                           </div>
                        </div>
                     </div>
                     <div class="topic concept nested2" xml:lang="en-US" id="effective-bandwidth-calculation"><a name="effective-bandwidth-calculation" shape="rect">
                           <!-- --></a><h3 class="title topictitle2"><a href="#effective-bandwidth-calculation" name="effective-bandwidth-calculation" shape="rect">8.2.2.&nbsp;Effective Bandwidth Calculation</a></h3>
                        <div class="body conbody">
                           <p class="p">Effective bandwidth is calculated by timing specific program
                              activities and by knowing how data is accessed by the program. To do
                              so, use this equation:
                           </p>
                           <p class="p d4p_eqn_block">
                              <math xmlns="http://www.w3.org/1998/Math/MathML">
                                 <mrow>
                                    <mtext>Effective bandwidth</mtext>
                                    <mo>=</mo>
                                    <mrow>
                                       <mo>(</mo>
                                       <mrow>
                                          <mo>(</mo>
                                          <msub>
                                             <mrow>
                                                <mi>B</mi>
                                             </mrow>
                                             <mrow>
                                                <mi>r</mi>
                                             </mrow>
                                          </msub>
                                          <mo>+</mo>
                                          <msub>
                                             <mrow>
                                                <mi>B</mi>
                                             </mrow>
                                             <mrow>
                                                <mi>w</mi>
                                             </mrow>
                                          </msub>
                                          <mo>)</mo>
                                          <mo>÷</mo>
                                          <msup>
                                             <mrow>
                                                <mn>10</mn>
                                             </mrow>
                                             <mrow>
                                                <mn>9</mn>
                                             </mrow>
                                          </msup>
                                       </mrow>
                                       <mo>)</mo>
                                    </mrow>
                                    <mo>÷</mo>
                                    <mtext>time</mtext>
                                 </mrow>
                              </math>
                           </p>
                           <p class="p">Here, the effective bandwidth is in units of GB/s, B<sub class="ph sub">r</sub> is
                              the number of bytes read per kernel, B<sub class="ph sub">w</sub> is the number of
                              bytes written per kernel, and time is given in seconds.
                           </p>
                           <p class="p">For example, to compute the effective bandwidth of a 2048 x 2048
                              matrix copy, the following formula could be used:
                           </p>
                           <p class="p d4p_eqn_block">
                              <math xmlns="http://www.w3.org/1998/Math/MathML">
                                 <mrow>
                                    <mtext>Effective bandwidth</mtext>
                                    <mo>=</mo>
                                    <mrow>
                                       <mo>(</mo>
                                       <mrow>
                                          <mo>(</mo>
                                          <msup>
                                             <mrow>
                                                <mn>2048</mn>
                                             </mrow>
                                             <mrow>
                                                <mn>2</mn>
                                             </mrow>
                                          </msup>
                                          <mo>×</mo>
                                          <mn>4</mn>
                                          <mo>×</mo>
                                          <mn>2</mn>
                                          <mo>)</mo>
                                          <mo>÷</mo>
                                          <msup>
                                             <mrow>
                                                <mn>10</mn>
                                             </mrow>
                                             <mrow>
                                                <mn>9</mn>
                                             </mrow>
                                          </msup>
                                       </mrow>
                                       <mo>)</mo>
                                    </mrow>
                                    <mo>÷</mo>
                                    <mtext>time</mtext>
                                 </mrow>
                              </math>
                           </p>
                           <p class="p">The number of elements is multiplied by the size of each element (4
                              bytes for a float), multiplied by 2 (because of the read <em class="ph i">and</em>
                              write), divided by 10<sup class="ph sup">9</sup> (or 1,024<sup class="ph sup">3</sup>) to obtain GB
                              of memory transferred. This number is divided by the time in seconds to
                              obtain GB/s.
                           </p>
                        </div>
                     </div>
                     <div class="topic concept nested2" xml:lang="en-US" id="throughput-reported-by-visual-profiler"><a name="throughput-reported-by-visual-profiler" shape="rect">
                           <!-- --></a><h3 class="title topictitle2"><a href="#throughput-reported-by-visual-profiler" name="throughput-reported-by-visual-profiler" shape="rect">8.2.3.&nbsp;Throughput Reported by Visual Profiler</a></h3>
                        <div class="body conbody">
                           <p class="p">For devices with <a class="xref" href="index.html#cuda-compute-capability" shape="rect">compute capability</a> of 2.0
                              or greater, the Visual Profiler can be used to collect several
                              different memory throughput measures. The following throughput metrics
                              can be displayed in the Details or Detail Graphs view:
                           </p>
                           <ul class="ul">
                              <li class="li">Requested Global Load Throughput</li>
                              <li class="li">Requested Global Store Throughput</li>
                              <li class="li">Global Load Throughput</li>
                              <li class="li">Global Store Throughput</li>
                              <li class="li">DRAM Read Throughput</li>
                              <li class="li">DRAM Write Throughput</li>
                           </ul>
                           <p class="p">The Requested Global Load Throughput and Requested Global Store
                              Throughput values indicate the global memory throughput requested by
                              the kernel and therefore correspond to the effective bandwidth obtained
                              by the calculation shown under <a class="xref" href="index.html#effective-bandwidth-calculation" shape="rect">Effective Bandwidth Calculation</a>.
                           </p>
                           <p class="p">Because the minimum memory transaction size is larger than most word
                              sizes, the actual memory throughput required for a kernel can include
                              the transfer of data not used by the kernel. For global memory
                              accesses, this actual throughput is reported by the Global Load
                              Throughput and Global Store Throughput values.
                           </p>
                           <p class="p">It's important to note that both numbers are useful. The actual
                              memory throughput shows how close the code is to the hardware limit,
                              and a comparison of the effective or requested bandwidth to the actual
                              bandwidth presents a good estimate of how much bandwidth is wasted by
                              suboptimal coalescing of memory accesses (see <a class="xref" href="index.html#coalesced-access-to-global-memory" shape="rect">Coalesced Access to Global Memory</a>). For global memory
                              accesses, this comparison of requested memory bandwidth to actual
                              memory bandwidth is reported by the Global Memory Load Efficiency and
                              Global Memory Store Efficiency metrics.
                           </p>
                        </div>
                     </div>
                  </div>
               </div>
               <div class="topic concept nested0" xml:lang="en-US" id="memory-optimizations"><a name="memory-optimizations" shape="rect">
                     <!-- --></a><h2 class="title topictitle1"><a href="#memory-optimizations" name="memory-optimizations" shape="rect">9.&nbsp;Memory Optimizations</a></h2>
                  <div class="body conbody">
                     <p class="p">Memory optimizations are the most important area for performance.
                        The goal is to maximize the use of the hardware by maximizing
                        bandwidth. Bandwidth is best served by using as much fast memory and as
                        little slow-access memory as possible. This chapter discusses the
                        various kinds of memory on the host and device and how best to set up
                        data items to use the memory effectively.
                     </p>
                  </div>
                  <div class="topic concept nested1" xml:lang="en-US" id="data-transfer-between-host-and-device"><a name="data-transfer-between-host-and-device" shape="rect">
                        <!-- --></a><h3 class="title topictitle2"><a href="#data-transfer-between-host-and-device" name="data-transfer-between-host-and-device" shape="rect">9.1.&nbsp;Data Transfer Between Host and Device</a></h3>
                     <div class="body conbody">
                        <p class="p">The peak theoretical bandwidth between the device memory and the GPU
                           is much higher (177.6 GB/s on the NVIDIA Tesla M2090, for example) than
                           the peak theoretical bandwidth between host memory and device memory (8
                           GB/s on the PCIe x16 Gen2). Hence, for best overall application
                           performance, it is important to minimize data transfer between the host
                           and the device, even if that means running kernels on the GPU that do
                           not demonstrate any speedup compared with running them on the host
                           CPU.
                        </p>
                        <div class="note note"><span class="notetitle">Note:</span><strong class="ph b">High Priority:</strong> Minimize data transfer between the host and
                           the device, even if it means running some kernels on the device that do
                           not show performance gains when compared with running them on the host
                           CPU.
                        </div>
                        <p class="p">Intermediate data structures should be created in device memory,
                           operated on by the device, and destroyed without ever being mapped by
                           the host or copied to host memory.
                        </p>
                        <p class="p">Also, because of the overhead associated with each transfer,
                           batching many small transfers into one larger transfer performs
                           significantly better than making each transfer separately, even
                           if doing so requires packing non-contiguous regions of memory into
                           a contiguous buffer and then unpacking after the transfer.
                        </p>
                        <p class="p">Finally, higher bandwidth between the host and the device is
                           achieved when using <dfn class="term">page-locked</dfn> (or <dfn class="term">pinned</dfn>)
                           memory, as discussed in the <cite class="cite">CUDA C Programming Guide</cite> and
                           the <a class="xref" href="index.html#pinned-memory" shape="rect">Pinned Memory</a> section of this document.
                        </p>
                     </div>
                     <div class="topic concept nested2" xml:lang="en-US" id="pinned-memory"><a name="pinned-memory" shape="rect">
                           <!-- --></a><h3 class="title topictitle2"><a href="#pinned-memory" name="pinned-memory" shape="rect">9.1.1.&nbsp;Pinned Memory</a></h3>
                        <div class="body conbody">
                           <p class="p">Page-locked or pinned memory transfers attain the highest bandwidth
                              between the host and the device. On PCIe x16 Gen2 cards, for example,
                              pinned memory can attain roughly 6GB/s transfer rates.
                           </p>
                           <p class="p">Pinned memory is allocated using the
                              <samp class="ph codeph">cudaHostAlloc()</samp> functions in the Runtime API. The
                              <samp class="ph codeph">bandwidthTest</samp> CUDA Sample
                              shows how to use these functions as well as how to measure memory
                              transfer performance.
                           </p>
                           <p class="p">For regions of system memory that have already been pre-allocated,
                              <samp class="ph codeph">cudaHostRegister()</samp> can be used to pin the memory
                              on-the-fly without the need to allocate a separate buffer and copy
                              the data into it.
                           </p>
                           <p class="p">Pinned memory should not be overused. Excessive use can reduce
                              overall system performance because pinned memory is a scarce resource,
                              but how much is too much is difficult to know in advance. Furthermore,
                              the pinning of system memory is a heavyweight operation compared to
                              most normal system memory allocations, so as with all optimizations,
                              test the application and the systems it runs on for optimal
                              performance parameters.
                           </p>
                        </div>
                     </div>
                     <div class="topic concept nested2" xml:lang="en-US" id="asynchronous-transfers-and-overlapping-transfers-with-computation"><a name="asynchronous-transfers-and-overlapping-transfers-with-computation" shape="rect">
                           <!-- --></a><h3 class="title topictitle2"><a href="#asynchronous-transfers-and-overlapping-transfers-with-computation" name="asynchronous-transfers-and-overlapping-transfers-with-computation" shape="rect">9.1.2.&nbsp;Asynchronous and Overlapping Transfers with Computation</a></h3>
                        <div class="body conbody">
                           <div class="section">
                              <p class="p">Data transfers between the host and the device using
                                 <samp class="ph codeph">cudaMemcpy()</samp> are blocking transfers; that is, control
                                 is returned to the host thread only after the data transfer is
                                 complete. The <samp class="ph codeph">cudaMemcpyAsync()</samp> function is a
                                 non-blocking variant of <samp class="ph codeph">cudaMemcpy()</samp> in which control
                                 is returned immediately to the host thread.  In contrast with
                                 <samp class="ph codeph">cudaMemcpy()</samp>, the asynchronous transfer version
                                 <em class="ph i">requires</em> pinned host memory (see <a class="xref" href="index.html#pinned-memory" shape="rect">Pinned Memory</a>), and it contains an additional argument, a stream ID.
                                 A <dfn class="term">stream</dfn> is simply a sequence of operations that are
                                 performed in order on the device.  Operations in different streams can
                                 be interleaved and in some cases overlapped - a property that can be
                                 used to hide data transfers between the host and the device.
                              </p>
                              <p class="p">Asynchronous transfers enable overlap of data transfers with
                                 computation in two different ways.  On all CUDA-enabled devices, it is
                                 possible to overlap host computation with asynchronous data transfers
                                 and with device computations. For example, <a class="xref" href="index.html#asynchronous-transfers-and-overlapping-transfers-with-computation__overlapping-computation-and-data-transfers" shape="rect">Overlapping computation and data transfers</a>
                                 demonstrates how host computation in the routine
                                 <samp class="ph codeph">cpuFunction()</samp> is performed while data is transferred
                                 to the device and a kernel using the device is executed.
                              </p>
                           </div>
                           <div class="example" id="asynchronous-transfers-and-overlapping-transfers-with-computation__overlapping-computation-and-data-transfers"><a name="asynchronous-transfers-and-overlapping-transfers-with-computation__overlapping-computation-and-data-transfers" shape="rect">
                                 <!-- --></a><h4 class="title sectiontitle">Overlapping computation and data transfers</h4><pre xml:space="preserve">cudaMemcpyAsync(a_d, a_h, size, cudaMemcpyHostToDevice, 0);
kernel<span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-attribute">&lt;&lt;&lt;</span>grid, block<span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-attribute">&gt;&gt;&gt;</span>(a_d);
cpuFunction();</pre></div>
                           <div class="section">
                              <p class="p">The last argument to the <samp class="ph codeph">cudaMemcpyAsync()</samp> function
                                 is the stream ID, which in this case uses the default stream, stream 0.
                                 The kernel also uses the default stream, and it will not begin
                                 execution until the memory copy completes; therefore, no explicit
                                 synchronization is needed. Because the memory copy and the kernel both
                                 return control to the host immediately, the host function
                                 <samp class="ph codeph">cpuFunction()</samp> overlaps their execution.
                              </p>
                              <p class="p">In <a class="xref" href="index.html#asynchronous-transfers-and-overlapping-transfers-with-computation__overlapping-computation-and-data-transfers" shape="rect">Overlapping computation and data transfers</a>,
                                 the memory copy and kernel execution occur sequentially. On devices
                                 that are capable of concurrent copy and compute, it is possible to
                                 overlap kernel execution on the device with data transfers between the
                                 host and the device. Whether a device has this capability is indicated
                                 by the <samp class="ph codeph">asyncEngineCount</samp> field of the
                                 <samp class="ph codeph">cudaDeviceProp</samp> structure (or listed in the output of
                                 the <samp class="ph codeph">deviceQuery</samp> CUDA Sample). On devices that have this
                                 capability, the overlap once again requires pinned host memory, and, in
                                 addition, the data transfer and kernel must use different, non-default
                                 streams (streams with non-zero stream IDs). Non-default streams are
                                 required for this overlap because memory copy, memory set functions,
                                 and kernel calls that use the default stream begin only after all
                                 preceding calls on the device (in any stream) have completed, and no
                                 operation on the device (in any stream) commences until they are
                                 finished.
                              </p>
                              <p class="p"><a class="xref" href="index.html#asynchronous-transfers-and-overlapping-transfers-with-computation__concurrent-copy-and-execute" shape="rect">Concurrent copy and execute</a>
                                 illustrates the basic technique.
                              </p>
                           </div>
                           <div class="example" id="asynchronous-transfers-and-overlapping-transfers-with-computation__concurrent-copy-and-execute"><a name="asynchronous-transfers-and-overlapping-transfers-with-computation__concurrent-copy-and-execute" shape="rect">
                                 <!-- --></a><h4 class="title sectiontitle">Concurrent copy and execute</h4><pre xml:space="preserve">cudaStreamCreate(&amp;stream1);
cudaStreamCreate(&amp;stream2);
cudaMemcpyAsync(a_d, a_h, size, cudaMemcpyHostToDevice, stream1);
kernel<span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-attribute">&lt;&lt;&lt;</span>grid, block, 0, stream2<span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-attribute">&gt;&gt;&gt;</span>(otherData_d);</pre></div>
                           <div class="section">
                              <p class="p">In this code, two streams are created and used in the data transfer
                                 and kernel executions as specified in the last arguments of the
                                 <samp class="ph codeph">cudaMemcpyAsync</samp> call and the kernel's execution
                                 configuration.
                              </p>
                              <p class="p"><a class="xref" href="index.html#asynchronous-transfers-and-overlapping-transfers-with-computation__concurrent-copy-and-execute" shape="rect">Concurrent copy and execute</a>
                                 demonstrates how to overlap kernel execution with asynchronous data
                                 transfer. This technique could be used when the data dependency is such
                                 that the data can be broken into chunks and transferred in multiple
                                 stages, launching multiple kernels to operate on each chunk as it
                                 arrives. <a class="xref" href="index.html#asynchronous-transfers-and-overlapping-transfers-with-computation__sequential-copy-and-execute" shape="rect">Sequential copy and execute</a>
                                 and <a class="xref" href="index.html#asynchronous-transfers-and-overlapping-transfers-with-computation__staged-concurrent-copy-and-execute" shape="rect">Staged concurrent copy and execute</a>
                                 demonstrate this. They produce equivalent results. The first segment
                                 shows the reference sequential implementation, which transfers and
                                 operates on an array of <em class="ph i">N</em> floats (where <em class="ph i">N</em> is assumed to
                                 be evenly divisible by nThreads).
                              </p>
                           </div>
                           <div class="example" id="asynchronous-transfers-and-overlapping-transfers-with-computation__sequential-copy-and-execute"><a name="asynchronous-transfers-and-overlapping-transfers-with-computation__sequential-copy-and-execute" shape="rect">
                                 <!-- --></a><h4 class="title sectiontitle">Sequential copy and execute</h4><pre xml:space="preserve">cudaMemcpy(a_d, a_h, N*<span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-keyword">sizeof</span>(<span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-keyword">float</span>), dir);
kernel<span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-attribute">&lt;&lt;&lt;</span>N/nThreads, nThreads<span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-attribute">&gt;&gt;&gt;</span>(a_d);</pre></div>
                           <div class="section">
                              <p class="p"><a class="xref" href="index.html#asynchronous-transfers-and-overlapping-transfers-with-computation__staged-concurrent-copy-and-execute" shape="rect">Staged concurrent copy and execute</a>
                                 shows how the transfer and kernel execution can be broken up into
                                 nStreams stages. This approach permits some overlapping of the data
                                 transfer and execution.
                              </p>
                           </div>
                           <div class="example" id="asynchronous-transfers-and-overlapping-transfers-with-computation__staged-concurrent-copy-and-execute"><a name="asynchronous-transfers-and-overlapping-transfers-with-computation__staged-concurrent-copy-and-execute" shape="rect">
                                 <!-- --></a><h4 class="title sectiontitle">Staged concurrent copy and execute</h4><pre xml:space="preserve">size=N*<span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-keyword">sizeof</span>(<span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-keyword">float</span>)/nStreams;
<span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-keyword">for</span> (i=0; i&lt;nStreams; i++) {
    offset = i*N/nStreams;
    cudaMemcpyAsync(a_d+offset, a_h+offset, size, dir, stream[i]);
    kernel<span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-attribute">&lt;&lt;&lt;</span>N/(nThreads*nStreams), nThreads, 0,  
             stream[i]<span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-attribute">&gt;&gt;&gt;</span>(a_d+offset);
}</pre></div>
                           <div class="section">
                              <p class="p">(In <a class="xref" href="index.html#asynchronous-transfers-and-overlapping-transfers-with-computation__staged-concurrent-copy-and-execute" shape="rect">Staged concurrent copy and execute</a>,
                                 it is assumed that <em class="ph i">N</em> is evenly divisible by
                                 <samp class="ph codeph">nThreads*nStreams</samp>.) Because execution within a stream
                                 occurs sequentially, none of the kernels will launch until the data
                                 transfers in their respective streams complete. Current GPUs can
                                 simultaneously process asynchronous data transfers and execute kernels.
                                 GPUs with a single copy engine can perform one asynchronous data
                                 transfer and execute kernels whereas GPUs with two copy engines can
                                 simultaneously perform one asynchronous data transfer from the host to
                                 the device, one asynchronous data transfer from the device to the host,
                                 and execute kernels. The number of copy engines on a GPU is given by
                                 the <samp class="ph codeph">asyncEngineCount</samp> field of the
                                 <samp class="ph codeph">cudaDeviceProp</samp> structure, which is also listed in the
                                 output of the <samp class="ph codeph">deviceQuery</samp> CUDA Sample. (It should be
                                 mentioned that it is not possible to overlap a blocking transfer with
                                 an asynchronous transfer, because the blocking transfer occurs in the
                                 default stream, so it will not begin until all previous CUDA calls
                                 complete. It will not allow any other CUDA call to begin until it has
                                 completed.) A diagram depicting the timeline of execution for the two
                                 code segments is shown in <a class="xref" href="index.html#asynchronous-transfers-and-overlapping-transfers-with-computation__timeline-comparison-for-copy-and-kernel-execution" shape="rect">Figure 1</a>,
                                 and <samp class="ph codeph">nStreams</samp> is equal to 4 for <a class="xref" href="index.html#asynchronous-transfers-and-overlapping-transfers-with-computation__staged-concurrent-copy-and-execute" shape="rect">Staged concurrent copy and execute</a>
                                 in the bottom half of the figure.
                              </p>
                              <div class="fig fignone" id="asynchronous-transfers-and-overlapping-transfers-with-computation__timeline-comparison-for-copy-and-kernel-execution"><a name="asynchronous-transfers-and-overlapping-transfers-with-computation__timeline-comparison-for-copy-and-kernel-execution" shape="rect">
                                    <!-- --></a><span class="figcap">Figure 1. Timeline comparison for copy and kernel execution</span><br clear="none"></br><div class="imagecenter"><img class="image imagecenter" src="graphics/timeline-comparison-for-copy-and-kernel-execution.png" alt="Timeline comparison for copy and kernel execution."></img></div><br clear="none"></br><dl class="dl">
                                    <dt class="dt dlterm">Top</dt>
                                    <dd class="dd">Sequential</dd>
                                    <dt class="dt dlterm">Bottom</dt>
                                    <dd class="dd">Concurrent</dd>
                                 </dl>
                              </div>
                              <p class="p">For this example, it is assumed that the data transfer and kernel
                                 execution times are comparable. In such cases, and when the execution
                                 time (<em class="ph i">t<sub class="ph sub">E</sub></em>) exceeds the transfer time
                                 (<em class="ph i">t<sub class="ph sub">T</sub></em>), a rough estimate for the overall time is
                                 <em class="ph i">t<sub class="ph sub">E</sub> + t<sub class="ph sub">T</sub>/nStreams</em> for the staged version
                                 versus <em class="ph i">t<sub class="ph sub">E</sub> + t<sub class="ph sub">T</sub></em> for the sequential version.
                                 If the transfer time exceeds the execution time, a rough estimate for
                                 the overall time is <em class="ph i">t<sub class="ph sub">T</sub> + t<sub class="ph sub">E</sub>/nStreams</em>.
                              </p>
                           </div>
                        </div>
                     </div>
                     <div class="topic concept nested2" xml:lang="en-US" id="zero-copy"><a name="zero-copy" shape="rect">
                           <!-- --></a><h3 class="title topictitle2"><a href="#zero-copy" name="zero-copy" shape="rect">9.1.3.&nbsp;Zero Copy</a></h3>
                        <div class="body conbody">
                           <div class="section">
                              <p class="p"><dfn class="term">Zero copy</dfn> is a feature that was added in version 2.2 of
                                 the CUDA Toolkit. It enables GPU threads to directly access host
                                 memory. For this purpose, it requires mapped pinned (non-pageable)
                                 memory. On integrated GPUs (i.e., GPUs with the integrated field of the
                                 CUDA device properties structure set to 1), mapped pinned memory is
                                 always a performance gain because it avoids superfluous copies as
                                 integrated GPU and CPU memory are physically the same.  On discrete
                                 GPUs, mapped pinned memory is advantageous only in certain cases.
                                 Because the data is not cached on the GPU, mapped pinned memory should
                                 be read or written only once, and the global loads and stores that read
                                 and write the memory should be coalesced. Zero copy can be used in
                                 place of streams because kernel-originated data transfers automatically
                                 overlap kernel execution without the overhead of setting up and
                                 determining the optimal number of streams.
                              </p>
                              <div class="note note"><span class="notetitle">Note:</span><strong class="ph b">Low Priority:</strong> Use zero-copy operations on integrated GPUs
                                 for CUDA Toolkit version 2.2 and later.
                              </div>
                              <p class="p">The host code in <a class="xref" href="index.html#zero-copy__zero-copy-host-code" shape="rect">Zero-copy host code</a>
                                 shows how zero copy is typically set up.
                              </p>
                           </div>
                           <div class="example" id="zero-copy__zero-copy-host-code"><a name="zero-copy__zero-copy-host-code" shape="rect">
                                 <!-- --></a><h4 class="title sectiontitle">Zero-copy host code</h4><pre xml:space="preserve"><span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-keyword">float</span> *a_h, *a_map;
...
cudaGetDeviceProperties(&amp;prop, 0);
<span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-keyword">if</span> (!prop.canMapHostMemory) 
    exit(0);
cudaSetDeviceFlags(cudaDeviceMapHost);
cudaHostAlloc(&amp;a_h, nBytes, cudaHostAllocMapped);
cudaHostGetDevicePointer(&amp;a_map, a_h, 0);
kernel<span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-attribute">&lt;&lt;&lt;</span>gridSize, blockSize<span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-attribute">&gt;&gt;&gt;</span>(a_map);</pre></div>
                           <div class="section">
                              <p class="p">In this code, the <samp class="ph codeph">canMapHostMemory</samp> field of the
                                 structure returned by <samp class="ph codeph">cudaGetDeviceProperties()</samp> is
                                 used to check that the device supports mapping host memory to the
                                 device's address space. Page-locked memory mapping is enabled by
                                 calling <samp class="ph codeph">cudaSetDeviceFlags()</samp> with
                                 <samp class="ph codeph">cudaDeviceMapHost</samp>. Note that
                                 <samp class="ph codeph">cudaSetDeviceFlags()</samp> must be called prior to setting a
                                 device or making a CUDA call that requires state (that is, essentially,
                                 before a context is created). Page-locked mapped host memory is
                                 allocated using <samp class="ph codeph">cudaHostAlloc()</samp>, and the pointer to
                                 the mapped device address space is obtained via the function
                                 <samp class="ph codeph">cudaHostGetDevicePointer()</samp>. In the code in <a class="xref" href="index.html#zero-copy__zero-copy-host-code" shape="rect">Zero-copy host code</a>, <samp class="ph codeph">kernel()</samp> can
                                 reference the mapped pinned host memory using the pointer
                                 <samp class="ph codeph">a_map</samp> in exactly the same was as it would if a_map
                                 referred to a location in device memory.
                              </p>
                              <div class="note note"><span class="notetitle">Note:</span> Mapped pinned host memory allows you to overlap CPU-GPU memory
                                 transfers with computation while avoiding the use of CUDA streams. But
                                 since any repeated access to such memory areas causes repeated PCIe
                                 transfers, consider creating a second area in device memory to manually
                                 cache the previously read host memory data.
                              </div>
                           </div>
                        </div>
                     </div>
                     <div class="topic concept nested2" xml:lang="en-US" id="unified-virtual-addressing"><a name="unified-virtual-addressing" shape="rect">
                           <!-- --></a><h3 class="title topictitle2"><a href="#unified-virtual-addressing" name="unified-virtual-addressing" shape="rect">9.1.4.&nbsp;Unified Virtual Addressing</a></h3>
                        <div class="body conbody">
                           <p class="p">Devices of <a class="xref" href="index.html#cuda-compute-capability" shape="rect">compute
                                 capability</a> 2.0 and later support a special addressing mode
                              called <dfn class="term">Unified Virtual Addressing</dfn> (UVA) on 64-bit Linux,
                              Mac OS, and Windows XP and on Windows Vista/7 when using TCC driver
                              mode. With UVA, the host memory and the device memories of all
                              installed supported devices share a single virtual address space.
                           </p>
                           <p class="p">Prior to UVA, an application had to keep track of which pointers
                              referred to device memory (and for which device) and which referred to
                              host memory as a separate bit of metadata (or as hard-coded information
                              in the program) for each pointer. Using UVA, on the other hand, the
                              physical memory space to which a pointer points can be determined
                              simply by inspecting the value of the pointer using
                              <samp class="ph codeph">cudaPointerGetAttributes()</samp>.
                           </p>
                           <p class="p">Under UVA, pinned host memory allocated with
                              <samp class="ph codeph">cudaHostAlloc()</samp> will have identical host and device
                              pointers, so it is not necessary to call
                              <samp class="ph codeph">cudaHostGetDevicePointer()</samp> for such allocations. Host
                              memory allocations pinned after-the-fact via
                              <samp class="ph codeph">cudaHostRegister()</samp>, however, will continue to have
                              different device pointers than their host pointers, so
                              <samp class="ph codeph">cudaHostGetDevicePointer()</samp> remains necessary in that
                              case.
                           </p>
                           <p class="p">UVA is also a necessary precondition for enabling peer-to-peer (P2P)
                              transfer of data directly across the PCIe bus for supported GPUs in
                              supported configurations, bypassing host memory.
                           </p>
                           <p class="p">See the <cite class="cite">CUDA C Programming Guide</cite> for further
                              explanations and software requirements for UVA and P2P.
                           </p>
                        </div>
                     </div>
                  </div>
                  <div class="topic concept nested1" xml:lang="en-US" id="device-memory-spaces"><a name="device-memory-spaces" shape="rect">
                        <!-- --></a><h3 class="title topictitle2"><a href="#device-memory-spaces" name="device-memory-spaces" shape="rect">9.2.&nbsp;Device Memory Spaces</a></h3>
                     <div class="body conbody">
                        <p class="p">CUDA devices use several memory spaces, which have different
                           characteristics that reflect their distinct usages in CUDA
                           applications.  These memory spaces include global, local, shared,
                           texture, and registers, as shown in <a class="xref" href="index.html#device-memory-spaces__memory-spaces-cuda-device" shape="rect">Figure 2</a>.
                        </p>
                        <div class="fig fignone" id="device-memory-spaces__memory-spaces-cuda-device"><a name="device-memory-spaces__memory-spaces-cuda-device" shape="rect">
                              <!-- --></a><span class="figcap">Figure 2. Memory spaces on a CUDA device</span><br clear="none"></br><div class="imagecenter"><img class="image imagecenter" src="graphics/memory-spaces-on-cuda-device.png" alt="Memory spaces on a CUDA device."></img></div><br clear="none"></br></div>
                        <p class="p">Of these different memory spaces, global memory is the most
                           plentiful; see <cite class="cite">Features and Technical Specifications</cite> of
                           the <cite class="cite">CUDA C Programming Guide</cite> for the amounts of memory
                           available in each memory space at each <a class="xref" href="index.html#cuda-compute-capability" shape="rect">compute
                              capability</a> level.
                           Global, local, and texture memory have the greatest access latency,
                           followed by constant memory, shared memory, and the register file.
                        </p>
                        <p class="p">The various principal traits of the memory types are shown in <a class="xref" href="index.html#device-memory-spaces__salient-features-device-memory" shape="rect">Table 1</a>.
                        </p>
                        <div class="tablenoborder"><a name="device-memory-spaces__salient-features-device-memory" shape="rect">
                              <!-- --></a><table cellpadding="4" cellspacing="0" summary="" id="device-memory-spaces__salient-features-device-memory" class="table" frame="border" border="1" rules="all">
                              <caption><span class="tablecap">Table 1. Salient Features of Device Memory</span></caption>
                              <thead class="thead" align="left">
                                 <tr class="row">
                                    <th class="entry" valign="top" width="12.5%" id="d54e2211" rowspan="1" colspan="1">Memory</th>
                                    <th class="entry" valign="top" width="12.5%" id="d54e2214" rowspan="1" colspan="1">Location on/off chip</th>
                                    <th class="entry" valign="top" width="12.5%" id="d54e2217" rowspan="1" colspan="1">Cached</th>
                                    <th class="entry" valign="top" width="12.5%" id="d54e2220" rowspan="1" colspan="1">Access</th>
                                    <th class="entry" valign="top" width="25%" id="d54e2223" rowspan="1" colspan="1">Scope</th>
                                    <th class="entry" valign="top" width="25%" id="d54e2227" rowspan="1" colspan="1">Lifetime</th>
                                 </tr>
                              </thead>
                              <tbody class="tbody">
                                 <tr class="row">
                                    <td class="entry" valign="top" width="12.5%" headers="d54e2211" rowspan="1" colspan="1">Register</td>
                                    <td class="entry" valign="top" width="12.5%" headers="d54e2214" rowspan="1" colspan="1">On</td>
                                    <td class="entry" valign="top" width="12.5%" headers="d54e2217" rowspan="1" colspan="1">n/a</td>
                                    <td class="entry" valign="top" width="12.5%" headers="d54e2220" rowspan="1" colspan="1">R/W</td>
                                    <td class="entry" valign="top" width="25%" headers="d54e2223" rowspan="1" colspan="1">1 thread</td>
                                    <td class="entry" valign="top" width="25%" headers="d54e2227" rowspan="1" colspan="1">Thread</td>
                                 </tr>
                                 <tr class="row">
                                    <td class="entry" valign="top" width="12.5%" headers="d54e2211" rowspan="1" colspan="1">Local</td>
                                    <td class="entry" valign="top" width="12.5%" headers="d54e2214" rowspan="1" colspan="1">Off</td>
                                    <td class="entry" valign="top" width="12.5%" headers="d54e2217" rowspan="1" colspan="1">Yes††</td>
                                    <td class="entry" valign="top" width="12.5%" headers="d54e2220" rowspan="1" colspan="1">R/W</td>
                                    <td class="entry" valign="top" width="25%" headers="d54e2223" rowspan="1" colspan="1">1 thread</td>
                                    <td class="entry" valign="top" width="25%" headers="d54e2227" rowspan="1" colspan="1">Thread</td>
                                 </tr>
                                 <tr class="row">
                                    <td class="entry" valign="top" width="12.5%" headers="d54e2211" rowspan="1" colspan="1">Shared</td>
                                    <td class="entry" valign="top" width="12.5%" headers="d54e2214" rowspan="1" colspan="1">On</td>
                                    <td class="entry" valign="top" width="12.5%" headers="d54e2217" rowspan="1" colspan="1">n/a</td>
                                    <td class="entry" valign="top" width="12.5%" headers="d54e2220" rowspan="1" colspan="1">R/W</td>
                                    <td class="entry" valign="top" width="25%" headers="d54e2223" rowspan="1" colspan="1">All threads in block</td>
                                    <td class="entry" valign="top" width="25%" headers="d54e2227" rowspan="1" colspan="1">Block</td>
                                 </tr>
                                 <tr class="row">
                                    <td class="entry" valign="top" width="12.5%" headers="d54e2211" rowspan="1" colspan="1">Global</td>
                                    <td class="entry" valign="top" width="12.5%" headers="d54e2214" rowspan="1" colspan="1">Off</td>
                                    <td class="entry" valign="top" width="12.5%" headers="d54e2217" rowspan="1" colspan="1">†</td>
                                    <td class="entry" valign="top" width="12.5%" headers="d54e2220" rowspan="1" colspan="1">R/W</td>
                                    <td class="entry" valign="top" width="25%" headers="d54e2223" rowspan="1" colspan="1">All threads + host</td>
                                    <td class="entry" valign="top" width="25%" headers="d54e2227" rowspan="1" colspan="1">Host allocation</td>
                                 </tr>
                                 <tr class="row">
                                    <td class="entry" valign="top" width="12.5%" headers="d54e2211" rowspan="1" colspan="1">Constant</td>
                                    <td class="entry" valign="top" width="12.5%" headers="d54e2214" rowspan="1" colspan="1">Off</td>
                                    <td class="entry" valign="top" width="12.5%" headers="d54e2217" rowspan="1" colspan="1">Yes</td>
                                    <td class="entry" valign="top" width="12.5%" headers="d54e2220" rowspan="1" colspan="1">R</td>
                                    <td class="entry" valign="top" width="25%" headers="d54e2223" rowspan="1" colspan="1">All threads + host</td>
                                    <td class="entry" valign="top" width="25%" headers="d54e2227" rowspan="1" colspan="1">Host allocation</td>
                                 </tr>
                                 <tr class="row">
                                    <td class="entry" valign="top" width="12.5%" headers="d54e2211" rowspan="1" colspan="1">Texture</td>
                                    <td class="entry" valign="top" width="12.5%" headers="d54e2214" rowspan="1" colspan="1">Off</td>
                                    <td class="entry" valign="top" width="12.5%" headers="d54e2217" rowspan="1" colspan="1">Yes</td>
                                    <td class="entry" valign="top" width="12.5%" headers="d54e2220" rowspan="1" colspan="1">R</td>
                                    <td class="entry" valign="top" width="25%" headers="d54e2223" rowspan="1" colspan="1">All threads + host</td>
                                    <td class="entry" valign="top" width="25%" headers="d54e2227" rowspan="1" colspan="1">Host allocation</td>
                                 </tr>
                                 <tr class="row">
                                    <td class="entry" colspan="6" valign="top" headers="d54e2211 d54e2214 d54e2217 d54e2220 d54e2223 d54e2227" rowspan="1"><sup class="ph sup">†</sup> Cached
                                       in L1 and L2 by default on devices of compute capability 2.x;
                                       cached only in L2 by default on devices of higher compute
                                       capabilities, though some allow opt-in to caching in L1 as well
                                       via compilation flags.
                                    </td>
                                 </tr>
                                 <tr class="row">
                                    <td class="entry" colspan="6" valign="top" headers="d54e2211 d54e2214 d54e2217 d54e2220 d54e2223 d54e2227" rowspan="1"><sup class="ph sup">††</sup> Cached
                                       in L1 and L2 by default on devices of compute capability 2.x
                                       and 3.x; devices of compute capability 5.x cache locals only in L2.
                                    </td>
                                 </tr>
                              </tbody>
                           </table>
                        </div>
                        <p class="p">In the case of texture access, if a texture reference is bound to a
                           linear array in global memory, then the device code can write to the
                           underlying array. Texture references that are bound to CUDA arrays can
                           be written to via surface-write operations by binding a surface to the
                           same underlying CUDA array storage).  Reading from a texture while
                           writing to its underlying global memory array in the same kernel launch
                           should be avoided because the texture caches are read-only and are not
                           invalidated when the associated global memory is modified.
                        </p>
                     </div>
                     <div class="topic concept nested2" xml:lang="en-US" id="coalesced-access-to-global-memory"><a name="coalesced-access-to-global-memory" shape="rect">
                           <!-- --></a><h3 class="title topictitle2"><a href="#coalesced-access-to-global-memory" name="coalesced-access-to-global-memory" shape="rect">9.2.1.&nbsp;Coalesced Access to Global Memory</a></h3>
                        <div class="body conbody">
                           <p class="p">Perhaps the single most important performance consideration in
                              programming for CUDA-capable GPU architectures is the coalescing of
                              global memory accesses. Global memory loads and stores by threads of a
                              warp are
                              coalesced by the device into as few as one transaction when certain
                              access requirements are met.
                           </p>
                           <div class="note note"><span class="notetitle">Note:</span><strong class="ph b">High Priority:</strong> Ensure global memory accesses are coalesced
                              whenever possible.
                           </div>
                           <p class="p">The access requirements for coalescing depend on the compute
                              capability of the device and are documented in the <cite class="cite">CUDA C
                                 Programming Guide</cite>.
                           </p>
                           <p class="p">For devices of compute capability 2.x, the requirements can be
                              summarized quite easily: the concurrent accesses of the threads of a
                              warp will coalesce into a number of transactions equal to the number of
                              cache lines necessary to service all of the threads of the warp. By
                              default, all accesses are cached through L1, which as 128-byte lines.
                              For scattered access patterns, to reduce overfetch, it can sometimes be
                              useful to cache only in L2, which caches shorter 32-byte segments (see
                              the <cite class="cite">CUDA C Programming Guide</cite>).
                           </p>
                           <p class="p">For devices of compute capability 3.x, accesses to global
                              memory are cached only in L2; L1 is reserved for local memory accesses.
                              Some devices of compute capability 3.5, 3.7, or 5.2 allow opt-in
                              caching of globals in L1 as well.
                           </p>
                           <p class="p">Accessing memory in a coalesced way is even more important when ECC
                              is turned on. Scattered accesses increase ECC memory transfer overhead,
                              especially when writing data to the global memory.
                           </p>
                           <p class="p">Coalescing concepts are illustrated in the following simple
                              examples. These examples assume compute capability 2.x. These examples
                              assume that accesses are cached through L1, which is the default
                              behavior on those devices, and that accesses are for 4-byte words,
                              unless otherwise noted.
                           </p>
                        </div>
                        <div class="topic concept nested3" xml:lang="en-US" id="simple-access-pattern"><a name="simple-access-pattern" shape="rect">
                              <!-- --></a><h3 class="title topictitle2"><a href="#simple-access-pattern" name="simple-access-pattern" shape="rect">9.2.1.1.&nbsp;A Simple Access Pattern</a></h3>
                           <div class="body conbody">
                              <p class="p">The first and simplest case of coalescing can be achieved by any
                                 CUDA-enabled device: the <em class="ph i">k</em>-th thread accesses the <em class="ph i">k</em>-th
                                 word in a cache line. Not all threads need to participate.
                              </p>
                              <p class="p">For example, if the threads of a warp access adjacent 4-byte words
                                 (e.g., adjacent <samp class="ph codeph">float</samp> values), a single 128B L1 cache
                                 line and therefore a single coalesced transaction will service that
                                 memory access. Such a pattern is shown in <a class="xref" href="index.html#simple-access-pattern__coalesced-access" shape="rect">Figure 3</a>.
                              </p>
                              <div class="fig fignone" id="simple-access-pattern__coalesced-access"><a name="simple-access-pattern__coalesced-access" shape="rect">
                                    <!-- --></a><span class="figcap">Figure 3. Coalesced access - all threads access one cache line</span><br clear="none"></br><div class="imagecenter"><img class="image imagecenter" src="graphics/coalesced-access.png" alt="Coalesced access - all threads access one cache line."></img></div><br clear="none"></br></div>
                              <p class="p">This access pattern results in a single 128-byte L1 transaction,
                                 indicated by the red rectangle.
                              </p>
                              <p class="p">If some words of the line had not been requested by any thread (such
                                 as if several threads had accessed the same word or if some threads did
                                 not participate in the access), all data in the cache line is fetched
                                 anyway. Furthermore, if accesses by the threads of the warp had been
                                 permuted within this segment, still only one 128-byte L1 transaction
                                 would have been performed by a device with <a class="xref" href="index.html#cuda-compute-capability" shape="rect">compute
                                    capability</a> 2.x.
                              </p>
                           </div>
                        </div>
                        <div class="topic concept nested3" xml:lang="en-US" id="sequential-but-misaligned-access-pattern"><a name="sequential-but-misaligned-access-pattern" shape="rect">
                              <!-- --></a><h3 class="title topictitle2"><a href="#sequential-but-misaligned-access-pattern" name="sequential-but-misaligned-access-pattern" shape="rect">9.2.1.2.&nbsp;A Sequential but Misaligned Access Pattern</a></h3>
                           <div class="body conbody">
                              <p class="p">If sequential threads in a warp access memory that is sequential but
                                 not aligned with the cache lines, two 128-byte L1 cache will be
                                 requested, as shown in <a class="xref" href="index.html#sequential-but-misaligned-access-pattern__unaligned-sequential-addresses-two-128byte-l1-cache" shape="rect">Figure 4</a>.
                              </p>
                              <div class="fig fignone" id="sequential-but-misaligned-access-pattern__unaligned-sequential-addresses-two-128byte-l1-cache"><a name="sequential-but-misaligned-access-pattern__unaligned-sequential-addresses-two-128byte-l1-cache" shape="rect">
                                    <!-- --></a><span class="figcap">Figure 4. Unaligned sequential addresses that fit into two
                                    128-byte L1-cache lines</span><br clear="none"></br><div class="imagecenter"><img class="image imagecenter" src="graphics/unaligned-sequential-addresses.png" alt="Unaligned sequential addresses that fit into two                         128-byte L1-cache lines."></img></div><br clear="none"></br></div>
                              <p class="p">For non-caching transactions (i.e., those that bypass L1 and use
                                 only the L2 cache), a similar effect is seen, except at the level of
                                 the 32-byte L2 segments. In <a class="xref" href="index.html#sequential-but-misaligned-access-pattern__misaligned-sequential-addresses-fall-5-32-byte-L2-cache-seqments" shape="rect">Figure 5</a>,
                                 we see an example of this: the same access pattern from <a class="xref" href="index.html#sequential-but-misaligned-access-pattern__unaligned-sequential-addresses-two-128byte-l1-cache" shape="rect">Figure 4</a>
                                 is used, but now L1 caching is disabled, so now five 32-byte L2
                                 segments are needed to satisfy the request.
                              </p>
                              <div class="fig fignone" id="sequential-but-misaligned-access-pattern__misaligned-sequential-addresses-fall-5-32-byte-L2-cache-seqments"><a name="sequential-but-misaligned-access-pattern__misaligned-sequential-addresses-fall-5-32-byte-L2-cache-seqments" shape="rect">
                                    <!-- --></a><span class="figcap">Figure 5. Misaligned sequential addresses that fall within five
                                    32-byte L2-cache segments</span><br clear="none"></br><div class="imagecenter"><img class="image imagecenter" src="graphics/misaligned-sequential-addresses.png" alt="Misaligned sequential addresses that fall within five                         32-byte L2-cache segments."></img></div><br clear="none"></br></div>
                              <p class="p">Memory allocated through the CUDA Runtime API, such as via
                                 <samp class="ph codeph">cudaMalloc()</samp>, is guaranteed to be aligned to at least
                                 256 bytes. Therefore, choosing sensible thread block sizes, such as
                                 multiples of the warp size (i.e., 32 on current GPUs), facilitates
                                 memory accesses by warps that are aligned to cache lines. (Consider
                                 what would happen to the memory addresses accessed by the second,
                                 third, and subsequent thread blocks if the thread block size was not a
                                 multiple of warp size, for example.)
                              </p>
                           </div>
                        </div>
                        <div class="topic concept nested3" xml:lang="en-US" id="effects-of-misaligned-accesses"><a name="effects-of-misaligned-accesses" shape="rect">
                              <!-- --></a><h3 class="title topictitle2"><a href="#effects-of-misaligned-accesses" name="effects-of-misaligned-accesses" shape="rect">9.2.1.3.&nbsp;Effects of Misaligned Accesses</a></h3>
                           <div class="body conbody">
                              <div class="section">
                                 <p class="p">It is easy and informative to explore the ramifications of
                                    misaligned accesses using a simple copy kernel, such as the one in
                                    <a class="xref" href="index.html#effects-of-misaligned-accesses__copy-kernel-illustrates-misaligned-accesses" shape="rect">A copy kernel that illustrates misaligned accesses</a>.
                                 </p>
                              </div>
                              <div class="example" id="effects-of-misaligned-accesses__copy-kernel-illustrates-misaligned-accesses"><a name="effects-of-misaligned-accesses__copy-kernel-illustrates-misaligned-accesses" shape="rect">
                                    <!-- --></a><h5 class="title sectiontitle">A copy kernel that illustrates misaligned accesses</h5><pre xml:space="preserve"><span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-attribute">__global__</span> <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-keyword">void</span> offsetCopy(<span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-keyword">float</span> *odata, <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-keyword">float</span>* idata, <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-keyword">int</span> offset)
{
    <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-keyword">int</span> xid = <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-attribute">blockIdx</span>.x * <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-attribute">blockDim</span>.x + <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-attribute">threadIdx</span>.x + offset;
    odata[xid] = idata[xid];
}</pre></div>
                              <div class="section">
                                 <p class="p">In <a class="xref" href="index.html#effects-of-misaligned-accesses__copy-kernel-illustrates-misaligned-accesses" shape="rect">A copy kernel that illustrates misaligned accesses</a>,
                                    data is copied from the input array <samp class="ph codeph">idata</samp> to the
                                    output array, both of which exist in global memory. The kernel is
                                    executed within a loop in host code that varies the parameter
                                    <samp class="ph codeph">offset</samp> from 0 to 32. (<a class="xref" href="index.html#sequential-but-misaligned-access-pattern__unaligned-sequential-addresses-two-128byte-l1-cache" shape="rect">Figure 4</a>
                                    and <a class="xref" href="index.html#sequential-but-misaligned-access-pattern__unaligned-sequential-addresses-two-128byte-l1-cache" shape="rect">Figure 4</a>
                                    correspond to misalignments in the cases of caching and non-caching
                                    memory accesses, respectively.) The effective bandwidth for the copy
                                    with various offsets on an NVIDIA Tesla M2090 (<a class="xref" href="index.html#cuda-compute-capability" shape="rect">compute
                                       capability</a> 2.0, with ECC turned on, as it is by default) is shown in <a class="xref" href="index.html#effects-of-misaligned-accesses__performance-offsetcopy-kernel" shape="rect">Figure 6</a>.
                                 </p>
                                 <div class="fig fignone" id="effects-of-misaligned-accesses__performance-offsetcopy-kernel"><a name="effects-of-misaligned-accesses__performance-offsetcopy-kernel" shape="rect">
                                       <!-- --></a><span class="figcap">Figure 6. Performance of offsetCopy kernel</span><br clear="none"></br><div class="imagecenter"><img class="image imagecenter" src="graphics/performance-of-offsetcopy-kernel.png" alt="Performance of offsetCopy kernel."></img></div><br clear="none"></br></div>
                                 <p class="p">For the NVIDIA Tesla M2090, global memory accesses with no offset or
                                    with offsets that are multiples of 32 words result in a single L1 cache
                                    line transaction or 4 L2 cache segment loads (for non-L1-caching
                                    loads). The achieved bandwidth is approximately 130GB/s. Otherwise,
                                    either two L1 cache lines (caching mode) or four to five L2 cache
                                    segments (non-caching mode) are loaded per warp, resulting in
                                    approximately 4/5<sup class="ph sup">th</sup> of the memory throughput achieved with
                                    no offsets.
                                 </p>
                                 <p class="p">An interesting point is that we might expect the caching case to
                                    perform worse than the non-caching case for this sample, given that
                                    each warp in the caching case fetches twice as many bytes as it
                                    requires, whereas in the non-caching case, only 5/4 as many bytes as
                                    required are fetched per warp. In this particular example, that effect
                                    is not apparent, however, because adjacent warps reuse the cache lines
                                    their neighbors fetched. So while the impact is still evident in the
                                    case of caching loads, it is not as great as we might have expected. It
                                    would have been more so if adjacent warps had not exhibited such a high
                                    degree of reuse of the over-fetched cache lines.
                                 </p>
                              </div>
                           </div>
                        </div>
                        <div class="topic concept nested3" xml:lang="en-US" id="strided-accesses"><a name="strided-accesses" shape="rect">
                              <!-- --></a><h3 class="title topictitle2"><a href="#strided-accesses" name="strided-accesses" shape="rect">9.2.1.4.&nbsp;Strided Accesses</a></h3>
                           <div class="body conbody">
                              <div class="section">
                                 <p class="p">As seen above, in the case of misaligned sequential accesses, the
                                    caches of <a class="xref" href="index.html#cuda-compute-capability" shape="rect">compute
                                       capability</a> 2.x devices help a lot to achieve
                                    reasonable performance. It may be different with non-unit-strided
                                    accesses, however, and this is a pattern that occurs frequently when
                                    dealing with multidimensional data or matrices. For this reason,
                                    ensuring that as much as possible of the data in each cache line
                                    fetched is actually used is an important part of performance
                                    optimization of memory accesses on these devices.
                                 </p>
                                 <p class="p">To illustrate the effect of strided access on effective bandwidth,
                                    see the kernel <samp class="ph codeph">strideCopy()</samp> in <a class="xref" href="index.html#strided-accesses__kernel-illustrate-non-unit-stride-data-copy" shape="rect">A kernel to illustrate non-unit stride data copy</a>,
                                    which copies data with a stride of stride elements between threads from
                                    <samp class="ph codeph">idata</samp> to <samp class="ph codeph">odata</samp>.
                                 </p>
                              </div>
                              <div class="example" id="strided-accesses__kernel-illustrate-non-unit-stride-data-copy"><a name="strided-accesses__kernel-illustrate-non-unit-stride-data-copy" shape="rect">
                                    <!-- --></a><h5 class="title sectiontitle">A kernel to illustrate non-unit stride data copy</h5><pre xml:space="preserve"><span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-attribute">__global__</span> <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-keyword">void</span> strideCopy(<span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-keyword">float</span> *odata, <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-keyword">float</span>* idata, <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-keyword">int</span> stride)
{
    <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-keyword">int</span> xid = (<span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-attribute">blockIdx</span>.x*<span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-attribute">blockDim</span>.x + <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-attribute">threadIdx</span>.x)*stride;
    odata[xid] = idata[xid];
}</pre></div>
                              <div class="section">
                                 <p class="p"><a class="xref" href="index.html#strided-accesses__adjacent-threads-accessing-memory-with-stride-of-2" shape="rect">Figure 7</a>
                                    illustrates such a situation; in this case, threads within a warp
                                    access words in memory with a stride of 2. This action leads to a load
                                    of two L1 cache lines (or eight L2 cache segments in non-caching mode)
                                    per warp on the Tesla M2090 (compute capability 2.0).
                                 </p>
                                 <div class="fig fignone" id="strided-accesses__adjacent-threads-accessing-memory-with-stride-of-2"><a name="strided-accesses__adjacent-threads-accessing-memory-with-stride-of-2" shape="rect">
                                       <!-- --></a><span class="figcap">Figure 7. Adjacent threads accessing memory with a stride of 2</span><br clear="none"></br><div class="imagecenter"><img class="image imagecenter" src="graphics/adjacent-threads-accessing-memory-with-stride-of-2.png" alt="Adjacent threads accessing memory with a stride of 2."></img></div><br clear="none"></br></div>
                                 <p class="p">A stride of 2 results in a 50% of load/store efficiency since half
                                    the elements in the transaction are not used and represent wasted
                                    bandwidth. As the stride increases, the effective bandwidth decreases
                                    until the point where 32 lines of cache are loaded for the 32 threads
                                    in a warp, as indicated in <a class="xref" href="index.html#strided-accesses__performance-of-stridecopy-kernel" shape="rect">Figure 8</a>.
                                 </p>
                                 <div class="fig fignone" id="strided-accesses__performance-of-stridecopy-kernel"><a name="strided-accesses__performance-of-stridecopy-kernel" shape="rect">
                                       <!-- --></a><span class="figcap">Figure 8. Performance of strideCopy kernel</span><br clear="none"></br><div class="imagecenter"><img class="image imagecenter" src="graphics/performance-of-stridecopy-kernel.png" alt="Performance of strideCopy kernel."></img></div><br clear="none"></br></div>
                                 <p class="p">As illustrated in <a class="xref" href="index.html#strided-accesses__performance-of-stridecopy-kernel" shape="rect">Figure 8</a>,
                                    non-unit-stride global memory accesses should be avoided whenever
                                    possible. One method for doing so utilizes shared memory, which is
                                    discussed in the next section.
                                 </p>
                              </div>
                           </div>
                        </div>
                     </div>
                     <div class="topic concept nested2" xml:lang="en-US" id="shared-memory"><a name="shared-memory" shape="rect">
                           <!-- --></a><h3 class="title topictitle2"><a href="#shared-memory" name="shared-memory" shape="rect">9.2.2.&nbsp;Shared Memory</a></h3>
                        <div class="body conbody">
                           <p class="p">Because it is on-chip, shared memory has much higher bandwidth and
                              lower latency than local and global memory - provided there are no bank
                              conflicts between the threads, as detailed in the following
                              section.
                           </p>
                        </div>
                        <div class="topic concept nested3" xml:lang="en-US" id="shared-memory-and-memory-banks"><a name="shared-memory-and-memory-banks" shape="rect">
                              <!-- --></a><h3 class="title topictitle2"><a href="#shared-memory-and-memory-banks" name="shared-memory-and-memory-banks" shape="rect">9.2.2.1.&nbsp;Shared Memory and Memory Banks</a></h3>
                           <div class="body conbody">
                              <p class="p">To achieve high memory bandwidth for concurrent accesses, shared
                                 memory is divided into equally sized memory modules
                                 (<dfn class="term">banks</dfn>) that can be accessed simultaneously. Therefore,
                                 any memory load or store of <em class="ph i">n</em> addresses that spans <em class="ph i">n</em>
                                 distinct memory banks can be serviced simultaneously, yielding an
                                 effective bandwidth that is <em class="ph i">n</em> times as high as the bandwidth of
                                 a single bank.
                              </p>
                              <p class="p">However, if multiple addresses of a memory request map to the same
                                 memory bank, the accesses are serialized. The hardware splits a memory
                                 request that has bank conflicts into as many separate conflict-free
                                 requests as necessary, decreasing the effective bandwidth by a factor
                                 equal to the number of separate memory requests. The one exception here
                                 is when multiple threads in a warp address the same shared memory
                                 location, resulting in a broadcast. Devices of <a class="xref" href="index.html#cuda-compute-capability" shape="rect">compute
                                    capability</a> 2.x and higher have the additional ability to
                                 multicast shared memory accesses (i.e. to send copies of the same value
                                 to several threads of the warp).
                              </p>
                              <p class="p">To minimize bank conflicts, it is important to understand how memory
                                 addresses map to memory banks and how to optimally schedule memory
                                 requests.
                              </p>
                              <div class="section">
                                 <h5 class="title sectiontitle">Compute Capability 2.x</h5>
                                 <p class="p">On devices of compute capability 2.x, each bank has a bandwidth of
                                    32 bits every two clock cycles, and successive 32-bit words are
                                    assigned to successive banks. The warp size is 32 threads and the
                                    number of banks is also 32, so bank conflicts can occur between any
                                    threads in the warp. See <cite class="cite">Compute Capability 2.x</cite> in the
                                    <cite class="cite">CUDA C Programming Guide</cite> for further details.
                                 </p>
                              </div>
                              <div class="section">
                                 <h5 class="title sectiontitle">Compute Capability 3.x</h5>
                                 <p class="p">On devices of compute capability 3.x, each bank has a bandwidth of
                                    64 bits every clock cycle (*). There are two different banking modes:
                                    either successive 32-bit words (in 32-bit mode) or successive 64-bit
                                    words (64-bit mode) are assigned to successive banks. The warp size is
                                    32 threads and the number of banks is also 32, so bank conflicts can
                                    occur between any threads in the warp. See <cite class="cite">Compute Capability
                                       3.x</cite> in the <cite class="cite">CUDA C Programming Guide</cite> for further
                                    details.
                                 </p>
                                 <div class="note note"><span class="notetitle">Note:</span> (*) However, devices of compute capability 3.x typically have
                                    lower clock frequencies than devices of compute capability 2.x
                                    for improved power efficiency.
                                 </div>
                              </div>
                           </div>
                        </div>
                        <div class="topic concept nested3" xml:lang="en-US" id="shared-memory-in-matrix-multiplication-c-ab"><a name="shared-memory-in-matrix-multiplication-c-ab" shape="rect">
                              <!-- --></a><h3 class="title topictitle2"><a href="#shared-memory-in-matrix-multiplication-c-ab" name="shared-memory-in-matrix-multiplication-c-ab" shape="rect">9.2.2.2.&nbsp;Shared Memory in Matrix Multiplication (C=AB)</a></h3>
                           <div class="body conbody">
                              <div class="section">
                                 <p class="p">Shared memory enables cooperation between threads in a block. When
                                    multiple threads in a block use the same data from global memory,
                                    shared memory can be used to access the data from global memory only
                                    once. Shared memory can also be used to avoid uncoalesced memory
                                    accesses by loading and storing data in a coalesced pattern from global
                                    memory and then reordering it in shared memory. Aside from memory bank
                                    conflicts, there is no penalty for non-sequential or unaligned accesses
                                    by a warp in shared memory.
                                 </p>
                                 <p class="p">The use of shared memory is illustrated via the simple example of a
                                    matrix multiplication C = AB for the case with A of dimension Mxw, B of
                                    dimension wxN, and C of dimension MxN.  To keep the kernels simple, M
                                    and N are multiples of 32, and w is 32 for devices of <a class="xref" href="index.html#cuda-compute-capability" shape="rect">compute
                                       capability</a> 2.0 or higher.
                                 </p>
                                 <p class="p">A natural decomposition of the problem is to use a block and tile size
                                    of wxw threads. Therefore, in terms of wxw tiles, A is a column matrix,
                                    B is a row matrix, and C is their outer product; see <a class="xref" href="index.html#shared-memory-in-matrix-multiplication-c-ab__block-column-matrix-A-multiplied-block-row-matrix-B-product-matrix-c" title="Block-column matrix (A) multiplied by block-row matrix (B) with resulting product matrix (C)." shape="rect">Figure 9</a>.
                                    A grid of N/w by M/w blocks is launched, where each thread block
                                    calculates the elements of a different tile in C from a single tile of
                                    A and a single tile of B.
                                 </p>
                                 <div class="fig fignone" id="shared-memory-in-matrix-multiplication-c-ab__block-column-matrix-A-multiplied-block-row-matrix-B-product-matrix-c"><a name="shared-memory-in-matrix-multiplication-c-ab__block-column-matrix-A-multiplied-block-row-matrix-B-product-matrix-c" shape="rect">
                                       <!-- --></a><span class="figcap">Figure 9. Block-column matrix multiplied by block-row matrix</span>. <span class="desc figdesc">Block-column matrix (A) multiplied by block-row matrix (B) with
                                       resulting product matrix (C).</span><br clear="none"></br><div class="imagecenter"><img class="image imagecenter" src="graphics/matrix-multiplication-block-column-by-block-row.png" alt="Block-column matrix multiplied by block-row matrix."></img></div><br clear="none"></br></div>
                                 <p class="p">To do this, the <samp class="ph codeph">simpleMultiply</samp> kernel (<a class="xref" href="index.html#shared-memory-in-matrix-multiplication-c-ab__unoptimized-matrix-multiplication" shape="rect">Unoptimized matrix multiplication</a>)
                                    calculates the output elements of a tile of matrix C.
                                 </p>
                              </div>
                              <div class="example" id="shared-memory-in-matrix-multiplication-c-ab__unoptimized-matrix-multiplication"><a name="shared-memory-in-matrix-multiplication-c-ab__unoptimized-matrix-multiplication" shape="rect">
                                    <!-- --></a><h5 class="title sectiontitle">Unoptimized matrix multiplication</h5><pre xml:space="preserve"><span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-attribute">__global__</span> <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-keyword">void</span> simpleMultiply(<span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-keyword">float</span> *a, <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-keyword">float</span>* b, <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-keyword">float</span> *c,
                               <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-keyword">int</span> N)
{
    <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-keyword">int</span> row = <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-attribute">blockIdx</span>.y * <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-attribute">blockDim</span>.y + <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-attribute">threadIdx</span>.y;
    <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-keyword">int</span> col = <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-attribute">blockIdx</span>.x * <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-attribute">blockDim</span>.x + <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-attribute">threadIdx</span>.x;
    <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-keyword">float</span> sum = 0.0f;
    <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-keyword">for</span> (<span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-keyword">int</span> i = 0; i &lt; TILE_DIM; i++) {
        sum += a[row*TILE_DIM+i] * b[i*N+col];
    }
    c[row*N+col] = sum;
}</pre></div>
                              <div class="section">
                                 <p class="p">In <a class="xref" href="index.html#shared-memory-in-matrix-multiplication-c-ab__unoptimized-matrix-multiplication" shape="rect">Unoptimized matrix multiplication</a>,
                                    <samp class="ph codeph">a</samp>, <samp class="ph codeph">b</samp>, and <samp class="ph codeph">c</samp> are
                                    pointers to global memory for the matrices A, B, and C, respectively;
                                    <samp class="ph codeph">blockDim.x</samp>, <samp class="ph codeph">blockDim.y</samp>, and
                                    <samp class="ph codeph">TILE_DIM</samp> are all equal to w. Each thread in the
                                    wxw-thread block calculates one element in a tile of C.
                                    <samp class="ph codeph">row</samp> and <samp class="ph codeph">col</samp> are the row and column of
                                    the element in C being calculated by a particular thread. The
                                    <samp class="ph codeph">for</samp> loop over <samp class="ph codeph">i</samp> multiplies a row of A
                                    by a column of B, which is then written to C.
                                 </p>
                                 <p class="p">The effective bandwidth of this kernel is only 6.6GB/s on an NVIDIA
                                    Tesla K20X (with ECC off). To analyze performance, it is necessary to
                                    consider how warps access global memory in the <samp class="ph codeph">for</samp>
                                    loop. Each warp of threads calculates one row of a tile of C, which
                                    depends on a single row of A and an entire tile of B as illustrated in
                                    <a class="xref" href="index.html#shared-memory-in-matrix-multiplication-c-ab__computing-row-c-tile-c-row-a-tile-b" title="Computing a row of a tile in C using one row of A and an entire tile of B." shape="rect">Figure 10</a>.
                                 </p>
                                 <div class="fig fignone" id="shared-memory-in-matrix-multiplication-c-ab__computing-row-c-tile-c-row-a-tile-b"><a name="shared-memory-in-matrix-multiplication-c-ab__computing-row-c-tile-c-row-a-tile-b" shape="rect">
                                       <!-- --></a><span class="figcap">Figure 10. Computing a row of a tile</span>. <span class="desc figdesc">Computing a row of a tile in C using one row of A and an entire
                                       tile of B.</span><br clear="none"></br><div class="imagecenter"><img class="image imagecenter" src="graphics/computing-row-of-tile.png" alt="Computing a row of a tile."></img></div><br clear="none"></br></div>
                                 <p class="p">For each iteration <em class="ph i">i</em> of the <samp class="ph codeph">for</samp> loop, the
                                    threads in a warp read a row of the B tile, which is a sequential and
                                    coalesced access for all compute capabilities.
                                 </p>
                                 <p class="p">However, for each iteration <em class="ph i">i</em>, all threads in a warp read the
                                    same value from global memory for matrix A, as the index
                                    <samp class="ph codeph">row*TILE_DIM+i</samp> is constant within a warp. Even though
                                    such an access requires only 1 transaction on devices of compute
                                    capability 2.0 or higher, there is wasted bandwidth in the transaction, because
                                    only one 4-byte word out of 32 words in the cache line is used. We can
                                    reuse this cache line in subsequent iterations of the loop, and we
                                    would eventually utilize all 32 words; however, when many warps execute
                                    on the same multiprocessor simultaneously, as is generally the case,
                                    the cache line may easily be evicted from the cache between iterations
                                    <em class="ph i">i</em> and <em class="ph i">i+1</em>.
                                 </p>
                                 <p class="p">The performance on a device of any compute capability can be
                                    improved by reading a tile of A into shared memory as shown in <a class="xref" href="index.html#shared-memory-in-matrix-multiplication-c-ab__using-shared-memory-improve-global-memory-load-efficiency-matrix-multiplication" shape="rect">Using shared memory to improve the global memory load efficiency in matrix multiplication</a>.
                                 </p>
                              </div>
                              <div class="example" id="shared-memory-in-matrix-multiplication-c-ab__using-shared-memory-improve-global-memory-load-efficiency-matrix-multiplication"><a name="shared-memory-in-matrix-multiplication-c-ab__using-shared-memory-improve-global-memory-load-efficiency-matrix-multiplication" shape="rect">
                                    <!-- --></a><h5 class="title sectiontitle">Using shared memory to improve the global memory load efficiency
                                    in matrix multiplication
                                 </h5><pre xml:space="preserve"><span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-attribute">__global__</span> <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-keyword">void</span> coalescedMultiply(<span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-keyword">float</span> *a, <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-keyword">float</span>* b, <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-keyword">float</span> *c,
                                  <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-keyword">int</span> N)
{
    <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-attribute">__shared__</span> <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-keyword">float</span> aTile[TILE_DIM][TILE_DIM];

    <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-keyword">int</span> row = <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-attribute">blockIdx</span>.y * <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-attribute">blockDim</span>.y + <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-attribute">threadIdx</span>.y;
    <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-keyword">int</span> col = <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-attribute">blockIdx</span>.x * <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-attribute">blockDim</span>.x + <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-attribute">threadIdx</span>.x;
    <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-keyword">float</span> sum = 0.0f;
    aTile[<span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-attribute">threadIdx</span>.y][<span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-attribute">threadIdx</span>.x] = a[row*TILE_DIM+<span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-attribute">threadIdx</span>.x];
    <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-keyword">for</span> (<span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-keyword">int</span> i = 0; i &lt; TILE_DIM; i++) {
        sum += aTile[<span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-attribute">threadIdx</span>.y][i]* b[i*N+col];
    }
    c[row*N+col] = sum;
}
</pre></div>
                              <div class="section">
                                 <p class="p">In <a class="xref" href="index.html#shared-memory-in-matrix-multiplication-c-ab__using-shared-memory-improve-global-memory-load-efficiency-matrix-multiplication" shape="rect">Using shared memory to improve the global memory load efficiency in matrix multiplication</a>,
                                    each element in a tile of A is read from global memory only once, in a
                                    fully coalesced fashion (with no wasted bandwidth), to shared memory.
                                    Within each iteration of the <samp class="ph codeph">for</samp> loop, a value in
                                    shared memory is broadcast to all threads in a warp. No
                                    <samp class="ph codeph">__syncthreads()</samp>synchronization barrier call is needed
                                    after reading the tile of A into shared memory because only threads
                                    within the warp that write the data into shared memory read the data
                                    (Note: in lieu of <samp class="ph codeph">__syncthreads()</samp>, the
                                    <samp class="ph codeph">__shared__ array</samp> may need to be marked as
                                    <samp class="ph codeph">volatile</samp> for correctness on devices of compute
                                    capability 2.0 or higher; see the <cite class="cite">NVIDIA Fermi Compatibility Guide</cite>).
                                    This kernel has an effective bandwidth of 7.8GB/s on an NVIDIA Tesla
                                    K20X. This illustrates the use of the shared memory as a
                                    <dfn class="term">user-managed cache</dfn> when the hardware L1 cache eviction
                                    policy does not match up well with the needs of the application
                                    or when L1 cache is not used for reads from global memory.
                                 </p>
                                 <p class="p">A further improvement can be made to how <a class="xref" href="index.html#shared-memory-in-matrix-multiplication-c-ab__using-shared-memory-improve-global-memory-load-efficiency-matrix-multiplication" shape="rect">Using shared memory to improve the global memory load efficiency in matrix multiplication</a>
                                    deals with matrix B. In calculating each of the rows of a tile of
                                    matrix C, the entire tile of B is read. The repeated reading of the B
                                    tile can be eliminated by reading it into shared memory once (<a class="xref" href="index.html#shared-memory-in-matrix-multiplication-c-ab__improvement-reading-additional-data-shared-memory" shape="rect">Improvement by reading additional data into shared memory</a>).
                                 </p>
                              </div>
                              <div class="example" id="shared-memory-in-matrix-multiplication-c-ab__improvement-reading-additional-data-shared-memory"><a name="shared-memory-in-matrix-multiplication-c-ab__improvement-reading-additional-data-shared-memory" shape="rect">
                                    <!-- --></a><h5 class="title sectiontitle">Improvement by reading additional data into shared memory</h5><pre xml:space="preserve"><span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-attribute">__global__</span> <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-keyword">void</span> sharedABMultiply(<span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-keyword">float</span> *a, <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-keyword">float</span>* b, <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-keyword">float</span> *c,
                                 <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-keyword">int</span> N)
{
    <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-attribute">__shared__</span> <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-keyword">float</span> aTile[TILE_DIM][TILE_DIM],
                     bTile[TILE_DIM][TILE_DIM];
    <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-keyword">int</span> row = <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-attribute">blockIdx</span>.y * <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-attribute">blockDim</span>.y + <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-attribute">threadIdx</span>.y;
    <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-keyword">int</span> col = <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-attribute">blockIdx</span>.x * <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-attribute">blockDim</span>.x + <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-attribute">threadIdx</span>.x;
    <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-keyword">float</span> sum = 0.0f;
    aTile[<span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-attribute">threadIdx</span>.y][<span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-attribute">threadIdx</span>.x] = a[row*TILE_DIM+<span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-attribute">threadIdx</span>.x];
    bTile[<span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-attribute">threadIdx</span>.y][<span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-attribute">threadIdx</span>.x] = b[<span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-attribute">threadIdx</span>.y*N+col];
    __syncthreads();
    <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-keyword">for</span> (<span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-keyword">int</span> i = 0; i &lt; TILE_DIM; i++) {
        sum += aTile[<span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-attribute">threadIdx</span>.y][i]* bTile[i][<span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-attribute">threadIdx</span>.x];
    }
    c[row*N+col] = sum;
}</pre></div>
                              <div class="section">
                                 <p class="p">Note that in <a class="xref" href="index.html#shared-memory-in-matrix-multiplication-c-ab__improvement-reading-additional-data-shared-memory" shape="rect">Improvement by reading additional data into shared memory</a>,
                                    a <samp class="ph codeph">__syncthreads()</samp> call is required after reading the B
                                    tile because a warp reads data from shared memory that were written to
                                    shared memory by different warps. The effective bandwidth of this
                                    routine is 14.9 GB/s on an NVIDIA Tesla K20X. Note that the
                                    performance improvement is not due to improved coalescing in either
                                    case, but to avoiding redundant transfers from global memory.
                                 </p>
                                 <p class="p">The results of the various optimizations are summarized in <a class="xref" href="index.html#shared-memory-in-matrix-multiplication-c-ab__performance-improvements-optimizing-c-ab-matrix" shape="rect">Table 2</a>.
                                 </p>
                                 <div class="tablenoborder"><a name="shared-memory-in-matrix-multiplication-c-ab__performance-improvements-optimizing-c-ab-matrix" shape="rect">
                                       <!-- --></a><table cellpadding="4" cellspacing="0" summary="" id="shared-memory-in-matrix-multiplication-c-ab__performance-improvements-optimizing-c-ab-matrix" class="table" frame="border" border="1" rules="all">
                                       <caption><span class="tablecap">Table 2. Performance Improvements Optimizing C = AB Matrix
                                             Multiply</span></caption>
                                       <thead class="thead" align="left">
                                          <tr class="row">
                                             <th class="entry" valign="top" width="50%" id="d54e2992" rowspan="1" colspan="1">Optimization</th>
                                             <th class="entry" valign="top" width="50%" id="d54e2995" rowspan="1" colspan="1">NVIDIA Tesla K20X</th>
                                          </tr>
                                       </thead>
                                       <tbody class="tbody">
                                          <tr class="row">
                                             <td class="entry" valign="top" width="50%" headers="d54e2992" rowspan="1" colspan="1">No optimization</td>
                                             <td class="entry" valign="top" width="50%" headers="d54e2995" rowspan="1" colspan="1">6.6 GB/s</td>
                                          </tr>
                                          <tr class="row">
                                             <td class="entry" valign="top" width="50%" headers="d54e2992" rowspan="1" colspan="1">Coalesced using shared memory to store a tile of
                                                A
                                             </td>
                                             <td class="entry" valign="top" width="50%" headers="d54e2995" rowspan="1" colspan="1">7.8 GB/s</td>
                                          </tr>
                                          <tr class="row">
                                             <td class="entry" valign="top" width="50%" headers="d54e2992" rowspan="1" colspan="1">Using shared memory to eliminate redundant reads
                                                of a tile of B
                                             </td>
                                             <td class="entry" valign="top" width="50%" headers="d54e2995" rowspan="1" colspan="1">14.9 GB/s</td>
                                          </tr>
                                       </tbody>
                                    </table>
                                 </div>
                                 <div class="note note"><span class="notetitle">Note:</span><strong class="ph b">Medium Priority:</strong> Use shared memory to avoid redundant
                                    transfers from global memory.
                                 </div>
                              </div>
                           </div>
                        </div>
                        <div class="topic concept nested3" xml:lang="en-US" id="shared-memory-in-matrix-multiplication-c-aa"><a name="shared-memory-in-matrix-multiplication-c-aa" shape="rect">
                              <!-- --></a><h3 class="title topictitle2"><a href="#shared-memory-in-matrix-multiplication-c-aa" name="shared-memory-in-matrix-multiplication-c-aa" shape="rect">9.2.2.3.&nbsp;Shared Memory in Matrix Multiplication (C=AA<sup class="ph sup">T</sup>)</a></h3>
                           <div class="body conbody">
                              <div class="section">
                                 <p class="p">A variant of the previous matrix multiplication can be used to
                                    illustrate how strided accesses to global memory, as well as shared
                                    memory bank conflicts, are handled. This variant simply uses the
                                    transpose of A in place of B, so C = AA<sup class="ph sup">T</sup>.
                                 </p>
                                 <p class="p">A simple implementation for C = AA<sup class="ph sup">T</sup> is shown in <a class="xref" href="index.html#shared-memory-in-matrix-multiplication-c-aa__unoptimized-handling-strided-accesses-global-memory" shape="rect">Unoptimized handling of strided accesses to global memory</a></p>
                              </div>
                              <div class="example" id="shared-memory-in-matrix-multiplication-c-aa__unoptimized-handling-strided-accesses-global-memory"><a name="shared-memory-in-matrix-multiplication-c-aa__unoptimized-handling-strided-accesses-global-memory" shape="rect">
                                    <!-- --></a><h5 class="title sectiontitle">Unoptimized handling of strided accesses to global memory</h5><pre xml:space="preserve"><span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-attribute">__global__</span> <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-keyword">void</span> simpleMultiply(<span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-keyword">float</span> *a, <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-keyword">float</span> *c, <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-keyword">int</span> M)
{
    <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-keyword">int</span> row = <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-attribute">blockIdx</span>.y * <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-attribute">blockDim</span>.y + <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-attribute">threadIdx</span>.y;
    <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-keyword">int</span> col = <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-attribute">blockIdx</span>.x * <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-attribute">blockDim</span>.x + <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-attribute">threadIdx</span>.x;
    <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-keyword">float</span> sum = 0.0f;
    <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-keyword">for</span> (<span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-keyword">int</span> i = 0; i &lt; TILE_DIM; i++) {
        sum += a[row*TILE_DIM+i] * a[col*TILE_DIM+i];
    }
    c[row*M+col] = sum;
}</pre></div>
                              <div class="section">
                                 <p class="p">In <a class="xref" href="index.html#shared-memory-in-matrix-multiplication-c-aa__unoptimized-handling-strided-accesses-global-memory" shape="rect">Unoptimized handling of strided accesses to global memory</a>,
                                    the <em class="ph i">row</em>-th, <em class="ph i">col</em>-th element of C is obtained by taking the
                                    dot product of the <dfn class="term">row</dfn>-th and <dfn class="term">col</dfn>-th rows of
                                    A. The effective bandwidth for this kernel is 3.64 GB/s on an NVIDIA
                                    Tesla M2090. These results are substantially lower than the
                                    corresponding measurements for the C = AB kernel. The difference is in
                                    how threads in a half warp access elements of A in the second term,
                                    <samp class="ph codeph">a[col*TILE_DIM+i]</samp>, for each iteration
                                    <samp class="ph codeph">i</samp>. For a warp of threads, <samp class="ph codeph">col</samp>
                                    represents sequential columns of the transpose of A, and therefore
                                    <samp class="ph codeph">col*TILE_DIM</samp> represents a strided access of global
                                    memory with a stride of w, resulting in plenty of wasted bandwidth.
                                 </p>
                                 <p class="p">The way to avoid strided access is to use shared memory as before,
                                    except in this case a warp reads a row of A into a column of a shared
                                    memory tile, as shown in <a class="xref" href="index.html#shared-memory-in-matrix-multiplication-c-aa__optimized-version-coalesced-reads-global-memory" shape="rect">An optimized handling of strided accesses using coalesced reads from global memory</a>.
                                 </p>
                              </div>
                              <div class="example" id="shared-memory-in-matrix-multiplication-c-aa__optimized-version-coalesced-reads-global-memory"><a name="shared-memory-in-matrix-multiplication-c-aa__optimized-version-coalesced-reads-global-memory" shape="rect">
                                    <!-- --></a><h5 class="title sectiontitle">An optimized handling of strided accesses using coalesced reads
                                    from global memory
                                 </h5><pre xml:space="preserve"><span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-attribute">__global__</span> <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-keyword">void</span> coalescedMultiply(<span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-keyword">float</span> *a, <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-keyword">float</span> *c, <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-keyword">int</span> M)
{
    <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-attribute">__shared__</span> <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-keyword">float</span> aTile[TILE_DIM][TILE_DIM],
                     transposedTile[TILE_DIM][TILE_DIM];
    <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-keyword">int</span> row = <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-attribute">blockIdx</span>.y * <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-attribute">blockDim</span>.y + <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-attribute">threadIdx</span>.y;
    <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-keyword">int</span> col = <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-attribute">blockIdx</span>.x * <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-attribute">blockDim</span>.x + <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-attribute">threadIdx</span>.x;
    <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-keyword">float</span> sum = 0.0f;
    aTile[<span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-attribute">threadIdx</span>.y][<span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-attribute">threadIdx</span>.x] = a[row*TILE_DIM+<span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-attribute">threadIdx</span>.x];
    transposedTile[<span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-attribute">threadIdx</span>.x][<span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-attribute">threadIdx</span>.y] =
        a[(<span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-attribute">blockIdx</span>.x*<span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-attribute">blockDim</span>.x + <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-attribute">threadIdx</span>.y)*TILE_DIM +
        <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-attribute">threadIdx</span>.x];  
    __syncthreads();
    <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-keyword">for</span> (<span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-keyword">int</span> i = 0; i &lt; TILE_DIM; i++) {
        sum += aTile[<span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-attribute">threadIdx</span>.y][i]* transposedTile[i][<span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-attribute">threadIdx</span>.x];
    }
    c[row*M+col] = sum;
}</pre></div>
                              <div class="section">
                                 <p class="p"><a class="xref" href="index.html#shared-memory-in-matrix-multiplication-c-aa__optimized-version-coalesced-reads-global-memory" shape="rect">An optimized handling of strided accesses using coalesced reads from global memory</a>
                                    uses the shared <samp class="ph codeph">transposedTile</samp> to avoid uncoalesced
                                    accesses in the second term in the dot product and the shared
                                    <samp class="ph codeph">aTile</samp> technique from the previous example to avoid
                                    uncoalesced accesses in the first term. The effective bandwidth of this
                                    kernel is 27.5 GB/s on an NVIDIA Tesla M2090.These results are slightly
                                    lower than those obtained by the final kernel for C = AB. The cause of
                                    the difference is shared memory bank conflicts.
                                 </p>
                                 <p class="p">The reads of
                                    elements in <samp class="ph codeph">transposedTile</samp> within the for loop are
                                    free of conflicts, because threads of each half warp read across rows
                                    of the tile, resulting in unit stride across the banks. However, bank
                                    conflicts occur when copying the tile from global memory into shared
                                    memory. To enable the loads from global memory to be coalesced, data
                                    are read from global memory sequentially. However, this requires
                                    writing to shared memory in columns, and because of the use of wxw
                                    tiles in shared memory, this results in a stride between threads of w
                                    banks - every thread of the warp hits the same bank. (Recall that w is
                                    selected as 32 for devices of <a class="xref" href="index.html#cuda-compute-capability" shape="rect">compute
                                       capability</a> 2.0 or higher.) These many-way bank conflicts
                                    are very expensive.  The simple remedy is to pad the shared memory
                                    array so that it has an extra column, as in the following line of
                                    code.
                                 </p><pre xml:space="preserve"><span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-attribute">__shared__</span> <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-keyword">float</span> transposedTile[TILE_DIM][TILE_DIM+1];</pre><p class="p">This padding eliminates the conflicts entirely, because now the
                                    stride between threads is w+1 banks (i.e., 33 for current devices),
                                    which, due to modulo arithmetic used to compute
                                    bank indices, is equivalent to a unit stride. After this change, the
                                    effective bandwidth is 39.2 GB/s on an NVIDIA Tesla M2090, which is
                                    comparable to the results from the last C = AB kernel.
                                 </p>
                                 <p class="p">The results of these optimizations are summarized in <a class="xref" href="index.html#shared-memory-in-matrix-multiplication-c-aa__performance-improvements-optimizing-c" shape="rect">Table 3</a>.
                                 </p>
                                 <div class="tablenoborder"><a name="shared-memory-in-matrix-multiplication-c-aa__performance-improvements-optimizing-c" shape="rect">
                                       <!-- --></a><table cellpadding="4" cellspacing="0" summary="" id="shared-memory-in-matrix-multiplication-c-aa__performance-improvements-optimizing-c" class="table" frame="border" border="1" rules="all">
                                       <caption><span class="tablecap">Table 3. Performance Improvements Optimizing C = AA<sup class="ph sup">T</sup> Matrix Multiplication</span></caption>
                                       <thead class="thead" align="left">
                                          <tr class="row">
                                             <th class="entry" valign="top" width="50%" id="d54e3176" rowspan="1" colspan="1">Optimization</th>
                                             <th class="entry" valign="top" width="50%" id="d54e3179" rowspan="1" colspan="1">NVIDIA Tesla M2090</th>
                                          </tr>
                                       </thead>
                                       <tbody class="tbody">
                                          <tr class="row">
                                             <td class="entry" valign="top" width="50%" headers="d54e3176" rowspan="1" colspan="1">No optimization</td>
                                             <td class="entry" valign="top" width="50%" headers="d54e3179" rowspan="1" colspan="1">3.6 GB/s</td>
                                          </tr>
                                          <tr class="row">
                                             <td class="entry" valign="top" width="50%" headers="d54e3176" rowspan="1" colspan="1">Using shared memory to coalesce global reads</td>
                                             <td class="entry" valign="top" width="50%" headers="d54e3179" rowspan="1" colspan="1">27.5 GB/s</td>
                                          </tr>
                                          <tr class="row">
                                             <td class="entry" valign="top" width="50%" headers="d54e3176" rowspan="1" colspan="1">Removing bank conflicts</td>
                                             <td class="entry" valign="top" width="50%" headers="d54e3179" rowspan="1" colspan="1">39.2 GB/s</td>
                                          </tr>
                                       </tbody>
                                    </table>
                                 </div>
                                 <p class="p">These results should be compared with those in <a class="xref" href="index.html#shared-memory-in-matrix-multiplication-c-ab__performance-improvements-optimizing-c-ab-matrix" shape="rect">Table 2</a>.
                                    As can be seen from these tables, judicious use of shared memory can
                                    dramatically improve performance.
                                 </p>
                                 <p class="p">The examples in this section have illustrated three reasons to use
                                    shared memory:
                                 </p>
                                 <ul class="ul">
                                    <li class="li">To enable coalesced accesses to global memory, especially to
                                       avoid large strides (for general matrices, strides are much
                                       larger than 32)
                                    </li>
                                    <li class="li">To eliminate (or reduce) redundant loads from global
                                       memory
                                    </li>
                                    <li class="li">To avoid wasted bandwidth</li>
                                 </ul>
                              </div>
                           </div>
                        </div>
                     </div>
                     <div class="topic concept nested2" xml:lang="en-US" id="local-memory"><a name="local-memory" shape="rect">
                           <!-- --></a><h3 class="title topictitle2"><a href="#local-memory" name="local-memory" shape="rect">9.2.3.&nbsp;Local Memory</a></h3>
                        <div class="body conbody">
                           <p class="p">Local memory is so named because its scope is local to the thread,
                              not because of its physical location. In fact, local memory is
                              off-chip. Hence, access to local memory is as expensive as access to
                              global memory.  In other words, the term
                              <dfn class="term">local</dfn> in the name does not imply faster access.
                           </p>
                           <p class="p">Local memory is used only to hold automatic variables. This is done
                              by the <samp class="ph codeph">nvcc</samp> compiler when it determines that there is
                              insufficient register space to hold the variable. Automatic variables
                              that are likely to be placed in local memory are large structures or
                              arrays that would consume too much register space and arrays that the
                              compiler determines may be indexed dynamically.
                           </p>
                           <p class="p">Inspection of the PTX assembly code (obtained by compiling with
                              <samp class="ph codeph">-ptx</samp> or <samp class="ph codeph">-keep</samp> command-line options to
                              <samp class="ph codeph">nvcc</samp>) reveals whether a variable has been placed in
                              local memory during the first compilation phases. If it has, it will be
                              declared using the <samp class="ph codeph">.local</samp> mnemonic and accessed using
                              the <samp class="ph codeph">ld.local</samp> and <samp class="ph codeph">st.local</samp> mnemonics.
                              If it has not, subsequent compilation phases might still decide
                              otherwise, if they find the variable consumes too much register space
                              for the targeted architecture. There is no way to check this for a
                              specific variable, but the compiler reports total local memory usage
                              per kernel (lmem) when run with the<samp class="ph codeph"> --ptxas-options=-v</samp>
                              option.
                           </p>
                        </div>
                     </div>
                     <div class="topic concept nested2" xml:lang="en-US" id="texture-memory"><a name="texture-memory" shape="rect">
                           <!-- --></a><h3 class="title topictitle2"><a href="#texture-memory" name="texture-memory" shape="rect">9.2.4.&nbsp;Texture Memory</a></h3>
                        <div class="body conbody">
                           <p class="p">The read-only texture memory space is cached. Therefore, a texture
                              fetch costs one device memory read only on a cache miss; otherwise, it
                              just costs one read from the texture cache. The texture cache is
                              optimized for 2D spatial locality, so threads of the same warp that
                              read texture addresses that are close together will achieve best
                              performance. Texture memory is also designed for streaming fetches with
                              a constant latency; that is, a cache hit reduces DRAM bandwidth demand,
                              but not fetch latency.
                           </p>
                           <p class="p">In certain addressing situations, reading device memory through
                              texture fetching can be an advantageous alternative to reading device
                              memory from global or constant memory.
                           </p>
                        </div>
                        <div class="topic concept nested3" xml:lang="en-US" id="additional-texture-capabilities"><a name="additional-texture-capabilities" shape="rect">
                              <!-- --></a><h3 class="title topictitle2"><a href="#additional-texture-capabilities" name="additional-texture-capabilities" shape="rect">9.2.4.1.&nbsp;Additional Texture Capabilities</a></h3>
                           <div class="body conbody">
                              <p class="p">If textures are fetched using <samp class="ph codeph">tex1D()</samp>,<samp class="ph codeph">
                                    tex2D()</samp>, or <samp class="ph codeph">tex3D()</samp> rather than
                                 <samp class="ph codeph">tex1Dfetch()</samp>, the hardware provides other capabilities
                                 that might be useful for some applications such as image processing, as
                                 shown in <a class="xref" href="index.html#additional-texture-capabilities__useful-features-tex1D-tex2D-tex3D-fetches" shape="rect">Table 4</a>.
                              </p>
                              <div class="tablenoborder"><a name="additional-texture-capabilities__useful-features-tex1D-tex2D-tex3D-fetches" shape="rect">
                                    <!-- --></a><table cellpadding="4" cellspacing="0" summary="" id="additional-texture-capabilities__useful-features-tex1D-tex2D-tex3D-fetches" class="table" frame="border" border="1" rules="all">
                                    <caption><span class="tablecap">Table 4. Useful Features for tex1D(), tex2D(), and tex3D() Fetches</span></caption>
                                    <thead class="thead" align="left">
                                       <tr class="row">
                                          <th class="entry" valign="top" width="20%" id="d54e3345" rowspan="1" colspan="1">Feature</th>
                                          <th class="entry" valign="top" width="40%" id="d54e3348" rowspan="1" colspan="1">Use</th>
                                          <th class="entry" valign="top" width="40%" id="d54e3351" rowspan="1" colspan="1">Caveat</th>
                                       </tr>
                                    </thead>
                                    <tbody class="tbody">
                                       <tr class="row">
                                          <td class="entry" valign="top" width="20%" headers="d54e3345" rowspan="1" colspan="1">Filtering</td>
                                          <td class="entry" valign="top" width="40%" headers="d54e3348" rowspan="1" colspan="1">Fast, low-precision interpolation between
                                             texels
                                          </td>
                                          <td class="entry" valign="top" width="40%" headers="d54e3351" rowspan="1" colspan="1">Valid only if the texture reference returns
                                             floating-point data
                                          </td>
                                       </tr>
                                       <tr class="row">
                                          <td class="entry" valign="top" width="20%" headers="d54e3345" rowspan="1" colspan="1">Normalized texture coordinates</td>
                                          <td class="entry" valign="top" width="40%" headers="d54e3348" rowspan="1" colspan="1">Resolution-independent coding</td>
                                          <td class="entry" valign="top" width="40%" headers="d54e3351" rowspan="1" colspan="1">None</td>
                                       </tr>
                                       <tr class="row">
                                          <td class="entry" valign="top" width="20%" headers="d54e3345" rowspan="1" colspan="1">Addressing modes </td>
                                          <td class="entry" valign="top" width="40%" headers="d54e3348" rowspan="1" colspan="1">Automatic handling of boundary cases<sup class="ph sup">1</sup></td>
                                          <td class="entry" valign="top" width="40%" headers="d54e3351" rowspan="1" colspan="1">Can be used only with normalized texture coordinates</td>
                                       </tr>
                                       <tr class="row">
                                          <td class="entry" colspan="3" valign="top" headers="d54e3345 d54e3348 d54e3351" rowspan="1"><sup class="ph sup">1</sup> The
                                             automatic handling of boundary cases in the bottom row
                                             of <a class="xref" href="index.html#additional-texture-capabilities__useful-features-tex1D-tex2D-tex3D-fetches" shape="rect">Table 4</a>
                                             refers to how a texture coordinate is resolved when it
                                             falls outside the valid addressing range. There are two
                                             options: <dfn class="term">clamp</dfn> and <dfn class="term">wrap</dfn>. If
                                             <em class="ph i">x</em> is the coordinate and <em class="ph i">N</em> is the number
                                             of texels for a one-dimensional texture, then with
                                             clamp, <em class="ph i">x</em> is replaced by <em class="ph i">0</em> if <em class="ph i">x</em>
                                             &lt; 0 and by 1-1/<em class="ph i">N</em> if 1 <u class="ph u">&lt;</u><em class="ph i">x</em>.
                                             With wrap, <em class="ph i">x</em> is replaced by <em class="ph i">frac(x)</em> where
                                             <em class="ph i">frac(x) = x - floor(x)</em>. Floor returns the
                                             largest integer less than or equal to <em class="ph i">x</em>. So, in
                                             clamp mode where <em class="ph i">N</em> = 1, an <em class="ph i">x</em> of 1.3 is
                                             clamped to 1.0; whereas in wrap mode, it is converted
                                             to 0.3
                                          </td>
                                       </tr>
                                    </tbody>
                                 </table>
                              </div>
                              <p class="p">Within a kernel call, the texture cache is not kept coherent with
                                 respect to global memory writes, so texture fetches from addresses that
                                 have been written via global stores in the same kernel call return
                                 undefined data.  That is, a thread can safely read a memory location
                                 via texture if the location has been updated by a previous kernel call
                                 or memory copy, but not if it has been previously updated by the same
                                 thread or another thread within the same kernel call.
                              </p>
                           </div>
                        </div>
                     </div>
                     <div class="topic concept nested2" xml:lang="en-US" id="constant-memory"><a name="constant-memory" shape="rect">
                           <!-- --></a><h3 class="title topictitle2"><a href="#constant-memory" name="constant-memory" shape="rect">9.2.5.&nbsp;Constant Memory</a></h3>
                        <div class="body conbody">
                           <p class="p">There is a total of 64 KB constant memory on a device. The constant memory space is cached. 
                              As a result, a read from constant memory costs one memory read from device memory only on a 
                              cache miss; otherwise, it just costs one read from the constant cache. Accesses to different 
                              addresses by threads within a warp are serialized, thus the cost scales linearly with the 
                              number of unique addresses read by all threads within a warp. As such, the constant cache is 
                              best when threads in the same warp accesses only a few distinct locations. If all threads
                              of a warp access the same location, then constant memory can be as fast as a register access.
                           </p>
                        </div>
                     </div>
                     <div class="topic concept nested2" xml:lang="en-US" id="registers"><a name="registers" shape="rect">
                           <!-- --></a><h3 class="title topictitle2"><a href="#registers" name="registers" shape="rect">9.2.6.&nbsp;Registers</a></h3>
                        <div class="body conbody">
                           <p class="p">Generally, accessing a register consumes zero extra clock cycles per
                              instruction, but delays may occur due to register read-after-write
                              dependencies and register memory bank conflicts.
                           </p>
                           <p class="p">The latency of read-after-write dependencies is approximately 24
                              cycles, but this latency is completely hidden on multiprocessors that
                              have sufficient warps of threads concurrent per multiprocessor.  For devices of
                              compute capability 2.0, which have 32 CUDA cores per multiprocessor,
                              as many as 768 threads (24 warps) might be required to completely hide
                              latency, and so on for devices of higher compute capabilities.
                           </p>
                           <p class="p">The compiler and hardware thread scheduler will schedule
                              instructions as optimally as possible to avoid register memory bank
                              conflicts. They achieve the best results when the number of threads per
                              block is a multiple of 64.  Other than following this rule, an
                              application has no direct control over these bank conflicts. In
                              particular, there is no register-related reason to pack data into
                              <samp class="ph codeph">float4</samp> or <samp class="ph codeph">int4</samp> types.
                           </p>
                        </div>
                        <div class="topic concept nested3" xml:lang="en-US" id="register-pressure"><a name="register-pressure" shape="rect">
                              <!-- --></a><h3 class="title topictitle2"><a href="#register-pressure" name="register-pressure" shape="rect">9.2.6.1.&nbsp;Register Pressure</a></h3>
                           <div class="body conbody">
                              <p class="p">Register pressure occurs when there are not enough registers
                                 available for a given task. Even though each multiprocessor contains
                                 thousands of 32-bit registers (see <cite class="cite">Features and Technical
                                    Specifications</cite> of the <cite class="cite">CUDA C Programming Guide</cite>),
                                 these are partitioned among concurrent threads. To prevent the compiler
                                 from allocating too many registers, use the
                                 <samp class="ph codeph">-maxrregcount=N</samp> compiler command-line option (see
                                 <a class="xref" href="index.html#nvcc" shape="rect">nvcc</a>) or the launch bounds kernel definition qualifier
                                 (see <cite class="cite">Execution Configuration</cite> of the <cite class="cite">CUDA C
                                    Programming Guide</cite>) to control the maximum number of registers to
                                 allocated per thread.
                              </p>
                           </div>
                        </div>
                     </div>
                  </div>
                  <div class="topic concept nested1" xml:lang="en-US" id="allocation"><a name="allocation" shape="rect">
                        <!-- --></a><h3 class="title topictitle2"><a href="#allocation" name="allocation" shape="rect">9.3.&nbsp;Allocation</a></h3>
                     <div class="body conbody">
                        <p class="p">Device memory allocation and de-allocation via
                           <samp class="ph codeph">cudaMalloc()</samp> and <samp class="ph codeph">cudaFree()</samp> are
                           expensive operations, so device memory should be reused and/or
                           sub-allocated by the application wherever possible to minimize the
                           impact of allocations on overall performance.
                        </p>
                     </div>
                  </div>
                  <div class="topic concept nested1" xml:lang="en-US" id="numa-best-practices"><a name="numa-best-practices" shape="rect">
                        <!-- --></a><h3 class="title topictitle2"><a href="#numa-best-practices" name="numa-best-practices" shape="rect">9.4.&nbsp;NUMA Best Practices</a></h3>
                     <div class="body conbody">
                        <p dir="ltr" class="p" id="numa-best-practices__docs-internal-guid-1d9676b1-7fff-26f6-fa76-9631624df7b0"><a name="numa-best-practices__docs-internal-guid-1d9676b1-7fff-26f6-fa76-9631624df7b0" shape="rect">
                              <!-- --></a>Some recent
                           Linux distributions enable automatic NUMA balancing (or “<a class="xref" href="https://lwn.net/Articles/488709/" target="_blank" shape="rect"><u class="ph u">AutoNUMA</u></a>”) by default. In some instances, operations
                           performed by automatic NUMA balancing may degrade the performance of applications
                           running on NVIDIA GPUs. For optimal performance, users should manually tune the NUMA
                           characteristics of their application. 
                        </p>
                        <p dir="ltr" class="p">The optimal NUMA tuning will depend on the characteristics and desired
                           hardware affinities of each application and node, but in general applications
                           computing on NVIDIA GPUs are advised to choose a policy that disables automatic NUMA
                           balancing. For example, on IBM Newell POWER9 nodes (where the CPUs correspond to
                           NUMA nodes 0 and 8), use: 
                        </p><pre class="pre screen" xml:space="preserve">numactl --membind=0,8 </pre><p dir="ltr" class="p">to bind memory allocations to the CPUs.</p>
                     </div>
                  </div>
               </div>
               <div class="topic concept nested0" xml:lang="en-US" id="execution-configuration-optimizations"><a name="execution-configuration-optimizations" shape="rect">
                     <!-- --></a><h2 class="title topictitle1"><a href="#execution-configuration-optimizations" name="execution-configuration-optimizations" shape="rect">10.&nbsp;Execution Configuration Optimizations</a></h2>
                  <div class="body conbody">
                     <p class="p">One of the keys to good performance is to keep the multiprocessors
                        on the device as busy as possible. A device in which work is poorly
                        balanced across the multiprocessors will deliver suboptimal
                        performance. Hence, it's important to design your application to use
                        threads and blocks in a way that maximizes hardware utilization and to
                        limit practices that impede the free distribution of work. A key
                        concept in this effort is occupancy, which is explained in the
                        following sections.
                     </p>
                     <p class="p">Hardware utilization can also be improved in some cases by designing
                        your application so that multiple, independent kernels can execute at
                        the same time.  Multiple kernels executing at the same time is known as
                        concurrent kernel execution. Concurrent kernel execution is described
                        below.
                     </p>
                     <p class="p">Another important concept is the management of system resources
                        allocated for a particular task. How to manage this resource
                        utilization is discussed in the final sections of this chapter.
                     </p>
                  </div>
                  <div class="topic concept nested1" xml:lang="en-US" id="occupancy"><a name="occupancy" shape="rect">
                        <!-- --></a><h3 class="title topictitle2"><a href="#occupancy" name="occupancy" shape="rect">10.1.&nbsp;Occupancy</a></h3>
                     <div class="body conbody">
                        <p class="p">Thread instructions are executed sequentially in CUDA, and, as a
                           result, executing other warps when one warp is paused or stalled is the
                           only way to hide latencies and keep the hardware busy. Some metric
                           related to the number of active warps on a multiprocessor is therefore
                           important in determining how effectively the hardware is kept busy.
                           This metric is <dfn class="term">occupancy</dfn>.
                        </p>
                        <p class="p">Occupancy is the ratio of the number of active warps per
                           multiprocessor to the maximum number of possible active warps. (To
                           determine the latter number, see the <samp class="ph codeph">deviceQuery</samp> CUDA
                           Sample or refer to <cite class="cite">Compute
                              Capabilities</cite> in the <cite class="cite">CUDA C Programming Guide</cite>.)
                           Another way to view occupancy is the percentage of the hardware's
                           ability to process warps that is actively in use.
                        </p>
                        <p class="p">Higher occupancy does not always equate to higher performance-there
                           is a point above which additional occupancy does not improve
                           performance. However, low occupancy always interferes with the ability
                           to hide memory latency, resulting in performance degradation.
                        </p>
                     </div>
                     <div class="topic concept nested2" xml:lang="en-US" id="calculating-occupancy"><a name="calculating-occupancy" shape="rect">
                           <!-- --></a><h3 class="title topictitle2"><a href="#calculating-occupancy" name="calculating-occupancy" shape="rect">10.1.1.&nbsp;Calculating Occupancy</a></h3>
                        <div class="body conbody">
                           <p class="p">One of several factors that determine occupancy is register
                              availability.  Register storage enables threads to keep local variables
                              nearby for low-latency access. However, the set of registers (known as
                              the <dfn class="term">register file</dfn>) is a limited commodity that all threads
                              resident on a multiprocessor must share. Registers are allocated to an
                              entire block all at once. So, if each thread block uses many registers,
                              the number of thread blocks that can be resident on a multiprocessor is
                              reduced, thereby lowering the occupancy of the multiprocessor. The
                              maximum number of registers per thread can be set manually at
                              compilation time per-file using the <samp class="ph codeph">-maxrregcount</samp>
                              option or per-kernel using the <samp class="ph codeph">__launch_bounds__</samp>
                              qualifier (see <a class="xref" href="index.html#register-pressure" shape="rect">Register Pressure</a>).
                           </p>
                           <p class="p">For purposes of calculating occupancy, the number of registers used
                              by each thread is one of the key factors.  For example, devices with
                              <a class="xref" href="index.html#cuda-compute-capability" shape="rect">compute
                                 capability</a> 1.1 have 8,192 32-bit registers per
                              multiprocessor and can have a maximum of 768 simultaneous threads
                              resident (24 warps x 32 threads per warp). This means that in one of
                              these devices, for a multiprocessor to have 100% occupancy, each thread
                              can use at most 10 registers.  However, this approach of determining
                              how register count affects occupancy does not take into account the
                              register allocation granularity. For example, on a device of compute
                              capability 1.1, a kernel with 128-thread blocks using 12 registers per
                              thread results in an occupancy of 83% with 5 active 128-thread blocks
                              per multi-processor, whereas a kernel with 256-thread blocks using the
                              same 12 registers per thread results in an occupancy of 66% because
                              only two 256-thread blocks can reside on a multiprocessor. Furthermore,
                              register allocations are rounded up to the nearest 256 registers per
                              block on devices with compute capability 1.1.
                           </p>
                           <p class="p">The number of registers available, the maximum number of
                              simultaneous threads resident on each multiprocessor, and the register
                              allocation granularity vary over different compute capabilities.
                              Because of these nuances in register allocation and the fact that a
                              multiprocessor's shared memory is also partitioned between resident
                              thread blocks, the exact relationship between register usage and
                              occupancy can be difficult to determine. The <samp class="ph codeph">--ptxas
                                 options=v</samp> option of <samp class="ph codeph">nvcc</samp> details the number
                              of registers used per thread for each kernel. See <cite class="cite">Hardware
                                 Multithreading</cite> of the <cite class="cite">CUDA C Programming Guide</cite> for
                              the register allocation formulas for devices of various compute
                              capabilities and <cite class="cite">Features and Technical Specifications</cite> of
                              the <cite class="cite">CUDA C Programming Guide</cite> for the total number of
                              registers available on those devices. Alternatively, NVIDIA provides an
                              occupancy calculator in the form of an Excel spreadsheet that enables
                              developers to hone in on the optimal balance and to test different
                              possible scenarios more easily. This spreadsheet, shown in <a class="xref" href="index.html#calculating-occupancy__cuda-occupancy-calculator-usage-project-gpu-multi-occupancy" shape="rect">Figure 11</a>,
                              is called <samp class="ph codeph">CUDA_Occupancy_Calculator.xls</samp> and is located
                              in the tools subdirectory of the CUDA Toolkit installation.
                           </p>
                           <div class="fig fignone" id="calculating-occupancy__cuda-occupancy-calculator-usage-project-gpu-multi-occupancy"><a name="calculating-occupancy__cuda-occupancy-calculator-usage-project-gpu-multi-occupancy" shape="rect">
                                 <!-- --></a><span class="figcap">Figure 11. Using the CUDA Occupancy Calculator to project GPU
                                 multiprocessor occupancy</span><br clear="none"></br><div class="imagecenter"><img class="image imagecenter" src="graphics/using-cuda-occupancy-calculator-usage.png" alt="Using the CUDA Occupancy Calculator to project GPU                          multiprocessor occupancy."></img></div><br clear="none"></br></div>
                           <p class="p">In addition to the calculator spreadsheet, occupancy can be
                              determined using the NVIDIA Visual Profiler's Achieved Occupancy
                              metric. The Visual Profiler also calculates occupancy as part of the
                              Multiprocessor stage of application analysis.
                           </p>
                        </div>
                     </div>
                  </div>
                  <div class="topic concept nested1" xml:lang="en-US" id="concurrent-kernel-execution"><a name="concurrent-kernel-execution" shape="rect">
                        <!-- --></a><h3 class="title topictitle2"><a href="#concurrent-kernel-execution" name="concurrent-kernel-execution" shape="rect">10.2.&nbsp;Concurrent Kernel Execution</a></h3>
                     <div class="body conbody">
                        <div class="section">
                           <p class="p">As described in <a class="xref" href="index.html#asynchronous-transfers-and-overlapping-transfers-with-computation" shape="rect">Asynchronous and Overlapping Transfers with Computation</a>,
                              CUDA streams can be used to overlap kernel execution with data
                              transfers. On devices that are capable of concurrent kernel execution,
                              streams can also be used to execute multiple kernels simultaneously to
                              more fully take advantage of the device's multiprocessors. Whether a
                              device has this capability is indicated by the
                              <samp class="ph codeph">concurrentKernels</samp> field of the
                              <samp class="ph codeph">cudaDeviceProp</samp> structure (or listed in the output of
                              the <samp class="ph codeph">deviceQuery</samp> CUDA Sample). Non-default streams
                              (streams other than stream 0) are required for concurrent execution
                              because kernel calls that use the default stream begin only after all
                              preceding calls on the device (in any stream) have completed, and no
                              operation on the device (in any stream) commences until they are
                              finished.
                           </p>
                           <p class="p">The following example illustrates the basic technique. Because
                              <samp class="ph codeph">kernel1</samp> and <samp class="ph codeph">kernel2</samp> are executed in
                              different, non-default streams, a capable device can execute the
                              kernels at the same time.
                           </p><pre xml:space="preserve">cudaStreamCreate(&amp;stream1);
cudaStreamCreate(&amp;stream2);
kernel1<span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-attribute">&lt;&lt;&lt;</span>grid, block, 0, stream1<span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-attribute">&gt;&gt;&gt;</span>(data_1);
kernel2<span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-attribute">&lt;&lt;&lt;</span>grid, block, 0, stream2<span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-attribute">&gt;&gt;&gt;</span>(data_2);</pre></div>
                     </div>
                  </div>
                  <div class="topic concept nested1" xml:lang="en-US" id="multiple-contexts"><a name="multiple-contexts" shape="rect">
                        <!-- --></a><h3 class="title topictitle2"><a href="#multiple-contexts" name="multiple-contexts" shape="rect">10.3.&nbsp;Multiple contexts</a></h3>
                     <div class="body conbody">
                        <p class="p">CUDA work occurs within a process space for a particular GPU known
                           as a <dfn class="term">context</dfn>. The context encapsulates kernel launches
                           and memory allocations for that GPU as well as supporting constructs
                           such as the page tables. The context is explicit in the CUDA Driver
                           API but is entirely implicit in the CUDA Runtime API, which creates
                           and manages contexts automatically.
                        </p>
                        <p class="p">With the CUDA Driver API, a CUDA application process can potentially
                           create more than one context for a given GPU. If multiple CUDA
                           application processes access the same GPU concurrently, this almost
                           always implies multiple contexts, since a context is tied to a
                           particular host process unless CUDA Multi-Process Service is in
                           use.
                        </p>
                        <p class="p">While multiple contexts (and their associated resources such as
                           global memory allocations) can be allocated concurrently on a given
                           GPU, only one of these contexts can execute work at any given moment on
                           that GPU; contexts sharing the same GPU are time-sliced. Creating
                           additional contexts incurs memory overhead for per-context data and
                           time overhead for context switching. Furthermore, the need for context
                           switching can reduce utilization when work from several contexts could
                           otherwise execute concurrently (see also <a class="xref" href="index.html#concurrent-kernel-execution" shape="rect">Concurrent Kernel Execution</a>).
                        </p>
                        <p class="p">Therefore, it is best to avoid multiple contexts per GPU within the
                           same CUDA application. To assist with this, the CUDA Driver API
                           provides methods to access and manage a special context on each GPU
                           called the <dfn class="term">primary context</dfn>. These are the same contexts
                           used implicitly by the CUDA Runtime when there is not already a current
                           context for a thread.
                        </p><pre xml:space="preserve"><span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-comment">// When initializing the program/library</span>
CUcontext ctx;
cuDevicePrimaryCtxRetain(&amp;ctx, dev);

<span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-comment">// When the program/library launches work</span>
cuCtxPushCurrent(ctx);
kernel<span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-attribute">&lt;&lt;&lt;</span>...<span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-attribute">&gt;&gt;&gt;</span>(...);
cuCtxPopCurrent(&amp;ctx);

<span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-comment">// When the program/library is finished with the context</span>
cuDevicePrimaryCtxRelease(dev);</pre><div class="note note"><span class="notetitle">Note:</span> NVIDIA-SMI can be used to configure a GPU for <a class="xref" href="http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#compute-modes" target="_blank" shape="rect">exclusive process mode</a>, which
                           limits the number of contexts per GPU to one. This context can be
                           current to as many threads as desired within the creating process, and
                           <samp class="ph codeph">cuDevicePrimaryCtxRetain</samp> will fail if a non-primary
                           context that was created with the CUDA driver API already exists on the
                           device.
                        </div>
                     </div>
                  </div>
                  <div class="topic concept nested1" xml:lang="en-US" id="hiding-register-dependencies"><a name="hiding-register-dependencies" shape="rect">
                        <!-- --></a><h3 class="title topictitle2"><a href="#hiding-register-dependencies" name="hiding-register-dependencies" shape="rect">10.4.&nbsp;Hiding Register Dependencies</a></h3>
                     <div class="body conbody">
                        <div class="note note"><span class="notetitle">Note:</span><strong class="ph b">Medium Priority:</strong> To hide latency arising from register
                           dependencies, maintain sufficient numbers of active threads per
                           multiprocessor (i.e., sufficient occupancy).
                        </div>
                        <p class="p">Register dependencies arise when an instruction uses a result stored
                           in a register written by an instruction before it. The latency on
                           current CUDA-enabled GPUs is approximately 24 cycles, so threads must
                           wait 24 cycles before using an arithmetic result. However, this latency
                           can be completely hidden by the execution of threads in other warps.
                           See <a class="xref" href="index.html#registers" shape="rect">Registers</a> for details.
                        </p>
                     </div>
                  </div>
                  <div class="topic concept nested1" xml:lang="en-US" id="thread-and-block-heuristics"><a name="thread-and-block-heuristics" shape="rect">
                        <!-- --></a><h3 class="title topictitle2"><a href="#thread-and-block-heuristics" name="thread-and-block-heuristics" shape="rect">10.5.&nbsp;Thread and Block Heuristics</a></h3>
                     <div class="body conbody">
                        <div class="note note"><span class="notetitle">Note:</span><strong class="ph b">Medium Priority:</strong> The number of threads per block should be
                           a multiple of 32 threads, because this provides optimal computing
                           efficiency and facilitates coalescing.
                        </div>
                        <p class="p">The dimension and size of blocks per grid and the dimension and size
                           of threads per block are both important factors. The multidimensional
                           aspect of these parameters allows easier mapping of multidimensional
                           problems to CUDA and does not play a role in performance. As a result,
                           this section discusses size but not dimension.
                        </p>
                        <p class="p">Latency hiding and occupancy depend on the number of active warps
                           per multiprocessor, which is implicitly determined by the execution
                           parameters along with resource (register and shared memory)
                           constraints. Choosing execution parameters is a matter of striking a
                           balance between latency hiding (occupancy) and resource
                           utilization.
                        </p>
                        <p class="p">Choosing the execution configuration parameters should be done in
                           tandem; however, there are certain heuristics that apply to each
                           parameter individually.  When choosing the first execution
                           configuration parameter-the number of blocks per grid, or <dfn class="term">grid
                              size</dfn> - the primary concern is keeping the entire GPU busy. The
                           number of blocks in a grid should be larger than the number of
                           multiprocessors so that all multiprocessors have at least one block to
                           execute. Furthermore, there should be multiple active blocks per
                           multiprocessor so that blocks that aren't waiting for a
                           <samp class="ph codeph">__syncthreads()</samp> can keep the hardware busy. This
                           recommendation is subject to resource availability; therefore, it
                           should be determined in the context of the second execution parameter -
                           the number of threads per block, or <dfn class="term">block size</dfn> - as well
                           as shared memory usage. To scale to future devices, the number of
                           blocks per kernel launch should be in the thousands.
                        </p>
                        <p class="p">When choosing the block size, it is important to remember that
                           multiple concurrent blocks can reside on a multiprocessor, so occupancy
                           is not determined by block size alone. In particular, a larger block
                           size does not imply a higher occupancy. For example, on a device of
                           compute capability 1.1 or lower, a kernel with a maximum block size of
                           512 threads results in an occupancy of 66 percent because the maximum
                           number of threads per multiprocessor on such a device is 768. Hence,
                           only a single block can be active per multiprocessor. However, a kernel
                           with 256 threads per block on such a device can result in 100 percent
                           occupancy with three resident active blocks.
                        </p>
                        <p class="p">As mentioned in <a class="xref" href="index.html#occupancy" shape="rect">Occupancy</a>, higher
                           occupancy does not always equate to better performance. For example,
                           improving occupancy from 66 percent to 100 percent generally does not
                           translate to a similar increase in performance. A lower occupancy
                           kernel will have more registers available per thread than a higher
                           occupancy kernel, which may result in less register spilling to local
                           memory. Typically, once an occupancy of 50 percent has been reached,
                           additional increases in occupancy do not translate into improved
                           performance. It is in some cases possible to fully cover latency with
                           even fewer warps, notably via instruction-level parallelism (ILP); for
                           discussion, see <a class="xref" href="http://www.nvidia.com/content/GTC-2010/pdfs/2238_GTC2010.pdf" target="_blank" shape="rect">http://www.nvidia.com/content/GTC-2010/pdfs/2238_GTC2010.pdf</a>.
                        </p>
                        <p class="p">There are many such factors involved in selecting block size, and
                           inevitably some experimentation is required. However, a few rules of
                           thumb should be followed:
                        </p>
                        <ul class="ul">
                           <li class="li">Threads per block should be a multiple of warp size to avoid wasting
                              computation on under-populated warps and to facilitate coalescing.
                           </li>
                           <li class="li">A minimum of 64 threads per block should be used, and only if there are
                              multiple concurrent blocks per multiprocessor.
                           </li>
                           <li class="li">Between 128 and 256 threads per block is a better choice and a good
                              initial range for experimentation with different block sizes.
                           </li>
                           <li class="li">Use several (3 to 4) smaller thread blocks rather than one large thread
                              block per multiprocessor if latency affects performance. This is
                              particularly beneficial to kernels that frequently call
                              <samp class="ph codeph">__syncthreads()</samp>.
                           </li>
                        </ul>
                        <p class="p">Note that when a thread block allocates more registers than are
                           available on a multiprocessor, the kernel launch fails, as it will when
                           too much shared memory or too many threads are requested.
                        </p>
                     </div>
                  </div>
                  <div class="topic concept nested1" xml:lang="en-US" id="effects-of-shared-memory"><a name="effects-of-shared-memory" shape="rect">
                        <!-- --></a><h3 class="title topictitle2"><a href="#effects-of-shared-memory" name="effects-of-shared-memory" shape="rect">10.6.&nbsp;Effects of Shared Memory</a></h3>
                     <div class="body conbody">
                        <p class="p">Shared memory can be helpful in several situations, such as helping
                           to coalesce or eliminate redundant access to global memory.  However,
                           it also can act as a constraint on occupancy. In many cases, the amount
                           of shared memory required by a kernel is related to the block size that
                           was chosen, but the mapping of threads to shared memory elements does
                           not need to be one-to-one.  For example, it may be desirable to use a
                           32x32 element shared memory array in a kernel, but because the maximum
                           number of threads per block is 512, it is not possible to launch a
                           kernel with 32x32 threads per block.  In such cases, kernels with 32x16
                           or 32x8 threads can be launched with each thread processing two or four
                           elements, respectively, of the shared memory array. The approach of
                           using a single thread to process multiple elements of a shared memory
                           array can be beneficial even if limits such as threads per block are
                           not an issue. This is because some operations common to each element
                           can be performed by the thread once, amortizing the cost over the
                           number of shared memory elements processed by the thread.
                        </p>
                        <p class="p">A useful technique to determine the sensitivity of performance to
                           occupancy is through experimentation with the amount of dynamically
                           allocated shared memory, as specified in the third parameter of the
                           execution configuration.  By simply increasing this parameter (without
                           modifying the kernel), it is possible to effectively reduce the
                           occupancy of the kernel and measure its effect on performance.
                        </p>
                        <p class="p">As mentioned in the previous section, once an occupancy of more than
                           50 percent has been reached, it generally does not pay to optimize
                           parameters to obtain higher occupancy ratios. The previous technique
                           can be used to determine whether such a plateau has been reached.
                        </p>
                     </div>
                  </div>
               </div>
               <div class="topic concept nested0" xml:lang="en-US" id="instruction-optimization"><a name="instruction-optimization" shape="rect">
                     <!-- --></a><h2 class="title topictitle1"><a href="#instruction-optimization" name="instruction-optimization" shape="rect">11.&nbsp;Instruction Optimization</a></h2>
                  <div class="body conbody">
                     <p class="p">Awareness of how instructions are executed often permits low-level
                        optimizations that can be useful, especially in code that is run
                        frequently (the so-called hot spot in a program). Best practices
                        suggest that this optimization be performed after all higher-level
                        optimizations have been completed. 
                     </p>
                  </div>
                  <div class="topic concept nested1" xml:lang="en-US" id="arithmetic-instructions"><a name="arithmetic-instructions" shape="rect">
                        <!-- --></a><h3 class="title topictitle2"><a href="#arithmetic-instructions" name="arithmetic-instructions" shape="rect">11.1.&nbsp;Arithmetic Instructions</a></h3>
                     <div class="body conbody">
                        <p class="p">Single-precision floats provide the best performance, and their use
                           is highly encouraged. The throughput of individual arithmetic operations
                           is detailed in the <cite class="cite">CUDA C Programming Guide</cite>.
                        </p>
                     </div>
                     <div class="topic concept nested2" xml:lang="en-US" id="division-and-modulo-operations"><a name="division-and-modulo-operations" shape="rect">
                           <!-- --></a><h3 class="title topictitle2"><a href="#division-and-modulo-operations" name="division-and-modulo-operations" shape="rect">11.1.1.&nbsp;Division Modulo Operations</a></h3>
                        <div class="body conbody">
                           <div class="note note"><span class="notetitle">Note:</span><strong class="ph b">Low Priority:</strong> Use shift operations to avoid expensive
                              division and modulo calculations.
                           </div>
                           <p class="p">Integer division and modulo operations are particularly costly and
                              should be avoided or replaced with bitwise operations whenever
                              possible: If
                              
                              <math xmlns="http://www.w3.org/1998/Math/MathML">
                                 <mstyle displaystyle="true">
                                    <mi> n </mi>
                                 </mstyle>
                              </math>
                              is a power of 2, (
                              
                              <math xmlns="http://www.w3.org/1998/Math/MathML">
                                 <mstyle displaystyle="true">
                                    <mi> i </mi>
                                    <mo> / </mo>
                                    <mi> n </mi>
                                 </mstyle>
                              </math>
                              ) is equivalent to (
                              
                              <math xmlns="http://www.w3.org/1998/Math/MathML">
                                 <mstyle displaystyle="true">
                                    <mi> i </mi>
                                    <mo> ≫ </mo>
                                    <mi> log2 </mi>
                                    <mfenced>
                                       <mrow>
                                          <mi> n </mi>
                                       </mrow>
                                    </mfenced>
                                 </mstyle>
                              </math>
                              ) and (
                              
                              <math xmlns="http://www.w3.org/1998/Math/MathML">
                                 <mstyle displaystyle="true">
                                    <mi> i </mi>
                                    <mo> % </mo>
                                    <mi> n </mi>
                                 </mstyle>
                              </math>
                              ) is equivalent to (
                              
                              <math xmlns="http://www.w3.org/1998/Math/MathML">
                                 <mstyle displaystyle="true">
                                    <mi> i </mi>
                                    <mo> &amp; </mo>
                                    <mfenced>
                                       <mrow>
                                          <mi> n </mi>
                                          <mo> - </mo>
                                          <mn> 1 </mn>
                                       </mrow>
                                    </mfenced>
                                 </mstyle>
                              </math>
                              ).
                           </p>
                           <p class="p">The compiler will perform these conversions if n is literal. (For
                              further information, refer to <cite class="cite">Performance Guidelines</cite> in
                              the <cite class="cite">CUDA C Programming Guide</cite>).
                           </p>
                        </div>
                     </div>
                     <div class="topic concept nested2" xml:lang="en-US" id="reciprocal-square-root"><a name="reciprocal-square-root" shape="rect">
                           <!-- --></a><h3 class="title topictitle2"><a href="#reciprocal-square-root" name="reciprocal-square-root" shape="rect">11.1.2.&nbsp;Reciprocal Square Root</a></h3>
                        <div class="body conbody">
                           <p class="p">The reciprocal square root should always be invoked explicitly as
                              <samp class="ph codeph">rsqrtf()</samp> for single precision and
                              <samp class="ph codeph">rsqrt()</samp> for double precision. The compiler optimizes
                              <samp class="ph codeph">1.0f/sqrtf(x)</samp> into <samp class="ph codeph">rsqrtf()</samp> only when
                              this does not violate IEEE-754 semantics.
                           </p>
                        </div>
                     </div>
                     <div class="topic concept nested2" xml:lang="en-US" id="other-arithmetic-instructions"><a name="other-arithmetic-instructions" shape="rect">
                           <!-- --></a><h3 class="title topictitle2"><a href="#other-arithmetic-instructions" name="other-arithmetic-instructions" shape="rect">11.1.3.&nbsp;Other Arithmetic Instructions</a></h3>
                        <div class="body conbody">
                           <div class="note note"><span class="notetitle">Note:</span><strong class="ph b">Low Priority:</strong> Avoid automatic conversion of doubles to
                              floats.
                           </div>
                           <p class="p">The compiler must on occasion insert conversion instructions,
                              introducing additional execution cycles. This is the case for:
                           </p>
                           <ul class="ul">
                              <li class="li">Functions operating on <samp class="ph codeph">char</samp> or
                                 <samp class="ph codeph">short</samp> whose operands generally need to be
                                 converted to an <samp class="ph codeph">int</samp></li>
                              <li class="li">Double-precision floating-point constants (defined without any
                                 type suffix) used as input to single-precision floating-point
                                 computations
                              </li>
                           </ul>
                           <p class="p">The latter case can be avoided by using single-precision
                              floating-point constants, defined with an <samp class="ph codeph">f</samp> suffix
                              such as <samp class="ph codeph">3.141592653589793f</samp>, <samp class="ph codeph">1.0f</samp>,
                              <samp class="ph codeph">0.5f</samp>. This suffix has accuracy implications in
                              addition to its ramifications on performance. The effects on accuracy
                              are discussed in <a class="xref" href="index.html#promotions-to-doubles-and-truncations-to-floats" shape="rect">Promotions to Doubles and Truncations to Floats</a>.
                              Note that this distinction is particularly important to performance on
                              devices of <a class="xref" href="index.html#cuda-compute-capability" shape="rect">compute
                                 capability</a> 2.x.
                           </p>
                           <p class="p">For single-precision code, use of the float type and the
                              single-precision math functions are highly recommended. When compiling
                              for devices without native double-precision support such as devices of
                              compute capability 1.2 and earlier, each double-precision
                              floating-point variable is converted to single-precision floating-point
                              format (but retains its size of 64 bits) and double-precision
                              arithmetic is demoted to single-precision arithmetic.
                           </p>
                           <p class="p">It should also be noted that the CUDA math library's complementary
                              error function, <samp class="ph codeph">erfcf()</samp>, is particularly fast with
                              full single-precision accuracy.
                           </p>
                        </div>
                     </div>
                     <div class="topic concept nested2" xml:lang="en-US" id="exponentiation-small-fractions"><a name="exponentiation-small-fractions" shape="rect">
                           <!-- --></a><h3 class="title topictitle2"><a href="#exponentiation-small-fractions" name="exponentiation-small-fractions" shape="rect">11.1.4.&nbsp;Exponentiation With Small Fractional Arguments</a></h3>
                        <div class="body conbody">
                           <p class="p">For some fractional exponents, exponentiation can be accelerated significantly compared to the use of <samp class="ph codeph">pow()</samp> by using square roots,
                              cube roots, and their inverses. For those exponentiations where the exponent is not exactly representable as a floating-point
                              number, such as 1/3,
                              this can also provide much more accurate results, as use of <samp class="ph codeph">pow()</samp> magnifies the initial representational error.
                           </p>
                           <p class="p">The formulas in the table below are valid for <samp class="ph codeph">x &gt;= 0, x != -0</samp>, that is, <samp class="ph codeph">signbit(x) == 0</samp>.
                           </p>
                           <div class="tablenoborder"><a name="exponentiation-small-fractions__exponentiation-small-fractions-double" shape="rect">
                                 <!-- --></a><table cellpadding="4" cellspacing="0" summary="" id="exponentiation-small-fractions__exponentiation-small-fractions-double" class="table" frame="border" border="1" rules="all">
                                 <caption><span class="tablecap">Table 5. Formulae for exponentiation by small fractions</span></caption>
                                 <thead class="thead" align="left">
                                    <tr class="row">
                                       <th class="entry" valign="top" width="20%" id="d54e4167" rowspan="1" colspan="1">Computation</th>
                                       <th class="entry" valign="top" width="40%" id="d54e4170" rowspan="1" colspan="1">Formula</th>
                                       <th class="entry" valign="top" width="20%" id="d54e4173" rowspan="1" colspan="1">ulps (double) <sup class="ph sup">1</sup></th>
                                       <th class="entry" valign="top" width="20%" id="d54e4178" rowspan="1" colspan="1">ulps (single) <sup class="ph sup">2</sup></th>
                                    </tr>
                                 </thead>
                                 <tbody class="tbody">
                                    <tr class="row">
                                       <td class="entry" valign="top" width="20%" headers="d54e4167" rowspan="1" colspan="1">x<sup class="ph sup">1/9</sup></td>
                                       <td class="entry" valign="top" width="40%" headers="d54e4170" rowspan="1" colspan="1"><samp class="ph codeph">r = rcbrt(rcbrt(x))</samp></td>
                                       <td class="entry" valign="top" width="20%" headers="d54e4173" rowspan="1" colspan="1">1</td>
                                       <td class="entry" valign="top" width="20%" headers="d54e4178" rowspan="1" colspan="1">1/1</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="20%" headers="d54e4167" rowspan="1" colspan="1">x<sup class="ph sup">-1/9</sup></td>
                                       <td class="entry" valign="top" width="40%" headers="d54e4170" rowspan="1" colspan="1"><samp class="ph codeph">r = cbrt(rcbrt(x))</samp></td>
                                       <td class="entry" valign="top" width="20%" headers="d54e4173" rowspan="1" colspan="1">1</td>
                                       <td class="entry" valign="top" width="20%" headers="d54e4178" rowspan="1" colspan="1">1/1</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="20%" headers="d54e4167" rowspan="1" colspan="1">x<sup class="ph sup">1/6</sup></td>
                                       <td class="entry" valign="top" width="40%" headers="d54e4170" rowspan="1" colspan="1"><samp class="ph codeph">r = rcbrt(rsqrt(x))</samp></td>
                                       <td class="entry" valign="top" width="20%" headers="d54e4173" rowspan="1" colspan="1">1</td>
                                       <td class="entry" valign="top" width="20%" headers="d54e4178" rowspan="1" colspan="1">2/2</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="20%" headers="d54e4167" rowspan="1" colspan="1">x<sup class="ph sup">-1/6</sup></td>
                                       <td class="entry" valign="top" width="40%" headers="d54e4170" rowspan="1" colspan="1"><samp class="ph codeph">r = rcbrt(sqrt(x))</samp></td>
                                       <td class="entry" valign="top" width="20%" headers="d54e4173" rowspan="1" colspan="1">1</td>
                                       <td class="entry" valign="top" width="20%" headers="d54e4178" rowspan="1" colspan="1">1/2</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="20%" headers="d54e4167" rowspan="1" colspan="1">x<sup class="ph sup">1/4</sup></td>
                                       <td class="entry" valign="top" width="40%" headers="d54e4170" rowspan="1" colspan="1"><samp class="ph codeph">r = rsqrt(rsqrt(x))</samp></td>
                                       <td class="entry" valign="top" width="20%" headers="d54e4173" rowspan="1" colspan="1">1</td>
                                       <td class="entry" valign="top" width="20%" headers="d54e4178" rowspan="1" colspan="1">2/2</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="20%" headers="d54e4167" rowspan="1" colspan="1">x<sup class="ph sup">-1/4</sup></td>
                                       <td class="entry" valign="top" width="40%" headers="d54e4170" rowspan="1" colspan="1"><samp class="ph codeph">r = sqrt(rsqrt(x))</samp></td>
                                       <td class="entry" valign="top" width="20%" headers="d54e4173" rowspan="1" colspan="1">1</td>
                                       <td class="entry" valign="top" width="20%" headers="d54e4178" rowspan="1" colspan="1">1/3</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="20%" headers="d54e4167" rowspan="1" colspan="1">x<sup class="ph sup">1/3</sup></td>
                                       <td class="entry" valign="top" width="40%" headers="d54e4170" rowspan="1" colspan="1"><samp class="ph codeph">r = cbrt(x)</samp></td>
                                       <td class="entry" valign="top" width="20%" headers="d54e4173" rowspan="1" colspan="1">1</td>
                                       <td class="entry" valign="top" width="20%" headers="d54e4178" rowspan="1" colspan="1">1/1</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="20%" headers="d54e4167" rowspan="1" colspan="1">x<sup class="ph sup">-1/3</sup></td>
                                       <td class="entry" valign="top" width="40%" headers="d54e4170" rowspan="1" colspan="1"><samp class="ph codeph">r = rcbrt(x)</samp></td>
                                       <td class="entry" valign="top" width="20%" headers="d54e4173" rowspan="1" colspan="1">1</td>
                                       <td class="entry" valign="top" width="20%" headers="d54e4178" rowspan="1" colspan="1">1/1</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="20%" headers="d54e4167" rowspan="1" colspan="1">x<sup class="ph sup">1/2</sup></td>
                                       <td class="entry" valign="top" width="40%" headers="d54e4170" rowspan="1" colspan="1"><samp class="ph codeph">r = sqrt(x)</samp></td>
                                       <td class="entry" valign="top" width="20%" headers="d54e4173" rowspan="1" colspan="1">0</td>
                                       <td class="entry" valign="top" width="20%" headers="d54e4178" rowspan="1" colspan="1">0/3</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="20%" headers="d54e4167" rowspan="1" colspan="1">x<sup class="ph sup">-1/2</sup></td>
                                       <td class="entry" valign="top" width="40%" headers="d54e4170" rowspan="1" colspan="1"><samp class="ph codeph">r = rsqrt(x)</samp></td>
                                       <td class="entry" valign="top" width="20%" headers="d54e4173" rowspan="1" colspan="1">1</td>
                                       <td class="entry" valign="top" width="20%" headers="d54e4178" rowspan="1" colspan="1">2/2</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="20%" headers="d54e4167" rowspan="1" colspan="1">x<sup class="ph sup">2/3</sup></td>
                                       <td class="entry" valign="top" width="40%" headers="d54e4170" rowspan="1" colspan="1"><samp class="ph codeph">r = cbrt(x); r = r*r</samp></td>
                                       <td class="entry" valign="top" width="20%" headers="d54e4173" rowspan="1" colspan="1">2</td>
                                       <td class="entry" valign="top" width="20%" headers="d54e4178" rowspan="1" colspan="1">3/3</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="20%" headers="d54e4167" rowspan="1" colspan="1">x<sup class="ph sup">-2/3</sup></td>
                                       <td class="entry" valign="top" width="40%" headers="d54e4170" rowspan="1" colspan="1"><samp class="ph codeph">r = rcbrt(x); r = r*r</samp></td>
                                       <td class="entry" valign="top" width="20%" headers="d54e4173" rowspan="1" colspan="1">2</td>
                                       <td class="entry" valign="top" width="20%" headers="d54e4178" rowspan="1" colspan="1">3/3</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="20%" headers="d54e4167" rowspan="1" colspan="1">x<sup class="ph sup">3/4</sup></td>
                                       <td class="entry" valign="top" width="40%" headers="d54e4170" rowspan="1" colspan="1"><samp class="ph codeph">r = sqrt(x); r = r*sqrt(r)</samp></td>
                                       <td class="entry" valign="top" width="20%" headers="d54e4173" rowspan="1" colspan="1">2</td>
                                       <td class="entry" valign="top" width="20%" headers="d54e4178" rowspan="1" colspan="1">2/6</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="20%" headers="d54e4167" rowspan="1" colspan="1">x<sup class="ph sup">-3/4</sup></td>
                                       <td class="entry" valign="top" width="40%" headers="d54e4170" rowspan="1" colspan="1"><samp class="ph codeph">r = rsqrt(x); r = r*sqrt(r)</samp></td>
                                       <td class="entry" valign="top" width="20%" headers="d54e4173" rowspan="1" colspan="1">2</td>
                                       <td class="entry" valign="top" width="20%" headers="d54e4178" rowspan="1" colspan="1">4/5</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="20%" headers="d54e4167" rowspan="1" colspan="1">x<sup class="ph sup">7/6</sup></td>
                                       <td class="entry" valign="top" width="40%" headers="d54e4170" rowspan="1" colspan="1"><samp class="ph codeph">r = x*rcbrt(rsqrt(x))</samp></td>
                                       <td class="entry" valign="top" width="20%" headers="d54e4173" rowspan="1" colspan="1">2</td>
                                       <td class="entry" valign="top" width="20%" headers="d54e4178" rowspan="1" colspan="1">2/2</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="20%" headers="d54e4167" rowspan="1" colspan="1">x<sup class="ph sup">-7/6</sup></td>
                                       <td class="entry" valign="top" width="40%" headers="d54e4170" rowspan="1" colspan="1"><samp class="ph codeph">r = (1/x) * rcbrt(sqrt(x))</samp></td>
                                       <td class="entry" valign="top" width="20%" headers="d54e4173" rowspan="1" colspan="1">2</td>
                                       <td class="entry" valign="top" width="20%" headers="d54e4178" rowspan="1" colspan="1">3/3</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="20%" headers="d54e4167" rowspan="1" colspan="1">x<sup class="ph sup">5/4</sup></td>
                                       <td class="entry" valign="top" width="40%" headers="d54e4170" rowspan="1" colspan="1"><samp class="ph codeph">r = x*rsqrt(rsqrt(x))</samp></td>
                                       <td class="entry" valign="top" width="20%" headers="d54e4173" rowspan="1" colspan="1">2</td>
                                       <td class="entry" valign="top" width="20%" headers="d54e4178" rowspan="1" colspan="1">3/3</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="20%" headers="d54e4167" rowspan="1" colspan="1">x<sup class="ph sup">-5/4</sup></td>
                                       <td class="entry" valign="top" width="40%" headers="d54e4170" rowspan="1" colspan="1"><samp class="ph codeph">r = (1/x)*sqrt(rsqrt(x))</samp></td>
                                       <td class="entry" valign="top" width="20%" headers="d54e4173" rowspan="1" colspan="1">2</td>
                                       <td class="entry" valign="top" width="20%" headers="d54e4178" rowspan="1" colspan="1">3/5</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="20%" headers="d54e4167" rowspan="1" colspan="1">x<sup class="ph sup">4/3</sup></td>
                                       <td class="entry" valign="top" width="40%" headers="d54e4170" rowspan="1" colspan="1"><samp class="ph codeph">r = x*cbrt(x)</samp></td>
                                       <td class="entry" valign="top" width="20%" headers="d54e4173" rowspan="1" colspan="1">1</td>
                                       <td class="entry" valign="top" width="20%" headers="d54e4178" rowspan="1" colspan="1">2/2</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="20%" headers="d54e4167" rowspan="1" colspan="1">x<sup class="ph sup">-4/3</sup></td>
                                       <td class="entry" valign="top" width="40%" headers="d54e4170" rowspan="1" colspan="1"><samp class="ph codeph">r = (1/x)*rcbrt(x)</samp></td>
                                       <td class="entry" valign="top" width="20%" headers="d54e4173" rowspan="1" colspan="1">2</td>
                                       <td class="entry" valign="top" width="20%" headers="d54e4178" rowspan="1" colspan="1">2/3</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="20%" headers="d54e4167" rowspan="1" colspan="1">x<sup class="ph sup">3/2</sup></td>
                                       <td class="entry" valign="top" width="40%" headers="d54e4170" rowspan="1" colspan="1"><samp class="ph codeph">r = x*sqrt(x)</samp></td>
                                       <td class="entry" valign="top" width="20%" headers="d54e4173" rowspan="1" colspan="1">1</td>
                                       <td class="entry" valign="top" width="20%" headers="d54e4178" rowspan="1" colspan="1">1/3</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="20%" headers="d54e4167" rowspan="1" colspan="1">x<sup class="ph sup">-3/2</sup></td>
                                       <td class="entry" valign="top" width="40%" headers="d54e4170" rowspan="1" colspan="1"><samp class="ph codeph">r = (1/x)*sqrt(x)</samp></td>
                                       <td class="entry" valign="top" width="20%" headers="d54e4173" rowspan="1" colspan="1">2</td>
                                       <td class="entry" valign="top" width="20%" headers="d54e4178" rowspan="1" colspan="1">3/3</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" colspan="4" valign="top" headers="d54e4167 d54e4170 d54e4173 d54e4178" rowspan="1"><sup class="ph sup">1</sup> Compared to correctly rounded result
                                       </td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" colspan="4" valign="top" headers="d54e4167 d54e4170 d54e4173 d54e4178" rowspan="1">
                                          <p class="p"><sup class="ph sup">2</sup>Compared to correctly rounded result
                                          </p>
                                          <p class="p">1st: -prec-sqrt=true -prec-div=true</p>
                                          <p class="p">2nd: -prec-sqrt=false -prec-div=false</p>
                                       </td>
                                    </tr>
                                 </tbody>
                              </table>
                           </div>
                        </div>
                     </div>
                     <div class="topic concept nested2" xml:lang="en-US" id="math-libraries"><a name="math-libraries" shape="rect">
                           <!-- --></a><h3 class="title topictitle2"><a href="#math-libraries" name="math-libraries" shape="rect">11.1.5.&nbsp;Math Libraries</a></h3>
                        <div class="body conbody">
                           <div class="note note"><span class="notetitle">Note:</span><strong class="ph b">Medium Priority:</strong> Use the fast math library whenever speed
                              trumps precision.
                           </div>
                           <p class="p">Two types of runtime math operations are supported. They can be
                              distinguished by their names: some have names with prepended
                              underscores, whereas others do not (e.g.,
                              <samp class="ph codeph">__functionName()</samp> versus
                              <samp class="ph codeph">functionName()</samp>). Functions following the
                              <samp class="ph codeph">__functionName()</samp> naming convention map directly to the
                              hardware level. They are faster but provide somewhat lower accuracy
                              (e.g., <samp class="ph codeph">__sinf(x)</samp> and <samp class="ph codeph">__expf(x)</samp>).
                              Functions following <samp class="ph codeph">functionName()</samp> naming convention
                              are slower but have higher accuracy (e.g., <samp class="ph codeph">sinf(x)</samp> and
                              <samp class="ph codeph">expf(x)</samp>). The throughput of
                              <samp class="ph codeph">__sinf(x)</samp>, <samp class="ph codeph">__cosf(x)</samp>, and<samp class="ph codeph">
                                 __expf(x)</samp> is much greater than that of
                              <samp class="ph codeph">sinf(x)</samp>, <samp class="ph codeph">cosf(x)</samp>, and
                              <samp class="ph codeph">expf(x)</samp>. The latter become even more expensive (about
                              an order of magnitude slower) if the magnitude of the argument
                              <samp class="ph codeph">x</samp> needs to be reduced. Moreover, in such cases, the
                              argument-reduction code uses local memory, which can affect performance
                              even more because of the high latency of local memory. More details are
                              available in the <em class="ph i">CUDA C Programming Guide</em>.
                           </p>
                           <p class="p">Note also that whenever sine and cosine of the same argument are
                              computed, the <samp class="ph codeph">sincos</samp> family of instructions should be
                              used to optimize performance:
                           </p>
                           <ul class="ul">
                              <li class="li"><samp class="ph codeph">__sincosf()</samp> for single-precision fast math
                                 (see next paragraph)
                              </li>
                              <li class="li"><samp class="ph codeph">sincosf()</samp> for regular single-precision
                              </li>
                              <li class="li"><samp class="ph codeph">sincos()</samp> for double precision
                              </li>
                           </ul>
                           <p class="p">The <samp class="ph codeph">-use_fast_math</samp> compiler option of
                              <samp class="ph codeph">nvcc</samp> coerces every <samp class="ph codeph">functionName()</samp>
                              call to the equivalent <samp class="ph codeph">__functionName()</samp> call. It also
                              disables single-precision denormal support and lowers the precision of
                              single-precision division in general. This is an aggressive
                              optimization that can both reduce numerical accuracy and alter special
                              case handling. A more robust approach is to selectively introduce calls
                              to fast intrinsic functions only if merited by performance gains and
                              where altered behavior can be tolerated. Note this switch is effective
                              only on single-precision floating point.
                           </p>
                           <div class="note note"><span class="notetitle">Note:</span><strong class="ph b">Medium Priority:</strong> Prefer faster, more specialized math
                              functions over slower, more general ones when possible.
                           </div>
                           <p class="p">For small integer powers (e.g., <em class="ph i">x<sup class="ph sup">2</sup></em> or
                              <em class="ph i">x<sup class="ph sup">3</sup></em>), explicit multiplication is almost certainly
                              faster than the use of general exponentiation routines such as
                              <samp class="ph codeph">pow()</samp>. While compiler optimization improvements
                              continually seek to narrow this gap, explicit multiplication (or the
                              use of an equivalent purpose-built inline function or macro) can have a
                              significant advantage. This advantage is increased when several powers
                              of the same base are needed (e.g., where both <em class="ph i">x<sup class="ph sup">2</sup></em> and
                              <em class="ph i">x<sup class="ph sup">5</sup></em> are calculated in close proximity), as this aids
                              the compiler in its common sub-expression elimination (CSE)
                              optimization.
                           </p>
                           <p class="p">For exponentiation using base 2 or 10, use the functions
                              <samp class="ph codeph">exp2()</samp> or <samp class="ph codeph">expf2()</samp> and
                              <samp class="ph codeph">exp10()</samp> or <samp class="ph codeph">expf10()</samp> rather than the
                              functions <samp class="ph codeph">pow()</samp> or <samp class="ph codeph">powf()</samp>. Both
                              <samp class="ph codeph">pow()</samp> and <samp class="ph codeph">powf()</samp> are heavy-weight
                              functions in terms of register pressure and instruction count due to
                              the numerous special cases arising in general exponentiation and the
                              difficulty of achieving good accuracy across the entire ranges of the
                              base and the exponent. The functions <samp class="ph codeph">exp2()</samp>,
                              <samp class="ph codeph">exp2f()</samp>, <samp class="ph codeph">exp10()</samp>, and
                              <samp class="ph codeph">exp10f()</samp>, on the other hand, are similar to
                              <samp class="ph codeph">exp()</samp> and <samp class="ph codeph">expf()</samp> in terms of
                              performance, and can be as much as ten times faster than their
                              <samp class="ph codeph">pow()</samp>/<samp class="ph codeph">powf()</samp> equivalents.
                           </p>
                           <p class="p">For exponentiation with an exponent of 1/3, use the
                              <samp class="ph codeph">cbrt()</samp> or <samp class="ph codeph">cbrtf()</samp> function rather
                              than the generic exponentiation functions <samp class="ph codeph">pow()</samp> or
                              <samp class="ph codeph">powf()</samp>, as the former are significantly faster than
                              the latter. Likewise, for exponentation with an exponent of -1/3, use
                              <samp class="ph codeph">rcbrt()</samp> or <samp class="ph codeph">rcbrtf()</samp>.
                           </p>
                           <p class="p">Replace <samp class="ph codeph">sin(π*&lt;expr&gt;)</samp> with
                              <samp class="ph codeph">sinpi(&lt;expr&gt;)</samp>,
                              <samp class="ph codeph">cos(π*&lt;expr&gt;)</samp> with
                              <samp class="ph codeph">cospi(&lt;expr&gt;)</samp>, and
                              <samp class="ph codeph">sincos(π*&lt;expr&gt;)</samp> with
                              <samp class="ph codeph">sincospi(&lt;expr&gt;)</samp>. This is advantageous with
                              regard to both accuracy and performance. As a particular example, to
                              evaluate the sine function in degrees instead of radians, use
                              <samp class="ph codeph">sinpi(x/180.0)</samp>. Similarly, the single-precision
                              functions <samp class="ph codeph">sinpif()</samp>, <samp class="ph codeph">cospif()</samp>, and
                              <samp class="ph codeph">sincospif()</samp> should replace calls to
                              <samp class="ph codeph">sinf()</samp>, <samp class="ph codeph">cosf()</samp>, and
                              <samp class="ph codeph">sincosf()</samp> when the function argument is of the form
                              <samp class="ph codeph">π*&lt;expr&gt;</samp>. (The performance advantage
                              <samp class="ph codeph">sinpi()</samp> has over <samp class="ph codeph">sin()</samp> is due to
                              simplified argument reduction; the accuracy advantage is because
                              <samp class="ph codeph">sinpi()</samp> multiplies by <samp class="ph codeph">π</samp> only
                              implicitly, effectively using an infinitely precise mathematical
                              <samp class="ph codeph">π</samp> rather than a single- or double-precision
                              approximation thereof.)
                           </p>
                        </div>
                     </div>
                     <div class="topic concept nested2" xml:lang="en-US" id="precision-related-compiler-flags"><a name="precision-related-compiler-flags" shape="rect">
                           <!-- --></a><h3 class="title topictitle2"><a href="#precision-related-compiler-flags" name="precision-related-compiler-flags" shape="rect">11.1.6.&nbsp;Precision-related Compiler Flags</a></h3>
                        <div class="body conbody">
                           <p class="p">By default, the <samp class="ph codeph">nvcc</samp> compiler generates
                              IEEE-compliant code for devices of <a class="xref" href="index.html#cuda-compute-capability" shape="rect">compute
                                 capability</a> 2.x, but
                              it also provides options to generate code that somewhat less accurate
                              but faster and that is closer to the code generated for earlier
                              devices:
                           </p>
                           <ul class="ul">
                              <li class="li"><samp class="ph codeph">-ftz=true</samp> (denormalized numbers are flushed to
                                 zero)
                              </li>
                              <li class="li"><samp class="ph codeph">-prec-div=false</samp> (less precise division)
                              </li>
                              <li class="li"><samp class="ph codeph">-prec-sqrt=false</samp> (less precise square
                                 root)
                              </li>
                           </ul>
                           <p class="p">Another, more aggressive, option is <samp class="ph codeph">-use_fast_math</samp>,
                              which coerces every <samp class="ph codeph">functionName()</samp> call to the
                              equivalent <samp class="ph codeph">__functionName()</samp> call. This makes the code
                              run faster at the cost of diminished precision and accuracy. See <a class="xref" href="index.html#math-libraries" shape="rect">Math Libraries</a>.
                           </p>
                        </div>
                     </div>
                  </div>
                  <div class="topic concept nested1" xml:lang="en-US" id="memory-instructions"><a name="memory-instructions" shape="rect">
                        <!-- --></a><h3 class="title topictitle2"><a href="#memory-instructions" name="memory-instructions" shape="rect">11.2.&nbsp;Memory Instructions</a></h3>
                     <div class="body conbody">
                        <div class="note note"><span class="notetitle">Note:</span><strong class="ph b">High Priority:</strong> Minimize the use of global memory. Prefer
                           shared memory access where possible.
                        </div>
                        <p class="p">Memory instructions include any instruction that reads from or
                           writes to shared, local, or global memory. When accessing uncached
                           local or global memory, there are 400 to 600 clock cycles of memory
                           latency.
                        </p>
                        <p class="p">As an example, the assignment operator in the following sample code
                           has a high throughput, but, crucially, there is a latency of 400 to 600
                           clock cycles to read data from global memory:
                        </p><pre xml:space="preserve"><span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-attribute">__shared__</span> <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-keyword">float</span> shared[32];
<span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-attribute">__device__</span> <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-keyword">float</span> device[32]; 
shared[<span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-attribute">threadIdx</span>.x] = device[<span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-attribute">threadIdx</span>.x];</pre><p class="p">Much of this global memory latency can be hidden by the thread
                           scheduler if there are sufficient independent arithmetic instructions
                           that can be issued while waiting for the global memory access to
                           complete. However, it is best to avoid accessing global memory whenever
                           possible.
                        </p>
                     </div>
                  </div>
               </div>
               <div class="topic concept nested0" xml:lang="en-US" id="control-flow"><a name="control-flow" shape="rect">
                     <!-- --></a><h2 class="title topictitle1"><a href="#control-flow" name="control-flow" shape="rect">12.&nbsp;Control Flow</a></h2>
                  <div class="topic concept nested1" xml:lang="en-US" id="branching-and-divergence"><a name="branching-and-divergence" shape="rect">
                        <!-- --></a><h3 class="title topictitle2"><a href="#branching-and-divergence" name="branching-and-divergence" shape="rect">12.1.&nbsp;Branching and Divergence</a></h3>
                     <div class="body conbody">
                        <div class="note note"><span class="notetitle">Note:</span><strong class="ph b">High Priority:</strong> Avoid different execution paths within the
                           same warp.
                        </div>
                        <p class="p">Flow control instructions (<samp class="ph codeph">if</samp>,
                           <samp class="ph codeph">switch</samp>, <samp class="ph codeph">do</samp>, <samp class="ph codeph">for</samp>,
                           <samp class="ph codeph">while</samp>) can significantly affect the instruction
                           throughput by causing threads of the same warp to diverge; that is, to
                           follow different execution paths. If this happens, the different
                           execution paths must be executed separately; this increases the total number of
                           instructions executed for this warp.
                        </p>
                        <p class="p">To obtain best performance in cases where the control flow depends
                           on the thread ID, the controlling condition should be written so as to
                           minimize the number of divergent warps.
                        </p>
                        <p class="p">This is possible because the distribution of the warps across the
                           block is deterministic as mentioned in <cite class="cite">SIMT Architecture</cite>
                           of the <cite class="cite">CUDA C Programming Guide</cite>. A trivial example is when
                           the controlling condition depends only on (<samp class="ph codeph">threadIdx</samp> /
                           <samp class="ph codeph">WSIZE</samp>) where <samp class="ph codeph">WSIZE</samp> is the warp
                           size.
                        </p>
                        <p class="p">In this case, no warp diverges because the controlling condition is
                           perfectly aligned with the warps.
                        </p>
                        <p class="p">For branches including just a few instructions, warp divergence generally
                           results in marginal performance losses. For example, the compiler may use
                           predication to avoid an actual branch. Instead, all instructions are scheduled,
                           but a per-thread condition code or predicate controls which threads execute the
                           instructions. Threads with a false predicate do not write results, and
                           also do not evaluate addresses or read operands.
                        </p>
                        <p class="p">Starting with the Volta architecture, Independent Thread Scheduling
                           allows a warp to remain diverged outside of the data-dependent conditional block.
                           An explicit <samp class="ph codeph">__syncwarp()</samp> can be used to guarantee that
                           the warp has reconverged for subsequent instructions.
                        </p>
                     </div>
                  </div>
                  <div class="topic concept nested1" xml:lang="en-US" id="branch-predication"><a name="branch-predication" shape="rect">
                        <!-- --></a><h3 class="title topictitle2"><a href="#branch-predication" name="branch-predication" shape="rect">12.2.&nbsp;Branch Predication</a></h3>
                     <div class="body conbody">
                        <div class="note note"><span class="notetitle">Note:</span><strong class="ph b">Low Priority:</strong> Make it easy for the compiler to use branch
                           predication in lieu of loops or control statements.
                        </div>
                        <p class="p">Sometimes, the compiler may unroll loops or optimize out
                           <samp class="ph codeph">if</samp> or <samp class="ph codeph">switch</samp> statements by using
                           branch predication instead. In these cases, no warp can ever diverge.
                           The programmer can also control loop unrolling using
                        </p><pre xml:space="preserve">#pragma unroll</pre><p class="p">For more information on this pragma, refer to the <cite class="cite">CUDA C
                              Programming Guide</cite>.
                        </p>
                        <p class="p">When using branch predication, none of the instructions whose
                           execution depends on the controlling condition is skipped. Instead,
                           each such instruction is associated with a per-thread condition code or
                           predicate that is set to true or false according to the controlling
                           condition. Although each of these instructions is scheduled for
                           execution, only the instructions with a true predicate are actually
                           executed. Instructions with a false predicate do not write results, and
                           they also do not evaluate addresses or read operands.
                        </p>
                        <p class="p">The compiler replaces a branch instruction with predicated
                           instructions only if the number of instructions controlled by the
                           branch condition is less than or equal to a certain threshold: If the
                           compiler determines that the condition is likely to produce many
                           divergent warps, this threshold is 7; otherwise it is 4.
                        </p>
                     </div>
                  </div>
                  <div class="topic concept nested1" xml:lang="en-US" id="loop-counters-signed-vs-unsigned"><a name="loop-counters-signed-vs-unsigned" shape="rect">
                        <!-- --></a><h3 class="title topictitle2"><a href="#loop-counters-signed-vs-unsigned" name="loop-counters-signed-vs-unsigned" shape="rect">12.3.&nbsp;Loop Counters Signed vs. Unsigned</a></h3>
                     <div class="body conbody">
                        <div class="note note"><span class="notetitle">Note:</span><strong class="ph b">Low Medium Priority:</strong> Use signed integers rather than
                           unsigned integers as loop counters.
                        </div>
                        <p class="p">In the C language standard, unsigned integer overflow semantics are
                           well defined, whereas signed integer overflow causes undefined results.
                           Therefore, the compiler can optimize more aggressively with signed
                           arithmetic than it can with unsigned arithmetic. This is of particular
                           note with loop counters: since it is common for loop counters to have
                           values that are always positive, it may be tempting to declare the
                           counters as unsigned. For slightly better performance, however, they
                           should instead be declared as signed.
                        </p>
                        <p class="p">For example, consider the following code:</p><pre xml:space="preserve"><span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-keyword">for</span> (i = 0; i &lt; n; i++) {
    out[i] = in[offset + stride*i];
}</pre><p class="p">Here, the sub-expression <samp class="ph codeph">stride*i</samp> could overflow a
                           32-bit integer, so if <samp class="ph codeph">i</samp> is declared as unsigned, the
                           overflow semantics prevent the compiler from using some optimizations
                           that might otherwise have applied, such as strength reduction. If
                           instead <samp class="ph codeph">i</samp> is declared as signed, where the overflow
                           semantics are undefined, the compiler has more leeway to use these
                           optimizations.
                        </p>
                     </div>
                  </div>
                  <div class="topic concept nested1" xml:lang="en-US" id="synchronizing-divergent-threads"><a name="synchronizing-divergent-threads" shape="rect">
                        <!-- --></a><h3 class="title topictitle2"><a href="#synchronizing-divergent-threads" name="synchronizing-divergent-threads" shape="rect">12.4.&nbsp;Synchronizing Divergent Threads in a Loop</a></h3>
                     <div class="body conbody">
                        <div class="note note"><span class="notetitle">Note:</span><strong class="ph b">High Priority:</strong> Avoid the use of
                           <samp class="ph codeph">__syncthreads()</samp> inside divergent code.
                        </div>
                        <p class="p">Synchronizing threads inside potentially divergent code (e.g., a
                           loop over an input array) can cause unanticipated errors. Care must be
                           taken to ensure that all threads are converged at the point where
                           <samp class="ph codeph">__syncthreads()</samp> is called. The following example
                           illustrates how to do this properly for 1D blocks:
                        </p><pre xml:space="preserve"><span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-keyword">unsigned</span> <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-keyword">int</span> imax = <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-attribute">blockDim</span>.x * ((nelements + <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-attribute">blockDim</span>.x - 1)/ <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-attribute">blockDim</span>.x);

<span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-keyword">for</span> (<span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-keyword">int</span> i = threadidx.x; i &lt; imax; i += <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-attribute">blockDim</span>.x)
{
    <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-keyword">if</span> (i &lt; nelements)
    {
        ...
    }

    __syncthreads();

    <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-keyword">if</span> (i &lt; nelements)
    {
        ...
    }
}</pre><p class="p">In this example, the loop has been carefully written to have the
                           same number of iterations for each thread, avoiding divergence
                           (<samp class="ph codeph">imax</samp> is the number of elements rounded up to a
                           multiple of the block size). Guards have been added inside the loop to
                           prevent out-of-bound accesses. At the point of the
                           <samp class="ph codeph">__syncthreads()</samp>, all threads are converged.
                        </p>
                        <p class="p">Similar care must be taken when invoking
                           <samp class="ph codeph">__syncthreads()</samp> from a device function called from
                           potentially divergent code. A straightforward method of solving this
                           issue is to call the device function from non-divergent code and pass a
                           <samp class="ph codeph">thread_active</samp> flag as a parameter to the device
                           function. This <samp class="ph codeph">thread_active</samp> flag would be used to
                           indicate which threads should participate in the computation inside the
                           device function, allowing all threads to participate in the
                           <samp class="ph codeph">__syncthreads()</samp>.
                        </p>
                     </div>
                  </div>
               </div>
               <div class="topic concept nested0" xml:lang="en-US" id="deploying-cuda-applications"><a name="deploying-cuda-applications" shape="rect">
                     <!-- --></a><h2 class="title topictitle1"><a href="#deploying-cuda-applications" name="deploying-cuda-applications" shape="rect">13.&nbsp;Deploying CUDA Applications</a></h2>
                  <div class="body conbody">
                     <p class="p">Having completed the GPU acceleration of one or more components of
                        the application it is possible to compare the outcome with the original
                        expectation. Recall that the initial <dfn class="term">assess</dfn> step allowed
                        the developer to determine an upper bound for the potential speedup
                        attainable by accelerating given hotspots.
                     </p>
                     <p class="p">Before tackling other hotspots to improve the total speedup, the
                        developer should consider taking the partially parallelized
                        implementation and carry it through to production. This is important
                        for a number of reasons; for example, it allows the user to profit
                        from their investment as early as possible (the speedup may be partial
                        but is still valuable), and it minimizes risk for the developer and
                        the user by providing an evolutionary rather than revolutionary set of
                        changes to the application.
                     </p>
                  </div>
               </div>
               <div class="topic concept nested0" xml:lang="en-US" id="understanding-programming-environment"><a name="understanding-programming-environment" shape="rect">
                     <!-- --></a><h2 class="title topictitle1"><a href="#understanding-programming-environment" name="understanding-programming-environment" shape="rect">14.&nbsp;Understanding the Programming Environment</a></h2>
                  <div class="body conbody">
                     <p class="p">With each generation of NVIDIA processors, new features are added to
                        the GPU that CUDA can leverage. Consequently, it's important to
                        understand the characteristics of the architecture.
                     </p>
                     <p class="p">Programmers should be aware of two version numbers. The first is the
                        <a class="xref" href="index.html#cuda-compute-capability" shape="rect">compute capability</a>, and the
                        second is the version number of the CUDA Runtime and CUDA Driver APIs.
                     </p>
                  </div>
                  <div class="topic concept nested1" xml:lang="en-US" id="cuda-compute-capability"><a name="cuda-compute-capability" shape="rect">
                        <!-- --></a><h3 class="title topictitle2"><a href="#cuda-compute-capability" name="cuda-compute-capability" shape="rect">14.1.&nbsp;CUDA Compute Capability</a></h3>
                     <div class="body conbody">
                        <p class="p">The <dfn class="term">compute capability</dfn> describes the features of the
                           hardware and reflects the set of instructions supported by the device
                           as well as other specifications, such as the maximum number of threads
                           per block and the number of registers per multiprocessor. Higher
                           compute capability versions are supersets of lower (that is, earlier)
                           versions, so they are backward compatible.
                        </p>
                        <p class="p">The compute capability of the GPU in the device can be queried
                           programmatically as illustrated in the 
                           <samp class="ph codeph">deviceQuery</samp> CUDA Sample. The output for that program is
                           shown in <a class="xref" href="index.html#cuda-compute-capability__sample-cuda-configuration-data-reported-devicequery" shape="rect">Figure 12</a>.
                           This information is obtained by calling
                           <samp class="ph codeph">cudaGetDeviceProperties()</samp> and accessing the
                           information in the structure it returns.
                        </p>
                        <div class="fig fignone" id="cuda-compute-capability__sample-cuda-configuration-data-reported-devicequery"><a name="cuda-compute-capability__sample-cuda-configuration-data-reported-devicequery" shape="rect">
                              <!-- --></a><span class="figcap">Figure 12. Sample CUDA configuration data reported by deviceQuery</span><br clear="none"></br><div class="imagecenter"><img class="image imagecenter" src="graphics/sample-cuda-configuration-data.png" alt="Sample CUDA configuration data reported by deviceQuery."></img></div><br clear="none"></br></div>
                        <p class="p">The major and minor revision numbers of the compute capability are
                           shown on the third and fourth lines of <a class="xref" href="index.html#cuda-compute-capability__sample-cuda-configuration-data-reported-devicequery" shape="rect">Figure 12</a>.
                           Device 0 of this system has compute capability 1.1.
                        </p>
                        <p class="p">More details about the compute capabilities of various GPUs are in
                           <cite class="cite">CUDA-Enabled GPUs</cite> and <cite class="cite">Compute Capabilities</cite> of
                           the <cite class="cite">CUDA C Programming Guide</cite>. In particular, developers
                           should note the number of multiprocessors on the device, the number of
                           registers and the amount of memory available, and any special
                           capabilities of the device.
                        </p>
                     </div>
                  </div>
                  <div class="topic concept nested1" xml:lang="en-US" id="additional-hardware-data"><a name="additional-hardware-data" shape="rect">
                        <!-- --></a><h3 class="title topictitle2"><a href="#additional-hardware-data" name="additional-hardware-data" shape="rect">14.2.&nbsp;Additional Hardware Data</a></h3>
                     <div class="body conbody">
                        <p class="p">Certain hardware features are not described by the compute
                           capability. For example, the ability to overlap kernel execution with
                           asynchronous data transfers between the host and the device is
                           available on most but not all GPUs with <a class="xref" href="index.html#cuda-compute-capability" shape="rect">compute capability</a> 1.1. In such
                           cases, call <samp class="ph codeph">cudaGetDeviceProperties()</samp> to determine
                           whether the device is capable of a certain feature. For example, the
                           <samp class="ph codeph">asyncEngineCount</samp> field of the device property
                           structure indicates whether overlapping kernel execution and data
                           transfers is possible (and, if so, how many concurrent transfers are
                           possible); likewise, the <samp class="ph codeph">canMapHostMemory</samp> field
                           indicates whether zero-copy data transfers can be performed.
                        </p>
                     </div>
                  </div>
                  <div class="topic concept nested1" xml:lang="en-US" id="which-compute-capability-to-target"><a name="which-compute-capability-to-target" shape="rect">
                        <!-- --></a><h3 class="title topictitle2"><a href="#which-compute-capability-to-target" name="which-compute-capability-to-target" shape="rect">14.3.&nbsp;Which Compute Capability Target</a></h3>
                     <div class="body conbody">
                        <p class="p">When in doubt about the <a class="xref" href="index.html#cuda-compute-capability" shape="rect">compute capability</a> of the hardware that will
                           be present at runtime, it is best to assume a compute capability of 2.0
                           as defined in the <cite class="cite">CUDA C Programming Guide</cite> section on
                           <cite class="cite">Technical and Feature Specifications</cite>.
                        </p>
                        <p class="p">To target specific versions of NVIDIA hardware and CUDA software,
                           use the <samp class="ph codeph">-arch</samp>, <samp class="ph codeph">-code</samp>, and
                           <samp class="ph codeph">-gencode</samp> options of <samp class="ph codeph">nvcc</samp>. Code that
                           uses the warp shuffle operation, for example, must be compiled
                           with <samp class="ph codeph">-arch=sm_30</samp> (or higher compute capability).
                        </p>
                        <p class="p">See <a class="xref" href="index.html#building-for-maximum-compatibility" shape="rect">Building for Maximum Compatibility</a>
                           for further discussion of the flags used for building code for multiple
                           generations of CUDA-capable device simultaneously.
                        </p>
                     </div>
                  </div>
                  <div class="topic concept nested1" xml:lang="en-US" id="cuda-runtime"><a name="cuda-runtime" shape="rect">
                        <!-- --></a><h3 class="title topictitle2"><a href="#cuda-runtime" name="cuda-runtime" shape="rect">14.4.&nbsp;CUDA Runtime</a></h3>
                     <div class="body conbody">
                        <p class="p">The host runtime component of the CUDA software environment can be
                           used only by host functions. It provides functions to handle the
                           following:
                        </p>
                        <ul class="ul">
                           <li class="li">Device management</li>
                           <li class="li">Context management</li>
                           <li class="li">Memory management</li>
                           <li class="li">Code module management</li>
                           <li class="li">Execution control</li>
                           <li class="li">Texture reference management</li>
                           <li class="li">Interoperability with OpenGL and Direct3D</li>
                        </ul>
                        <p class="p">As compared to the lower-level CUDA Driver API, the CUDA Runtime
                           greatly eases device management by providing implicit initialization,
                           context management, and device code module management. The C/C++ host
                           code generated by <samp class="ph codeph">nvcc</samp> utilizes the CUDA Runtime, so
                           applications that link to this code will depend on the CUDA Runtime;
                           similarly, any code that uses the <samp class="ph codeph">cuBLAS</samp>,
                           <samp class="ph codeph">cuFFT</samp>, and other CUDA Toolkit libraries will also
                           depend on the CUDA Runtime, which is used internally by these
                           libraries.
                        </p>
                        <p class="p">The functions that make up the CUDA Runtime API are explained in the
                           <cite class="cite">CUDA Toolkit Reference Manual</cite>.
                        </p>
                        <p class="p">The CUDA Runtime handles kernel loading and setting up kernel
                           parameters and launch configuration before the kernel is launched. The
                           implicit driver version checking, code initialization, CUDA context
                           management, CUDA module management (cubin to function mapping), kernel
                           configuration, and parameter passing are all performed by the CUDA
                           Runtime.
                        </p>
                        <p class="p">It comprises two principal parts:</p>
                        <ul class="ul">
                           <li class="li">A C-style function interface
                              (<samp class="ph codeph">cuda_runtime_api.h</samp>).
                           </li>
                           <li class="li">C++-style convenience wrappers
                              (<samp class="ph codeph">cuda_runtime.h</samp>) built on top of the
                              C-style functions.
                           </li>
                        </ul>
                        <p class="p">For more information on the Runtime API, refer to <cite class="cite">CUDA C
                              Runtime</cite> of the <cite class="cite">CUDA C Programming Guide</cite>.
                        </p>
                     </div>
                  </div>
               </div>
               <div class="topic concept nested0" xml:lang="en-US" id="cuda-compatibility-and-upgrades"><a name="cuda-compatibility-and-upgrades" shape="rect">
                     <!-- --></a><h2 class="title topictitle1"><a href="#cuda-compatibility-and-upgrades" name="cuda-compatibility-and-upgrades" shape="rect">15.&nbsp;CUDA Compatibility and Upgrades</a></h2>
                  <div class="body conbody">
                     <p class="p">The enterprise users have a choice of upgrading the CUDA Toolkit with greater
                        flexibility. See below. 
                     </p>
                  </div>
                  <div class="topic concept nested1" xml:lang="en-US" id="cuda-runtime-and-driver-api-version"><a name="cuda-runtime-and-driver-api-version" shape="rect">
                        <!-- --></a><h3 class="title topictitle2"><a href="#cuda-runtime-and-driver-api-version" name="cuda-runtime-and-driver-api-version" shape="rect">15.1.&nbsp;CUDA Runtime and Driver API Version</a></h3>
                     <div class="body conbody">
                        <p class="p">The CUDA Driver API and the CUDA Runtime are two of the programming
                           interfaces to CUDA. Their version number enables developers to check
                           the features associated with these APIs and decide whether an
                           application requires a newer (later) version than the one currently
                           installed. This is important because the CUDA Driver API is
                           <dfn class="term">backward compatible but not forward compatible</dfn>, meaning
                           that applications, plug-ins, and libraries (including the CUDA Runtime)
                           compiled against a particular version of the Driver API will continue
                           to work on subsequent (later) driver releases. However, applications,
                           plug-ins, and libraries (including the CUDA Runtime) compiled against a
                           particular version of the Driver API may not work on earlier versions
                           of the driver, as illustrated in <a class="xref" href="index.html#cuda-runtime-and-driver-api-version__compatibility-of-cuda-versions" shape="rect">Figure 13</a>.
                        </p>
                        <div class="fig fignone" id="cuda-runtime-and-driver-api-version__compatibility-of-cuda-versions"><a name="cuda-runtime-and-driver-api-version__compatibility-of-cuda-versions" shape="rect">
                              <!-- --></a><span class="figcap">Figure 13. Compatibility of CUDA versions</span><br clear="none"></br><div class="imagecenter"><img class="image imagecenter" src="../common/graphics/compatibility-of-cuda-versions.png" alt="Compatibility of CUDA versions."></img></div><br clear="none"></br></div>
                     </div>
                  </div>
                  <div class="topic concept nested1" xml:lang="en-US" id="standard-upgrade-path"><a name="standard-upgrade-path" shape="rect">
                        <!-- --></a><h3 class="title topictitle2"><a href="#standard-upgrade-path" name="standard-upgrade-path" shape="rect">15.2.&nbsp;Standard Upgrade Path</a></h3>
                     <div class="body conbody">
                        <p class="p">The standard upgrade path from CUDA 9 to CUDA 10 is shown in the <a class="xref" href="index.html#standard-upgrade-path__standard-upgrade-cuda" shape="rect">Figure 14</a>.
                        </p>
                        <div class="fig fignone" id="standard-upgrade-path__standard-upgrade-cuda"><a name="standard-upgrade-path__standard-upgrade-cuda" shape="rect">
                              <!-- --></a><span class="figcap">Figure 14. Standard Upgrade Path</span><br clear="none"></br><div class="imagecenter"><img class="image imagecenter" width="350" src="../common/graphics/standard_upgrade_cuda.png" alt="Compatibility of CUDA versions."></img></div><br clear="none"></br></div>
                     </div>
                  </div>
                  <div class="topic concept nested1" xml:lang="en-US" id="flexible-upgrade-path"><a name="flexible-upgrade-path" shape="rect">
                        <!-- --></a><h3 class="title topictitle2"><a href="#flexible-upgrade-path" name="flexible-upgrade-path" shape="rect">15.3.&nbsp;Flexible Upgrade Path</a></h3>
                     <div class="body conbody">
                        <p class="p">Beginning with CUDA 10, enterprise users with Tesla GPUs have the option to upgrade
                           to newer versions of CUDA with greater flexibility. With this option the user<em class="ph i">
                              does not have</em> to update the GPU Kernel Mode Driver components as long as
                           they are validated on specific enterprise driver branches. See <a class="xref" href="index.html#flexible-upgrade-path__flexible-upgrade-cuda" shape="rect">Figure 15</a>.
                        </p>
                        <p class="p"></p>
                        <div class="fig fignone" id="flexible-upgrade-path__flexible-upgrade-cuda"><a name="flexible-upgrade-path__flexible-upgrade-cuda" shape="rect">
                              <!-- --></a><span class="figcap">Figure 15. Flexible Upgrade Path</span><br clear="none"></br><div class="imagecenter"><img class="image imagecenter" width="350" src="../common/graphics/flexible-upgrade-cuda.png" alt="Compatibility of CUDA versions."></img></div><br clear="none"></br></div>
                     </div>
                  </div>
                  <div class="topic concept nested1" xml:lang="en-US" id="cuda-compatibility-package"><a name="cuda-compatibility-package" shape="rect">
                        <!-- --></a><h3 class="title topictitle2"><a href="#cuda-compatibility-package" name="cuda-compatibility-package" shape="rect">15.4.&nbsp;CUDA Compatibility Platform Package</a></h3>
                     <div class="body conbody">
                        <div class="p">The flexible upgrade is accomplished by making use of the files in the CUDA
                           Compatibility Platform package in the CUDA 10 distribution. This package is
                           comprised of three files, namely: <a name="cuda-compatibility-package__ol_vnd_y5z_cfb" shape="rect">
                              <!-- --></a><ol class="ol" id="cuda-compatibility-package__ol_vnd_y5z_cfb">
                              <li class="li">CUDA Driver (libcuda.so), </li>
                              <li class="li">The Fatbinary Loader (libnvidia-fatbinaryloader.so), and </li>
                              <li class="li">JIT Compiler (libnvidia-ptxjitcompiler.so) 
                                 <p class="p"></p>
                              </li>
                           </ol>
                        </div>
                        <p class="p">After the installation, the users and the system administrators of the compatibility
                           platform should configure the system loader to pick up the new set of user-mode
                           components. This is typically accomplished by either setting the environment
                           variable<samp class="ph codeph"> LD_LIBRARY_PATH</samp> or updating the
                           <samp class="ph codeph">ld.so.conf</samp> file and then running the <samp class="ph codeph">ldconfig</samp>
                           to ensure that the updates to the <samp class="ph codeph">ld.so.conf</samp> are properly linked.
                           This will allow the CUDA 10 Toolkit to run on the existing kernel mode driver
                           components, that is, without upgrading these components to the CUDA 10 versions. 
                        </p>
                        <p class="p">The organization of the CUDA Compatibility Platform package should follow as shown in
                           <a class="xref" href="index.html#cuda-compatibility-package__cuda-compatibility-package" shape="rect">Figure 16</a>. 
                        </p>
                        <div class="fig fignone" id="cuda-compatibility-package__cuda-compatibility-package"><a name="cuda-compatibility-package__cuda-compatibility-package" shape="rect">
                              <!-- --></a><span class="figcap">Figure 16. CUDA Compatibility Platform Package</span><br clear="none"></br><div class="imagecenter"><img class="image imagecenter" width="450" src="../common/graphics/cuda-compatibility-platform-package.png" alt="Compatibility of CUDA versions."></img></div><br clear="none"></br></div>
                        <div class="p"><strong class="ph b">The following applies for the CUDA Compatibility Platform Package:</strong><a name="cuda-compatibility-package__ul_qmj_lvz_cfb" shape="rect">
                              <!-- --></a><ul class="ul" id="cuda-compatibility-package__ul_qmj_lvz_cfb">
                              <li class="li">The libraries in the CUDA Compatibility Platform package are intended to be
                                 used alongside existing driver installations. 
                              </li>
                              <li class="li">The CUDA Compatibility Platform package files should be located in a single
                                 path and should not be split up.
                              </li>
                              <li class="li">The CUDA Compatibility Platform package is versioned according to the
                                 runtime version that it supports. 
                              </li>
                           </ul>
                        </div>
                     </div>
                  </div>
                  <div class="topic concept nested1" xml:lang="en-US" id="extended-nvidia-smi"><a name="extended-nvidia-smi" shape="rect">
                        <!-- --></a><h3 class="title topictitle2"><a href="#extended-nvidia-smi" name="extended-nvidia-smi" shape="rect">15.5.&nbsp;Extended nvidia-smi</a></h3>
                     <div class="body conbody">
                        <p class="p">In order to help both the administrator and the users, the
                           <samp class="ph codeph">nvidia-smi</samp> is enhanced to show the CUDA version in its display.
                           It will use the currently configured paths to determine which CUDA version is being
                           used. 
                        </p>
                        <p class="p">For more information on the Runtime API, refer to <cite class="cite">CUDA C
                              Runtime</cite> of the <cite class="cite">CUDA C Programming Guide</cite>.
                        </p>
                     </div>
                  </div>
               </div>
               <div class="topic concept nested0" xml:lang="en-US" id="preparing-for-deployment"><a name="preparing-for-deployment" shape="rect">
                     <!-- --></a><h2 class="title topictitle1"><a href="#preparing-for-deployment" name="preparing-for-deployment" shape="rect">16.&nbsp;Preparing for Deployment</a></h2>
                  <div class="topic concept nested1" xml:lang="en-US" id="testing-for-cuda-availability"><a name="testing-for-cuda-availability" shape="rect">
                        <!-- --></a><h3 class="title topictitle2"><a href="#testing-for-cuda-availability" name="testing-for-cuda-availability" shape="rect">16.1.&nbsp;Testing for CUDA Availability</a></h3>
                     <div class="body conbody">
                        <div class="section">
                           <p class="p">When deploying a CUDA application, it is often desirable to
                              ensure that the an application will continue to function properly
                              even if the target machine does not have a CUDA-capable GPU and/or
                              a sufficient version of the NVIDIA Driver installed. (Developers
                              targeting a single machine with known configuration may choose to
                              skip this section.)
                           </p>
                        </div>
                        <div class="section">
                           <h3 class="title sectiontitle">Detecting a CUDA-Capable GPU</h3>
                           <p class="p">When an application will be deployed to target machines of
                              arbitrary/unknown configuration, the application should explicitly test
                              for the existence of a CUDA-capable GPU in order to take appropriate
                              action when no such device is available. The
                              <samp class="ph codeph">cudaGetDeviceCount()</samp> function can be used to query for
                              the number of available devices. Like all CUDA Runtime API functions,
                              this function will fail gracefully and return
                              <samp class="ph codeph">cudaErrorNoDevice</samp> to the application if there is no
                              CUDA-capable GPU or <samp class="ph codeph">cudaErrorInsufficientDriver</samp> if
                              there is not an appropriate version of the NVIDIA Driver installed. If
                              <samp class="ph codeph">cudaGetDeviceCount()</samp> reports an error, the
                              application should fall back to an alternative code path.
                           </p>
                           <p class="p">A system with multiple GPUs may contain GPUs of different hardware
                              versions and capabilities. When using multiple GPUs from the same
                              application, it is recommended to use GPUs of the same type, rather
                              than mixing hardware generations. The
                              <samp class="ph codeph">cudaChooseDevice()</samp> function can be used to select the
                              device that most closely matches a desired set of features.
                           </p>
                        </div>
                        <div class="section">
                           <h3 class="title sectiontitle">Detecting Hardware and Software Configuration</h3>
                           <p class="p">When an application depends on the availability of certain hardware
                              or software capabilities to enable certain functionality, the CUDA API
                              can be queried for details about the configuration of the available
                              device and for the installed software versions.
                           </p>
                           <p class="p">The <samp class="ph codeph">cudaGetDeviceProperties()</samp> function reports
                              various features of the available devices, including the <a class="xref" href="index.html#cuda-compute-capability" shape="rect">CUDA Compute Capability</a> of the device (see also the
                              <cite class="cite">Compute Capabilities</cite> section of the <cite class="cite">CUDA C
                                 Programming Guide</cite>). See <a class="xref" href="index.html#cuda-runtime-and-driver-api-version" shape="rect">CUDA Runtime and Driver API Version</a> for details on how to
                              query the available CUDA software API versions.
                           </p>
                        </div>
                     </div>
                  </div>
                  <div class="topic concept nested1" xml:lang="en-US" id="error-handling"><a name="error-handling" shape="rect">
                        <!-- --></a><h3 class="title topictitle2"><a href="#error-handling" name="error-handling" shape="rect">16.2.&nbsp;Error Handling</a></h3>
                     <div class="body conbody">
                        <p class="p">All CUDA Runtime API calls return an error code of type
                           <samp class="ph codeph">cudaError_t</samp>; the return value will be equal to
                           <samp class="ph codeph">cudaSuccess</samp> if no errors have occurred. (The
                           exceptions to this are kernel launches, which return void, and
                           <samp class="ph codeph">cudaGetErrorString()</samp>, which returns a character string
                           describing the <samp class="ph codeph">cudaError_t</samp> code that was passed into
                           it.) The CUDA Toolkit libraries (<samp class="ph codeph">cuBLAS</samp>,
                           <samp class="ph codeph">cuFFT</samp>, etc.) likewise return their own sets of error
                           codes.
                        </p>
                        <p class="p">Since some CUDA API calls and all kernel launches are asynchronous
                           with respect to the host code, errors may be reported to the host
                           asynchronously as well; often this occurs the next time the host and
                           device synchronize with each other, such as during a call to
                           <samp class="ph codeph">cudaMemcpy()</samp> or to
                           <samp class="ph codeph">cudaDeviceSynchronize()</samp>.
                        </p>
                        <p class="p">Always check the error return values on all CUDA API functions, even
                           for functions that are not expected to fail, as this will allow the
                           application to detect and recover from errors as soon as possible
                           should they occur. Applications that do not check for CUDA API errors
                           could at times run to completion without having noticed that the data
                           calculated by the GPU is incomplete, invalid, or uninitialized.
                        </p>
                        <div class="note note"><span class="notetitle">Note:</span> The CUDA Toolkit Samples provide several helper functions for
                           error checking with the various CUDA APIs; these helper functions are
                           located in the <samp class="ph codeph">samples/common/inc/helper_cuda.h</samp> file
                           in the CUDA Toolkit.
                        </div>
                     </div>
                  </div>
                  <div class="topic concept nested1" xml:lang="en-US" id="building-for-maximum-compatibility"><a name="building-for-maximum-compatibility" shape="rect">
                        <!-- --></a><h3 class="title topictitle2"><a href="#building-for-maximum-compatibility" name="building-for-maximum-compatibility" shape="rect">16.3.&nbsp;Building for Maximum Compatibility</a></h3>
                     <div class="body conbody">
                        <div class="section">
                           <p class="p">Each generation of CUDA-capable device has an associated <em class="ph i">compute
                                 capability</em> version that indicates the feature set supported by the
                              device (see <a class="xref" href="index.html#cuda-compute-capability" shape="rect">CUDA Compute Capability</a>). One or more
                              <a class="xref" href="index.html#cuda-compute-capability" shape="rect">compute capability</a>
                              versions can be specified to the nvcc compiler while
                              building a file; compiling for the native compute capability for the
                              target GPU(s) of the application is important to ensure that
                              application kernels achieve the best possible performance and are able
                              to use the features that are available on a given generation of GPU.
                           </p>
                           <p class="p">When an application is built for multiple compute capabilities
                              simultaneously (using several instances of the
                              <samp class="ph codeph">-gencode</samp> flag to nvcc), the binaries for the specified
                              compute capabilities are combined into the executable, and the CUDA
                              Driver selects the most appropriate binary at runtime according to the
                              compute capability of the present device. If an appropriate native
                              binary (<em class="ph i">cubin</em>) is not available, but the intermediate <em class="ph i">PTX</em>
                              code (which targets an abstract virtual instruction set and is used for
                              forward-compatibility) is available, then the kernel will be compiled
                              <em class="ph i">Just In Time</em> (JIT) (see <a class="xref" href="index.html#compiler-jit-cache-management" shape="rect">Compiler JIT Cache Management Tools</a>) from the PTX to the native
                              cubin for the device. If the PTX is also not available, then the kernel
                              launch will fail.
                           </p>
                        </div>
                        <div class="example">
                           <h3 class="title sectiontitle">Windows</h3><pre class="pre screen" xml:space="preserve">nvcc.exe -ccbin "C:\vs2008\VC\bin"
  -Xcompiler "/EHsc /W3 /nologo /O2 /Zi /MT"
  -gencode=arch=compute_20,code=sm_20
  -gencode=arch=compute_30,code=sm_30
  -gencode=arch=compute_35,code=sm_35
  -gencode=arch=compute_50,code=sm_50
  -gencode=arch=compute_50,code=compute_50
  --compile -o "Release\mykernel.cu.obj" "mykernel.cu"</pre></div>
                        <div class="example">
                           <h3 class="title sectiontitle">Mac/Linux</h3><pre class="pre screen" xml:space="preserve">/usr/local/cuda/bin/nvcc
  -gencode=arch=compute_20,code=sm_20
  -gencode=arch=compute_30,code=sm_30
  -gencode=arch=compute_35,code=sm_35
  -gencode=arch=compute_50,code=sm_50
  -gencode=arch=compute_50,code=compute_50
  -O2 -o mykernel.o -c mykernel.cu</pre></div>
                        <div class="section">
                           <p class="p">Alternatively, the <samp class="ph codeph">nvcc</samp> command-line option
                              <samp class="ph codeph">-arch=sm_XX</samp> can be used as a shorthand equivalent to
                              the following more explicit <samp class="ph codeph">-gencode=</samp> command-line
                              options described above:
                           </p><pre class="pre screen" xml:space="preserve">  -gencode=arch=compute_XX,code=sm_XX
  -gencode=arch=compute_XX,code=compute_XX</pre><p class="p">However, while the <samp class="ph codeph">-arch=sm_XX</samp> command-line option
                              does result in inclusion of a PTX back-end target by default (due to
                              the <samp class="ph codeph">code=compute_XX</samp> target it implies), it can only
                              specify a single target <samp class="ph codeph">cubin</samp> architecture at a time,
                              and it is not possible to use multiple <samp class="ph codeph">-arch=</samp> options
                              on the same <samp class="ph codeph">nvcc</samp> command line, which is why the
                              examples above use <samp class="ph codeph">-gencode=</samp> explicitly.
                           </p>
                        </div>
                     </div>
                  </div>
                  <div class="topic concept nested1" xml:lang="en-US" id="distributing-cuda-runtime-and-libraries"><a name="distributing-cuda-runtime-and-libraries" shape="rect">
                        <!-- --></a><h3 class="title topictitle2"><a href="#distributing-cuda-runtime-and-libraries" name="distributing-cuda-runtime-and-libraries" shape="rect">16.4.&nbsp;Distributing the CUDA Runtime and Libraries</a></h3>
                     <div class="body conbody">
                        <div class="section">
                           <p class="p">CUDA applications are built against the CUDA Runtime library, which
                              handles device, memory, and kernel management. Unlike the <a class="xref" href="index.html#cuda-runtime-and-driver-api-version__compatibility-of-cuda-versions" shape="rect">CUDA
                                 Driver</a>, the CUDA Runtime guarantees neither forward nor backward
                              binary compatibility across versions. It is therefore best to <a class="xref" href="index.html#redistribution" shape="rect">redistribute</a> the CUDA Runtime library
                              with the application when using dynamic linking or else to statically
                              link against the CUDA Runtime. This will ensure that the executable
                              will be able to run even if the user does not have the same CUDA
                              Toolkit installed that the application was built against.
                           </p>
                           <div class="note note"><span class="notetitle">Note:</span> When statically linking to the CUDA Runtime, multiple versions of
                              the runtime can peacably coexist in the same application process
                              simultaneously; for example, if an application uses one version of the
                              CUDA Runtime, and a plugin to that application is statically linked to
                              a different version, that is perfectly acceptable, as long as the
                              installed NVIDIA Driver is sufficient for both.
                           </div>
                        </div>
                        <div class="section" id="distributing-cuda-runtime-and-libraries__static-cudart"><a name="distributing-cuda-runtime-and-libraries__static-cudart" shape="rect">
                              <!-- --></a><h3 class="title sectiontitle">Statically-linked CUDA Runtime</h3>
                           <p class="p">The easiest option is to statically link against the CUDA Runtime.
                              This is the default if using <samp class="ph codeph">nvcc</samp> to link in CUDA 5.5
                              and later. Static linking makes the executable slightly larger, but it
                              ensures that the correct version of runtime library functions are
                              included in the application binary without requiring separate
                              redistribution of the CUDA Runtime library.
                           </p>
                        </div>
                        <div class="section" id="distributing-cuda-runtime-and-libraries__dynamic-cudart"><a name="distributing-cuda-runtime-and-libraries__dynamic-cudart" shape="rect">
                              <!-- --></a><h3 class="title sectiontitle">Dynamically-linked CUDA Runtime</h3>
                           <p class="p">If static linking against the CUDA Runtime is impractical for some
                              reason, then a dynamically-linked version of the CUDA Runtime library
                              is also available.  (This was the default and only option provided in
                              CUDA versions 5.0 and earlier.)
                           </p>
                           <p class="p">To use dynamic linking with the CUDA Runtime when using the
                              <samp class="ph codeph">nvcc</samp> from CUDA 5.5 or later to link the application,
                              add the <samp class="ph codeph">--cudart=shared</samp> flag to the link command line;
                              otherwise the <a class="xref" href="index.html#distributing-cuda-runtime-and-libraries__static-cudart" shape="rect">statically-linked
                                 CUDA Runtime library</a> is used by default.
                           </p>
                           <p class="p">After the application is dynamically linked against the CUDA
                              Runtime, this version of the runtime library should be <a class="xref" href="index.html#redistribution" shape="rect">bundled with</a> the application. It can be
                              copied into the same directory as the application executable or into a
                              subdirectory of that installation path.
                           </p>
                        </div>
                        <div class="section" id="distributing-cuda-runtime-and-libraries__otherlibs"><a name="distributing-cuda-runtime-and-libraries__otherlibs" shape="rect">
                              <!-- --></a><h3 class="title sectiontitle">Other CUDA Libraries</h3>
                           <p class="p">Although the CUDA Runtime provides the option of static linking, the
                              other libraries included in the CUDA Toolkit (cuBLAS, cuFFT, etc.) are
                              available only in dynamically-linked form. As with the <a class="xref" href="index.html#distributing-cuda-runtime-and-libraries__dynamic-cudart" shape="rect">dynamically-linked
                                 version of the CUDA Runtime library</a>, these libraries should be
                              <a class="xref" href="index.html#redistribution" shape="rect">bundled with</a> the application
                              executable when distributing that application.
                           </p>
                        </div>
                     </div>
                     <div class="topic concept nested2" xml:lang="en-US" id="redistribution"><a name="redistribution" shape="rect">
                           <!-- --></a><h3 class="title topictitle2"><a href="#redistribution" name="redistribution" shape="rect">16.4.1.&nbsp;CUDA Toolkit Library Redistribution</a></h3>
                        <div class="body conbody">
                           <p class="p">The CUDA Toolkit's End-User License Agreement (EULA) allows for
                              redistribution of many of the CUDA libraries under certain terms and
                              conditions. This allows applications that depend on these libraries
                              <a class="xref" href="index.html#redistribution--which-files" shape="rect">to redistribute the exact
                                 versions</a> of the libraries against which they were built and
                              tested, thereby avoiding any trouble for end users who might have a
                              different version of the CUDA Toolkit (or perhaps none at all)
                              installed on their machines. Please refer to the EULA for details.
                           </p>
                           <div class="note note"><span class="notetitle">Note:</span> This does <em class="ph i">not</em> apply to the NVIDIA Driver; the end user
                              must still download and install an NVIDIA Driver appropriate to their
                              GPU(s) and operating system.
                           </div>
                        </div>
                        <div class="topic concept nested3" xml:lang="en-US" id="redistribution--which-files"><a name="redistribution--which-files" shape="rect">
                              <!-- --></a><h3 class="title topictitle2"><a href="#redistribution--which-files" name="redistribution--which-files" shape="rect">16.4.1.1.&nbsp;Which Files to Redistribute</a></h3>
                           <div class="body conbody">
                              <div class="section">
                                 <p class="p">When redistributing the dynamically-linked versions of one or more
                                    CUDA libraries, it is important to identify the exact files that need
                                    to be redistributed. The following examples use the cuBLAS library from
                                    CUDA Toolkit 5.5 as an illustration:
                                 </p>
                              </div>
                              <div class="example">
                                 <h5 class="title sectiontitle">Linux</h5>
                                 <p class="p">In a shared library on Linux, there is a string field called the
                                    <samp class="ph codeph">SONAME</samp> that indicates the binary compatibility level
                                    of the library. The <samp class="ph codeph">SONAME</samp> of the library against
                                    which the application was built must match the filename of the library
                                    that is redistributed with the application.
                                 </p>
                                 <div class="p">For example, in the standard CUDA Toolkit installation, the files
                                    <samp class="ph codeph">libcublas.so</samp> and <samp class="ph codeph">libcublas.so.5.5</samp> are
                                    both symlinks pointing to a specific build of cuBLAS, which is named
                                    like <samp class="ph codeph">libcublas.so.5.5.<em class="ph i">x</em></samp>, where <em class="ph i">x</em> is the
                                    build number (e.g., <samp class="ph codeph">libcublas.so.5.5.17</samp>). However, the
                                    <samp class="ph codeph">SONAME</samp> of this library is given as
                                    "<samp class="ph codeph">libcublas.so.5.5</samp>":
                                    <pre class="pre screen" xml:space="preserve"><strong class="ph b">$</strong> objdump -p /usr/local/cuda/lib64/libcublas.so | grep SONAME
   SONAME               <strong class="ph b">libcublas.so.5.5</strong></pre></div>
                                 <p class="p">Because of this, even if <samp class="ph codeph">-lcublas</samp> (with no version
                                    number specified) is used when linking the application, the
                                    <samp class="ph codeph">SONAME</samp> found at link time implies that
                                    "<samp class="ph codeph">libcublas.so.5.5</samp>" is the name of the file that the
                                    dynamic loader will look for when loading the application and therefore
                                    must be the name of the file (or a symlink to the same) that is
                                    redistributed with the application.
                                 </p>
                                 <div class="p">The <samp class="ph codeph">ldd</samp> tool is useful for identifying the exact
                                    filenames of the libraries that the application expects to find at
                                    runtime as well as the path, if any, of the copy of that library that
                                    the dynamic loader would select when loading the application given the
                                    current library search path:
                                    <pre class="pre screen" xml:space="preserve"><strong class="ph b">$</strong> ldd a.out | grep libcublas
   <strong class="ph b">libcublas.so.5.5</strong> =&gt; /usr/local/cuda/lib64/libcublas.so.5.5</pre></div>
                              </div>
                              <div class="example">
                                 <h5 class="title sectiontitle">Mac</h5>
                                 <p class="p">In a shared library on Mac OS X, there is a field called the
                                    <samp class="ph codeph">install name</samp> that indicates the expected installation
                                    path and filename the library; the CUDA libraries also use this
                                    filename to indicate binary compatibility. The value of this field is
                                    propagated into an application built against the library and is used to
                                    locate the library of the correct version at runtime.
                                 </p>
                                 <p class="p">For example, if the install name of the cuBLAS library is given as
                                    <samp class="ph codeph">@rpath/libcublas.5.5.dylib</samp>, then the library is
                                    version 5.5 and the copy of this library redistributed with the
                                    application must be named <samp class="ph codeph">libcublas.5.5.dylib</samp>, even
                                    though only <samp class="ph codeph">-lcublas</samp> (with no version number
                                    specified) is used at link time. Furthermore, this file should be
                                    installed into the <samp class="ph codeph">@rpath</samp> of the application; see
                                    <a class="xref" href="index.html#redistribution--where-to-install" shape="rect">Where to Install Redistributed CUDA Libraries</a>.
                                 </p>
                                 <div class="p">To view a library's install name, use the <samp class="ph codeph">otool -L</samp>
                                    command:
                                    <pre class="pre screen" xml:space="preserve"><strong class="ph b">$</strong> otool -L a.out
a.out:
        @rpath/<strong class="ph b">libcublas.5.5.dylib</strong> (...)</pre></div>
                              </div>
                              <div class="example">
                                 <h5 class="title sectiontitle">Windows</h5>
                                 <p class="p">The binary compatibility version of the CUDA libraries on Windows
                                    is indicated as part of the filename.
                                 </p>
                                 <p class="p">For example, a 64-bit application linked to cuBLAS 5.5 will look for
                                    <samp class="ph codeph">cublas64_55.dll</samp> at runtime, so this is the file that
                                    should be redistributed with that application, even though
                                    <samp class="ph codeph">cublas.lib</samp> is the file that the application is linked
                                    against. For 32-bit applications, the file would be
                                    <samp class="ph codeph">cublas32_55.dll</samp>.
                                 </p>
                                 <div class="p">To verify the exact DLL filename that the application expects
                                    to find at runtime, use the <samp class="ph codeph">dumpbin</samp> tool from the
                                    Visual Studio command prompt:
                                    <pre class="pre screen" xml:space="preserve"><strong class="ph b">$</strong> dumpbin /IMPORTS a.exe
Microsoft (R) COFF/PE Dumper Version 10.00.40219.01
Copyright (C) Microsoft Corporation.  All rights reserved.


Dump of file a.exe

File Type: EXECUTABLE IMAGE

  Section contains the following imports:

    ...
    <strong class="ph b">cublas64_55.dll</strong>
    ...</pre></div>
                              </div>
                           </div>
                        </div>
                        <div class="topic concept nested3" xml:lang="en-US" id="redistribution--where-to-install"><a name="redistribution--where-to-install" shape="rect">
                              <!-- --></a><h3 class="title topictitle2"><a href="#redistribution--where-to-install" name="redistribution--where-to-install" shape="rect">16.4.1.2.&nbsp;Where to Install Redistributed CUDA Libraries</a></h3>
                           <div class="body conbody">
                              <div class="section">
                                 <p class="p">Once the correct library files are identified for redistribution,
                                    they must be configured for installation into a location where the
                                    application will be able to find them.
                                 </p>
                                 <p class="p">On Windows, if the CUDA Runtime or other dynamically-linked CUDA
                                    Toolkit library is placed in the same directory as the executable,
                                    Windows will locate it automatically. On Linux and Mac, the
                                    <samp class="ph codeph">-rpath</samp> linker option should be used to instruct the
                                    executable to search its local path for these libraries before
                                    searching the system paths:
                                 </p>
                              </div>
                              <div class="example">
                                 <h5 class="title sectiontitle">Linux/Mac</h5><pre class="pre screen" xml:space="preserve">nvcc -I $(CUDA_HOME)/include
  <strong class="ph b">-Xlinker "-rpath '$ORIGIN'" --cudart=shared</strong>
  -o myprogram myprogram.cu</pre></div>
                              <div class="example">
                                 <h5 class="title sectiontitle">Windows</h5><pre class="pre screen" xml:space="preserve">nvcc.exe -ccbin "C:\vs2008\VC\bin"
  -Xcompiler "/EHsc /W3 /nologo /O2 /Zi /MT" <strong class="ph b">--cudart=shared</strong>
  -o "Release\myprogram.exe" "myprogram.cu"</pre></div>
                              <div class="section">
                                 <div class="note note"><span class="notetitle">Note:</span> It may be necessary to adjust the value of <samp class="ph codeph">-ccbin</samp>
                                    to reflect the location of your Visual Studio installation.
                                 </div>
                                 <p class="p">To specify an alternate path where the libraries will be
                                    distributed, use linker options similar to those below:
                                 </p>
                              </div>
                              <div class="example">
                                 <h5 class="title sectiontitle">Linux/Mac</h5><pre class="pre screen" xml:space="preserve">nvcc -I $(CUDA_HOME)/include
  <strong class="ph b">-Xlinker "-rpath '$ORIGIN/lib'" --cudart=shared</strong>
  -o myprogram myprogram.cu</pre></div>
                              <div class="example">
                                 <h5 class="title sectiontitle">Windows</h5><pre class="pre screen" xml:space="preserve">nvcc.exe -ccbin "C:\vs2008\VC\bin"
  -Xcompiler "/EHsc /W3 /nologo /O2 /Zi /MT <strong class="ph b">/DELAY</strong>" <strong class="ph b">--cudart=shared</strong>
  -o "Release\myprogram.exe" "myprogram.cu"</pre></div>
                              <div class="section">
                                 <p class="p">For Linux and Mac, the <samp class="ph codeph">-rpath</samp> option is used as
                                    before. For Windows, the <samp class="ph codeph">/DELAY</samp> option is used; this
                                    requires that the application call <samp class="ph codeph">SetDllDirectory()</samp>
                                    before the first call to any CUDA API function in order to specify the
                                    directory containing the CUDA DLLs.
                                 </p>
                                 <div class="note note"><span class="notetitle">Note:</span> For Windows 8, <samp class="ph codeph">SetDefaultDLLDirectories()</samp> and
                                    <samp class="ph codeph">AddDllDirectory()</samp> should be used instead of
                                    <samp class="ph codeph">SetDllDirectory()</samp>. Please see the MSDN documentation
                                    for these routines for more information.
                                 </div>
                              </div>
                           </div>
                        </div>
                     </div>
                  </div>
               </div>
               <div class="topic concept nested0" xml:lang="en-US" id="deployment-infrastructure-tools"><a name="deployment-infrastructure-tools" shape="rect">
                     <!-- --></a><h2 class="title topictitle1"><a href="#deployment-infrastructure-tools" name="deployment-infrastructure-tools" shape="rect">17.&nbsp;Deployment Infrastructure Tools</a></h2>
                  <div class="topic concept nested1" xml:lang="en-US" id="nvidia-smi"><a name="nvidia-smi" shape="rect">
                        <!-- --></a><h3 class="title topictitle2"><a href="#nvidia-smi" name="nvidia-smi" shape="rect">17.1.&nbsp;Nvidia-SMI</a></h3>
                     <div class="body conbody">
                        <p class="p">The NVIDIA System Management Interface (<samp class="ph codeph">nvidia-smi</samp>)
                           is a command line utility that aids in the management and monitoring of
                           NVIDIA GPU devices. This utility allows administrators to query GPU
                           device state and, with the appropriate privileges, permits
                           administrators to modify GPU device state. <samp class="ph codeph">nvidia-smi</samp>
                           is targeted at Tesla and certain Quadro GPUs, though limited support is
                           also available on other NVIDIA GPUs.  <samp class="ph codeph">nvidia-smi</samp> ships
                           with NVIDIA GPU display drivers on Linux, and with 64-bit Windows
                           Server 2008 R2 and Windows 7.  <samp class="ph codeph">nvidia-smi</samp> can output
                           queried information as XML or as human-readable plain text either to
                           standard output or to a file. See the nvidia-smi documenation for
                           details. Please note that new versions of nvidia-smi are not guaranteed
                           to be backward-compatible with previous versions.
                        </p>
                     </div>
                     <div class="topic concept nested2" xml:lang="en-US" id="queryable-state"><a name="queryable-state" shape="rect">
                           <!-- --></a><h3 class="title topictitle2"><a href="#queryable-state" name="queryable-state" shape="rect">17.1.1.&nbsp;Queryable state</a></h3>
                        <div class="body conbody">
                           <dl class="dl">
                              <dt class="dt dlterm">ECC error counts</dt>
                              <dd class="dd">Both correctable single-bit and detectable double-bit
                                 errors are reported. Error counts are provided for both the
                                 current boot cycle and the lifetime of the GPU.
                              </dd>
                              <dt class="dt dlterm">GPU utilization</dt>
                              <dd class="dd">Current utilization rates are reported for both the compute
                                 resources of the GPU and the memory interface.
                              </dd>
                              <dt class="dt dlterm">Active compute process</dt>
                              <dd class="dd">The list of active processes running on the GPU is
                                 reported, along with the corresponding process name/ID and
                                 allocated GPU memory.
                              </dd>
                              <dt class="dt dlterm">Clocks and performance state</dt>
                              <dd class="dd">Max and current clock rates are reported for several
                                 important clock domains, as well as the current GPU
                                 performance state (<dfn class="term">pstate</dfn>).
                              </dd>
                              <dt class="dt dlterm">Temperature and fan speed</dt>
                              <dd class="dd">The current GPU core temperature is reported, along with
                                 fan speeds for products with active cooling.
                              </dd>
                              <dt class="dt dlterm">Power management</dt>
                              <dd class="dd">The current board power draw and power limits are reported
                                 for products that report these measurements.
                              </dd>
                              <dt class="dt dlterm">Identification</dt>
                              <dd class="dd">Various dynamic and static information is reported,
                                 including board serial numbers, PCI device IDs,
                                 VBIOS/Inforom version numbers and product names.
                              </dd>
                           </dl>
                        </div>
                     </div>
                     <div class="topic concept nested2" xml:lang="en-US" id="modifiable-state"><a name="modifiable-state" shape="rect">
                           <!-- --></a><h3 class="title topictitle2"><a href="#modifiable-state" name="modifiable-state" shape="rect">17.1.2.&nbsp;Modifiable state</a></h3>
                        <div class="body conbody">
                           <dl class="dl">
                              <dt class="dt dlterm">ECC mode</dt>
                              <dd class="dd">Enable and disable ECC reporting.</dd>
                              <dt class="dt dlterm">ECC reset</dt>
                              <dd class="dd">Clear single-bit and double-bit ECC error counts.</dd>
                              <dt class="dt dlterm">Compute mode</dt>
                              <dd class="dd">Indicate whether compute processes can run on the GPU and
                                 whether they run exclusively or concurrently with other
                                 compute processes.
                              </dd>
                              <dt class="dt dlterm">Persistence mode</dt>
                              <dd class="dd">Indicate whether the NVIDIA driver stays loaded when no
                                 applications are connected to the GPU. It is best to enable
                                 this option in most circumstances.
                              </dd>
                              <dt class="dt dlterm">GPU reset</dt>
                              <dd class="dd">Reinitialize the GPU hardware and software state via a
                                 secondary bus reset.
                              </dd>
                           </dl>
                        </div>
                     </div>
                  </div>
                  <div class="topic concept nested1" xml:lang="en-US" id="nvml"><a name="nvml" shape="rect">
                        <!-- --></a><h3 class="title topictitle2"><a href="#nvml" name="nvml" shape="rect">17.2.&nbsp;NVML</a></h3>
                     <div class="body conbody">
                        <p class="p">The NVIDIA Management Library (NVML) is a C-based interface that
                           provides direct access to the queries and commands exposed via
                           <samp class="ph codeph">nvidia-smi</samp> intended as a platform for building
                           3rd-party system management applications. The NVML API is available on
                           the NVIDIA developer website as part of the Tesla Deployment Kit
                           through a single header file and is accompanied by PDF documentation,
                           stub libraries, and sample applications; see <a class="xref" href="http://developer.nvidia.com/tesla-deployment-kit" target="_blank" shape="rect">http://developer.nvidia.com/tesla-deployment-kit</a>.  Each new version of NVML is
                           backward-compatible.
                        </p>
                        <p class="p">An additional set of Perl and Python bindings are provided for the
                           NVML API. These bindings expose the same features as the C-based
                           interface and also provide backwards compatibility. The Perl bindings
                           are provided via CPAN and the Python bindings via PyPI.
                        </p>
                        <p class="p">All of these products (<samp class="ph codeph">nvidia-smi</samp>, NVML, and the
                           NVML language bindings) are updated with each new CUDA release and
                           provide roughly the same functionality.
                        </p>
                        <p class="p">See <a class="xref" href="http://developer.nvidia.com/nvidia-management-library-nvml" target="_blank" shape="rect">http://developer.nvidia.com/nvidia-management-library-nvml</a> for additional information.
                        </p>
                     </div>
                  </div>
                  <div class="topic concept nested1" xml:lang="en-US" id="cluster-management-tools"><a name="cluster-management-tools" shape="rect">
                        <!-- --></a><h3 class="title topictitle2"><a href="#cluster-management-tools" name="cluster-management-tools" shape="rect">17.3.&nbsp;Cluster Management Tools</a></h3>
                     <div class="body conbody">
                        <p class="p">Managing your GPU cluster will help achieve maximum GPU utilization
                           and help you and your users extract the best possible performance. Many
                           of the industry's most popular cluster management tools now support
                           CUDA GPUs via NVML. For a listing of some of these tools, see <a class="xref" href="http://developer.nvidia.com/cluster-management" target="_blank" shape="rect">http://developer.nvidia.com/cluster-management</a>.
                        </p>
                     </div>
                  </div>
                  <div class="topic concept nested1" xml:lang="en-US" id="compiler-jit-cache-management"><a name="compiler-jit-cache-management" shape="rect">
                        <!-- --></a><h3 class="title topictitle2"><a href="#compiler-jit-cache-management" name="compiler-jit-cache-management" shape="rect">17.4.&nbsp;Compiler JIT Cache Management Tools</a></h3>
                     <div class="body conbody">
                        <p class="p">Any PTX device code loaded by an application at runtime is compiled
                           further to binary code by the device driver. This is called
                           <dfn class="term">just-in-time compilation</dfn> (<dfn class="term">JIT</dfn>). Just-in-time
                           compilation increases application load time but allows applications to
                           benefit from latest compiler improvements. It is also the only way for
                           applications to run on devices that did not exist at the time the
                           application was compiled.
                        </p>
                        <p class="p">When JIT compilation of PTX device code is used, the NVIDIA driver
                           caches the resulting binary code on disk. Some aspects of this behavior
                           such as cache location and maximum cache size can be controlled via the
                           use of environment variables; see <cite class="cite">Just in Time Compilation</cite>
                           of the <cite class="cite">CUDA C Programming Guide</cite>.
                        </p>
                     </div>
                  </div>
                  <div class="topic concept nested1" xml:lang="en-US" id="cuda-visible-devices"><a name="cuda-visible-devices" shape="rect">
                        <!-- --></a><h3 class="title topictitle2"><a href="#cuda-visible-devices" name="cuda-visible-devices" shape="rect">17.5.&nbsp;CUDA_VISIBLE_DEVICES</a></h3>
                     <div class="body conbody">
                        <p class="p">It is possible to rearrange the collection of installed CUDA devices
                           that will be visible to and enumerated by a CUDA application prior to
                           the start of that application by way of the
                           <samp class="ph codeph">CUDA_VISIBLE_DEVICES</samp> environment variable.
                        </p>
                        <p class="p">Devices to be made visible to the application should be included as
                           a comma-separated list in terms of the system-wide list of enumerable
                           devices. For example, to use only devices 0 and 2 from the system-wide
                           list of devices, set <samp class="ph codeph">CUDA_VISIBLE_DEVICES=0,2</samp> before
                           launching the application. The application will then enumerate these
                           devices as device 0 and device 1, respectively.
                        </p>
                     </div>
                  </div>
               </div>
               <div class="topic concept nested0" xml:lang="en-US" id="recommendations-and-best-practices-appendix"><a name="recommendations-and-best-practices-appendix" shape="rect">
                     <!-- --></a><h2 class="title topictitle1"><a href="#recommendations-and-best-practices-appendix" name="recommendations-and-best-practices-appendix" shape="rect">A.&nbsp;Recommendations and Best Practices</a></h2>
                  <div class="body conbody">
                     <p class="p">This appendix contains a summary of the recommendations for
                        optimization that are explained in this document.
                     </p>
                  </div>
                  <div class="topic concept nested1" xml:lang="en-US" id="overall-performance-optimization-strategies"><a name="overall-performance-optimization-strategies" shape="rect">
                        <!-- --></a><h3 class="title topictitle2"><a href="#overall-performance-optimization-strategies" name="overall-performance-optimization-strategies" shape="rect">A.1.&nbsp;Overall Performance Optimization Strategies</a></h3>
                     <div class="body conbody">
                        <p class="p">Performance optimization revolves around three basic strategies:</p>
                        <ul class="ul">
                           <li class="li">Maximizing parallel execution</li>
                           <li class="li">Optimizing memory usage to achieve maximum memory
                              bandwidth
                           </li>
                           <li class="li">Optimizing instruction usage to achieve maximum instruction
                              throughput
                           </li>
                        </ul>
                        <p class="p">Maximizing parallel execution starts with structuring the algorithm
                           in a way that exposes as much data parallelism as possible. Once the
                           parallelism of the algorithm has been exposed, it needs to be mapped to
                           the hardware as efficiently as possible. This is done by carefully
                           choosing the execution configuration of each kernel launch. The
                           application should also maximize parallel execution at a higher level
                           by explicitly exposing concurrent execution on the device through
                           streams, as well as maximizing concurrent execution between the host
                           and the device.
                        </p>
                        <p class="p">Optimizing memory usage starts with minimizing data transfers
                           between the host and the device because those transfers have much lower
                           bandwidth than internal device data transfers. Kernel access to global
                           memory also should be minimized by maximizing the use of shared memory
                           on the device. Sometimes, the best optimization might even be to avoid
                           any data transfer in the first place by simply recomputing the data
                           whenever it is needed.
                        </p>
                        <p class="p">The effective bandwidth can vary by an order of magnitude depending
                           on the access pattern for each type of memory. The next step in
                           optimizing memory usage is therefore to organize memory accesses
                           according to the optimal memory access patterns. This optimization is
                           especially important for global memory accesses, because latency of
                           access costs hundreds of clock cycles. Shared memory accesses, in
                           counterpoint, are usually worth optimizing only when there exists a
                           high degree of bank conflicts.
                        </p>
                        <p class="p">As for optimizing instruction usage, the use of arithmetic
                           instructions that have low throughput should be avoided. This suggests
                           trading precision for speed when it does not affect the end result,
                           such as using intrinsics instead of regular functions or single
                           precision instead of double precision. Finally, particular attention
                           must be paid to control flow instructions due to the SIMT (single
                           instruction multiple thread) nature of the device.
                        </p>
                     </div>
                  </div>
               </div>
               <div class="topic concept nested0" xml:lang="en-US" id="nvcc-compiler-switches"><a name="nvcc-compiler-switches" shape="rect">
                     <!-- --></a><h2 class="title topictitle1"><a href="#nvcc-compiler-switches" name="nvcc-compiler-switches" shape="rect">B.&nbsp;nvcc Compiler Switches</a></h2>
                  <div class="topic concept nested1" xml:lang="en-US" id="nvcc"><a name="nvcc" shape="rect">
                        <!-- --></a><h3 class="title topictitle2"><a href="#nvcc" name="nvcc" shape="rect">B.1.&nbsp;nvcc</a></h3>
                     <div class="body conbody">
                        <p class="p">The NVIDIA <samp class="ph codeph">nvcc</samp> compiler driver converts
                           <samp class="ph codeph">.cu</samp> files into C for the host system and CUDA assembly
                           or binary instructions for the device. It supports a number of
                           command-line parameters, of which the following are especially useful
                           for optimization and related best practices:
                        </p>
                        <ul class="ul">
                           <li class="li"><samp class="ph codeph">-maxrregcount=N</samp> specifies the maximum
                              number of registers kernels can use at a per-file level. See
                              <a class="xref" href="index.html#register-pressure" shape="rect">Register Pressure</a>. (See also the<samp class="ph codeph">
                                 __launch_bounds__</samp> qualifier discussed in
                              <cite class="cite">Execution Configuration</cite> of the <cite class="cite">CUDA C
                                 Programming Guide</cite> to control the number of registers
                              used on a per-kernel basis.)
                           </li>
                           <li class="li"><samp class="ph codeph">--ptxas-options=-v</samp> or
                              <samp class="ph codeph">-Xptxas=-v</samp> lists per-kernel register, shared,
                              and constant memory usage.
                           </li>
                           <li class="li"><samp class="ph codeph">-ftz=true</samp> (denormalized numbers are flushed to
                              zero)
                           </li>
                           <li class="li"><samp class="ph codeph">-prec-div=false</samp> (less precise division)
                           </li>
                           <li class="li"><samp class="ph codeph">-prec-sqrt=false</samp> (less precise square root)
                           </li>
                           <li class="li"><samp class="ph codeph">-use_fast_math</samp> compiler option of
                              <samp class="ph codeph">nvcc</samp> coerces every
                              <samp class="ph codeph">functionName()</samp> call to the equivalent
                              <samp class="ph codeph">__functionName()</samp> call. This makes the code run
                              faster at the cost of diminished precision and accuracy. See
                              <a class="xref" href="index.html#math-libraries" shape="rect">Math Libraries</a>.
                           </li>
                        </ul>
                     </div>
                  </div>
               </div>
               <div class="topic concept nested0" id="notices-header"><a name="notices-header" shape="rect">
                     <!-- --></a><h2 class="title topictitle1"><a href="#notices-header" name="notices-header" shape="rect">Notices</a></h2>
                  <div class="topic reference nested1" id="notice"><a name="notice" shape="rect">
                        <!-- --></a><h3 class="title topictitle2"><a href="#notice" name="notice" shape="rect"></a></h3>
                     <div class="body refbody">
                        <div class="section">
                           <h3 class="title sectiontitle">Notice</h3>
                           <p class="p">ALL NVIDIA DESIGN SPECIFICATIONS, REFERENCE BOARDS, FILES, DRAWINGS, DIAGNOSTICS, LISTS, AND OTHER DOCUMENTS (TOGETHER AND
                              SEPARATELY, "MATERIALS") ARE BEING PROVIDED "AS IS." NVIDIA MAKES NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE
                              WITH RESPECT TO THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT, MERCHANTABILITY, AND FITNESS
                              FOR A PARTICULAR PURPOSE. 
                           </p>
                           <p class="p">Information furnished is believed to be accurate and reliable. However, NVIDIA Corporation assumes no responsibility for the
                              consequences of use of such information or for any infringement of patents or other rights of third parties that may result
                              from its use. No license is granted by implication of otherwise under any patent rights of NVIDIA Corporation. Specifications
                              mentioned in this publication are subject to change without notice. This publication supersedes and replaces all other information
                              previously supplied. NVIDIA Corporation products are not authorized as critical components in life support devices or systems
                              without express written approval of NVIDIA Corporation.
                           </p>
                        </div>
                     </div>
                  </div>
                  <div class="topic reference nested1" id="trademarks"><a name="trademarks" shape="rect">
                        <!-- --></a><h3 class="title topictitle2"><a href="#trademarks" name="trademarks" shape="rect"></a></h3>
                     <div class="body refbody">
                        <div class="section">
                           <h3 class="title sectiontitle">Trademarks</h3>
                           <p class="p">NVIDIA and the NVIDIA logo are trademarks or registered trademarks of NVIDIA Corporation
                              in the U.S. and other countries.  Other company and product names may be trademarks of
                              the respective companies with which they are associated.
                           </p>
                        </div>
                     </div>
                  </div>
                  <div class="topic reference nested1" id="copyright-past-to-present"><a name="copyright-past-to-present" shape="rect">
                        <!-- --></a><h3 class="title topictitle2"><a href="#copyright-past-to-present" name="copyright-past-to-present" shape="rect"></a></h3>
                     <div class="body refbody">
                        <div class="section">
                           <h3 class="title sectiontitle">Copyright</h3>
                           <p class="p">© <span class="ph">2007</span>-<span class="ph">2019</span> NVIDIA
                              Corporation. All rights reserved.
                           </p>
                           <p class="p">This product includes software developed by the Syncro Soft SRL (http://www.sync.ro/).</p>
                        </div>
                     </div>
                  </div>
               </div>
               
               <hr id="contents-end"></hr>
               
            </article>
         </div>
      </div>
      <script language="JavaScript" type="text/javascript" charset="utf-8" src="../common/formatting/common.min.js"></script>
      <script language="JavaScript" type="text/javascript" charset="utf-8" src="../common/scripts/google-analytics/google-analytics-write.js"></script>
      <script language="JavaScript" type="text/javascript" charset="utf-8" src="../common/scripts/google-analytics/google-analytics-tracker.js"></script>
      <script type="text/javascript">var switchTo5x=true;</script><script type="text/javascript" src="http://w.sharethis.com/button/buttons.js"></script><script type="text/javascript">stLight.options({publisher: "998dc202-a267-4d8e-bce9-14debadb8d92", doNotHash: false, doNotCopy: false, hashAddressBar: false});</script><script type="text/javascript">_satellite.pageBottom();</script></body>
</html>