Sophie

Sophie

distrib > Mageia > 5 > x86_64 > media > nonfree-updates > by-pkgid > fd8445e7e4d58b8cfe6e0150bd441ee1 > files > 1675

nvidia-cuda-toolkit-devel-6.5.14-6.1.mga5.nonfree.x86_64.rpm

<!DOCTYPE html
  PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" lang="en-us" xml:lang="en-us">
   <head>
      <meta http-equiv="Content-Type" content="text/html; charset=utf-8"></meta>
      <meta http-equiv="X-UA-Compatible" content="IE=edge"></meta>
      <meta name="copyright" content="(C) Copyright 2005"></meta>
      <meta name="DC.rights.owner" content="(C) Copyright 2005"></meta>
      <meta name="DC.Type" content="concept"></meta>
      <meta name="DC.Title" content="Tuning CUDA Applications for Maxwell"></meta>
      <meta name="abstract" content="The programming guide to tuning CUDA Applications for GPUs based on the NVIDIA Maxwell Architecture."></meta>
      <meta name="description" content="The programming guide to tuning CUDA Applications for GPUs based on the NVIDIA Maxwell Architecture."></meta>
      <meta name="DC.Coverage" content="Programming Guides"></meta>
      <meta name="DC.subject" content="CUDA Maxwell, CUDA Maxwell tuning, CUDA Maxwell best practices, CUDA Maxwell performance"></meta>
      <meta name="keywords" content="CUDA Maxwell, CUDA Maxwell tuning, CUDA Maxwell best practices, CUDA Maxwell performance"></meta>
      <meta name="DC.Format" content="XHTML"></meta>
      <meta name="DC.Identifier" content="abstract"></meta>
      <link rel="stylesheet" type="text/css" href="../common/formatting/commonltr.css"></link>
      <link rel="stylesheet" type="text/css" href="../common/formatting/site.css"></link>
      <title>Maxwell Tuning Guide :: CUDA Toolkit Documentation</title>
      <!--[if lt IE 9]>
      <script src="../common/formatting/html5shiv-printshiv.min.js"></script>
      <![endif]-->
      <script type="text/javascript" charset="utf-8" src="../common/scripts/tynt/tynt.js"></script>
      <script type="text/javascript" charset="utf-8" src="../common/formatting/jquery.min.js"></script>
      <script type="text/javascript" charset="utf-8" src="../common/formatting/jquery.ba-hashchange.min.js"></script>
      <script type="text/javascript" charset="utf-8" src="../common/formatting/jquery.scrollintoview.min.js"></script>
      <script type="text/javascript" src="../search/htmlFileList.js"></script>
      <script type="text/javascript" src="../search/htmlFileInfoList.js"></script>
      <script type="text/javascript" src="../search/nwSearchFnt.min.js"></script>
      <script type="text/javascript" src="../search/stemmers/en_stemmer.min.js"></script>
      <script type="text/javascript" src="../search/index-1.js"></script>
      <script type="text/javascript" src="../search/index-2.js"></script>
      <script type="text/javascript" src="../search/index-3.js"></script>
      <link rel="canonical" href="http://docs.nvidia.com/cuda/maxwell-tuning-guide/index.html"></link>
      <link rel="stylesheet" type="text/css" href="../common/formatting/qwcode.highlight.css"></link>
   </head>
   <body>
      
      <header id="header"><span id="company">NVIDIA</span><span id="site-title">CUDA Toolkit Documentation</span><form id="search" method="get" action="search">
            <input type="text" name="search-text"></input><fieldset id="search-location">
               <legend>Search In:</legend>
               <label><input type="radio" name="search-type" value="site"></input>Entire Site</label>
               <label><input type="radio" name="search-type" value="document"></input>Just This Document</label></fieldset>
            <button type="reset">clear search</button>
            <button id="submit" type="submit">search</button></form>
      </header>
      <div id="site-content">
         <nav id="site-nav">
            <div class="category closed"><a href="../index.html" title="The root of the site.">CUDA Toolkit
                  v6.5</a></div>
            <div class="category"><a href="index.html" title="Maxwell Tuning Guide">Maxwell Tuning Guide</a></div>
            <ul>
               <li>
                  <div class="section-link"><a href="#tuning-cuda-applications-for-maxwell">1.&nbsp;Maxwell Tuning Guide</a></div>
                  <ul>
                     <li>
                        <div class="section-link"><a href="#nvidia-maxwell-compute-architecture">1.1.&nbsp;NVIDIA Maxwell Compute Architecture</a></div>
                     </li>
                     <li>
                        <div class="section-link"><a href="#cuda-best-practices">1.2.&nbsp;CUDA Best Practices</a></div>
                     </li>
                     <li>
                        <div class="section-link"><a href="#application-compatibility">1.3.&nbsp;Application Compatibility</a></div>
                     </li>
                     <li>
                        <div class="section-link"><a href="#maxwell-tuning">1.4.&nbsp;Maxwell Tuning</a></div>
                        <ul>
                           <li>
                              <div class="section-link"><a href="#smm">1.4.1.&nbsp;SMM</a></div>
                              <ul>
                                 <li>
                                    <div class="section-link"><a href="#smm-occupancy">1.4.1.1.&nbsp;Occupancy</a></div>
                                 </li>
                                 <li>
                                    <div class="section-link"><a href="#smm-scheduling">1.4.1.2.&nbsp;Instruction Scheduling</a></div>
                                 </li>
                                 <li>
                                    <div class="section-link"><a href="#smm-latencies">1.4.1.3.&nbsp;Instruction Latencies</a></div>
                                 </li>
                                 <li>
                                    <div class="section-link"><a href="#smm-instruction-throughput">1.4.1.4.&nbsp;Instruction Throughput</a></div>
                                 </li>
                              </ul>
                           </li>
                           <li>
                              <div class="section-link"><a href="#memory-throughput">1.4.2.&nbsp;Memory Throughput</a></div>
                              <ul>
                                 <li>
                                    <div class="section-link"><a href="#l1-cache">1.4.2.1.&nbsp;Unified L1/Texture Cache</a></div>
                                 </li>
                              </ul>
                           </li>
                           <li>
                              <div class="section-link"><a href="#shared-memory">1.4.3.&nbsp;Shared Memory</a></div>
                              <ul>
                                 <li>
                                    <div class="section-link"><a href="#shared-memory-capacity">1.4.3.1.&nbsp;Shared Memory Capacity</a></div>
                                 </li>
                                 <li>
                                    <div class="section-link"><a href="#shared-memory-bandwidth">1.4.3.2.&nbsp;Shared Memory Bandwidth</a></div>
                                 </li>
                                 <li>
                                    <div class="section-link"><a href="#fast-shared-memory-atomics">1.4.3.3.&nbsp;Fast Shared Memory Atomics</a></div>
                                 </li>
                              </ul>
                           </li>
                           <li>
                              <div class="section-link"><a href="#dynamic-parallelism">1.4.4.&nbsp;Dynamic Parallelism</a></div>
                           </li>
                        </ul>
                     </li>
                  </ul>
               </li>
               <li>
                  <div class="section-link"><a href="#revision-history">A.&nbsp;Revision History</a></div>
               </li>
            </ul>
         </nav>
         <div id="resize-nav"></div>
         <nav id="search-results">
            <h2>Search Results</h2>
            <ol></ol>
         </nav>
         
         <div id="contents-container">
            <div id="breadcrumbs-container">
               <div id="release-info">Maxwell Tuning Guide
                  (<a href="../../pdf/Maxwell_Tuning_Guide.pdf">PDF</a>)
                  -
                  
                  v6.5
                  (<a href="https://developer.nvidia.com/cuda-toolkit-archive">older</a>)
                  -
                  Last updated August 1, 2014
                  -
                  <a href="mailto:cudatools@nvidia.com?subject=CUDA Toolkit Documentation Feedback: Maxwell Tuning Guide">Send Feedback</a>
                  -
                  <span class="st_facebook"></span><span class="st_twitter"></span><span class="st_linkedin"></span><span class="st_reddit"></span><span class="st_slashdot"></span><span class="st_tumblr"></span><span class="st_sharethis"></span></div>
            </div>
            <article id="contents">
               <div class="topic nested0" id="abstract"><a name="abstract" shape="rect">
                     <!-- --></a><h2 class="title topictitle1"><a href="#abstract" name="abstract" shape="rect">Tuning CUDA Applications for Maxwell</a></h2>
                  <div class="body conbody"></div>
               </div>
               <div class="topic concept nested0" xml:lang="en-US" id="tuning-cuda-applications-for-maxwell"><a name="tuning-cuda-applications-for-maxwell" shape="rect">
                     <!-- --></a><h2 class="title topictitle1"><a href="#tuning-cuda-applications-for-maxwell" name="tuning-cuda-applications-for-maxwell" shape="rect">1.&nbsp;Maxwell Tuning Guide</a></h2>
                  <div class="topic concept nested1" xml:lang="en-US" id="nvidia-maxwell-compute-architecture"><a name="nvidia-maxwell-compute-architecture" shape="rect">
                        <!-- --></a><h3 class="title topictitle2"><a href="#nvidia-maxwell-compute-architecture" name="nvidia-maxwell-compute-architecture" shape="rect">1.1.&nbsp;NVIDIA Maxwell Compute Architecture</a></h3>
                     <div class="body conbody">
                        <p class="p">Maxwell is NVIDIA's next-generation architecture for CUDA
                           compute applications. Maxwell retains and extends the same CUDA
                           programming model as in previous NVIDIA architectures such as Fermi and
                           Kepler, and applications that follow the best practices for those
                           architectures should typically see speedups on the Maxwell architecture
                           without any code changes. This guide summarizes the ways that an
                           application can be fine-tuned to gain additional speedups by leveraging
                           Maxwell architectural features.<a name="fnsrc_1" href="#fntarg_1" shape="rect"><sup>1</sup></a></p>
                        <p class="p">Maxwell introduces an all-new design for the Streaming
                           Multiprocessor (<dfn class="term">SM</dfn>) that dramatically improves
                           energy efficiency. Although the Kepler SMX design was
                           extremely efficient for its generation, through its development,
                           NVIDIA's GPU architects saw an opportunity for another big leap forward
                           in architectural efficiency; the Maxwell SM is the realization of that
                           vision. Improvements to control logic partitioning, workload balancing,
                           clock-gating granularity, compiler-based scheduling,
                           number of instructions issued per clock cycle, and many other
                           enhancements allow the Maxwell SM (also called <dfn class="term">SMM</dfn>) to far
                           exceed Kepler SMX efficiency.
                        </p>
                        <p class="p">The first Maxwell-based GPU is codenamed <dfn class="term">GM107</dfn> and is
                           designed for use in power-limited environments like notebooks and small
                           form factor (SFF) PCs.  GM107 is described in a whitepaper entitled
                           <a class="xref" href="http://international.download.nvidia.com/geforce-com/international/pdfs/GeForce-GTX-750-Ti-Whitepaper.pdf" target="_blank" shape="rect">NVIDIA GeForce GTX 750 Ti: Featuring
                              First-Generation Maxwell GPU Technology, Designed for Extreme
                              Performance per Watt</a>.
                        </p>
                        <p class="p">This guide currently focuses on the first-generation Maxwell GPUs.
                           A higher-performing second generation of Maxwell GPUs will be
                           introduced and described at a later date.
                        </p>
                        <p class="p">For details on the programming features discussed in this guide,
                           please refer to the <a class="xref" href="http://docs.nvidia.com/cuda/cuda-c-programming-guide/" target="_blank" shape="rect">CUDA C Programming Guide</a>.
                        </p>
                     </div>
                  </div>
                  <div class="topic concept nested1" xml:lang="en-US" id="cuda-best-practices"><a name="cuda-best-practices" shape="rect">
                        <!-- --></a><h3 class="title topictitle2"><a href="#cuda-best-practices" name="cuda-best-practices" shape="rect">1.2.&nbsp;CUDA Best Practices</a></h3>
                     <div class="body conbody">
                        <p class="p">The performance guidelines and best practices described in the <a class="xref" href="http://docs.nvidia.com/cuda/cuda-c-programming-guide/" target="_blank" shape="rect">CUDA C Programming Guide</a> and the
                           <a class="xref" href="http://docs.nvidia.com/cuda/cuda-c-best-practices-guide/" target="_blank" shape="rect">CUDA C Best Practices Guide</a> apply
                           to all CUDA-capable GPU architectures. Programmers must primarily focus
                           on following those recommendations to achieve the best performance.
                        </p>
                        <div class="p">The high-priority recommendations from those guides are as follows:
                           
                           <ul class="ul">
                              <li class="li">Find ways to parallelize sequential code,</li>
                              <li class="li">Minimize data transfers between the host and the device,</li>
                              <li class="li">Adjust kernel launch configuration to maximize device
                                 utilization,
                              </li>
                              <li class="li">Ensure global memory accesses are coalesced,</li>
                              <li class="li">Minimize redundant accesses to global memory whenever
                                 possible,
                              </li>
                              <li class="li">Avoid long sequences of diverged execution by threads within
                                 the same warp.
                              </li>
                           </ul>
                        </div>
                     </div>
                  </div>
                  <div class="topic concept nested1" xml:lang="en-US" id="application-compatibility"><a name="application-compatibility" shape="rect">
                        <!-- --></a><h3 class="title topictitle2"><a href="#application-compatibility" name="application-compatibility" shape="rect">1.3.&nbsp;Application Compatibility</a></h3>
                     <div class="body conbody">
                        <p class="p">Before addressing specific performance tuning issues
                           covered in this guide, refer to the <a class="xref" href="http://docs.nvidia.com/cuda/maxwell-compatibility-guide/" target="_blank" shape="rect">Maxwell Compatibility Guide for CUDA
                              Applications</a> to ensure that your application is compiled in a
                           way that is compatible with Maxwell.
                        </p>
                     </div>
                  </div>
                  <div class="topic concept nested1" xml:lang="en-US" id="maxwell-tuning"><a name="maxwell-tuning" shape="rect">
                        <!-- --></a><h3 class="title topictitle2"><a href="#maxwell-tuning" name="maxwell-tuning" shape="rect">1.4.&nbsp;Maxwell Tuning</a></h3>
                     <div class="topic concept nested2" xml:lang="en-US" id="smm"><a name="smm" shape="rect">
                           <!-- --></a><h3 class="title topictitle2"><a href="#smm" name="smm" shape="rect">1.4.1.&nbsp;SMM</a></h3>
                        <div class="body conbody">
                           <p class="p">The Maxwell Streaming Multiprocessor, SMM, is similar in many
                              respects to the Kepler architecture's SMX.   The key enhancements
                              of SMM over SMX are geared toward improving efficiency without
                              requiring significant increases in available parallelism per SM
                              from the application.
                           </p>
                        </div>
                        <div class="topic concept nested3" xml:lang="en-US" id="smm-occupancy"><a name="smm-occupancy" shape="rect">
                              <!-- --></a><h3 class="title topictitle2"><a href="#smm-occupancy" name="smm-occupancy" shape="rect">1.4.1.1.&nbsp;Occupancy</a></h3>
                           <div class="body conbody">
                              <div class="p">The maximum number of concurrent warps per SMM remains the
                                 same as in SMX (i.e., 64), and <a class="xref" href="http://developer.download.nvidia.com/compute/cuda/CUDA_Occupancy_calculator.xls" target="_blank" shape="rect">factors influencing warp
                                    occupancy</a> remain similar or improved over SMX:
                                 
                                 
                                 <ul class="ul">
                                    <li class="li">The register file size (64k 32-bit registers) is the
                                       same as that of SMX.
                                    </li>
                                    <li class="li">The maximum registers per thread, 255, matches that of
                                       Kepler GK110.  As with Kepler, experimentation should be
                                       used to determine the optimum balance of register spilling
                                       vs. occupancy, however.
                                    </li>
                                    <li class="li">The maximum number of thread blocks per SM has been
                                       increased from 16 to 32.  This should result in an
                                       automatic occupancy improvement for kernels with small
                                       thread blocks of 64 or fewer threads (shared memory and
                                       register file resource requirements permitting).  Such
                                       kernels would have tended to under-utilize SMX, but less so
                                       SMM.
                                    </li>
                                    <li class="li">Shared memory capacity is increased (see <a class="xref" href="index.html#shared-memory-capacity" shape="rect">Shared Memory Capacity</a>).
                                    </li>
                                 </ul>
                              </div>
                              <p class="p">As such, developers can expect similar or improved occupancy
                                 on SMM without changes to their application.  At the same time,
                                 warp occupancy requirements (i.e., available parallelism) for
                                 maximum device utilization are similar to or less than those of
                                 SMX (see <a class="xref" href="index.html#smm-latencies" shape="rect">Instruction Latencies</a>).
                              </p>
                           </div>
                        </div>
                        <div class="topic concept nested3" xml:lang="en-US" id="smm-scheduling"><a name="smm-scheduling" shape="rect">
                              <!-- --></a><h3 class="title topictitle2"><a href="#smm-scheduling" name="smm-scheduling" shape="rect">1.4.1.2.&nbsp;Instruction Scheduling</a></h3>
                           <div class="body conbody">
                              <p class="p">The number of CUDA Cores per SM has been reduced to a power
                                 of two, however with Maxwell's improved execution efficiency,
                                 performance per SM is usually within 10% of Kepler performance,
                                 and the improved area efficiency of SMM means CUDA Cores per
                                 GPU will be substantially higher vs. comparable Fermi or Kepler
                                 chips.  SMM retains the same number of instruction issue slots
                                 per clock and reduces arithmetic latencies compared to the
                                 Kepler design.
                              </p>
                              <p class="p">As with SMX, each SMM has four warp schedulers.  Unlike SMX,
                                 however, all SMM core functional units are assigned to a
                                 particular scheduler, with no shared units.  Along with the
                                 selection of a power-of-two number of CUDA Cores per SM, which
                                 simplifies scheduling and reduces stall cycles, this
                                 partitioning of SM computational resources in SMM is a major
                                 component of the streamlined efficiency of SMM.
                              </p>
                              <p class="p">The power-of-two number of CUDA Cores per partition
                                 simplifies scheduling, as each of SMM's warp schedulers issue
                                 to a dedicated set of CUDA Cores equal to the warp width.  Each
                                 warp scheduler still has the flexibility to dual-issue (such as
                                 issuing a math operation to a CUDA Core in the same cycle as a
                                 memory operation to a load/store unit), but single-issue is now
                                 sufficient to fully utilize all CUDA Cores.
                              </p>
                           </div>
                        </div>
                        <div class="topic concept nested3" xml:lang="en-US" id="smm-latencies"><a name="smm-latencies" shape="rect">
                              <!-- --></a><h3 class="title topictitle2"><a href="#smm-latencies" name="smm-latencies" shape="rect">1.4.1.3.&nbsp;Instruction Latencies</a></h3>
                           <div class="body conbody">
                              <p class="p">Another major improvement of SMM is that dependent math
                                 latencies have been significantly reduced; a consequence of
                                 this is a further reduction of stall cycles, as the available
                                 warp-level parallelism (i.e., occupancy) on SMM should be equal
                                 to or greater than that of SMX (see <a class="xref" href="index.html#smm-occupancy" shape="rect">Occupancy</a>), while at the same time each math
                                 operation takes <em class="ph i">less</em> time to complete, improving
                                 utilization and throughput.
                              </p>
                           </div>
                        </div>
                        <div class="topic concept nested3" xml:lang="en-US" id="smm-instruction-throughput"><a name="smm-instruction-throughput" shape="rect">
                              <!-- --></a><h3 class="title topictitle2"><a href="#smm-instruction-throughput" name="smm-instruction-throughput" shape="rect">1.4.1.4.&nbsp;Instruction Throughput</a></h3>
                           <div class="body conbody">
                              <div class="p">The most significant changes to peak instruction throughputs
                                 in SMM are as follows:
                                 
                                 <ul class="ul">
                                    <li class="li">The change in <a class="xref" href="index.html#smm-scheduling" shape="rect">number of
                                          CUDA Cores per SM</a> brings with it a corresponding
                                       change in peak single-precision floating point operations
                                       per clock per SM.  However, since the number of SMs is
                                       typically increased, the result is an increase in aggregate
                                       peak throughput; furthermore, the scheduling and latency
                                       improvements also discussed above make this peak easier to
                                       approach.
                                    </li>
                                    <li class="li">The throughput of many integer operations including
                                       multiply, logical operations and shift is improved.  In
                                       addition, there are now specialized integer instructions
                                       that can accelerate pointer arithmetic.  These instructions
                                       are most efficient when data structures are a power of two
                                       in size.
                                    </li>
                                 </ul>
                              </div>
                              <div class="note note"><span class="notetitle">Note:</span> As was already the recommended best practice, signed
                                 arithmetic should be preferred over unsigned arithmetic
                                 wherever possible for best throughput on SMM.  The C language
                                 standard places more restrictions on overflow behavior for
                                 unsigned math, limiting compiler optimization
                                 opportunities.
                              </div>
                           </div>
                        </div>
                     </div>
                     <div class="topic concept nested2" xml:lang="en-US" id="memory-throughput"><a name="memory-throughput" shape="rect">
                           <!-- --></a><h3 class="title topictitle2"><a href="#memory-throughput" name="memory-throughput" shape="rect">1.4.2.&nbsp;Memory Throughput</a></h3>
                        <div class="topic concept nested3" xml:lang="en-US" id="l1-cache"><a name="l1-cache" shape="rect">
                              <!-- --></a><h3 class="title topictitle2"><a href="#l1-cache" name="l1-cache" shape="rect">1.4.2.1.&nbsp;Unified L1/Texture Cache</a></h3>
                           <div class="body conbody">
                              <p class="p">Maxwell combines the functionality of the L1 and texture
                                 caches into a single unit.
                              </p>
                              <p class="p">As with Kepler, global loads in first-generation Maxwell are
                                 cached in L2 only, unless using the <dfn class="term">LDG</dfn> read-only
                                 data cache mechanism introduced in Kepler.
                              </p>
                              <p class="p">Local loads also are cached in L2 only, which could increase
                                 the cost of register spilling if L1 local load hit rates were
                                 high with Kepler. The balance of occupancy versus spilling
                                 should therefore be reevaluated to ensure best performance.
                                 Especially given the improvements to arithmetic latencies, code
                                 built for Maxwell may benefit from somewhat lower occupancy
                                 (due to increased registers per thread) in exchange for lower
                                 spilling.
                              </p>
                              <p class="p">The unified L1/texture cache acts as a coalescing buffer for
                                 memory accesses, gathering up the data requested by the threads
                                 of a warp prior to delivery of that data to the warp.  This
                                 function previously was served by the separate L1 cache in
                                 Fermi and Kepler.
                              </p>
                              <p class="p">Two new device attributes have been added in CUDA Toolkit
                                 6.0: <samp class="ph codeph">globalL1CacheSupported</samp> and
                                 <samp class="ph codeph">localL1CacheSupported</samp>.  Developers who wish to
                                 have separately-tuned paths for various architecture
                                 generations can use these fields to simplify the path selection
                                 process.
                              </p>
                           </div>
                        </div>
                     </div>
                     <div class="topic concept nested2" xml:lang="en-US" id="shared-memory"><a name="shared-memory" shape="rect">
                           <!-- --></a><h3 class="title topictitle2"><a href="#shared-memory" name="shared-memory" shape="rect">1.4.3.&nbsp;Shared Memory</a></h3>
                        <div class="topic concept nested3" xml:lang="en-US" id="shared-memory-capacity"><a name="shared-memory-capacity" shape="rect">
                              <!-- --></a><h3 class="title topictitle2"><a href="#shared-memory-capacity" name="shared-memory-capacity" shape="rect">1.4.3.1.&nbsp;Shared Memory Capacity</a></h3>
                           <div class="body conbody">
                              <p class="p">With Fermi and Kepler, shared memory and the L1 cache shared
                                 the same on-chip storage.  Maxwell, by contrast, devotes the
                                 full 64 KB per SMM to shared memory, since the functionality
                                 of the L1 and texture caches have been merged in SMM.
                              </p>
                              <div class="p">This presents several benefits to application developers:
                                 
                                 <ul class="ul">
                                    <li class="li">Algorithms with significant shared memory capacity
                                       requirements (e.g., radix sort) see an automatic 33% boost
                                       in capacity per SM on top of the aggregate boost from
                                       higher SM count.
                                    </li>
                                    <li class="li">Applications no longer need to select a preference
                                       of the L1/shared split for optimal performance.
                                       For purposes of backward compatibility with Fermi and
                                       Kepler, applications may optionally continue to specify
                                       such a preference, but the preference will be ignored
                                       on Maxwell, with the full 64 KB per SMM always going to
                                       shared memory.
                                    </li>
                                 </ul>
                              </div>
                              <div class="note note"><span class="notetitle">Note:</span> While the per-SM shared memory capacity is increased to
                                 64 KB in SMM, the per-thread-block limit remains 48 KB.  For
                                 maximum flexibility on possible future GPUs, NVIDIA recommends
                                 that applications use at most 32 KB of shared memory in any one
                                 thread block, which would for example allow at least two such
                                 thread blocks to fit per SMM.
                              </div>
                           </div>
                        </div>
                        <div class="topic concept nested3" xml:lang="en-US" id="shared-memory-bandwidth"><a name="shared-memory-bandwidth" shape="rect">
                              <!-- --></a><h3 class="title topictitle2"><a href="#shared-memory-bandwidth" name="shared-memory-bandwidth" shape="rect">1.4.3.2.&nbsp;Shared Memory Bandwidth</a></h3>
                           <div class="body conbody">
                              <p class="p">Kepler SMX introduced an optional 8-byte shared memory
                                 banking mode, which had the potential to increase shared memory
                                 bandwidth per SM over Fermi for shared memory accesses of 8 or
                                 16 bytes.  However, applications could only benefit from
                                 this when storing these larger elements in shared memory (i.e.,
                                 integers and fp32 values saw no benefit), and only when the
                                 developer explicitly opted into the 8-byte bank mode via the
                                 API.
                              </p>
                              <p class="p">To simplify this, Maxwell returns to the Fermi style of
                                 shared memory banking, where banks are always four bytes wide.
                                 Aggregate shared memory bandwidth across the chip remains
                                 comparable to that of corresponding Kepler chips, given
                                 increased SM count.  In this way, all applications using shared
                                 memory can now benefit from the higher bandwidth, even when
                                 storing only four-byte items into shared memory and without
                                 specifying any particular preference via the API.
                              </p>
                           </div>
                        </div>
                        <div class="topic concept nested3" xml:lang="en-US" id="fast-shared-memory-atomics"><a name="fast-shared-memory-atomics" shape="rect">
                              <!-- --></a><h3 class="title topictitle2"><a href="#fast-shared-memory-atomics" name="fast-shared-memory-atomics" shape="rect">1.4.3.3.&nbsp;Fast Shared Memory Atomics</a></h3>
                           <div class="body conbody">
                              <p class="p">Kepler introduced a dramatically higher throughput for
                                 atomic operations to <em class="ph i">global</em> memory as compared to Fermi.
                                 However, atomic operations to <em class="ph i">shared</em> memory remained
                                 essentially unchanged: both architectures implemented shared
                                 memory atomics using a lock/update/unlock pattern that could be
                                 expensive in the case of high contention for updates to
                                 particular locations in shared memory.
                              </p>
                              <p class="p">Maxwell improves upon this by implementing native shared
                                 memory atomic operations for 32-bit integers and native
                                 shared memory 32-bit and 64-bit compare-and-swap (CAS), which
                                 can be used to implement other atomic functions with reduced
                                 overhead compared to the Fermi and Kepler methods.
                              </p>
                              <div class="note note"><span class="notetitle">Note:</span> Refer to the <a class="xref" href="http://docs.nvidia.com/cuda/cuda-c-programming-guide/" target="_blank" shape="rect">CUDA C Programming Guide</a>
                                 for an example implementation of an fp64
                                 <samp class="ph codeph">atomicAdd()</samp> using
                                 <samp class="ph codeph">atomicCAS()</samp>.
                              </div>
                           </div>
                        </div>
                     </div>
                     <div class="topic concept nested2" xml:lang="en-US" id="dynamic-parallelism"><a name="dynamic-parallelism" shape="rect">
                           <!-- --></a><h3 class="title topictitle2"><a href="#dynamic-parallelism" name="dynamic-parallelism" shape="rect">1.4.4.&nbsp;Dynamic Parallelism</a></h3>
                        <div class="body conbody">
                           <p class="p">GK110 introduced a new architectural feature called
                              Dynamic Parallelism, which allows the GPU to create additional
                              work for itself. A programming model enhancement leveraging
                              this feature was introduced in CUDA 5.0 to enable kernels
                              running on GK110 to launch additional kernels onto the
                              same GPU.
                           </p>
                           <p class="p">SMM brings Dynamic Parallelism into the mainstream by
                              supporting it across the product line, even in lower-power
                              chips such as GM107.  This will benefit developers, as it
                              means that applications will no longer need special-case
                              algorithm implementations for high-end GPUs that differ
                              from those usable in more power-constrained environments.
                           </p>
                        </div>
                     </div>
                  </div>
               </div>
               <div class="topic reference nested0" id="revision-history"><a name="revision-history" shape="rect">
                     <!-- --></a><h2 class="title topictitle1"><a href="#revision-history" name="revision-history" shape="rect">A.&nbsp;Revision History</a></h2>
                  <div class="body refbody">
                     <div class="section">
                        <h2 class="title sectiontitle">Version 1.0</h2>
                        <ul class="ul">
                           <li class="li">Initial Public Release</li>
                        </ul>
                     </div>
                  </div>
               </div>
               <div class="topic concept nested0" id="notices-header"><a name="notices-header" shape="rect">
                     <!-- --></a><h2 class="title topictitle1"><a href="#notices-header" name="notices-header" shape="rect">Notices</a></h2>
                  <div class="topic reference nested1" id="notice"><a name="notice" shape="rect">
                        <!-- --></a><h3 class="title topictitle2"><a href="#notice" name="notice" shape="rect"></a></h3>
                     <div class="body refbody">
                        <div class="section">
                           <h3 class="title sectiontitle">Notice</h3>
                           <p class="p">ALL NVIDIA DESIGN SPECIFICATIONS, REFERENCE BOARDS, FILES, DRAWINGS, DIAGNOSTICS, LISTS, AND OTHER DOCUMENTS (TOGETHER AND
                              SEPARATELY, "MATERIALS") ARE BEING PROVIDED "AS IS." NVIDIA MAKES NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE
                              WITH RESPECT TO THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT, MERCHANTABILITY, AND FITNESS
                              FOR A PARTICULAR PURPOSE. 
                           </p>
                           <p class="p">Information furnished is believed to be accurate and reliable. However, NVIDIA Corporation assumes no responsibility for the
                              consequences of use of such information or for any infringement of patents or other rights of third parties that may result
                              from its use. No license is granted by implication of otherwise under any patent rights of NVIDIA Corporation. Specifications
                              mentioned in this publication are subject to change without notice. This publication supersedes and replaces all other information
                              previously supplied. NVIDIA Corporation products are not authorized as critical components in life support devices or systems
                              without express written approval of NVIDIA Corporation.
                           </p>
                        </div>
                     </div>
                  </div>
                  <div class="topic reference nested1" id="trademarks"><a name="trademarks" shape="rect">
                        <!-- --></a><h3 class="title topictitle2"><a href="#trademarks" name="trademarks" shape="rect"></a></h3>
                     <div class="body refbody">
                        <div class="section">
                           <h3 class="title sectiontitle">Trademarks</h3>
                           <p class="p">NVIDIA and the NVIDIA logo are trademarks or registered trademarks of NVIDIA Corporation
                              in the U.S. and other countries.  Other company and product names may be trademarks of
                              the respective companies with which they are associated.
                           </p>
                        </div>
                     </div>
                  </div>
                  <div class="topic reference nested1" id="copyright"><a name="copyright" shape="rect">
                        <!-- --></a><h3 class="title topictitle2"><a href="#copyright" name="copyright" shape="rect"></a></h3>
                     <div class="body refbody">
                        <div class="section">
                           <h3 class="title sectiontitle">Copyright</h3>
                           <p class="p">© 2012-<span class="ph">2014</span> NVIDIA Corporation. All rights reserved.
                           </p>
                           <p class="p">This product includes software developed by the Syncro Soft SRL (http://www.sync.ro/).</p>
                        </div>
                     </div>
                  </div>
               </div>
               <div class="fn"><a name="fntarg_1" href="#fnsrc_1" shape="rect"><sup>1</sup></a>  Throughout this guide,
                  <dfn class="term">Fermi</dfn> refers to devices of compute capability 2.x,
                  <dfn class="term">Kepler</dfn> refers to devices of compute capability 3.x, and
                  <dfn class="term">Maxwell</dfn> refers to devices of compute capability
                  5.0.
               </div>
               
               <hr id="contents-end"></hr>
               
            </article>
         </div>
      </div>
      <script language="JavaScript" type="text/javascript" charset="utf-8" src="../common/formatting/common.min.js"></script>
      <script language="JavaScript" type="text/javascript" charset="utf-8" src="../common/scripts/google-analytics/google-analytics-write.js"></script>
      <script language="JavaScript" type="text/javascript" charset="utf-8" src="../common/scripts/google-analytics/google-analytics-tracker.js"></script>
      <script type="text/javascript">var switchTo5x=true;</script><script type="text/javascript" src="http://w.sharethis.com/button/buttons.js"></script><script type="text/javascript">stLight.options({publisher: "998dc202-a267-4d8e-bce9-14debadb8d92", doNotHash: false, doNotCopy: false, hashAddressBar: false});</script></body>
</html>