Sophie

Sophie

distrib > Mageia > 5 > x86_64 > media > nonfree-updates > by-pkgid > fd8445e7e4d58b8cfe6e0150bd441ee1 > files > 1053

nvidia-cuda-toolkit-devel-6.5.14-6.1.mga5.nonfree.x86_64.rpm

<!DOCTYPE html
  PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" lang="en-us" xml:lang="en-us">
   <head>
      <meta http-equiv="Content-Type" content="text/html; charset=utf-8"></meta>
      <meta http-equiv="X-UA-Compatible" content="IE=edge"></meta>
      <meta name="copyright" content="(C) Copyright 2005"></meta>
      <meta name="DC.rights.owner" content="(C) Copyright 2005"></meta>
      <meta name="DC.Type" content="concept"></meta>
      <meta name="DC.Title" content="NVIDIA CUDA Compiler Driver NVCC"></meta>
      <meta name="abstract" content="The user manual for nvcc, the CUDA compiler driver."></meta>
      <meta name="description" content="The user manual for nvcc, the CUDA compiler driver."></meta>
      <meta name="DC.Coverage" content="Tools"></meta>
      <meta name="DC.subject" content="CUDA NVCC, CUDA NVCC compilers, CUDA NVCC environments, CUDA NVCC identification macro, CUDA NVCC phases, CUDA NVCC command options, CUDA NVCC profile, CUDA NVCC variables, CUDA NVCC usage, CUDA NVCC nvlink"></meta>
      <meta name="keywords" content="CUDA NVCC, CUDA NVCC compilers, CUDA NVCC environments, CUDA NVCC identification macro, CUDA NVCC phases, CUDA NVCC command options, CUDA NVCC profile, CUDA NVCC variables, CUDA NVCC usage, CUDA NVCC nvlink"></meta>
      <meta name="DC.Format" content="XHTML"></meta>
      <meta name="DC.Identifier" content="abstract"></meta>
      <link rel="stylesheet" type="text/css" href="../common/formatting/commonltr.css"></link>
      <link rel="stylesheet" type="text/css" href="../common/formatting/site.css"></link>
      <title>NVCC :: CUDA Toolkit Documentation</title>
      <!--[if lt IE 9]>
      <script src="../common/formatting/html5shiv-printshiv.min.js"></script>
      <![endif]-->
      <script type="text/javascript" charset="utf-8" src="../common/scripts/tynt/tynt.js"></script>
      <script type="text/javascript" charset="utf-8" src="../common/formatting/jquery.min.js"></script>
      <script type="text/javascript" charset="utf-8" src="../common/formatting/jquery.ba-hashchange.min.js"></script>
      <script type="text/javascript" charset="utf-8" src="../common/formatting/jquery.scrollintoview.min.js"></script>
      <script type="text/javascript" src="../search/htmlFileList.js"></script>
      <script type="text/javascript" src="../search/htmlFileInfoList.js"></script>
      <script type="text/javascript" src="../search/nwSearchFnt.min.js"></script>
      <script type="text/javascript" src="../search/stemmers/en_stemmer.min.js"></script>
      <script type="text/javascript" src="../search/index-1.js"></script>
      <script type="text/javascript" src="../search/index-2.js"></script>
      <script type="text/javascript" src="../search/index-3.js"></script>
      <link rel="canonical" href="http://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html"></link>
      <link rel="stylesheet" type="text/css" href="../common/formatting/qwcode.highlight.css"></link>
   </head>
   <body>
      
      <header id="header"><span id="company">NVIDIA</span><span id="site-title">CUDA Toolkit Documentation</span><form id="search" method="get" action="search">
            <input type="text" name="search-text"></input><fieldset id="search-location">
               <legend>Search In:</legend>
               <label><input type="radio" name="search-type" value="site"></input>Entire Site</label>
               <label><input type="radio" name="search-type" value="document"></input>Just This Document</label></fieldset>
            <button type="reset">clear search</button>
            <button id="submit" type="submit">search</button></form>
      </header>
      <div id="site-content">
         <nav id="site-nav">
            <div class="category closed"><a href="../index.html" title="The root of the site.">CUDA Toolkit
                  v6.5</a></div>
            <div class="category"><a href="index.html" title="NVCC">NVCC</a></div>
            <ul>
               <li>
                  <div class="section-link"><a href="#introduction">1.&nbsp;Introduction</a></div>
                  <ul>
                     <li>
                        <div class="section-link"><a href="#overview">1.1.&nbsp;Overview</a></div>
                        <ul>
                           <li>
                              <div class="section-link"><a href="#cuda-programming-model">1.1.1.&nbsp;CUDA Programming Model</a></div>
                           </li>
                           <li>
                              <div class="section-link"><a href="#cuda-sources">1.1.2.&nbsp;CUDA Sources</a></div>
                           </li>
                           <li>
                              <div class="section-link"><a href="#purpose-nvcc">1.1.3.&nbsp;Purpose of NVCC</a></div>
                           </li>
                        </ul>
                     </li>
                     <li>
                        <div class="section-link"><a href="#supported-host-compilers">1.2.&nbsp;Supported Host Compilers</a></div>
                     </li>
                     <li>
                        <div class="section-link"><a href="#supported-build-environments">1.3.&nbsp;Supported Build Environments</a></div>
                     </li>
                  </ul>
               </li>
               <li>
                  <div class="section-link"><a href="#compilation-phases">2.&nbsp;Compilation Phases</a></div>
                  <ul>
                     <li>
                        <div class="section-link"><a href="#nvcc-identification-macro">2.1.&nbsp;NVCC Identification Macro</a></div>
                     </li>
                     <li>
                        <div class="section-link"><a href="#nvcc-phases">2.2.&nbsp;NVCC Phases</a></div>
                     </li>
                     <li>
                        <div class="section-link"><a href="#supported-input-file-suffixes">2.3.&nbsp;Supported Input File Suffixes</a></div>
                     </li>
                     <li>
                        <div class="section-link"><a href="#supported-phases">2.4.&nbsp;Supported Phases</a></div>
                     </li>
                     <li>
                        <div class="section-link"><a href="#supported-phase-combinations">2.5.&nbsp;Supported Phase Combinations</a></div>
                     </li>
                     <li>
                        <div class="section-link"><a href="#keeping-intermediate-phase-files">2.6.&nbsp;Keeping Intermediate Phase Files</a></div>
                     </li>
                     <li>
                        <div class="section-link"><a href="#cleaning-generated-files">2.7.&nbsp;Cleaning Up Generated Files</a></div>
                     </li>
                     <li>
                        <div class="section-link"><a href="#use-of-platform-compiler">2.8.&nbsp;Use of Platform Compiler</a></div>
                        <ul>
                           <li>
                              <div class="section-link"><a href="#proper-compiler-install">2.8.1.&nbsp;Proper Compiler Installations</a></div>
                           </li>
                           <li>
                              <div class="section-link"><a href="#non-proper-compiler-install">2.8.2.&nbsp;Non Proper Compiler Installations</a></div>
                           </li>
                        </ul>
                     </li>
                     <li>
                        <div class="section-link"><a href="#cross-compiling-to-arm">2.9.&nbsp;cross compiling from x86 to ARMv7</a></div>
                     </li>
                     <li>
                        <div class="section-link"><a href="#nvcc-profile">2.10.&nbsp;nvcc.profile</a></div>
                        <ul>
                           <li>
                              <div class="section-link"><a href="#syntax">2.10.1.&nbsp;Syntax</a></div>
                           </li>
                           <li>
                              <div class="section-link"><a href="#environment-variable-expansion">2.10.2.&nbsp;Environment Variable Expansion</a></div>
                           </li>
                           <li>
                              <div class="section-link"><a href="#here-space">2.10.3.&nbsp;HERE_, _SPACE_</a></div>
                           </li>
                           <li>
                              <div class="section-link"><a href="#variables-interpreted-by-nvcc-itself">2.10.4.&nbsp;Variables Interpreted by NVCC Itself</a></div>
                           </li>
                           <li>
                              <div class="section-link"><a href="#example-of-profile">2.10.5.&nbsp;Example of profile</a></div>
                           </li>
                        </ul>
                     </li>
                  </ul>
               </li>
               <li>
                  <div class="section-link"><a href="#nvcc-command-options">3.&nbsp;NVCC Command Options</a></div>
                  <ul>
                     <li>
                        <div class="section-link"><a href="#command-option-types-and-notation">3.1.&nbsp;Command Option Types and Notation</a></div>
                     </li>
                     <li>
                        <div class="section-link"><a href="#command-option-description">3.2.&nbsp;Command Option Description</a></div>
                        <ul>
                           <li>
                              <div class="section-link"><a href="#options-for-specifying-compilation-phase">3.2.1.&nbsp;Options for Specifying the Compilation Phase</a></div>
                           </li>
                           <li>
                              <div class="section-link"><a href="#file-and-path-specifications">3.2.2.&nbsp;File and Path Specifications</a></div>
                           </li>
                           <li>
                              <div class="section-link"><a href="#options-for-altering-compiler-linker-behavior">3.2.3.&nbsp;Options for Altering Compiler/Linker Behavior</a></div>
                           </li>
                           <li>
                              <div class="section-link"><a href="#options-for-passing-specific-phase-options">3.2.4.&nbsp;Options for Passing Specific Phase Options</a></div>
                           </li>
                           <li>
                              <div class="section-link"><a href="#options-for-guiding-compiler-driver">3.2.5.&nbsp;Options for Guiding the Compiler Driver</a></div>
                           </li>
                           <li>
                              <div class="section-link"><a href="#options-for-steering-cuda-compilation">3.2.6.&nbsp;Options for Steering CUDA Compilation</a></div>
                           </li>
                           <li>
                              <div class="section-link"><a href="#options-for-steering-gpu-code-generation">3.2.7.&nbsp;Options for Steering GPU Code Generation</a></div>
                           </li>
                           <li>
                              <div class="section-link"><a href="#generic-tool-options">3.2.8.&nbsp;Generic Tool Options</a></div>
                           </li>
                           <li>
                              <div class="section-link"><a href="#phase-options">3.2.9.&nbsp;Phase Options</a></div>
                              <ul>
                                 <li>
                                    <div class="section-link"><a href="#ptxas-options">3.2.9.1.&nbsp;Ptxas Options</a></div>
                                 </li>
                                 <li>
                                    <div class="section-link"><a href="#nvlink-options">3.2.9.2.&nbsp;Nvlink Options</a></div>
                                 </li>
                              </ul>
                           </li>
                        </ul>
                     </li>
                  </ul>
               </li>
               <li>
                  <div class="section-link"><a href="#cuda-compilation-trajectory">4.&nbsp;The CUDA Compilation Trajectory</a></div>
                  <ul>
                     <li>
                        <div class="section-link"><a href="#listing-and-rerunning-nvcc-steps">4.1.&nbsp;Listing and Rerunning NVCC Steps</a></div>
                     </li>
                     <li>
                        <div class="section-link"><a href="#full-cuda-compilation-trajectory">4.2.&nbsp;Full CUDA Compilation Trajectory</a></div>
                        <ul>
                           <li>
                              <div class="section-link"><a href="#compilation-flow">4.2.1.&nbsp;Compilation Flow</a></div>
                           </li>
                           <li>
                              <div class="section-link"><a href="#cuda-frontend">4.2.2.&nbsp;CUDA Frontend</a></div>
                           </li>
                           <li>
                              <div class="section-link"><a href="#preprocessing">4.2.3.&nbsp;Preprocessing</a></div>
                           </li>
                        </ul>
                     </li>
                  </ul>
               </li>
               <li>
                  <div class="section-link"><a href="#sample-nvcc-usage">5.&nbsp;Sample NVCC Usage</a></div>
               </li>
               <li>
                  <div class="section-link"><a href="#gpu-compilation">6.&nbsp;GPU Compilation</a></div>
                  <ul>
                     <li>
                        <div class="section-link"><a href="#gpu-generations">6.1.&nbsp;GPU Generations</a></div>
                     </li>
                     <li>
                        <div class="section-link"><a href="#gpu-feature-list">6.2.&nbsp;GPU Feature List</a></div>
                     </li>
                     <li>
                        <div class="section-link"><a href="#application-compatibility">6.3.&nbsp;Application Compatibility</a></div>
                     </li>
                     <li>
                        <div class="section-link"><a href="#virtual-architectures">6.4.&nbsp;Virtual Architectures</a></div>
                     </li>
                     <li>
                        <div class="section-link"><a href="#virtual-architecture-feature-list">6.5.&nbsp;Virtual Architecture Feature List</a></div>
                     </li>
                     <li>
                        <div class="section-link"><a href="#further-mechanisms">6.6.&nbsp;Further Mechanisms</a></div>
                        <ul>
                           <li>
                              <div class="section-link"><a href="#just-in-time-compilation">6.6.1.&nbsp;Just in Time Compilation</a></div>
                           </li>
                           <li>
                              <div class="section-link"><a href="#fatbinaries">6.6.2.&nbsp;Fatbinaries</a></div>
                           </li>
                        </ul>
                     </li>
                     <li>
                        <div class="section-link"><a href="#nvcc-examples">6.7.&nbsp;NVCC Examples</a></div>
                        <ul>
                           <li>
                              <div class="section-link"><a href="#base-notation">6.7.1.&nbsp;Base Notation</a></div>
                           </li>
                           <li>
                              <div class="section-link"><a href="#shorthand">6.7.2.&nbsp;Shorthand</a></div>
                              <ul>
                                 <li>
                                    <div class="section-link"><a href="#shorthand-1">6.7.2.1.&nbsp;Shorthand 1</a></div>
                                 </li>
                                 <li>
                                    <div class="section-link"><a href="#shorthand-2">6.7.2.2.&nbsp;Shorthand 2</a></div>
                                 </li>
                                 <li>
                                    <div class="section-link"><a href="#shorthand-3">6.7.2.3.&nbsp;Shorthand 3</a></div>
                                 </li>
                              </ul>
                           </li>
                           <li>
                              <div class="section-link"><a href="#extended-notation">6.7.3.&nbsp;Extended Notation</a></div>
                           </li>
                           <li>
                              <div class="section-link"><a href="#virtual-architecture-identification-macro">6.7.4.&nbsp;Virtual Architecture Identification Macro</a></div>
                           </li>
                        </ul>
                     </li>
                  </ul>
               </li>
               <li>
                  <div class="section-link"><a href="#using-separate-compilation-in-cuda">7.&nbsp;Using Separate Compilation in CUDA</a></div>
                  <ul>
                     <li>
                        <div class="section-link"><a href="#code-changes-for-separate-compilation">7.1.&nbsp;Code Changes for Separate Compilation</a></div>
                     </li>
                     <li>
                        <div class="section-link"><a href="#nvcc-options-for-separate-compilation">7.2.&nbsp;NVCC Options for Separate Compilation</a></div>
                     </li>
                     <li>
                        <div class="section-link"><a href="#libraries">7.3.&nbsp;Libraries</a></div>
                     </li>
                     <li>
                        <div class="section-link"><a href="#examples">7.4.&nbsp;Examples</a></div>
                     </li>
                     <li>
                        <div class="section-link"><a href="#potential-separate-compilation-issues">7.5.&nbsp;Potential Separate Compilation Issues</a></div>
                        <ul>
                           <li>
                              <div class="section-link"><a href="#object-compatibility">7.5.1.&nbsp;Object Compatibility</a></div>
                           </li>
                           <li>
                              <div class="section-link"><a href="#jit-linking-not-supported">7.5.2.&nbsp;JIT Linking Support</a></div>
                           </li>
                           <li>
                              <div class="section-link"><a href="#implicit-cuda-host-code">7.5.3.&nbsp;Implicit CUDA Host Code</a></div>
                           </li>
                        </ul>
                     </li>
                  </ul>
               </li>
               <li>
                  <div class="section-link"><a href="#miscellaneous-nvcc-usage">8.&nbsp;Miscellaneous NVCC Usage</a></div>
                  <ul>
                     <li>
                        <div class="section-link"><a href="#printing-code-generation-statistics">8.1.&nbsp;Printing Code Generation Statistics</a></div>
                     </li>
                  </ul>
               </li>
            </ul>
         </nav>
         <div id="resize-nav"></div>
         <nav id="search-results">
            <h2>Search Results</h2>
            <ol></ol>
         </nav>
         
         <div id="contents-container">
            <div id="breadcrumbs-container">
               <div id="release-info">NVCC
                  (<a href="../../pdf/CUDA_Compiler_Driver_NVCC.pdf">PDF</a>)
                  -
                  
                  v6.5
                  (<a href="https://developer.nvidia.com/cuda-toolkit-archive">older</a>)
                  -
                  Last updated August 1, 2014
                  -
                  <a href="mailto:cudatools@nvidia.com?subject=CUDA Toolkit Documentation Feedback: NVCC">Send Feedback</a>
                  -
                  <span class="st_facebook"></span><span class="st_twitter"></span><span class="st_linkedin"></span><span class="st_reddit"></span><span class="st_slashdot"></span><span class="st_tumblr"></span><span class="st_sharethis"></span></div>
            </div>
            <article id="contents">
               <div class="topic nested0" id="abstract"><a name="abstract" shape="rect">
                     <!-- --></a><h2 class="title topictitle1"><a href="#abstract" name="abstract" shape="rect">NVIDIA CUDA Compiler Driver NVCC</a></h2>
                  <div class="body conbody"></div>
               </div>
               <div class="topic concept nested0" id="changes-from-previous-version"><a name="changes-from-previous-version" shape="rect">
                     <!-- --></a><h2 class="title topictitle1"><a href="#changes-from-previous-version" name="changes-from-previous-version" shape="rect">Changes from Previous Version</a></h2>
                  <div class="body conbody">
                     <ul class="ul">
                        <li class="li">New support for separate compilation.</li>
                        <li class="li">Replaced Device Code Repositories with Using Separate Compilation in CUDA</li>
                     </ul>
                  </div>
               </div>
               <div class="topic concept nested0" id="introduction"><a name="introduction" shape="rect">
                     <!-- --></a><h2 class="title topictitle1"><a href="#introduction" name="introduction" shape="rect">1.&nbsp;Introduction</a></h2>
                  <div class="topic concept nested1" id="overview"><a name="overview" shape="rect">
                        <!-- --></a><h3 class="title topictitle2"><a href="#overview" name="overview" shape="rect">1.1.&nbsp;Overview</a></h3>
                     <div class="topic concept nested2" id="cuda-programming-model"><a name="cuda-programming-model" shape="rect">
                           <!-- --></a><h3 class="title topictitle2"><a href="#cuda-programming-model" name="cuda-programming-model" shape="rect">1.1.1.&nbsp;CUDA Programming Model</a></h3>
                        <div class="body conbody">
                           <p class="p">
                              The CUDA Toolkit targets a class of applications whose control part runs as a process on a general purpose computer (Linux,
                              Windows), 
                              and which use one or more NVIDIA GPUs as coprocessors for accelerating SIMD parallel jobs. 
                              Such jobs are self- contained, in the sense that they can be executed and 
                              completed by a batch of GPU threads entirely without intervention by the host process, 
                              thereby gaining optimal benefit from the parallel graphics hardware.
                              
                           </p>
                           <p class="p">Dispatching GPU jobs by the host process is supported by the CUDA Toolkit in the form of remote procedure calling. 
                              The GPU code is implemented as a collection of functions in a language that is essentially C, 
                              but with some annotations for distinguishing them from the host code, plus annotations for distinguishing different types
                              of data memory 
                              that exists on the GPU. Such functions may have parameters, and they can be called using a syntax that is very similar to
                              regular 
                              C function calling, but slightly extended for being able to specify the matrix of GPU threads that must execute the called
                              function. 
                              During its life time, the host process may dispatch many parallel GPU tasks. See <a class="xref" href="index.html#supported-build-environments__example-of-cuda-source-file" shape="rect">Figure 1</a>.  
                              
                           </p>
                        </div>
                     </div>
                     <div class="topic concept nested2" id="cuda-sources"><a name="cuda-sources" shape="rect">
                           <!-- --></a><h3 class="title topictitle2"><a href="#cuda-sources" name="cuda-sources" shape="rect">1.1.2.&nbsp;CUDA Sources</a></h3>
                        <div class="body conbody">
                           <p class="p">Hence, source files for CUDA applications consist of a mixture of conventional C++ host code, plus GPU device (i.e., GPU-)
                              functions. The CUDA compilation trajectory separates the device functions from the host code, compiles the device functions
                              using proprietary NVIDIA compilers/assemblers, compiles the host code using a general purpose C/C++ compiler that is available
                              on the host platform, and afterwards embeds the compiled GPU functions as load images in the host object file. In the linking
                              stage, specific CUDA runtime libraries are added for supporting remote SIMD procedure calling and for providing explicit GPU
                              manipulation such as allocation of GPU memory buffers and host-GPU data transfer.
                           </p>
                        </div>
                     </div>
                     <div class="topic concept nested2" id="purpose-nvcc"><a name="purpose-nvcc" shape="rect">
                           <!-- --></a><h3 class="title topictitle2"><a href="#purpose-nvcc" name="purpose-nvcc" shape="rect">Purpose of NVCC</a></h3>
                        <div class="body conbody">
                           <p class="p">This compilation trajectory involves several splitting, compilation, preprocessing, and merging steps for each CUDA source
                              file, and several of these steps are subtly different for different modes of CUDA compilation (such as compilation for device
                              emulation, or the generation of device code repositories). 
                              It is the purpose of the CUDA compiler driver <samp class="ph codeph">nvcc</samp> to hide the intricate details of CUDA compilation from developers. 
                              	Additionally, instead of being a specific CUDA compilation driver,
                              <samp class="ph codeph">nvcc</samp> mimics the behavior of the GNU compiler <samp class="ph codeph">gcc</samp>: it accepts a range of conventional compiler options, 
                              such as for defining macros and include/library paths, and for steering the compilation process. 
                              All non-CUDA compilation steps are forwarded to a general purpose C compiler that is supported by <samp class="ph codeph">nvcc</samp>, a
                              nd on Windows platforms, where this compiler is an instance of the Microsoft Visual Studio compiler, 
                              <samp class="ph codeph">nvcc</samp> will translate its options into appropriate <samp class="ph codeph">cl</samp> command syntax. This extended behavior plus <samp class="ph codeph">cl</samp> option 
                              translation is intended for support of portable application build and make scripts across Linux and Windows platforms.
                           </p>
                        </div>
                     </div>
                  </div>
                  <div class="topic reference nested1" id="supported-host-compilers"><a name="supported-host-compilers" shape="rect">
                        <!-- --></a><h3 class="title topictitle2"><a href="#supported-host-compilers" name="supported-host-compilers" shape="rect">1.2.&nbsp;Supported Host Compilers</a></h3>
                     <div class="body refbody">
                        <div class="section refsyn">
                           <div class="p"><samp class="ph codeph">nvcc</samp> uses the following compilers for host code compilation:
                              
                              <dl class="dl">
                                 <dt class="dt dlterm">On Linux platforms</dt>
                                 <dd class="dd">The GNU compiler, <samp class="ph codeph">gcc</samp>, and <samp class="ph codeph">arm-linux-gnueabihf-g++</samp> for
                                    		      cross compilation to the ARMv7 architecture
                                 </dd>
                                 <dt class="dt dlterm">On Windows platforms</dt>
                                 <dd class="dd">The Microsoft Visual Studio compiler, <samp class="ph codeph">cl</samp></dd>
                              </dl>
                           </div>
                           <p class="p">On both platforms, the compiler found on the current execution search path will be used, unless <samp class="ph codeph">nvcc</samp> option <samp class="ph codeph">-compiler-bindir</samp> is specified (see <a class="xref" href="index.html#file-and-path-specifications" shape="rect">File and Path Specifications</a>).
                           </p>
                        </div>
                     </div>
                  </div>
                  <div class="topic reference nested1" id="supported-build-environments"><a name="supported-build-environments" shape="rect">
                        <!-- --></a><h3 class="title topictitle2"><a href="#supported-build-environments" name="supported-build-environments" shape="rect">1.3.&nbsp;Supported Build Environments</a></h3>
                     <div class="body refbody">
                        <div class="section refsyn">
                           <p class="p"><samp class="ph codeph">nvcc</samp> can be used in the following build environments: 
                           </p>
                           <dl class="dl">
                              <dt class="dt dlterm">Linux</dt>
                              <dd class="dd">Any shell</dd>
                              <dt class="dt dlterm">Windows</dt>
                              <dd class="dd">DOS shell</dd>
                              <dt class="dt dlterm">Windows</dt>
                              <dd class="dd">CygWin shells, use <samp class="ph codeph">nvcc</samp>'s drive prefix options (see <a class="xref" href="index.html#options-for-guiding-compiler-driver" shape="rect">Options for Guiding the Compiler Driver</a>).
                              </dd>
                              <dt class="dt dlterm">Windows:</dt>
                              <dd class="dd">MinGW shells, use <samp class="ph codeph">nvcc</samp>'s drive prefix options (see <a class="xref" href="index.html#options-for-guiding-compiler-driver" shape="rect">Options for Guiding the Compiler Driver</a>).
                              </dd>
                           </dl>
                           <p class="p">Although a variety of POSIX style shells is supported on Windows, <samp class="ph codeph">nvcc</samp> will still assume the Microsoft Visual Studio compiler for host compilation. Use of <samp class="ph codeph">gcc</samp> is not supported on Windows.
                           </p>
                           <div class="fig fignone" id="supported-build-environments__example-of-cuda-source-file"><a name="supported-build-environments__example-of-cuda-source-file" shape="rect">
                                 <!-- --></a><span class="figcap">Figure 1. Example of CUDA Source File</span><pre xml:space="preserve">#define ACOS_TESTS      (5)
<span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-directive">#define ACOS_THREAD_CNT (128)</span>
<span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-directive">#define ACOS_CTA_CNT    (96)</span>

<span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-keyword">struct</span> acosParams {
    <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-keyword">float</span> *arg;
    <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-keyword">float</span> *res;
    <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-keyword">int</span> n;
};

<span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-attribute">__global__</span> <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-keyword">void</span> acos_main (<span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-keyword">struct</span> acosParams parms)
{
    <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-keyword">int</span> i;
    <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-keyword">int</span> totalThreads = gridDim.x * <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-attribute">blockDim</span>.x;
    <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-keyword">int</span> ctaStart = <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-attribute">blockDim</span>.x * <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-attribute">blockIdx</span>.x;
    <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-keyword">for</span> (i = ctaStart + <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-attribute">threadIdx</span>.x; i &lt; parms.n; i += totalThreads) {
        parms.res[i] = acosf(parms.arg[i]);
    }
}

<span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-keyword">int</span> main (<span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-keyword">int</span> argc, <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-keyword">char</span> *argv[])
{
    <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-keyword">volatile</span> <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-keyword">float</span> acosRef;
    <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-keyword">float</span>* acosRes = 0;
    <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-keyword">float</span>* acosArg = 0;
    <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-keyword">float</span>* arg = 0;
    <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-keyword">float</span>* res = 0;
    <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-keyword">float</span> t;
    <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-keyword">struct</span> acosParams funcParams;
    <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-keyword">int</span> errors;
    <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-keyword">int</span> i;

    cudaMalloc ((<span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-keyword">void</span> **)&amp;acosArg, ACOS_TESTS * <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-keyword">sizeof</span>(<span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-keyword">float</span>));
    cudaMalloc ((<span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-keyword">void</span> **)&amp;acosRes, ACOS_TESTS * <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-keyword">sizeof</span>(<span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-keyword">float</span>));
    
    arg = (<span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-keyword">float</span> *) malloc (ACOS_TESTS * <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-keyword">sizeof</span>(arg[0]));
    res = (<span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-keyword">float</span> *) malloc (ACOS_TESTS * <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-keyword">sizeof</span>(res[0]));

    cudaMemcpy (acosArg, arg, ACOS_TESTS * <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-keyword">sizeof</span>(arg[0]), 
                cudaMemcpyHostToDevice);
    
    funcParams.res = acosRes;
    funcParams.arg = acosArg;
    funcParams.n = opts.n;

    acos_main<span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-attribute">&lt;&lt;&lt;</span>ACOS_CTA_CNT,ACOS_THREAD_CNT<span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-attribute">&gt;&gt;&gt;</span>(funcParams);

    cudaMemcpy (res, acosRes, ACOS_TESTS * <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-keyword">sizeof</span>(res[0]), 
                cudaMemcpyDeviceToHost);</pre></div>
                        </div>
                     </div>
                  </div>
               </div>
               <div class="topic concept nested0" id="compilation-phases"><a name="compilation-phases" shape="rect">
                     <!-- --></a><h2 class="title topictitle1"><a href="#compilation-phases" name="compilation-phases" shape="rect">2.&nbsp;Compilation Phases</a></h2>
                  <div class="topic concept nested1" id="nvcc-identification-macro"><a name="nvcc-identification-macro" shape="rect">
                        <!-- --></a><h3 class="title topictitle2"><a href="#nvcc-identification-macro" name="nvcc-identification-macro" shape="rect">2.1.&nbsp;NVCC Identification Macro</a></h3>
                     <div class="body conbody">
                        <div class="p"><samp class="ph codeph">nvcc</samp> predefines the following macros:
                           
                           <ul class="ul">
                              <li class="li"><samp class="ph codeph">__NVCC__</samp> : Defined when compiling C/C++/CUDA source files 
                              </li>
                              <li class="li"><samp class="ph codeph">__CUDACC__</samp> :  Defined when compiling CUDA source files 
                              </li>
                              <li class="li"><samp class="ph codeph">__CUDACC_RDC__</samp> : Defined when compiling CUDA sources files in relocatable device code mode (see <a class="xref" href="index.html#nvcc-options-for-separate-compilation" shape="rect">NVCC Options for Separate Compilation</a>).  
                              </li>
                           </ul>
                        </div>
                     </div>
                  </div>
                  <div class="topic concept nested1" id="nvcc-phases"><a name="nvcc-phases" shape="rect">
                        <!-- --></a><h3 class="title topictitle2"><a href="#nvcc-phases" name="nvcc-phases" shape="rect">2.2.&nbsp;NVCC Phases</a></h3>
                     <div class="body conbody">
                        <p class="p">A compilation phase is the a logical translation step that can be selected by command line
                           options to <samp class="ph codeph">nvcc</samp>. A single compilation phase can still be broken up by <samp class="ph codeph">nvcc</samp> into smaller steps,
                           but these smaller steps are <em class="ph i">just</em> implementations of the phase: they depend on seemingly
                           arbitrary capabilities of the internal tools that <samp class="ph codeph">nvcc</samp> uses, and all of these internals may
                           change with a new release of the CUDA Toolkit Hence, only compilation phases are stable across
                           releases, and although <samp class="ph codeph">nvcc</samp> provides options to display the compilation steps that it
                           executes, these are for debugging purposes only and must not be copied and used into build
                           scripts.
                        </p>
                        <p class="p"><samp class="ph codeph">nvcc</samp> phases are <em class="ph i">selected</em> by a combination of command line options and input file name
                           suffixes, and the execution of these phases may be <dfn class="term">modified</dfn> by other command line
                           options. In phase selection, the <em class="ph i">input</em> file suffix defines the phase input, while the
                           command line option defines the required <em class="ph i">output</em> of the phase.
                        </p>
                        <p class="p">The following paragraphs will list the recognized file name suffixes and the supported
                           compilation phases. A full explanation of the <samp class="ph codeph">nvcc</samp> command line options can be found in the
                           next chapter.
                        </p>
                     </div>
                  </div>
                  <div class="topic reference nested1" id="supported-input-file-suffixes"><a name="supported-input-file-suffixes" shape="rect">
                        <!-- --></a><h3 class="title topictitle2"><a href="#supported-input-file-suffixes" name="supported-input-file-suffixes" shape="rect">2.3.&nbsp;Supported Input File Suffixes</a></h3>
                     <div class="body refbody">
                        <div class="section refsyn">
                           <p class="p">The following table defines how <samp class="ph codeph">nvcc</samp> interprets its input files:
                           </p>
                           <div class="tablenoborder">
                              <table cellpadding="4" cellspacing="0" summary="" class="table" frame="border" border="1" rules="all">
                                 <tbody class="tbody">
                                    <tr class="row">
                                       <td class="entry" valign="top" width="33.33333333333333%" rowspan="1" colspan="1"><samp class="ph codeph">.cu</samp></td>
                                       <td class="entry" valign="top" width="66.66666666666666%" rowspan="1" colspan="1"> CUDA source file, containing host code and device functions </td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="33.33333333333333%" rowspan="1" colspan="1"><samp class="ph codeph">.cup</samp></td>
                                       <td class="entry" valign="top" width="66.66666666666666%" rowspan="1" colspan="1"><em class="ph i">Preprocessed</em> CUDA source file, containing host code and device functions
                                          
                                       </td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="33.33333333333333%" rowspan="1" colspan="1"><samp class="ph codeph">.c</samp></td>
                                       <td class="entry" valign="top" width="66.66666666666666%" rowspan="1" colspan="1">
                                          C source file 
                                       </td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="33.33333333333333%" rowspan="1" colspan="1"><samp class="ph codeph">.cc</samp>, <samp class="ph codeph">.cxx</samp>, <samp class="ph codeph">.cpp</samp></td>
                                       <td class="entry" valign="top" width="66.66666666666666%" rowspan="1" colspan="1"> C++ source file </td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="33.33333333333333%" rowspan="1" colspan="1"><samp class="ph codeph">.gpu</samp></td>
                                       <td class="entry" valign="top" width="66.66666666666666%" rowspan="1" colspan="1"> GPU intermediate file (see <a class="xref" href="index.html#full-cuda-compilation-trajectory__cuda-compilation-from-cu-to-cu-cpp-ii" shape="rect">Figure 2</a>) 
                                       </td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="33.33333333333333%" rowspan="1" colspan="1"><samp class="ph codeph">.ptx</samp></td>
                                       <td class="entry" valign="top" width="66.66666666666666%" rowspan="1" colspan="1"> PTX intermediate assembly file (see <a class="xref" href="index.html#full-cuda-compilation-trajectory__cuda-compilation-from-cu-to-cu-cpp-ii" shape="rect">Figure 2</a>) 
                                       </td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="33.33333333333333%" rowspan="1" colspan="1"><samp class="ph codeph">.o</samp>, <samp class="ph codeph">.obj</samp></td>
                                       <td class="entry" valign="top" width="66.66666666666666%" rowspan="1" colspan="1"> Object file </td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="33.33333333333333%" rowspan="1" colspan="1"><samp class="ph codeph">.a</samp>, <samp class="ph codeph">.lib</samp></td>
                                       <td class="entry" valign="top" width="66.66666666666666%" rowspan="1" colspan="1"> Library file </td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="33.33333333333333%" rowspan="1" colspan="1"><samp class="ph codeph">.res</samp></td>
                                       <td class="entry" valign="top" width="66.66666666666666%" rowspan="1" colspan="1"> Resource file </td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="33.33333333333333%" rowspan="1" colspan="1"><samp class="ph codeph">.so</samp></td>
                                       <td class="entry" valign="top" width="66.66666666666666%" rowspan="1" colspan="1"> Shared object file </td>
                                    </tr>
                                 </tbody>
                              </table>
                           </div>
                           <div class="p"><strong class="ph b">Notes:</strong><ul class="ul">
                                 <li class="li"><samp class="ph codeph">nvcc</samp> does not make any distinction between object, library or resource files. It just passes files of these types to the linker
                                    when the linking phase is executed.
                                 </li>
                                 <li class="li"><samp class="ph codeph">nvcc</samp> deviates from gcc behavior with respect to files whose suffixes are <em class="ph i">unknown</em> (i.e., that do not occur in the above table): instead of assuming that these files must be linker input, <samp class="ph codeph">nvcc</samp> will generate an error.
                                 </li>
                              </ul>
                           </div>
                        </div>
                     </div>
                  </div>
                  <div class="topic reference nested1" id="supported-phases"><a name="supported-phases" shape="rect">
                        <!-- --></a><h3 class="title topictitle2"><a href="#supported-phases" name="supported-phases" shape="rect">2.4.&nbsp;Supported Phases</a></h3>
                     <div class="body refbody">
                        <div class="section refsyn">
                           <p class="p">The following table specifies the supported compilation phases, plus the option to <samp class="ph codeph">nvcc</samp> that enables execution of this phase. It also lists the default name of the output file generated by this phase, which will
                              take effect when no explicit output file name is specified using option <samp class="ph codeph">-o</samp>:
                           </p>
                           <div class="tablenoborder">
                              <table cellpadding="4" cellspacing="0" summary="" class="table" frame="border" border="1" rules="all">
                                 <tbody class="tbody">
                                    <tr class="row">
                                       <td class="entry" valign="top" width="33.33333333333333%" rowspan="1" colspan="1"> CUDA compilation to C/C++ source file </td>
                                       <td class="entry" valign="top" width="25%" rowspan="1" colspan="1"><samp class="ph codeph">-cuda</samp></td>
                                       <td class="entry" valign="top" width="41.66666666666667%" rowspan="1" colspan="1"><samp class="ph codeph">.cpp.ii</samp> appended to source file name, as in <samp class="ph codeph">x.cu.cpp.ii</samp>. This output file can be compiled by the host compiler that was used by <samp class="ph codeph">nvcc</samp> to preprocess the <samp class="ph codeph">.cu</samp> file 
                                       </td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="33.33333333333333%" rowspan="1" colspan="1"> C/C++ preprocessing</td>
                                       <td class="entry" valign="top" width="25%" rowspan="1" colspan="1"><samp class="ph codeph">-E</samp></td>
                                       <td class="entry" valign="top" width="41.66666666666667%" rowspan="1" colspan="1"> &lt; result on standard output &gt; </td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="33.33333333333333%" rowspan="1" colspan="1"> C/C++ compilation to object file </td>
                                       <td class="entry" valign="top" width="25%" rowspan="1" colspan="1"><samp class="ph codeph">-c</samp></td>
                                       <td class="entry" valign="top" width="41.66666666666667%" rowspan="1" colspan="1"> Source file name with suffix replaced by <samp class="ph codeph">o</samp> on Linux,&nbsp; or <samp class="ph codeph">obj</samp> on Windows
                                          
                                       </td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="33.33333333333333%" rowspan="1" colspan="1"> Cubin generation from CUDA source files </td>
                                       <td class="entry" valign="top" width="25%" rowspan="1" colspan="1"><samp class="ph codeph">-cubin</samp></td>
                                       <td class="entry" valign="top" width="41.66666666666667%" rowspan="1" colspan="1"> Source file name with suffix replaced by <samp class="ph codeph">cubin</samp></td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="33.33333333333333%" rowspan="1" colspan="1"> Cubin generation from <samp class="ph codeph">.gpu</samp> intermediate files 
                                       </td>
                                       <td class="entry" valign="top" width="25%" rowspan="1" colspan="1"><samp class="ph codeph">-cubin</samp></td>
                                       <td class="entry" valign="top" width="41.66666666666667%" rowspan="1" colspan="1"> Source file name with suffix replaced by <samp class="ph codeph">cubin</samp></td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="33.33333333333333%" rowspan="1" colspan="1"> Cubin generation from <samp class="ph codeph">ptx</samp> intermediate files. 
                                       </td>
                                       <td class="entry" valign="top" width="25%" rowspan="1" colspan="1"><samp class="ph codeph">-cubin</samp></td>
                                       <td class="entry" valign="top" width="41.66666666666667%" rowspan="1" colspan="1"> Source file name with suffix replaced by <samp class="ph codeph">cubin</samp></td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="33.33333333333333%" rowspan="1" colspan="1"> PTX generation from CUDA source files </td>
                                       <td class="entry" valign="top" width="25%" rowspan="1" colspan="1"><samp class="ph codeph">-ptx</samp></td>
                                       <td class="entry" valign="top" width="41.66666666666667%" rowspan="1" colspan="1"> Source file name with suffix replaced by <samp class="ph codeph">ptx</samp></td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="33.33333333333333%" rowspan="1" colspan="1"> PTX generation from <samp class="ph codeph">.gpu</samp> intermediate files 
                                       </td>
                                       <td class="entry" valign="top" width="25%" rowspan="1" colspan="1"><samp class="ph codeph">-ptx</samp></td>
                                       <td class="entry" valign="top" width="41.66666666666667%" rowspan="1" colspan="1"> Source file name with suffix replaced by <samp class="ph codeph">ptx</samp></td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="33.33333333333333%" rowspan="1" colspan="1"> Fatbin generation from source, <samp class="ph codeph">ptx</samp> or <samp class="ph codeph">cubin</samp> files 
                                       </td>
                                       <td class="entry" valign="top" width="25%" rowspan="1" colspan="1"><samp class="ph codeph">-fatbin</samp></td>
                                       <td class="entry" valign="top" width="41.66666666666667%" rowspan="1" colspan="1"> Source file name with suffix replaced by <samp class="ph codeph">fatbin</samp></td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="33.33333333333333%" rowspan="1" colspan="1"> GPU generation from CUDA source files </td>
                                       <td class="entry" valign="top" width="25%" rowspan="1" colspan="1"><samp class="ph codeph">-gpu</samp></td>
                                       <td class="entry" valign="top" width="41.66666666666667%" rowspan="1" colspan="1"> Source file name with suffix replaced by <samp class="ph codeph">gpu</samp></td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="33.33333333333333%" rowspan="1" colspan="1"> Linking an executable, or <samp class="ph codeph">dll</samp></td>
                                       <td class="entry" valign="top" width="25%" rowspan="1" colspan="1"> &lt; no phase option &gt; </td>
                                       <td class="entry" valign="top" width="41.66666666666667%" rowspan="1" colspan="1"><samp class="ph codeph">a.out</samp> on Linux, or <samp class="ph codeph">a.exe</samp> on Windows 
                                       </td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="33.33333333333333%" rowspan="1" colspan="1"> Constructing an object file archive, or library </td>
                                       <td class="entry" valign="top" width="25%" rowspan="1" colspan="1"><samp class="ph codeph">-lib</samp></td>
                                       <td class="entry" valign="top" width="41.66666666666667%" rowspan="1" colspan="1"><samp class="ph codeph">a.a</samp> on Linux, or <samp class="ph codeph">a.lib</samp> on Windows 
                                       </td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="33.33333333333333%" rowspan="1" colspan="1"><samp class="ph codeph">make</samp> dependency generation 
                                       </td>
                                       <td class="entry" valign="top" width="25%" rowspan="1" colspan="1"><samp class="ph codeph">-M</samp></td>
                                       <td class="entry" valign="top" width="41.66666666666667%" rowspan="1" colspan="1"> &lt; result on standard output &gt; </td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="33.33333333333333%" rowspan="1" colspan="1"> Running an executable </td>
                                       <td class="entry" valign="top" width="25%" rowspan="1" colspan="1"><samp class="ph codeph">-run</samp></td>
                                       <td class="entry" valign="top" width="41.66666666666667%" rowspan="1" colspan="1"><samp class="ph codeph">-</samp></td>
                                    </tr>
                                 </tbody>
                              </table>
                           </div>
                           <div class="p"><strong class="ph b">Notes:</strong><ul class="ul">
                                 <li class="li">The last phase in this list is more of a convenience phase. It allows running the compiled and linked executable without having
                                    to explicitly set the library path to the CUDA dynamic libraries. Running using <samp class="ph codeph">nvcc</samp> will automatically set the environment variables that are specified in <samp class="ph codeph">nvcc.profile</samp> (see <a class="xref" href="index.html#environment-variable-expansion" shape="rect">Environment Variable Expansion</a>) prior to starting the executable.
                                 </li>
                                 <li class="li">Files with extension <samp class="ph codeph">.cup</samp> are assumed to be the result of preprocessing CUDA source files, by <samp class="ph codeph">nvcc</samp> commands as <samp class="ph codeph">nvcc -E x.cu -o x.cup</samp>, or <samp class="ph codeph">nvcc -E x.cu &gt; x.cup</samp>. Similar to regular compiler distributions, such as Microsoft Visual Studio or <samp class="ph codeph">gcc</samp>, preprocessed source files are the best format to include in compiler bug reports. They are most likely to contain all information
                                    necessary for reproducing the bug.
                                 </li>
                              </ul>
                           </div>
                        </div>
                     </div>
                  </div>
                  <div class="topic reference nested1" id="supported-phase-combinations"><a name="supported-phase-combinations" shape="rect">
                        <!-- --></a><h3 class="title topictitle2"><a href="#supported-phase-combinations" name="supported-phase-combinations" shape="rect">2.5.&nbsp;Supported Phase Combinations</a></h3>
                     <div class="body refbody">
                        <div class="section refsyn">
                           <div class="p">The following phase combinations are supported by <samp class="ph codeph">nvcc</samp>:
                              <ul class="ul">
                                 <li class="li">CUDA compilation to object file. This is a combination of CUDA Compilation and C compilation, and invoked by option <samp class="ph codeph">-c</samp>.
                                 </li>
                                 <li class="li">Preprocessing is usually implicitly performed as first step in compilation phases</li>
                                 <li class="li">Unless a phase option is specified, <samp class="ph codeph">nvcc</samp> will compile and link all its input files
                                 </li>
                                 <li class="li">When <samp class="ph codeph">-lib</samp> is specified, <samp class="ph codeph">nvcc</samp> will compile all its input files, and store the resulting object files into the specified <samp class="ph codeph">archive/library</samp>.
                                 </li>
                              </ul>
                           </div>
                        </div>
                     </div>
                  </div>
                  <div class="topic concept nested1" id="keeping-intermediate-phase-files"><a name="keeping-intermediate-phase-files" shape="rect">
                        <!-- --></a><h3 class="title topictitle2"><a href="#keeping-intermediate-phase-files" name="keeping-intermediate-phase-files" shape="rect">2.6.&nbsp;Keeping Intermediate Phase Files</a></h3>
                     <div class="body conbody">
                        <p class="p"><samp class="ph codeph">nvcc</samp> will store intermediate results by default into temporary files that are deleted immediately before <samp class="ph codeph">nvcc</samp> completes. The location of the temporary file directories that are used are, depending on the current platform, as follows:
                        </p>
                        <dl class="dl">
                           <dt class="dt dlterm">Windows temp directory</dt>
                           <dd class="dd">Value of environment variable <samp class="ph codeph">TEMP</samp>, or <samp class="ph codeph">c:/Windows/temp</samp></dd>
                           <dt class="dt dlterm">Linux temp directory</dt>
                           <dd class="dd"><samp class="ph codeph">/tmp</samp></dd>
                        </dl>
                        <p class="p">Options <samp class="ph codeph">-keep</samp> or <samp class="ph codeph">-save-temps</samp> (these options are equivalent) will instead store these intermediate files in the current directory, with names as described
                           in <a class="xref" href="index.html#supported-phases" shape="rect">Supported Phases</a>.
                        </p>
                     </div>
                  </div>
                  <div class="topic concept nested1" id="cleaning-generated-files"><a name="cleaning-generated-files" shape="rect">
                        <!-- --></a><h3 class="title topictitle2"><a href="#cleaning-generated-files" name="cleaning-generated-files" shape="rect">2.7.&nbsp;Cleaning Up Generated Files</a></h3>
                     <div class="body conbody">
                        <p class="p">All files generated by a particular nvcc command can be cleaned up by repeating the command, but with additional option <samp class="ph codeph">-clean</samp>. This option is particularly useful after using <samp class="ph codeph">-keep</samp>, because the keep option usually leaves quite an amount of intermediate files around.
                        </p>
                        <p class="p">Because using <samp class="ph codeph">-clean</samp> will remove exactly what the original nvcc command created, it is important to exactly repeat all of the options in the original
                           command. For instance, in the following example, omitting <samp class="ph codeph">-keep</samp>, or adding <samp class="ph codeph">-c</samp> will have different cleanup effects.
                        </p><pre xml:space="preserve">nvcc acos.cu -keep

nvcc acos.cu -keep -clean</pre></div>
                  </div>
                  <div class="topic concept nested1" id="use-of-platform-compiler"><a name="use-of-platform-compiler" shape="rect">
                        <!-- --></a><h3 class="title topictitle2"><a href="#use-of-platform-compiler" name="use-of-platform-compiler" shape="rect">2.8.&nbsp;Use of Platform Compiler</a></h3>
                     <div class="body conbody">
                        <div class="p">A general purpose C compiler is needed by <samp class="ph codeph">nvcc</samp> in the following situations:
                           
                           <ul class="ul">
                              <li class="li">During non-CUDA phases (except the run phase), because these phases will be forwarded by <samp class="ph codeph">nvcc</samp> to this compiler
                              </li>
                              <li class="li">During CUDA phases, for several preprocessing stages (see also <a class="xref" href="index.html#cuda-compilation-trajectory" shape="rect">The CUDA Compilation Trajectory</a>).
                              </li>
                           </ul>
                        </div>
                        <p class="p">On Linux platforms, the compiler is assumed to be <samp class="ph codeph">gcc</samp>, or <samp class="ph codeph">g++</samp> for linking.
                           On Windows platforms, the compiler is assumed to be <samp class="ph codeph">cl</samp>. The compiler executables are expected
                           to be in the current executable search path, unless option <samp class="ph codeph">--compiler-bindir</samp> is specified, in
                           which case the value of this option must be the name of the directory in which these compiler executables reside.
                           This option is used for cross compilation to the ARMv7 architecture as well, where the underlying host compiler
                           is required to be a gcc compiler, capable of generating ARMv7 code.
                        </p>
                     </div>
                     <div class="topic concept nested2" id="proper-compiler-install"><a name="proper-compiler-install" shape="rect">
                           <!-- --></a><h3 class="title topictitle2"><a href="#proper-compiler-install" name="proper-compiler-install" shape="rect">2.8.1.&nbsp;Proper Compiler Installations</a></h3>
                        <div class="body conbody">
                           <p class="p">On both Linux and Windows, <em class="ph i">properly</em> installed compilers have some form of <em class="ph i">internal knowledge</em> that enables them to locate system include files, system libraries and dlls, include files and libraries related the compiler
                              installation itself, and include files and libraries that implement <samp class="ph codeph">libc</samp> and <samp class="ph codeph">libc++</samp>. 
                           </p>
                           <p class="p">A properly installed <samp class="ph codeph">gcc</samp> compiler has this knowledge built in, while a properly installed Microsoft Visual Studio compiler has this knowledge available
                              in a batch script <samp class="ph codeph">vsvars.bat</samp>, at a known place in its installation tree. This script must be executed prior to running the <samp class="ph codeph">cl</samp> compiler, in order to place the correct settings into specific environment variables that the <samp class="ph codeph">cl</samp> compiler recognizes.
                           </p>
                           <p class="p">On Windows platforms, <samp class="ph codeph">nvcc</samp> will locate <samp class="ph codeph">vsvars.bat</samp> via the specified <samp class="ph codeph">--compiler-bindir</samp> and execute it so that these environment variables become available.
                           </p>
                           <p class="p">On Linux platforms, <samp class="ph codeph">nvcc</samp> will always assume that the compiler is properly installed.
                           </p>
                        </div>
                     </div>
                     <div class="topic concept nested2" id="non-proper-compiler-install"><a name="non-proper-compiler-install" shape="rect">
                           <!-- --></a><h3 class="title topictitle2"><a href="#non-proper-compiler-install" name="non-proper-compiler-install" shape="rect">2.8.2.&nbsp;Non Proper Compiler Installations</a></h3>
                        <div class="body conbody">
                           <p class="p">The platform compiler can still be <em class="ph i">improperly</em> used, but in this case the user of <samp class="ph codeph">nvcc</samp> is responsible for explicitly providing the correct include and library paths on the <samp class="ph codeph">nvcc</samp> command line. Especially using <samp class="ph codeph">gcc</samp> compilers, this requires intimate knowledge of <samp class="ph codeph">gcc</samp> and Linux system issues, and these may vary over different <samp class="ph codeph">gcc</samp> distributions. Therefore, this practice is not recommended
                           </p>
                        </div>
                     </div>
                  </div>
                  <div class="topic concept nested1" id="cross-compiling-to-arm"><a name="cross-compiling-to-arm" shape="rect">
                        <!-- --></a><h3 class="title topictitle2"><a href="#cross-compiling-to-arm" name="cross-compiling-to-arm" shape="rect">2.9.&nbsp;cross compiling from x86 to ARMv7</a></h3>
                     <div class="body conbody">
                        <div class="p">Cross compiling to the ARMv7 architecture is controlled
                           by using the following nvcc command line options:
                           
                           <ul class="ul">
                              <li class="li"><samp class="ph codeph">-target-cpu-arch ARM</samp>. This option signals cross compilation to ARM.
                              </li>
                              <li class="li"><samp class="ph codeph">-ccbin &lt;arm-cross-compiler&gt;</samp>. This sets the host compiler
                                 	           with which nvcc cross compiles the host.
                              </li>
                              <li class="li"><samp class="ph codeph">-m32</samp>. This option signals that the target platform is a 32-bit platform.
                                 	          Use this when the host platform is a 64-bit x86 platform.
                              </li>
                           </ul>
                        </div>
                     </div>
                  </div>
                  <div class="topic concept nested1" id="nvcc-profile"><a name="nvcc-profile" shape="rect">
                        <!-- --></a><h3 class="title topictitle2 preserve-case"><a href="#nvcc-profile" name="nvcc-profile" shape="rect">2.10.&nbsp;nvcc.profile</a></h3>
                     <div class="body conbody">
                        <p class="p"><samp class="ph codeph">nvcc</samp> expects a configuration file <samp class="ph codeph">nvcc.profile</samp> in the directory where the <samp class="ph codeph">nvcc</samp> executable itself resides. 
                           	This profile contains a sequence of assignments to environment variables which are necessary for correct execution of executables
                           that <samp class="ph codeph">nvcc</samp> invokes. 
                           	Typical is extending the variables <samp class="ph codeph">PATH</samp>, <samp class="ph codeph">LD_LIBRARY_PATH</samp> with the bin and lib directories in the CUDA Toolkit installation.
                           		  
                        </p>
                        <p class="p">The single purpose of <samp class="ph codeph">nvcc.profile</samp> is to define the directory structure of the CUDA release tree to <samp class="ph codeph">nvcc</samp>. It is not intended as a configuration file for <samp class="ph codeph">nvcc</samp> users.
                        </p>
                     </div>
                     <div class="topic concept nested2" id="syntax"><a name="syntax" shape="rect">
                           <!-- --></a><h3 class="title topictitle2"><a href="#syntax" name="syntax" shape="rect">2.10.1.&nbsp;Syntax</a></h3>
                        <div class="body conbody">
                           <p class="p">Lines containing all spaces, or lines that start with zero or more spaces followed by a <samp class="ph codeph">#</samp> character are considered comment lines. All other lines in <samp class="ph codeph">nvcc.profile</samp> must have settings of either of the following forms:
                           </p><pre xml:space="preserve"><em class="ph i">name</em> = &lt;text&gt;
<em class="ph i">name</em> ?= &lt;text&gt;
<em class="ph i">name</em> += &lt;text&gt;
<em class="ph i">name</em> =+ &lt;text&gt;</pre><p class="p">Each of these three forms will cause an assignment to environment variable <samp class="ph codeph"><em class="ph i">name</em></samp>: the specified text string will be macro- expanded (see <a class="xref" href="index.html#environment-variable-expansion" shape="rect">Environment Variable Expansion</a>) and assigned (<samp class="ph codeph">=</samp>), or conditionally assigned (<samp class="ph codeph">?=</samp>), or prepended (<samp class="ph codeph">+=</samp>), or appended (<samp class="ph codeph">=+</samp>)
                           </p>
                        </div>
                     </div>
                     <div class="topic concept nested2" id="environment-variable-expansion"><a name="environment-variable-expansion" shape="rect">
                           <!-- --></a><h3 class="title topictitle2"><a href="#environment-variable-expansion" name="environment-variable-expansion" shape="rect">2.10.2.&nbsp;Environment Variable Expansion</a></h3>
                        <div class="body conbody">
                           <p class="p">The assigned text strings may refer to the current value of environment variables by either of the following syntax:</p>
                           <dl class="dl">
                              <dt class="dt dlterm"><samp class="ph codeph">%name%</samp></dt>
                              <dd class="dd">DOS style</dd>
                              <dt class="dt dlterm"><samp class="ph codeph">$(name)</samp></dt>
                              <dd class="dd"><samp class="ph codeph">make</samp> style
                              </dd>
                           </dl>
                        </div>
                     </div>
                     <div class="topic concept nested2" id="here-space"><a name="here-space" shape="rect">
                           <!-- --></a><h3 class="title topictitle2"><a href="#here-space" name="here-space" shape="rect">2.10.3.&nbsp;HERE_, _SPACE_</a></h3>
                        <div class="body conbody">
                           <p class="p">Prior to evaluating <samp class="ph codeph">nvcc.profile</samp>, <samp class="ph codeph">nvcc</samp> defines <samp class="ph codeph">_HERE_</samp> to be directory path in which the profile file was found. Depending on how nvcc was invoked, this may be an absolute path
                              or a relative path. 
                              	  
                           </p>
                           <p class="p">Similarly, <samp class="ph codeph">nvcc</samp> will assign a single space string to <samp class="ph codeph">_SPACE_</samp>. This variable can be used to enforce separation in profile lines such as:
                           </p><pre xml:space="preserve">INCLUDES += -I../common $(_SPACE_)</pre><p class="p">Omitting the <samp class="ph codeph">_SPACE_</samp> could cause <em class="ph i">glueing</em> effects such as <samp class="ph codeph">-I../common-Iapps</samp> with previous values of <samp class="ph codeph">INCLUDES</samp>.
                           </p>
                        </div>
                     </div>
                     <div class="topic reference nested2" id="variables-interpreted-by-nvcc-itself"><a name="variables-interpreted-by-nvcc-itself" shape="rect">
                           <!-- --></a><h3 class="title topictitle2"><a href="#variables-interpreted-by-nvcc-itself" name="variables-interpreted-by-nvcc-itself" shape="rect">2.10.4.&nbsp;Variables Interpreted by NVCC Itself</a></h3>
                        <div class="body refbody">
                           <div class="section refsyn">
                              <p class="p">The following variables are used by <samp class="ph codeph">nvcc</samp> itself:
                              </p>
                              <div class="tablenoborder">
                                 <table cellpadding="4" cellspacing="0" summary="" class="table" frame="border" border="1" rules="all">
                                    <tbody class="tbody">
                                       <tr class="row">
                                          <td class="entry" valign="top" width="33.33333333333333%" rowspan="1" colspan="1"><samp class="ph codeph">compiler-bindir</samp></td>
                                          <td class="entry" valign="top" width="66.66666666666666%" rowspan="1" colspan="1">The default value of the directory in which the host compiler resides (see <a class="xref" href="index.html#supported-host-compilers" shape="rect">Supported Host Compilers</a>). This value can still be overridden by command line option <samp class="ph codeph">--compiler-bindir</samp></td>
                                       </tr>
                                       <tr class="row">
                                          <td class="entry" valign="top" width="33.33333333333333%" rowspan="1" colspan="1"><samp class="ph codeph">INCLUDES</samp></td>
                                          <td class="entry" valign="top" width="66.66666666666666%" rowspan="1" colspan="1">This string extends the value of <samp class="ph codeph">nvcc</samp> command option <samp class="ph codeph">-Xcompiler</samp>. It is intended for defining additional include paths. It is in actual compiler option syntax, i.e., <samp class="ph codeph">gcc</samp> syntax on Linux and <samp class="ph codeph">cl</samp> syntax on Windows. 
                                          </td>
                                       </tr>
                                       <tr class="row">
                                          <td class="entry" valign="top" width="33.33333333333333%" rowspan="1" colspan="1"><samp class="ph codeph">LIBRARIES</samp></td>
                                          <td class="entry" valign="top" width="66.66666666666666%" rowspan="1" colspan="1">This string extends the value of <samp class="ph codeph">nvcc</samp> command option <samp class="ph codeph">-Xlinker</samp>. It is intended for defining additional libraries and library search paths. It is in actual compiler option syntax, i.e.,
                                             <samp class="ph codeph">gcc</samp> syntax on Linux and <samp class="ph codeph">cl</samp> syntax on Windows.
                                             
                                          </td>
                                       </tr>
                                       <tr class="row">
                                          <td class="entry" valign="top" width="33.33333333333333%" rowspan="1" colspan="1"><samp class="ph codeph">PTXAS_FLAGS</samp></td>
                                          <td class="entry" valign="top" width="66.66666666666666%" rowspan="1" colspan="1">This string extends the value of <samp class="ph codeph">nvcc</samp> command option <samp class="ph codeph">-Xptxas</samp>. It is intended for passing optimization options to the CUDA internal tool <samp class="ph codeph">ptxas</samp>.
                                             
                                          </td>
                                       </tr>
                                       <tr class="row">
                                          <td class="entry" valign="top" width="33.33333333333333%" rowspan="1" colspan="1"><samp class="ph codeph">OPENCC_FLAGS</samp></td>
                                          <td class="entry" valign="top" width="66.66666666666666%" rowspan="1" colspan="1">This string extends the value of <samp class="ph codeph">nvcc</samp> command line option <samp class="ph codeph">-Xopencc</samp>. It is intended to pass optimization options to the CUDA internal tool <samp class="ph codeph">nvopencc</samp>. 
                                          </td>
                                       </tr>
                                    </tbody>
                                 </table>
                              </div>
                           </div>
                        </div>
                     </div>
                     <div class="topic concept nested2" id="example-of-profile"><a name="example-of-profile" shape="rect">
                           <!-- --></a><h3 class="title topictitle2"><a href="#example-of-profile" name="example-of-profile" shape="rect">2.10.5.&nbsp;Example of profile</a></h3>
                        <div class="body conbody"><pre xml:space="preserve">#
# nvcc and nvcc.profile are in the bin directory of the
# cuda installation tree. Hence, this installation tree
# is ‘one up’:
#
TOP           = $(_HERE_)/..

#
# Define the cuda include directories: 
#
INCLUDES	 +=  -I$(TOP)/include -I$(TOP)/include/cudart ${_SPACE_}

#
# Extend dll search path to find cudart.dll and cuda.dll
# and add these two libraries to the link line
#
PATH         += $(TOP)/lib;
LIBRARIES    =+ ${_SPACE_} -L$(TOP)/lib -lcuda -lcudart
#
# Extend the executable search path to find the
# cuda internal tools:
#
PATH         += $(TOP)/open64/bin:$(TOP)/bin:

#
# Location of Microsoft Visual Studio compiler
#
compiler-bindir  = c:/mvs/bin

#
# No special optimization flags for device code compilation:
#
PTXAS_FLAGS    += </pre></div>
                     </div>
                  </div>
               </div>
               <div class="topic concept nested0" id="nvcc-command-options"><a name="nvcc-command-options" shape="rect">
                     <!-- --></a><h2 class="title topictitle1"><a href="#nvcc-command-options" name="nvcc-command-options" shape="rect">3.&nbsp;NVCC Command Options</a></h2>
                  <div class="topic concept nested1" id="command-option-types-and-notation"><a name="command-option-types-and-notation" shape="rect">
                        <!-- --></a><h3 class="title topictitle2"><a href="#command-option-types-and-notation" name="command-option-types-and-notation" shape="rect">3.1.&nbsp;Command Option Types and Notation</a></h3>
                     <div class="body conbody">
                        <p class="p"><samp class="ph codeph">nvcc</samp> recognizes three types of command options: boolean (flag-) options, single value options, and list (multivalued-) options.
                        </p>
                        <p class="p">Boolean options do not have an argument: they are either specified on a command line or not. Single value options must be
                           specified at most once, and list (multivalued-) options may be repeated. Examples of each of these option types are, respectively:
                           <samp class="ph codeph">-v</samp> (switch to verbose mode), <samp class="ph codeph">-o</samp> (specify output file), and <samp class="ph codeph">-I</samp> (specify include path).
                        </p>
                        <p class="p">Single value options and list options must have arguments, which must follow the name of the option itself by either one of
                           more spaces or an equals character. In some cases of compatibility with gcc (such as <samp class="ph codeph">-I</samp>, <samp class="ph codeph">-l</samp>, and <samp class="ph codeph">-L</samp>), the value of the option may also immediately follow the option itself, without being separated by spaces. The individual
                           values of multivalued options may be separated by commas in a single instance of the option, or the option may be repeated,
                           or any combination of these two cases. 
                        </p>
                        <p class="p">Hence, for the two sample options mentioned above that may take values, the following notations are legal:</p><pre xml:space="preserve">-o file
-o=file
-Idir1,dir2 -I=dir3 -I dir4,dir5</pre><p class="p">The option type in the tables in the remainder of this section can be recognized as follows: boolean options do not have arguments
                           specified in the first column, while the other two types do. List options can be recognized by the repeat indicator <samp class="ph codeph">,...</samp> at the end of the argument.
                        </p>
                        <p class="p">Each option has a long name and a short name, which are interchangeable with each other. These two variants are distinguished
                           by the number of hyphens that must precede the option name: long names must be preceded by two hyphens, while short names
                           must be preceded by a single hyphen.  An example of this is the long alias of <samp class="ph codeph">-I</samp>, which is <samp class="ph codeph">--include-path</samp>.
                        </p>
                        <p class="p">Long options are intended for use in build scripts, where size of the option is less important than descriptive value. In
                           contrast, short options are intended for interactive use.  For <samp class="ph codeph">nvcc</samp>, this distinction may be of dubious value, because many of its options are well known compiler driver options, and the names
                           of many other single-hyphen options were already chosen before <samp class="ph codeph">nvcc</samp> was developed (and not especially short). However, the distinction is a useful convention, and the <em class="ph i">short</em> options names may be shortened in future releases of the CUDA Toolkit.
                        </p>
                        <p class="p">Long options are described in the first columns of the options tables, and short options occupy the second columns.</p>
                     </div>
                  </div>
                  <div class="topic concept nested1" id="command-option-description"><a name="command-option-description" shape="rect">
                        <!-- --></a><h3 class="title topictitle2"><a href="#command-option-description" name="command-option-description" shape="rect">3.2.&nbsp;Command Option Description</a></h3>
                     <div class="topic reference nested2" id="options-for-specifying-compilation-phase"><a name="options-for-specifying-compilation-phase" shape="rect">
                           <!-- --></a><h3 class="title topictitle2"><a href="#options-for-specifying-compilation-phase" name="options-for-specifying-compilation-phase" shape="rect">3.2.1.&nbsp;Options for Specifying the Compilation Phase</a></h3>
                        <div class="body refbody">
                           <div class="section refsyn">
                              <p class="p">Options of this category specify up to which stage the input files must be compiled.</p>
                              <div class="tablenoborder">
                                 <table cellpadding="4" cellspacing="0" summary="" class="table" frame="border" border="1" rules="all">
                                    <tbody class="tbody">
                                       <tr class="row">
                                          <td class="entry" valign="top" width="33.33333333333333%" rowspan="1" colspan="1"><samp class="ph codeph">--cuda</samp></td>
                                          <td class="entry" valign="top" width="25%" rowspan="1" colspan="1"><samp class="ph codeph">-cuda</samp></td>
                                          <td class="entry" valign="top" width="41.66666666666667%" rowspan="1" colspan="1">Compile all <samp class="ph codeph">.cu</samp> input files to <samp class="ph codeph">.cu.cpp.ii</samp> output. 
                                          </td>
                                       </tr>
                                       <tr class="row">
                                          <td class="entry" valign="top" width="33.33333333333333%" rowspan="1" colspan="1"><samp class="ph codeph">--cubin</samp></td>
                                          <td class="entry" valign="top" width="25%" rowspan="1" colspan="1"><samp class="ph codeph">-cubin</samp></td>
                                          <td class="entry" valign="top" width="41.66666666666667%" rowspan="1" colspan="1">Compile all <samp class="ph codeph">.cu</samp>/<samp class="ph codeph">.gpu</samp>/<samp class="ph codeph">.ptx</samp> input files to device-only <samp class="ph codeph">.cubin</samp> files. This step discards the host code for each <samp class="ph codeph">.cu</samp> input file. 
                                          </td>
                                       </tr>
                                       <tr class="row">
                                          <td class="entry" valign="top" width="33.33333333333333%" rowspan="1" colspan="1"><samp class="ph codeph">--ptx</samp></td>
                                          <td class="entry" valign="top" width="25%" rowspan="1" colspan="1"><samp class="ph codeph">-ptx</samp></td>
                                          <td class="entry" valign="top" width="41.66666666666667%" rowspan="1" colspan="1">Compile all <samp class="ph codeph">.cu</samp>/<samp class="ph codeph">.gpu</samp> input files to device-only <samp class="ph codeph">.ptx</samp> files. This step discards the host code for each <samp class="ph codeph">.cu</samp> input file. 
                                          </td>
                                       </tr>
                                       <tr class="row">
                                          <td class="entry" valign="top" width="33.33333333333333%" rowspan="1" colspan="1"><samp class="ph codeph">--gpu</samp></td>
                                          <td class="entry" valign="top" width="25%" rowspan="1" colspan="1"><samp class="ph codeph">-gpu</samp></td>
                                          <td class="entry" valign="top" width="41.66666666666667%" rowspan="1" colspan="1">Compile all <samp class="ph codeph">.cu</samp> input files to device-only <samp class="ph codeph">.gpu</samp> files. This step discards the host code for each <samp class="ph codeph">.cu</samp> input file. 
                                          </td>
                                       </tr>
                                       <tr class="row">
                                          <td class="entry" valign="top" width="33.33333333333333%" rowspan="1" colspan="1"><samp class="ph codeph">--fatbin</samp></td>
                                          <td class="entry" valign="top" width="25%" rowspan="1" colspan="1"><samp class="ph codeph">-fatbin</samp></td>
                                          <td class="entry" valign="top" width="41.66666666666667%" rowspan="1" colspan="1">Compile all <samp class="ph codeph">.cu</samp>/<samp class="ph codeph">.gpu</samp>/<samp class="ph codeph">.ptx</samp>/<samp class="ph codeph">.cubin</samp> input files to device-only <samp class="ph codeph">.fatbin</samp> files. This step discards the host code for each <samp class="ph codeph">.cu</samp> input file. 
                                          </td>
                                       </tr>
                                       <tr class="row">
                                          <td class="entry" valign="top" width="33.33333333333333%" rowspan="1" colspan="1"><samp class="ph codeph">--preprocess</samp></td>
                                          <td class="entry" valign="top" width="25%" rowspan="1" colspan="1"><samp class="ph codeph">-E</samp></td>
                                          <td class="entry" valign="top" width="41.66666666666667%" rowspan="1" colspan="1">Preprocess all <samp class="ph codeph">.c</samp>/<samp class="ph codeph">.cc</samp>/<samp class="ph codeph">.cpp</samp>/<samp class="ph codeph">.cxx</samp>/<samp class="ph codeph">.cu</samp> input files. 
                                          </td>
                                       </tr>
                                       <tr class="row">
                                          <td class="entry" valign="top" width="33.33333333333333%" rowspan="1" colspan="1"><samp class="ph codeph">--generate-dependencies</samp></td>
                                          <td class="entry" valign="top" width="25%" rowspan="1" colspan="1"><samp class="ph codeph">-M</samp></td>
                                          <td class="entry" valign="top" width="41.66666666666667%" rowspan="1" colspan="1">Generate for the one <samp class="ph codeph">.c</samp>/<samp class="ph codeph">.cc</samp>/<samp class="ph codeph">.cpp</samp>/<samp class="ph codeph">.cxx</samp>/<samp class="ph codeph">.cu</samp> input file (more than one are not allowed in this step) a dependency file that can be included in a <samp class="ph codeph">make</samp> file. 
                                          </td>
                                       </tr>
                                       <tr class="row">
                                          <td class="entry" valign="top" width="33.33333333333333%" rowspan="1" colspan="1"><samp class="ph codeph">--compile</samp></td>
                                          <td class="entry" valign="top" width="25%" rowspan="1" colspan="1"><samp class="ph codeph">-c</samp></td>
                                          <td class="entry" valign="top" width="41.66666666666667%" rowspan="1" colspan="1">Compile each <samp class="ph codeph">.c</samp>/<samp class="ph codeph">.cc</samp>/<samp class="ph codeph">.cpp</samp>/<samp class="ph codeph">.cxx</samp>/<samp class="ph codeph">.cu</samp> input file into an object file. 
                                          </td>
                                       </tr>
                                       <tr class="row">
                                          <td class="entry" valign="top" width="33.33333333333333%" rowspan="1" colspan="1"><samp class="ph codeph">--link</samp></td>
                                          <td class="entry" valign="top" width="25%" rowspan="1" colspan="1"><samp class="ph codeph">-link</samp></td>
                                          <td class="entry" valign="top" width="41.66666666666667%" rowspan="1" colspan="1">This option specifies the default behavior: compile and link all inputs. </td>
                                       </tr>
                                       <tr class="row">
                                          <td class="entry" valign="top" width="33.33333333333333%" rowspan="1" colspan="1"><samp class="ph codeph">--lib</samp></td>
                                          <td class="entry" valign="top" width="25%" rowspan="1" colspan="1"><samp class="ph codeph">-lib</samp></td>
                                          <td class="entry" valign="top" width="41.66666666666667%" rowspan="1" colspan="1">Compile all input files into object files (if necessary), and add the results to the specified library output file. </td>
                                       </tr>
                                       <tr class="row">
                                          <td class="entry" valign="top" width="33.33333333333333%" rowspan="1" colspan="1"><samp class="ph codeph">--run</samp></td>
                                          <td class="entry" valign="top" width="25%" rowspan="1" colspan="1"><samp class="ph codeph">-run</samp></td>
                                          <td class="entry" valign="top" width="41.66666666666667%" rowspan="1" colspan="1">This option compiles and links all inputs into an executable, and executes it.  Or, when the input is a single executable,
                                             it is executed without any compilation.  This step is intended for developers who do not want to be bothered with setting
                                             the necessary CUDA dll search paths (these will be set temporarily by nvcc according to the definitions in <samp class="ph codeph">nvcc.profile</samp>). 
                                          </td>
                                       </tr>
                                    </tbody>
                                 </table>
                              </div>
                           </div>
                        </div>
                     </div>
                     <div class="topic reference nested2" id="file-and-path-specifications"><a name="file-and-path-specifications" shape="rect">
                           <!-- --></a><h3 class="title topictitle2"><a href="#file-and-path-specifications" name="file-and-path-specifications" shape="rect">3.2.2.&nbsp;File and Path Specifications</a></h3>
                        <div class="body refbody">
                           <div class="section refsyn">
                              <div class="tablenoborder">
                                 <table cellpadding="4" cellspacing="0" summary="" class="table" frame="border" border="1" rules="all">
                                    <tbody class="tbody">
                                       <tr class="row">
                                          <td class="entry" valign="top" width="33.33333333333333%" rowspan="1" colspan="1"><samp class="ph codeph">--x <em class="ph i">language</em></samp></td>
                                          <td class="entry" valign="top" width="25%" rowspan="1" colspan="1"><samp class="ph codeph">-x</samp></td>
                                          <td class="entry" valign="top" width="41.66666666666667%" rowspan="1" colspan="1">
                                             <p class="p">Explicitly specify the language for the input files, rather
                                                than letting the compiler choose a default based on the file
                                                name suffix.
                                             </p>
                                             <p class="p">Allowed values for this option:
                                                <samp class="ph codeph">c</samp>,<samp class="ph codeph">c++</samp>,<samp class="ph codeph">cu</samp>.
                                             </p>
                                          </td>
                                       </tr>
                                       <tr class="row">
                                          <td class="entry" valign="top" width="33.33333333333333%" rowspan="1" colspan="1"><samp class="ph codeph">--output-file <em class="ph i">file</em></samp></td>
                                          <td class="entry" valign="top" width="25%" rowspan="1" colspan="1"><samp class="ph codeph">-o</samp></td>
                                          <td class="entry" valign="top" width="41.66666666666667%" rowspan="1" colspan="1">Specify name and location of the output file. Only a single input file is allowed when this option is present in <samp class="ph codeph">nvcc</samp> non-linking/archiving mode. 
                                          </td>
                                       </tr>
                                       <tr class="row">
                                          <td class="entry" valign="top" width="33.33333333333333%" rowspan="1" colspan="1"><samp class="ph codeph">--pre-include <em class="ph i">include-file</em>,...</samp></td>
                                          <td class="entry" valign="top" width="25%" rowspan="1" colspan="1"><samp class="ph codeph">-include</samp></td>
                                          <td class="entry" valign="top" width="41.66666666666667%" rowspan="1" colspan="1">Specify header files that must be preincluded during preprocessing or compilation. </td>
                                       </tr>
                                       <tr class="row">
                                          <td class="entry" valign="top" width="33.33333333333333%" rowspan="1" colspan="1"><samp class="ph codeph">--library <em class="ph i">library-file</em>,...</samp></td>
                                          <td class="entry" valign="top" width="25%" rowspan="1" colspan="1"><samp class="ph codeph">-l</samp></td>
                                          <td class="entry" valign="top" width="41.66666666666667%" rowspan="1" colspan="1">Specify libraries to be used in the linking stage without the library file extension. The libraries are searched for on the
                                             library search paths that have been specified using option <samp class="ph codeph">-L</samp> (see <a class="xref" href="index.html#libraries" shape="rect">Libraries</a>). 
                                          </td>
                                       </tr>
                                       <tr class="row">
                                          <td class="entry" valign="top" width="33.33333333333333%" rowspan="1" colspan="1"><samp class="ph codeph">--define-macro <em class="ph i">macrodef</em>,...</samp></td>
                                          <td class="entry" valign="top" width="25%" rowspan="1" colspan="1"><samp class="ph codeph">-D</samp></td>
                                          <td class="entry" valign="top" width="41.66666666666667%" rowspan="1" colspan="1">Specify macro definitions for use during preprocessing or compilation.</td>
                                       </tr>
                                       <tr class="row">
                                          <td class="entry" valign="top" width="33.33333333333333%" rowspan="1" colspan="1"><samp class="ph codeph">--undefine-macro <em class="ph i">macrodef</em>,...</samp></td>
                                          <td class="entry" valign="top" width="25%" rowspan="1" colspan="1"><samp class="ph codeph">-U</samp></td>
                                          <td class="entry" valign="top" width="41.66666666666667%" rowspan="1" colspan="1">Undefine a macro definition.</td>
                                       </tr>
                                       <tr class="row">
                                          <td class="entry" valign="top" width="33.33333333333333%" rowspan="1" colspan="1"><samp class="ph codeph">--include-path <em class="ph i">include-path</em>,...</samp></td>
                                          <td class="entry" valign="top" width="25%" rowspan="1" colspan="1"><samp class="ph codeph">-I</samp></td>
                                          <td class="entry" valign="top" width="41.66666666666667%" rowspan="1" colspan="1">Specify include search paths. </td>
                                       </tr>
                                       <tr class="row">
                                          <td class="entry" valign="top" width="33.33333333333333%" rowspan="1" colspan="1"><samp class="ph codeph">--system-include <em class="ph i">include-path</em>,...</samp></td>
                                          <td class="entry" valign="top" width="25%" rowspan="1" colspan="1"><samp class="ph codeph">-isystem</samp></td>
                                          <td class="entry" valign="top" width="41.66666666666667%" rowspan="1" colspan="1">Specify system include search paths. </td>
                                       </tr>
                                       <tr class="row">
                                          <td class="entry" valign="top" width="33.33333333333333%" rowspan="1" colspan="1"><samp class="ph codeph">--library-path <em class="ph i">library-path</em>,...</samp></td>
                                          <td class="entry" valign="top" width="25%" rowspan="1" colspan="1"><samp class="ph codeph">-L</samp></td>
                                          <td class="entry" valign="top" width="41.66666666666667%" rowspan="1" colspan="1">Specify library search paths (see <a class="xref" href="index.html#libraries" shape="rect">Libraries</a>). 
                                          </td>
                                       </tr>
                                       <tr class="row">
                                          <td class="entry" valign="top" width="33.33333333333333%" rowspan="1" colspan="1"><samp class="ph codeph">--output-directory <em class="ph i">directory</em></samp></td>
                                          <td class="entry" valign="top" width="25%" rowspan="1" colspan="1"><samp class="ph codeph">-odir</samp></td>
                                          <td class="entry" valign="top" width="41.66666666666667%" rowspan="1" colspan="1">Specify the directory of the output file. This option is intended for letting the dependency generation step (<samp class="ph codeph">--generate-dependencies</samp>) generate a rule that defines the target object file in the proper directory. 
                                          </td>
                                       </tr>
                                       <tr class="row">
                                          <td class="entry" valign="top" width="33.33333333333333%" rowspan="1" colspan="1"><samp class="ph codeph">--compiler-bindir <em class="ph i">directory</em></samp></td>
                                          <td class="entry" valign="top" width="25%" rowspan="1" colspan="1"><samp class="ph codeph">-ccbin</samp></td>
                                          <td class="entry" valign="top" width="41.66666666666667%" rowspan="1" colspan="1">Specify the directory in which the host compiler executable (Microsoft Visual Studio <samp class="ph codeph">cl</samp>, or a <samp class="ph codeph">gcc</samp> derivative) resides. By default, this executable is expected in the current executable search path. 
                                          </td>
                                       </tr>
                                       <tr class="row">
                                          <td class="entry" valign="top" width="33.33333333333333%" rowspan="1" colspan="1"><samp class="ph codeph">--cudart <em class="ph i">CUDA-runtime-library</em></samp></td>
                                          <td class="entry" valign="top" width="25%" rowspan="1" colspan="1"><samp class="ph codeph">-cudart</samp></td>
                                          <td class="entry" valign="top" width="41.66666666666667%" rowspan="1" colspan="1">
                                             <p class="p">Specify the type of CUDA runtime library to be used: static CUDA runtime library, shared/dynamic CUDA runtime library, or
                                                no CUDA runtime library. By default, the static CUDA runtime library is used.
                                             </p>
                                             <p class="p">Allowed values for this option:  'static','shared','none'.</p>
                                          </td>
                                       </tr>
                                       <tr class="row">
                                          <td class="entry" valign="top" width="33.33333333333333%" rowspan="1" colspan="1"><samp class="ph codeph">--libdevice-directory <em class="ph i">directory</em></samp></td>
                                          <td class="entry" valign="top" width="25%" rowspan="1" colspan="1"><samp class="ph codeph">-ldir</samp></td>
                                          <td class="entry" valign="top" width="41.66666666666667%" rowspan="1" colspan="1">Specify the directory that contains the libdevice library files when option <samp class="ph codeph">--dont-use-profile</samp> is used. Libdevice library files are located in the <samp class="ph codeph">nvvm/libdevice</samp> directory in the CUDA toolkit. 
                                          </td>
                                       </tr>
                                    </tbody>
                                 </table>
                              </div>
                           </div>
                        </div>
                     </div>
                     <div class="topic reference nested2" id="options-for-altering-compiler-linker-behavior"><a name="options-for-altering-compiler-linker-behavior" shape="rect">
                           <!-- --></a><h3 class="title topictitle2"><a href="#options-for-altering-compiler-linker-behavior" name="options-for-altering-compiler-linker-behavior" shape="rect">3.2.3.&nbsp;Options for Altering Compiler/Linker Behavior</a></h3>
                        <div class="body refbody">
                           <div class="tablenoborder"><a name="options-for-altering-compiler-linker-behavior__table_g1x_mml_1g" shape="rect">
                                 <!-- --></a><table cellpadding="4" cellspacing="0" summary="" id="options-for-altering-compiler-linker-behavior__table_g1x_mml_1g" class="table" frame="border" border="1" rules="all">
                                 <tbody class="tbody">
                                    <tr class="row">
                                       <td class="entry" valign="top" width="33.33333333333333%" rowspan="1" colspan="1"><samp class="ph codeph">--profile</samp></td>
                                       <td class="entry" valign="top" width="25%" rowspan="1" colspan="1"><samp class="ph codeph">-pg</samp></td>
                                       <td class="entry" valign="top" width="41.66666666666667%" rowspan="1" colspan="1"> Instrument generated code/executable for use by <samp class="ph codeph">gprof</samp> (Linux only). 
                                       </td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="33.33333333333333%" rowspan="1" colspan="1"><samp class="ph codeph">--debug <em class="ph i">level</em></samp></td>
                                       <td class="entry" valign="top" width="25%" rowspan="1" colspan="1"><samp class="ph codeph">-g</samp></td>
                                       <td class="entry" valign="top" width="41.66666666666667%" rowspan="1" colspan="1"> Generate debug-able code. </td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="33.33333333333333%" rowspan="1" colspan="1"><samp class="ph codeph">--device-debug</samp></td>
                                       <td class="entry" valign="top" width="25%" rowspan="1" colspan="1"><samp class="ph codeph">-G</samp></td>
                                       <td class="entry" valign="top" width="41.66666666666667%" rowspan="1" colspan="1"> Generate debug-able device code. </td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="33.33333333333333%" rowspan="1" colspan="1"><samp class="ph codeph">--generate-line-info</samp></td>
                                       <td class="entry" valign="top" width="25%" rowspan="1" colspan="1"><samp class="ph codeph">-lineinfo</samp></td>
                                       <td class="entry" valign="top" width="41.66666666666667%" rowspan="1" colspan="1"> Generate line-number information for device code. </td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="33.33333333333333%" rowspan="1" colspan="1"><samp class="ph codeph">--optimize <em class="ph i">level</em></samp></td>
                                       <td class="entry" valign="top" width="25%" rowspan="1" colspan="1"><samp class="ph codeph">-O</samp></td>
                                       <td class="entry" valign="top" width="41.66666666666667%" rowspan="1" colspan="1"> Generate optimized code. </td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="33.33333333333333%" rowspan="1" colspan="1"><samp class="ph codeph">--shared</samp></td>
                                       <td class="entry" valign="top" width="25%" rowspan="1" colspan="1"><samp class="ph codeph">-shared</samp></td>
                                       <td class="entry" valign="top" width="41.66666666666667%" rowspan="1" colspan="1"> Generate a shared library during linking. <strong class="ph b">Note:</strong> when other linker options are required for controlling <samp class="ph codeph">dll</samp> generation, use option <samp class="ph codeph">-Xlinker</samp>. 
                                       </td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="33.33333333333333%" rowspan="1" colspan="1"><samp class="ph codeph">--machine</samp></td>
                                       <td class="entry" valign="top" width="25%" rowspan="1" colspan="1"><samp class="ph codeph">-m</samp></td>
                                       <td class="entry" valign="top" width="41.66666666666667%" rowspan="1" colspan="1"> Specify 32-bit vs. 64-bit architecture. </td>
                                    </tr>
                                 </tbody>
                              </table>
                           </div>
                        </div>
                     </div>
                     <div class="topic reference nested2" id="options-for-passing-specific-phase-options"><a name="options-for-passing-specific-phase-options" shape="rect">
                           <!-- --></a><h3 class="title topictitle2"><a href="#options-for-passing-specific-phase-options" name="options-for-passing-specific-phase-options" shape="rect">3.2.4.&nbsp;Options for Passing Specific Phase Options</a></h3>
                        <div class="body refbody">
                           <div class="section refsyn">
                              <p class="p">These allow for passing specific options directly to the internal compilation tools that
                                 <samp class="ph codeph">nvcc</samp> encapsulates, without burdening <samp class="ph codeph">nvcc</samp> with too-detailed
                                 knowledge on these tools. A table of useful sub-tool options can be found at the end of this
                                 chapter.
                                 
                              </p>
                              <div class="tablenoborder">
                                 <table cellpadding="4" cellspacing="0" summary="" class="table" frame="border" border="1" rules="all">
                                    <tbody class="tbody">
                                       <tr class="row">
                                          <td class="entry" valign="top" width="33.33333333333333%" rowspan="1" colspan="1"><samp class="ph codeph">--compiler-options <em class="ph i">options</em>,...</samp></td>
                                          <td class="entry" valign="top" width="25%" rowspan="1" colspan="1"><samp class="ph codeph">-Xcompiler</samp></td>
                                          <td class="entry" valign="top" width="41.66666666666667%" rowspan="1" colspan="1"> Specify options directly to the compiler/preprocessor. </td>
                                       </tr>
                                       <tr class="row">
                                          <td class="entry" valign="top" width="33.33333333333333%" rowspan="1" colspan="1"><samp class="ph codeph">--linker-options <em class="ph i">options</em>,...</samp></td>
                                          <td class="entry" valign="top" width="25%" rowspan="1" colspan="1"><samp class="ph codeph">-Xlinker</samp></td>
                                          <td class="entry" valign="top" width="41.66666666666667%" rowspan="1" colspan="1"> Specify options directly to the host linker. </td>
                                       </tr>
                                       <tr class="row">
                                          <td class="entry" valign="top" width="33.33333333333333%" rowspan="1" colspan="1"><samp class="ph codeph">--opencc-options <em class="ph i">options</em>,...</samp></td>
                                          <td class="entry" valign="top" width="25%" rowspan="1" colspan="1"><samp class="ph codeph">-Xopencc</samp></td>
                                          <td class="entry" valign="top" width="41.66666666666667%" rowspan="1" colspan="1"> Specify options directly to nvopencc, typically for steering <samp class="ph codeph">nvopencc</samp> optimization. 
                                          </td>
                                       </tr>
                                       <tr class="row">
                                          <td class="entry" valign="top" width="33.33333333333333%" rowspan="1" colspan="1"><samp class="ph codeph">--ptxas-options <em class="ph i">options</em>,...</samp></td>
                                          <td class="entry" valign="top" width="25%" rowspan="1" colspan="1"><samp class="ph codeph">-Xptxas</samp></td>
                                          <td class="entry" valign="top" width="41.66666666666667%" rowspan="1" colspan="1"> Specify options directly to the <samp class="ph codeph">ptx</samp> optimizing assembler. 
                                          </td>
                                       </tr>
                                       <tr class="row">
                                          <td class="entry" valign="top" width="33.33333333333333%" rowspan="1" colspan="1"><samp class="ph codeph">--nvlink-options <em class="ph i">options</em>,...</samp></td>
                                          <td class="entry" valign="top" width="25%" rowspan="1" colspan="1"><samp class="ph codeph">-Xnvlink</samp></td>
                                          <td class="entry" valign="top" width="41.66666666666667%" rowspan="1" colspan="1"> Specify options directly to <samp class="ph codeph">nvlink</samp>. 
                                          </td>
                                       </tr>
                                    </tbody>
                                 </table>
                              </div>
                           </div>
                        </div>
                     </div>
                     <div class="topic reference nested2" id="options-for-guiding-compiler-driver"><a name="options-for-guiding-compiler-driver" shape="rect">
                           <!-- --></a><h3 class="title topictitle2"><a href="#options-for-guiding-compiler-driver" name="options-for-guiding-compiler-driver" shape="rect">3.2.5.&nbsp;Options for Guiding the Compiler Driver</a></h3>
                        <div class="body refbody">
                           <div class="tablenoborder">
                              <table cellpadding="4" cellspacing="0" summary="" class="table" frame="border" border="1" rules="all">
                                 <tbody class="tbody">
                                    <tr class="row">
                                       <td class="entry" valign="top" width="33.33333333333333%" rowspan="1" colspan="1"><samp class="ph codeph">--dryrun</samp></td>
                                       <td class="entry" valign="top" width="25%" rowspan="1" colspan="1"><samp class="ph codeph">-dryrun</samp></td>
                                       <td class="entry" valign="top" width="41.66666666666667%" rowspan="1" colspan="1">Do not execute the compilation commands generated by <samp class="ph codeph">nvcc</samp>. Instead, list them.
                                          
                                       </td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="33.33333333333333%" rowspan="1" colspan="1"><samp class="ph codeph">--verbose</samp></td>
                                       <td class="entry" valign="top" width="25%" rowspan="1" colspan="1"><samp class="ph codeph">-v</samp></td>
                                       <td class="entry" valign="top" width="41.66666666666667%" rowspan="1" colspan="1">List the compilation commands generated by this compiler driver, but do not suppress their execution. </td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="33.33333333333333%" rowspan="1" colspan="1"><samp class="ph codeph">--keep</samp></td>
                                       <td class="entry" valign="top" width="25%" rowspan="1" colspan="1"><samp class="ph codeph">-keep</samp></td>
                                       <td class="entry" valign="top" width="41.66666666666667%" rowspan="1" colspan="1">Keep all intermediate files that are generated during internal compilation steps. </td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="33.33333333333333%" rowspan="1" colspan="1"><samp class="ph codeph">--save-temps</samp></td>
                                       <td class="entry" valign="top" width="25%" rowspan="1" colspan="1"><samp class="ph codeph">-save-temps</samp></td>
                                       <td class="entry" valign="top" width="41.66666666666667%" rowspan="1" colspan="1">This option is an alias of <samp class="ph codeph">--keep</samp>. 
                                       </td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="33.33333333333333%" rowspan="1" colspan="1"><samp class="ph codeph">--dont-use-profile</samp></td>
                                       <td class="entry" valign="top" width="25%" rowspan="1" colspan="1"><samp class="ph codeph">-noprof</samp></td>
                                       <td class="entry" valign="top" width="41.66666666666667%" rowspan="1" colspan="1">Do not use the <samp class="ph codeph">nvcc.profile</samp> file to guide the compilation. 
                                       </td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="33.33333333333333%" rowspan="1" colspan="1"><samp class="ph codeph">--clean-targets</samp></td>
                                       <td class="entry" valign="top" width="25%" rowspan="1" colspan="1"><samp class="ph codeph">-clean</samp></td>
                                       <td class="entry" valign="top" width="41.66666666666667%" rowspan="1" colspan="1">This option reverses the behaviour of <samp class="ph codeph">nvcc</samp>. When specified, none of the compilation phases will be executed. Instead, all of the non-temporary files that <samp class="ph codeph">nvcc</samp> would otherwise create will be deleted. 
                                       </td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="33.33333333333333%" rowspan="1" colspan="1"><samp class="ph codeph">--run-args <em class="ph i">arguments</em>,...</samp></td>
                                       <td class="entry" valign="top" width="25%" rowspan="1" colspan="1"><samp class="ph codeph">-run-args</samp></td>
                                       <td class="entry" valign="top" width="41.66666666666667%" rowspan="1" colspan="1">Used in combination with option <samp class="ph codeph">-R</samp>, to specify command line arguments for the executable. 
                                       </td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="33.33333333333333%" rowspan="1" colspan="1"><samp class="ph codeph">--input-drive-prefix <em class="ph i">prefix</em></samp></td>
                                       <td class="entry" valign="top" width="25%" rowspan="1" colspan="1"><samp class="ph codeph">-idp</samp></td>
                                       <td class="entry" valign="top" width="41.66666666666667%" rowspan="1" colspan="1">On Windows platforms, all command line arguments that refer to file names must be converted to Windows native format before
                                          they are passed to pure Windows executables. This option specifies how the <em class="ph i">current</em> development environment represents absolute paths. Use <samp class="ph codeph">-idp /cygwin/</samp> for CygWin build environments, and <samp class="ph codeph">-idp /</samp> for Mingw. 
                                       </td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="33.33333333333333%" rowspan="1" colspan="1"><samp class="ph codeph">--dependency-drive-prefix <em class="ph i">prefix</em></samp></td>
                                       <td class="entry" valign="top" width="25%" rowspan="1" colspan="1"><samp class="ph codeph">-ddp</samp></td>
                                       <td class="entry" valign="top" width="41.66666666666667%" rowspan="1" colspan="1">On Windows platforms, when generating dependency files (option <samp class="ph codeph">-M</samp>), all file names must be converted to whatever the used instance of <samp class="ph codeph">make</samp> will recognize. Some instances of <samp class="ph codeph">make</samp> have trouble with the colon in absolute paths in native Windows format, which depends on the environment in which this <samp class="ph codeph">make</samp> instance has been compiled. Use <samp class="ph codeph">-ddp /cygwin/</samp> for a CygWin <samp class="ph codeph">make</samp>, and <samp class="ph codeph">-ddp /</samp> for Mingw. Or leave these file names in native Windows format by specifying nothing. 
                                       </td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="33.33333333333333%" rowspan="1" colspan="1"><samp class="ph codeph">--drive-prefix <em class="ph i">prefix</em></samp></td>
                                       <td class="entry" valign="top" width="25%" rowspan="1" colspan="1"><samp class="ph codeph">-dp</samp></td>
                                       <td class="entry" valign="top" width="41.66666666666667%" rowspan="1" colspan="1">Specifies <samp class="ph codeph"><em class="ph i">prefix</em></samp> as both <samp class="ph codeph">input-drive-prefix</samp> and <samp class="ph codeph">dependency-drive-prefix</samp>.
                                          
                                       </td>
                                    </tr>
                                 </tbody>
                              </table>
                           </div>
                        </div>
                     </div>
                     <div class="topic reference nested2" id="options-for-steering-cuda-compilation"><a name="options-for-steering-cuda-compilation" shape="rect">
                           <!-- --></a><h3 class="title topictitle2"><a href="#options-for-steering-cuda-compilation" name="options-for-steering-cuda-compilation" shape="rect">3.2.6.&nbsp;Options for Steering CUDA Compilation</a></h3>
                        <div class="body refbody">
                           <div class="tablenoborder">
                              <table cellpadding="4" cellspacing="0" summary="" class="table" frame="border" border="1" rules="all">
                                 <tbody class="tbody">
                                    <tr class="row">
                                       <td class="entry" valign="top" width="33.33333333333333%" rowspan="1" colspan="1"><samp class="ph codeph">--target-cpu-architecture</samp></td>
                                       <td class="entry" valign="top" width="25%" rowspan="1" colspan="1"><samp class="ph codeph">-target-cpu-arch</samp></td>
                                       <td class="entry" valign="top" width="41.66666666666667%" rowspan="1" colspan="1">Specify the name of the class of CPU architecture for which
                                          	             the input files must be compiled.
                                          
                                       </td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="33.33333333333333%" rowspan="1" colspan="1"><samp class="ph codeph">--use_fast_math</samp></td>
                                       <td class="entry" valign="top" width="25%" rowspan="1" colspan="1"><samp class="ph codeph">-use_fast_math</samp></td>
                                       <td class="entry" valign="top" width="41.66666666666667%" rowspan="1" colspan="1"> Make use of fast math library.&nbsp;<samp class="ph codeph">-use_fast_math</samp> implies <samp class="ph codeph">-ftz=true -prec-div=false -prec-sqrt=false -fmad=true</samp>.
                                       </td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="33.33333333333333%" rowspan="1" colspan="1"><samp class="ph codeph">--ftz</samp></td>
                                       <td class="entry" valign="top" width="25%" rowspan="1" colspan="1"><samp class="ph codeph">-ftz</samp></td>
                                       <td class="entry" valign="top" width="41.66666666666667%" rowspan="1" colspan="1">The <samp class="ph codeph">-ftz</samp> option controls single precision denormals support. When <samp class="ph codeph">-ftz=false</samp>, denormals are supported and with <samp class="ph codeph">-ftz=true</samp>, denormals are flushed to 0. 
                                       </td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="33.33333333333333%" rowspan="1" colspan="1"><samp class="ph codeph">--prec-div</samp></td>
                                       <td class="entry" valign="top" width="25%" rowspan="1" colspan="1"><samp class="ph codeph">-prec-div</samp></td>
                                       <td class="entry" valign="top" width="41.66666666666667%" rowspan="1" colspan="1">The <samp class="ph codeph">-prec-div</samp> option controls single precision division. With <samp class="ph codeph">-prec-div=true</samp>, the division is IEEE compliant, with <samp class="ph codeph">-prec-div=false</samp>, the division is approximate.
                                       </td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="33.33333333333333%" rowspan="1" colspan="1"><samp class="ph codeph">--prec-sqrt</samp></td>
                                       <td class="entry" valign="top" width="25%" rowspan="1" colspan="1"><samp class="ph codeph">-prec-sqrt</samp></td>
                                       <td class="entry" valign="top" width="41.66666666666667%" rowspan="1" colspan="1">The <samp class="ph codeph">-prec-sqrt</samp> option controls single precision square root. With <samp class="ph codeph">-prec-sqrt=true</samp>, the square root is IEEE compliant, with <samp class="ph codeph">-prec-sqrt=false</samp>, the square root is approximate.
                                       </td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="33.33333333333333%" rowspan="1" colspan="1"><samp class="ph codeph">--entries <em class="ph i">entry,...</em></samp></td>
                                       <td class="entry" valign="top" width="25%" rowspan="1" colspan="1"><samp class="ph codeph">-e</samp></td>
                                       <td class="entry" valign="top" width="41.66666666666667%" rowspan="1" colspan="1">In case of compilation of <samp class="ph codeph">ptx</samp> or <samp class="ph codeph">gpu</samp> files to cubin: specify the global entry functions for which code must be generated. By default, code will be generated for
                                          all entries. 
                                       </td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="33.33333333333333%" rowspan="1" colspan="1"><samp class="ph codeph">--fmad</samp></td>
                                       <td class="entry" valign="top" width="25%" rowspan="1" colspan="1"><samp class="ph codeph">-fmad</samp></td>
                                       <td class="entry" valign="top" width="41.66666666666667%" rowspan="1" colspan="1">Enables (disables) the contraction of floating-point multiplies and adds/subtracts into floating-point multiply-add operations
                                          (FMAD, FFMA, or DFMA). The default is <samp class="ph codeph">-fmad=true</samp>. 
                                       </td>
                                    </tr>
                                 </tbody>
                              </table>
                           </div>
                        </div>
                     </div>
                     <div class="topic reference nested2" id="options-for-steering-gpu-code-generation"><a name="options-for-steering-gpu-code-generation" shape="rect">
                           <!-- --></a><h3 class="title topictitle2"><a href="#options-for-steering-gpu-code-generation" name="options-for-steering-gpu-code-generation" shape="rect">3.2.7.&nbsp;Options for Steering GPU Code Generation</a></h3>
                        <div class="body refbody">
                           <div class="tablenoborder">
                              <table cellpadding="4" cellspacing="0" summary="" class="table" frame="border" border="1" rules="all">
                                 <tbody class="tbody">
                                    <tr class="row">
                                       <td class="entry" valign="top" width="33.33333333333333%" rowspan="1" colspan="1"><samp class="ph codeph">--gpu-architecture <em class="ph i">gpuarch</em></samp></td>
                                       <td class="entry" valign="top" width="25%" rowspan="1" colspan="1"><samp class="ph codeph">-arch</samp></td>
                                       <td class="entry" valign="top" width="41.66666666666667%" rowspan="1" colspan="1">
                                          <p class="p">Specify the name of the NVIDIA GPU to compile for. This can either be a <em class="ph i">real</em> GPU, or a <em class="ph i">virtual</em> PTX architecture. PTX code represents an intermediate format that can still be further compiled and optimized for, depending
                                             on the ptx version, a specific class of actual GPUs .
                                          </p>
                                          <p class="p">The architecture specified by this option is the architecture that is assumed by the compilation chain up to the PTX stage,
                                             while the architecture(s) specified with the <samp class="ph codeph">-code</samp> option are assumed by the last, potentially runtime, compilation stage.
                                          </p>
                                          <p class="p">Currently supported compilation architectures are:
                                             virtual architectures <samp class="ph codeph">compute_10</samp>,
                                             <samp class="ph codeph">compute_11</samp>,
                                             <samp class="ph codeph">compute_12</samp>,
                                             <samp class="ph codeph">compute_13</samp>,
                                             <samp class="ph codeph">compute_20</samp>,
                                             <samp class="ph codeph">compute_30</samp>,
                                             <samp class="ph codeph">compute_32</samp>,
                                             <span class="ph"><samp class="ph codeph">compute_35</samp>,
                                                <samp class="ph codeph">compute_50</samp>; and
                                                </span>
                                             
                                             GPU architectures <samp class="ph codeph">sm_10</samp>,
                                             <samp class="ph codeph">sm_11</samp>,
                                             <samp class="ph codeph">sm_12</samp>,
                                             <samp class="ph codeph">sm_13</samp>,
                                             <samp class="ph codeph">sm_20</samp>,
                                             <samp class="ph codeph">sm_21</samp>,
                                             <samp class="ph codeph">sm_30</samp>,
                                             <samp class="ph codeph">sm_32</samp>,
                                             <span class="ph"><samp class="ph codeph">sm_35</samp>,
                                                <samp class="ph codeph">sm_50</samp>.
                                                </span></p>
                                       </td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="33.33333333333333%" rowspan="1" colspan="1"><samp class="ph codeph">--gpu-code <em class="ph i">gpuarch,...</em></samp></td>
                                       <td class="entry" valign="top" width="25%" rowspan="1" colspan="1"><samp class="ph codeph">-code</samp></td>
                                       <td class="entry" valign="top" width="41.66666666666667%" rowspan="1" colspan="1">
                                          <p class="p">Specify&nbsp;the name of the NVIDIA GPU to generate code for.</p>
                                          <p class="p"><samp class="ph codeph">nvcc</samp> embeds a compiled code image in the executable for each specified <em class="ph i">code</em> architecture, which is a true binary load image for each <em class="ph i">real</em> architecture, and PTX code for each virtual architecture.
                                          </p>
                                          <p class="p">During runtime, such embedded PTX code will be dynamically compiled by the CUDA runtime system if no binary load image is
                                             found for the <em class="ph i">current</em> GPU.
                                          </p>
                                          <p class="p">Architectures specified for options <samp class="ph codeph">-arch</samp> and <samp class="ph codeph">-code</samp> may be virtual as well as real, but the <em class="ph i">code</em> architectures must be compatible with the <em class="ph i">arch</em> architecture. When the&nbsp; code option is used, the value for the <samp class="ph codeph">-arch</samp> option must be a virtual PTX architecture.
                                          </p>
                                          <p class="p">For instance,&nbsp;<samp class="ph codeph"><em class="ph i">arch</em>=compute_13</samp> is not compatible with <samp class="ph codeph"><em class="ph i">code</em>=sm_10</samp>, because the earlier compilation stages will assume the availability of <samp class="ph codeph">compute_13</samp> features that are not present on <samp class="ph codeph">sm_10</samp>.
                                          </p>
                                          <p class="p">This option defaults to the value of option <samp class="ph codeph">-arch</samp>. Currently supported GPU architectures:
                                             <samp class="ph codeph">sm_10</samp>,
                                             <samp class="ph codeph">sm_11</samp>,
                                             <samp class="ph codeph">sm_12</samp>,
                                             <samp class="ph codeph">sm_13</samp>,
                                             <samp class="ph codeph">sm_20</samp>,
                                             <samp class="ph codeph">sm_21</samp>,
                                             <samp class="ph codeph">sm_30</samp>,
                                             <span class="ph"><samp class="ph codeph">sm_32</samp>,
                                                <samp class="ph codeph">sm_35</samp> and
                                                <samp class="ph codeph">sm_50</samp>.
                                                </span></p>
                                       </td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="33.33333333333333%" rowspan="1" colspan="1"><samp class="ph codeph">--generate-code</samp></td>
                                       <td class="entry" valign="top" width="25%" rowspan="1" colspan="1"><samp class="ph codeph">-gencode</samp></td>
                                       <td class="entry" valign="top" width="41.66666666666667%" rowspan="1" colspan="1">
                                          <p class="p">This option provides a generalization of the <samp class="ph codeph">-arch=&lt;arch&gt; -code=code,...</samp> option combination for specifying <samp class="ph codeph">nvcc</samp> behavior with respect to code generation. Where use of the previous options generates different code for a fixed virtual
                                             architecture, option <samp class="ph codeph">--generate-code</samp> allows multiple <samp class="ph codeph">nvopencc</samp> invocations, iterating over different virtual architectures. In fact,&nbsp;<samp class="ph codeph">-arch=&lt;arch&gt; -code=&lt;code&gt;,...</samp> is equivalent to <samp class="ph codeph">--generate-code arch=&lt;arch&gt;,code=&lt;code&gt;,...</samp>.
                                          </p>
                                          <p class="p"><samp class="ph codeph">--generate-code</samp> options may be repeated for different virtual architectures.
                                          </p>
                                          <p class="p">Allowed keywords for this option:&nbsp;<samp class="ph codeph"><em class="ph i">arch</em></samp>,<samp class="ph codeph"><em class="ph i">code</em></samp>.
                                          </p>
                                       </td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="33.33333333333333%" rowspan="1" colspan="1"><samp class="ph codeph">--maxrregcount <em class="ph i">amount</em></samp></td>
                                       <td class="entry" valign="top" width="25%" rowspan="1" colspan="1"><samp class="ph codeph">-maxrregcount</samp></td>
                                       <td class="entry" valign="top" width="41.66666666666667%" rowspan="1" colspan="1">
                                          <p class="p">Specify the maximum amount of registers that GPU functions can use. </p>
                                          <p class="p">Until a function-specific limit, a higher value will generally increase the performance of individual GPU threads that execute
                                             this function. However, because thread registers are allocated from a global register pool on each GPU, a higher value of
                                             this option will also reduce the maximum thread block size, thereby reducing the amount of thread parallelism. Hence, a good
                                             maxrregcount value is the result of a trade-off.
                                          </p>
                                          <p class="p">If this option is not specified, then no maximum is assumed. </p>
                                          <p class="p">Value less than the minimum registers required by ABI will be bumped up by the compiler to ABI minimum limit.</p>
                                       </td>
                                    </tr>
                                 </tbody>
                              </table>
                           </div>
                        </div>
                     </div>
                     <div class="topic reference nested2" id="generic-tool-options"><a name="generic-tool-options" shape="rect">
                           <!-- --></a><h3 class="title topictitle2"><a href="#generic-tool-options" name="generic-tool-options" shape="rect">3.2.8.&nbsp;Generic Tool Options</a></h3>
                        <div class="body refbody">
                           <div class="section refsyn">
                              <div class="tablenoborder">
                                 <table cellpadding="4" cellspacing="0" summary="" class="table" frame="border" border="1" rules="all">
                                    <tbody class="tbody">
                                       <tr class="row">
                                          <td class="entry" valign="top" width="33.33333333333333%" rowspan="1" colspan="1"><samp class="ph codeph">--source-in-ptx</samp></td>
                                          <td class="entry" valign="top" width="25%" rowspan="1" colspan="1"><samp class="ph codeph">-src-in-ptx</samp></td>
                                          <td class="entry" valign="top" width="41.66666666666667%" rowspan="1" colspan="1"> Interleave source in PTX. </td>
                                       </tr>
                                       <tr class="row">
                                          <td class="entry" valign="top" width="33.33333333333333%" rowspan="1" colspan="1"><samp class="ph codeph">--Werror <em class="ph i">kind,...</em></samp></td>
                                          <td class="entry" valign="top" width="25%" rowspan="1" colspan="1"><samp class="ph codeph">-Werror</samp></td>
                                          <td class="entry" valign="top" width="41.66666666666667%" rowspan="1" colspan="1"> Make warnings of the specified kinds into errors. The following is the list of warning kinds accepted by this option:
                                             
                                             
                                             <ul class="ul">
                                                <li class="li"><samp class="ph codeph">cross-execution-space-call</samp>
                                                   Be more strict about unsupported cross execution space calls.
                                                   The compiler will generate an error instead of a warning for a call from a <samp class="ph codeph">__host__</samp><samp class="ph codeph">__device__</samp> to a <samp class="ph codeph">__host__</samp> function.
                                                   
                                                </li>
                                             </ul>
                                          </td>
                                       </tr>
                                       <tr class="row">
                                          <td class="entry" valign="top" width="33.33333333333333%" rowspan="1" colspan="1"><samp class="ph codeph">--help</samp></td>
                                          <td class="entry" valign="top" width="25%" rowspan="1" colspan="1"><samp class="ph codeph">-h</samp></td>
                                          <td class="entry" valign="top" width="41.66666666666667%" rowspan="1" colspan="1"> Print help information on this tool. </td>
                                       </tr>
                                       <tr class="row">
                                          <td class="entry" valign="top" width="33.33333333333333%" rowspan="1" colspan="1"><samp class="ph codeph">--version</samp></td>
                                          <td class="entry" valign="top" width="25%" rowspan="1" colspan="1"><samp class="ph codeph">-V</samp></td>
                                          <td class="entry" valign="top" width="41.66666666666667%" rowspan="1" colspan="1"> Print version information on this tool. </td>
                                       </tr>
                                       <tr class="row">
                                          <td class="entry" valign="top" width="33.33333333333333%" rowspan="1" colspan="1"><samp class="ph codeph">--options-file <em class="ph i">file,...</em></samp></td>
                                          <td class="entry" valign="top" width="25%" rowspan="1" colspan="1"><samp class="ph codeph">-optf</samp></td>
                                          <td class="entry" valign="top" width="41.66666666666667%" rowspan="1" colspan="1"> Include command line options from specified file. </td>
                                       </tr>
                                    </tbody>
                                 </table>
                              </div>
                           </div>
                        </div>
                     </div>
                     <div class="topic concept nested2" id="phase-options"><a name="phase-options" shape="rect">
                           <!-- --></a><h3 class="title topictitle2"><a href="#phase-options" name="phase-options" shape="rect">3.2.9.&nbsp;Phase Options</a></h3>
                        <div class="body conbody">
                           <p class="p">The following sections lists some useful options to lower level compilation tools.</p>
                        </div>
                        <div class="topic reference nested3" id="ptxas-options"><a name="ptxas-options" shape="rect">
                              <!-- --></a><h3 class="title topictitle2"><a href="#ptxas-options" name="ptxas-options" shape="rect">3.2.9.1.&nbsp;Ptxas Options</a></h3>
                           <div class="body refbody">
                              <div class="section refsyn">
                                 <p class="p">The following table lists some useful <samp class="ph codeph">ptxas</samp> options which can be specified with
                                    <samp class="ph codeph">nvcc</samp> option <samp class="ph codeph">-Xptxas</samp>.
                                    
                                 </p>
                                 <div class="tablenoborder">
                                    <table cellpadding="4" cellspacing="0" summary="" class="table" frame="border" border="1" rules="all">
                                       <tbody class="tbody">
                                          <tr class="row">
                                             <td class="entry" valign="top" width="33.33333333333333%" rowspan="1" colspan="1"><samp class="ph codeph">--allow-expensive-optimizations</samp></td>
                                             <td class="entry" valign="top" width="25%" rowspan="1" colspan="1"><samp class="ph codeph">-allow-expensive-optimizations</samp></td>
                                             <td class="entry" valign="top" width="41.66666666666667%" rowspan="1" colspan="1">
                                                <p class="p">Enable (disable) to allow compiler to perform expensive optimizations using maximum
                                                   available resources (memory and compile-time).
                                                   
                                                </p>
                                                <p class="p">If unspecified, default behavior is to enable this feature for optimization level &gt;=
                                                   <samp class="ph codeph">O2</samp>.
                                                   
                                                </p>
                                             </td>
                                          </tr>
                                          <tr class="row">
                                             <td class="entry" valign="top" width="33.33333333333333%" rowspan="1" colspan="1"><samp class="ph codeph">--compile-only</samp></td>
                                             <td class="entry" valign="top" width="25%" rowspan="1" colspan="1"><samp class="ph codeph">-c</samp></td>
                                             <td class="entry" valign="top" width="41.66666666666667%" rowspan="1" colspan="1">Generate relocatable object.</td>
                                          </tr>
                                          <tr class="row">
                                             <td class="entry" valign="top" width="33.33333333333333%" rowspan="1" colspan="1"><samp class="ph codeph">--def-load-cache</samp></td>
                                             <td class="entry" valign="top" width="25%" rowspan="1" colspan="1"><samp class="ph codeph">-dlcm</samp></td>
                                             <td class="entry" valign="top" width="41.66666666666667%" rowspan="1" colspan="1">Default cache modifier on global/generic load. Default value: <samp class="ph codeph">ca</samp>.
                                             </td>
                                          </tr>
                                          <tr class="row">
                                             <td class="entry" valign="top" width="33.33333333333333%" rowspan="1" colspan="1"><samp class="ph codeph">--def-store-cache</samp></td>
                                             <td class="entry" valign="top" width="25%" rowspan="1" colspan="1"><samp class="ph codeph">-dscm</samp></td>
                                             <td class="entry" valign="top" width="41.66666666666667%" rowspan="1" colspan="1">Default cache modifier on global/generic store.</td>
                                          </tr>
                                          <tr class="row">
                                             <td class="entry" valign="top" width="33.33333333333333%" rowspan="1" colspan="1"><samp class="ph codeph">--gpu-name <em class="ph i">gpuname</em></samp></td>
                                             <td class="entry" valign="top" width="25%" rowspan="1" colspan="1"><samp class="ph codeph">-arch</samp></td>
                                             <td class="entry" valign="top" width="41.66666666666667%" rowspan="1" colspan="1">
                                                <p class="p">Specify name of NVIDIA GPU to generate code for. This option also takes virtual compute
                                                   architectures, in which case code generation is suppressed. This can be used for parsing
                                                   only.
                                                   
                                                </p>
                                                <p class="p">Allowed values for this option:
                                                   <samp class="ph codeph">compute_10</samp>,
                                                   <samp class="ph codeph">compute_11</samp>,
                                                   <samp class="ph codeph">compute_12</samp>,
                                                   <samp class="ph codeph">compute_13</samp>,
                                                   <samp class="ph codeph">compute_20</samp>,
                                                   <samp class="ph codeph">compute_30</samp>,
                                                   <span class="ph"><samp class="ph codeph">compute_35</samp>,
                                                      <samp class="ph codeph">compute_50</samp>; and
                                                      </span><samp class="ph codeph">sm_10</samp>,
                                                   <samp class="ph codeph">sm_11</samp>,
                                                   <samp class="ph codeph">sm_12</samp>,
                                                   <samp class="ph codeph">sm_13</samp>,
                                                   <samp class="ph codeph">sm_20</samp>,
                                                   <samp class="ph codeph">sm_21</samp>,
                                                   <samp class="ph codeph">sm_30</samp>,
                                                   <span class="ph"><samp class="ph codeph">sm_32</samp>,
                                                      <samp class="ph codeph">sm_35</samp> and
                                                      <samp class="ph codeph">sm_50</samp>.
                                                      </span></p>
                                                <p class="p">Default value: <samp class="ph codeph">sm_10</samp>.
                                                </p>
                                             </td>
                                          </tr>
                                          <tr class="row">
                                             <td class="entry" valign="top" width="33.33333333333333%" rowspan="1" colspan="1"><samp class="ph codeph">--opt-level <em class="ph i">N</em></samp></td>
                                             <td class="entry" valign="top" width="25%" rowspan="1" colspan="1"><samp class="ph codeph">-O</samp></td>
                                             <td class="entry" valign="top" width="41.66666666666667%" rowspan="1" colspan="1">Specify optimization level. Default value: <samp class="ph codeph">3</samp>.
                                             </td>
                                          </tr>
                                          <tr class="row">
                                             <td class="entry" valign="top" width="33.33333333333333%" rowspan="1" colspan="1"><samp class="ph codeph">--output-file <em class="ph i">file</em></samp></td>
                                             <td class="entry" valign="top" width="25%" rowspan="1" colspan="1"><samp class="ph codeph">-o</samp></td>
                                             <td class="entry" valign="top" width="41.66666666666667%" rowspan="1" colspan="1">Specify name of output file. Default value: <samp class="ph codeph">elf.o</samp>.
                                             </td>
                                          </tr>
                                          <tr class="row">
                                             <td class="entry" valign="top" width="33.33333333333333%" rowspan="1" colspan="1"><samp class="ph codeph">--preserve-relocs</samp></td>
                                             <td class="entry" valign="top" width="25%" rowspan="1" colspan="1"><samp class="ph codeph">-preserve-relocs</samp></td>
                                             <td class="entry" valign="top" width="41.66666666666667%" rowspan="1" colspan="1">
                                                <p class="p">This option will make <samp class="ph codeph">ptxas</samp> to generate relocatable references for
                                                   variables and preserve relocations generated for them in linked executable.
                                                   
                                                </p>
                                             </td>
                                          </tr>
                                          <tr class="row">
                                             <td class="entry" valign="top" width="33.33333333333333%" rowspan="1" colspan="1"><samp class="ph codeph">--sp-bound-check</samp></td>
                                             <td class="entry" valign="top" width="25%" rowspan="1" colspan="1"><samp class="ph codeph">-sp-bound-check</samp></td>
                                             <td class="entry" valign="top" width="41.66666666666667%" rowspan="1" colspan="1">
                                                <p class="p">Generate stack-pointer bounds-checking code sequence. This option is turned on
                                                   automatically when <samp class="ph codeph">device-debug(-g)</samp> or <samp class="ph codeph">opt-level(-O) 0</samp>
                                                   is specified.
                                                   
                                                </p>
                                             </td>
                                          </tr>
                                          <tr class="row">
                                             <td class="entry" valign="top" width="33.33333333333333%" rowspan="1" colspan="1"><samp class="ph codeph">--disable-optimizer-constants</samp></td>
                                             <td class="entry" valign="top" width="25%" rowspan="1" colspan="1"><samp class="ph codeph">-disable-optimizer-consts</samp></td>
                                             <td class="entry" valign="top" width="41.66666666666667%" rowspan="1" colspan="1">Disable use of optimizer constant bank.</td>
                                          </tr>
                                          <tr class="row">
                                             <td class="entry" valign="top" width="33.33333333333333%" rowspan="1" colspan="1"><samp class="ph codeph">--verbose</samp></td>
                                             <td class="entry" valign="top" width="25%" rowspan="1" colspan="1"><samp class="ph codeph">-v</samp></td>
                                             <td class="entry" valign="top" width="41.66666666666667%" rowspan="1" colspan="1">Enable verbose mode which prints code generation statistics.</td>
                                          </tr>
                                          <tr class="row">
                                             <td class="entry" valign="top" width="33.33333333333333%" rowspan="1" colspan="1"><samp class="ph codeph">--warning-as-error</samp></td>
                                             <td class="entry" valign="top" width="25%" rowspan="1" colspan="1"><samp class="ph codeph">-Werror</samp></td>
                                             <td class="entry" valign="top" width="41.66666666666667%" rowspan="1" colspan="1">Make all warnings into errors.</td>
                                          </tr>
                                          <tr class="row">
                                             <td class="entry" valign="top" width="33.33333333333333%" rowspan="1" colspan="1"><samp class="ph codeph">--device-debug</samp></td>
                                             <td class="entry" valign="top" width="25%" rowspan="1" colspan="1"><samp class="ph codeph">-g</samp></td>
                                             <td class="entry" valign="top" width="41.66666666666667%" rowspan="1" colspan="1">Semantics same as <samp class="ph codeph">nvcc</samp> option <samp class="ph codeph">--device-debug</samp>.
                                             </td>
                                          </tr>
                                          <tr class="row">
                                             <td class="entry" valign="top" width="33.33333333333333%" rowspan="1" colspan="1"><samp class="ph codeph">--entry <em class="ph i">entry</em>,...</samp></td>
                                             <td class="entry" valign="top" width="25%" rowspan="1" colspan="1"><samp class="ph codeph">-e</samp></td>
                                             <td class="entry" valign="top" width="41.66666666666667%" rowspan="1" colspan="1">Semantics same as <samp class="ph codeph">nvcc</samp> option <samp class="ph codeph">--entries</samp>.
                                             </td>
                                          </tr>
                                          <tr class="row">
                                             <td class="entry" valign="top" width="33.33333333333333%" rowspan="1" colspan="1"><samp class="ph codeph">--fmad</samp></td>
                                             <td class="entry" valign="top" width="25%" rowspan="1" colspan="1"><samp class="ph codeph">-fmad</samp></td>
                                             <td class="entry" valign="top" width="41.66666666666667%" rowspan="1" colspan="1">Semantics same as <samp class="ph codeph">nvcc</samp> option <samp class="ph codeph">--fmad</samp>.
                                             </td>
                                          </tr>
                                          <tr class="row">
                                             <td class="entry" valign="top" width="33.33333333333333%" rowspan="1" colspan="1"><samp class="ph codeph">--generate-line-info</samp></td>
                                             <td class="entry" valign="top" width="25%" rowspan="1" colspan="1"><samp class="ph codeph">-lineinfo</samp></td>
                                             <td class="entry" valign="top" width="41.66666666666667%" rowspan="1" colspan="1">Semantics same as <samp class="ph codeph">nvcc</samp> option <samp class="ph codeph">--generate-line-info</samp>.
                                             </td>
                                          </tr>
                                          <tr class="row">
                                             <td class="entry" valign="top" width="33.33333333333333%" rowspan="1" colspan="1"><samp class="ph codeph">--machine</samp></td>
                                             <td class="entry" valign="top" width="25%" rowspan="1" colspan="1"><samp class="ph codeph">-m</samp></td>
                                             <td class="entry" valign="top" width="41.66666666666667%" rowspan="1" colspan="1">Semantics same as <samp class="ph codeph">nvcc</samp> option <samp class="ph codeph">--machine</samp>.
                                             </td>
                                          </tr>
                                          <tr class="row">
                                             <td class="entry" valign="top" width="33.33333333333333%" rowspan="1" colspan="1"><samp class="ph codeph">--maxrregcount <em class="ph i">amount</em></samp></td>
                                             <td class="entry" valign="top" width="25%" rowspan="1" colspan="1"><samp class="ph codeph">-maxrregcount</samp></td>
                                             <td class="entry" valign="top" width="41.66666666666667%" rowspan="1" colspan="1">Semantics same as <samp class="ph codeph">nvcc</samp> option <samp class="ph codeph">--maxrregcount</samp>.
                                             </td>
                                          </tr>
                                          <tr class="row">
                                             <td class="entry" valign="top" width="33.33333333333333%" rowspan="1" colspan="1"><samp class="ph codeph">--help</samp></td>
                                             <td class="entry" valign="top" width="25%" rowspan="1" colspan="1"><samp class="ph codeph">-h</samp></td>
                                             <td class="entry" valign="top" width="41.66666666666667%" rowspan="1" colspan="1">Semantics same as <samp class="ph codeph">nvcc</samp> option <samp class="ph codeph">--help</samp>.
                                             </td>
                                          </tr>
                                          <tr class="row">
                                             <td class="entry" valign="top" width="33.33333333333333%" rowspan="1" colspan="1"><samp class="ph codeph">--options-file <em class="ph i">file</em>,...</samp></td>
                                             <td class="entry" valign="top" width="25%" rowspan="1" colspan="1"><samp class="ph codeph">-optf</samp></td>
                                             <td class="entry" valign="top" width="41.66666666666667%" rowspan="1" colspan="1">Semantics same as <samp class="ph codeph">nvcc</samp> option <samp class="ph codeph">--options-file</samp>.
                                             </td>
                                          </tr>
                                          <tr class="row">
                                             <td class="entry" valign="top" width="33.33333333333333%" rowspan="1" colspan="1"><samp class="ph codeph">--version</samp></td>
                                             <td class="entry" valign="top" width="25%" rowspan="1" colspan="1"><samp class="ph codeph">-V</samp></td>
                                             <td class="entry" valign="top" width="41.66666666666667%" rowspan="1" colspan="1">Semantics same as <samp class="ph codeph">nvcc</samp> option <samp class="ph codeph">--version</samp>.
                                             </td>
                                          </tr>
                                       </tbody>
                                    </table>
                                 </div>
                              </div>
                           </div>
                        </div>
                        <div class="topic reference nested3" id="nvlink-options"><a name="nvlink-options" shape="rect">
                              <!-- --></a><h3 class="title topictitle2"><a href="#nvlink-options" name="nvlink-options" shape="rect">3.2.9.2.&nbsp;Nvlink Options</a></h3>
                           <div class="body refbody">
                              <div class="section refsyn">
                                 <p class="p">The following table lists some useful <samp class="ph codeph">nvlink</samp> options which can be specified with
                                    <samp class="ph codeph">nvcc</samp> option <samp class="ph codeph">-Xnvlink</samp>.
                                    
                                 </p>
                                 <div class="tablenoborder">
                                    <table cellpadding="4" cellspacing="0" summary="" class="table" frame="border" border="1" rules="all">
                                       <tbody class="tbody">
                                          <tr class="row">
                                             <td class="entry" valign="top" width="33.33333333333333%" rowspan="1" colspan="1"><samp class="ph codeph">--preserve-relocs</samp></td>
                                             <td class="entry" valign="top" width="25%" rowspan="1" colspan="1"><samp class="ph codeph">-preserve-relocs</samp></td>
                                             <td class="entry" valign="top" width="41.66666666666667%" rowspan="1" colspan="1">Preserve resolved relocations in linked executable.</td>
                                          </tr>
                                       </tbody>
                                    </table>
                                 </div>
                              </div>
                           </div>
                        </div>
                     </div>
                  </div>
               </div>
               <div class="topic concept nested0" id="cuda-compilation-trajectory"><a name="cuda-compilation-trajectory" shape="rect">
                     <!-- --></a><h2 class="title topictitle1"><a href="#cuda-compilation-trajectory" name="cuda-compilation-trajectory" shape="rect">4.&nbsp;The CUDA Compilation Trajectory</a></h2>
                  <div class="body conbody">
                     <p class="p">This chapter explains the internal structure of the various CUDA compilation phases. These internals can usually be ignored
                        unless one wants to understand, or <em class="ph i">manually</em> rerun, the compilation steps corresponding to phases. Such command replay is useful during debugging of CUDA applications,
                        when intermediate files need be inspected or modified. It is important to note that this structure reflects the current way
                        in which <samp class="ph codeph">nvcc</samp> implements its phases; it may significantly change with new releases of the CUDA Toolkit.
                     </p>
                     <p class="p">The following section illustrates how internal steps can be made visible by <samp class="ph codeph">nvcc</samp>, and rerun. After that, a translation diagram of the <samp class="ph codeph">.cu</samp> to <samp class="ph codeph">.cu.cpp.ii</samp> phase is listed. All other CUDA compilations are variants in some form of another of the <samp class="ph codeph">.cu</samp> to C++ transformation.
                     </p>
                  </div>
                  <div class="topic concept nested1" id="listing-and-rerunning-nvcc-steps"><a name="listing-and-rerunning-nvcc-steps" shape="rect">
                        <!-- --></a><h3 class="title topictitle2"><a href="#listing-and-rerunning-nvcc-steps" name="listing-and-rerunning-nvcc-steps" shape="rect">4.1.&nbsp;Listing and Rerunning NVCC Steps</a></h3>
                     <div class="body conbody">
                        <p class="p">Intermediate steps can be made visible by options <samp class="ph codeph">-v</samp> and <samp class="ph codeph">-dryrun</samp>. In addition, option <samp class="ph codeph">-keep</samp> might be specified to retain temporary files, and also to give them slightly more meaningful names. The following sample
                           command lists the intermediate steps for a CUDA compilation:
                        </p><pre class="pre screen" xml:space="preserve">nvcc -cuda x.cu --compiler-bindir=c:/mvs/vc/bin -keep -dryrun</pre><p class="p">This command results in a listing as the one shown at the end of this section.</p>
                        <p class="p">Depending on the actual command shell that is used, the displayed commands are <em class="ph i">almost</em> executable: the DOS command shell, and the Linux shells <samp class="ph codeph">sh</samp> and <samp class="ph codeph">csh</samp> each have slightly different notations for assigning values to environment variables.
                        </p>
                        <p class="p">The command list contains the following:</p>
                        <ul class="ul">
                           <li class="li"> Definition of standard variables <samp class="ph codeph">_HERE_</samp> and <samp class="ph codeph">_SPACE_</samp> (see <a class="xref" href="index.html#here-space" shape="rect">HERE_, _SPACE_</a>)
                           </li>
                           <li class="li">Environment assignments resulting from executing <samp class="ph codeph">nvcc.profile</samp> (see <a class="xref" href="index.html#nvcc-profile" shape="rect">nvcc.profile</a>)
                           </li>
                           <li class="li">Definition of Visual Studio installation macros, derived from <samp class="ph codeph">-compiler-bindir</samp> (see <a class="xref" href="index.html#variables-interpreted-by-nvcc-itself" shape="rect">Variables Interpreted by NVCC Itself</a>)
                           </li>
                           <li class="li">Environment assignments resulting from executing <samp class="ph codeph">vsvars32.bat</samp></li>
                           <li class="li">Commands generated by <samp class="ph codeph">nvcc</samp></li>
                        </ul><pre class="pre screen" xml:space="preserve">#$ _SPACE_=
#$ _HERE_=c:\sw\gpgpu\bin\win32_debug
#$ TOP=c:\sw\gpgpu\bin\win32_debug/../..
#$ BINDIR=c:\sw\gpgpu\bin\win32_debug
#$ 
COMPILER_EXPORT=c:\sw\gpgpu\bin\win32_debug/../../../compiler/gpgpu/export/win32_debug
#$ 
PATH=c:\sw\gpgpu\bin\win32_debug/open64/bin;c:\sw\gpgpu\bin\win32_debug;C:\cygwin\usr\local\bin;C:\cygwin\bin;C:\cygwin\bin;C:\cygwin\usr\X11R6\bin;c:\WINDOWS\system32;c:\WINDOWS;c:\WINDOWS\System32\Wbem;c:\Program Files\Microsoft SQL Server\90\Tools\binn\;c:\Program Files\Perforce;C:\cygwin\lib\lapack
#$ 
PATH=c:\sw\gpgpu\bin\win32_debug/../../../compiler/gpgpu/export/win32_debug/open64/bin;c:\sw\gpgpu\bin\win32_debug/../../../compiler/gpgpu/export/win32_debug/bin;c:\sw\gpgpu\bin\win32_debug/open64/bin;c:\sw\gpgpu\bin\win32_debug;C:\cygwin\usr\local\bin;C:\cygwin\bin;C:\cygwin\bin;C:\cygwin\usr\X11R6\bin;c:\WINDOWS\system32;c:\WINDOWS;c:\WINDOWS\System32\Wbem;c:\Program Files\Microsoft SQL Server\90\Tools\binn\;c:\Program Files\Perforce;C:\cygwin\lib\lapack
#$ INCLUDES="-Ic:\sw\gpgpu\bin\win32_debug/../../cuda/inc" "-Ic:\sw\gpgpu\bin\win32_debug/../../cuda/tools/cudart"
#$ INCLUDES="-Ic:\sw\gpgpu\bin\win32_debug/../../../compiler/gpgpu/export/win32_debug/include" "-Ic:\sw\gpgpu\bin\win32_debug/../../cuda/inc" "-Ic:\sw\gpgpu\bin\win32_debug/../../cuda/tools/cudart"
#$ LIBRARIES= "c:\sw\gpgpu\bin\win32_debug/cuda.lib" "c:\sw\gpgpu\bin\win32_debug/cudart.lib"
#$ PTXAS_FLAGS=
#$ OPENCC_FLAGS=-Werror
#$ VSINSTALLDIR=c:/mvs/vc/bin/..
#$ VCINSTALLDIR=c:/mvs/vc/bin/..
#$ FrameworkDir=c:\WINDOWS\Microsoft.NET\Framework
#$ FrameworkVersion=v2.0.50727
#$ FrameworkSDKDir=c:\MVS\SDK\v2.0
#$ DevEnvDir=c:\MVS\Common7\IDE
#$ 
PATH=c:\MVS\Common7\IDE;c:\MVS\VC\BIN;c:\MVS\Common7\Tools;c:\MVS\Common7\Tools\bin;c:\MVS\VC\PlatformSDK\bin;c:\MVS\SDK\v2.0\bin;c:\WINDOWS\Microsoft.NET\Framework\v2.0.50727;c:\MVS\VC\VCPackages;c:\sw\gpgpu\bin\win32_debug/../../../compiler/gpgpu/export/win32_debug/open64/bin;c:\sw\gpgpu\bin\win32_debug/../../../compiler/gpgpu/export/win32_debug/bin;c:\sw\gpgpu\bin\win32_debug/open64/bin;c:\sw\gpgpu\bin\win32_debug;C:\cygwin\usr\local\bin;C:\cygwin\bin;C:\cygwin\bin;C:\cygwin\usr\X11R6\bin;c:\WINDOWS\system32;c:\WINDOWS;c:\WINDOWS\System32\Wbem;c:\Program Files\Microsoft SQL Server\90\Tools\binn\;c:\Program Files\Perforce;C:\cygwin\lib\lapack
#$ 
INCLUDE=c:\MVS\VC\ATLMFC\INCLUDE;c:\MVS\VC\INCLUDE;c:\MVS\VC\PlatformSDK\include;c:\MVS\SDK\v2.0\include;
#$ 
LIB=c:\MVS\VC\ATLMFC\LIB;c:\MVS\VC\LIB;c:\MVS\VC\PlatformSDK\lib;c:\MVS\SDK\v2.0\lib;
#$ 
LIBPATH=c:\WINDOWS\Microsoft.NET\Framework\v2.0.50727;c:\MVS\VC\ATLMFC\LIB
#$ 
PATH=c:/mvs/vc/bin;c:\MVS\Common7\IDE;c:\MVS\VC\BIN;c:\MVS\Common7\Tools;c:\MVS\Common7\Tools\bin;c:\MVS\VC\PlatformSDK\bin;c:\MVS\SDK\v2.0\bin;c:\WINDOWS\Microsoft.NET\Framework\v2.0.50727;c:\MVS\VC\VCPackages;c:\sw\gpgpu\bin\win32_debug/../../../compiler/gpgpu/export/win32_debug/open64/bin;c:\sw\gpgpu\bin\win32_debug/../../../compiler/gpgpu/export/win32_debug/bin;c:\sw\gpgpu\bin\win32_debug/open64/bin;c:\sw\gpgpu\bin\win32_debug;C:\cygwin\usr\local\bin;C:\cygwin\bin;C:\cygwin\bin;C:\cygwin\usr\X11R6\bin;c:\WINDOWS\system32;c:\WINDOWS;c:\WINDOWS\System32\Wbem;c:\Program Files\Microsoft SQL Server\90\Tools\binn\;c:\Program Files\Perforce;C:\cygwin\lib\lapack
#$ cudafe -E -DCUDA_NO_SM_12_ATOMIC_INTRINSICS -DCUDA_NO_SM_13_DOUBLE_INTRINSICS -DCUDA_FLOAT_MATH_FUNCTIONS "-Ic:\sw\gpgpu\bin\win32_debug/../../../compiler/gpgpu/export/win32_debug/include" "-Ic:\sw\gpgpu\bin\win32_debug/../../cuda/inc" "-Ic:\sw\gpgpu\bin\win32_debug/../../cuda/tools/cudart" -I. "-Ic:\MVS\VC\ATLMFC\INCLUDE" "-Ic:\MVS\VC\INCLUDE" "-Ic:\MVS\VC\PlatformSDK\include" "-Ic:\MVS\SDK\v2.0\include" -D__CUDACC__ -C --preinclude "cuda_runtime.h" -o "x.cpp1.ii" "x.cu"
#$ cudafe "-Ic:\sw\gpgpu\bin\win32_debug/../../../compiler/gpgpu/export/win32_debug/include" "-Ic:\sw\gpgpu\bin\win32_debug/../../cuda/inc" "-Ic:\sw\gpgpu\bin\win32_debug/../../cuda/tools/cudart" -I. --gen_c_file_name "x.cudafe1.c" --gen_device_file_name "x.cudafe1.gpu" --include_file_name x.fatbin.c --no_exceptions -tused "x.cpp1.ii"
#$ cudafe -E --c -DCUDA_NO_SM_12_ATOMIC_INTRINSICS -DCUDA_NO_SM_13_DOUBLE_INTRINSICS -DCUDA_FLOAT_MATH_FUNCTIONS "-Ic:\sw\gpgpu\bin\win32_debug/../../../compiler/gpgpu/export/win32_debug/include" "-Ic:\sw\gpgpu\bin\win32_debug/../../cuda/inc" "-Ic:\sw\gpgpu\bin\win32_debug/../../cuda/tools/cudart" -I. "-Ic:\MVS\VC\ATLMFC\INCLUDE" "-Ic:\MVS\VC\INCLUDE" "-Ic:\MVS\VC\PlatformSDK\include" "-Ic:\MVS\SDK\v2.0\include" -D__CUDACC__ -C -o "x.cpp2.i" "x.cudafe1.gpu"
#$ cudafe --c "-Ic:\sw\gpgpu\bin\win32_debug/../../../compiler/gpgpu/export/win32_debug/include" "-Ic:\sw\gpgpu\bin\win32_debug/../../cuda/inc" "-Ic:\sw\gpgpu\bin\win32_debug/../../cuda/tools/cudart" -I. --gen_c_file_name "x.cudafe2.c" --gen_device_file_name "x.cudafe2.gpu" --include_file_name x.fatbin.c "x.cpp2.i"
#$ cudafe -E --c -DCUDA_NO_SM_12_ATOMIC_INTRINSICS -DCUDA_NO_SM_13_DOUBLE_INTRINSICS -DCUDA_FLOAT_MATH_FUNCTIONS "-Ic:\sw\gpgpu\bin\win32_debug/../../../compiler/gpgpu/export/win32_debug/include" "-Ic:\sw\gpgpu\bin\win32_debug/../../cuda/inc" "-Ic:\sw\gpgpu\bin\win32_debug/../../cuda/tools/cudart" -I. "-Ic:\MVS\VC\ATLMFC\INCLUDE" "-Ic:\MVS\VC\INCLUDE" "-Ic:\MVS\VC\PlatformSDK\include" "-Ic:\MVS\SDK\v2.0\include" -D__GNUC__ -D__CUDABE__ -o "x.cpp3.i" "x.cudafe2.gpu"
#$ nvopencc -Werror "x.cpp3.i" -o "x.ptx"
#$ ptxas -arch=sm_10 "x.ptx" -o "x.cubin"
#$ filehash --skip-cpp-directives -s "" "x.cpp3.i" &gt; "x.cpp3.i.hash"
#$ fatbin --key="x@xxxxxxxxxx" --source-name="x.cu" --usage-mode="" --embedded-fatbin="x.fatbin.c" --image=profile=sm_10,file=x.cubin
#$ cudafe -E --c -DCUDA_NO_SM_12_ATOMIC_INTRINSICS -DCUDA_NO_SM_13_DOUBLE_INTRINSICS -DCUDA_FLOAT_MATH_FUNCTIONS "-Ic:\sw\gpgpu\bin\win32_debug/../../../compiler/gpgpu/export/win32_debug/include" "-Ic:\sw\gpgpu\bin\win32_debug/../../cuda/inc" "-Ic:\sw\gpgpu\bin\win32_debug/../../cuda/tools/cudart" -I. "-Ic:\MVS\VC\ATLMFC\INCLUDE" "-Ic:\MVS\VC\INCLUDE" "-Ic:\MVS\VC\PlatformSDK\include" "-Ic:\MVS\SDK\v2.0\include" -o "x.cu.c" "x.cudafe1.c"</pre></div>
                  </div>
                  <div class="topic concept nested1" id="full-cuda-compilation-trajectory"><a name="full-cuda-compilation-trajectory" shape="rect">
                        <!-- --></a><h3 class="title topictitle2"><a href="#full-cuda-compilation-trajectory" name="full-cuda-compilation-trajectory" shape="rect">4.2.&nbsp;Full CUDA Compilation Trajectory</a></h3>
                     <div class="body conbody">
                        <div class="fig fignone" id="full-cuda-compilation-trajectory__cuda-compilation-from-cu-to-cu-cpp-ii"><a name="full-cuda-compilation-trajectory__cuda-compilation-from-cu-to-cu-cpp-ii" shape="rect">
                              <!-- --></a><span class="figcap">Figure 2. CUDA Compilation from .cu to .cu.cpp.ii</span><br clear="none"></br><div class="imagecenter"><img class="image imagecenter" src="graphics/cuda-compilation-from-cu-to-cu-cpp-ii.png" alt="CUDA Compilation from .cu to .cu.cpp.ii."></img></div><br clear="none"></br></div>
                        <p class="p">The CUDA phase converts a source file coded in the extended CUDA language, into a regular ANSI C++ source file that can be
                           handed over to a general purpose C++ compiler for further compilation and linking. The exact steps that are followed to achieve
                           this are displayed in <a class="xref" href="index.html#full-cuda-compilation-trajectory__cuda-compilation-from-cu-to-cu-cpp-ii" shape="rect">Figure 2</a>.
                        </p>
                     </div>
                     <div class="topic concept nested2" id="compilation-flow"><a name="compilation-flow" shape="rect">
                           <!-- --></a><h3 class="title topictitle2"><a href="#compilation-flow" name="compilation-flow" shape="rect">4.2.1.&nbsp;Compilation Flow</a></h3>
                        <div class="body conbody">
                           <p class="p">In short, CUDA compilation works as follows: the input program is separated by the CUDA front end (<samp class="ph codeph">cudafe</samp>), into C/C++ host code and the <samp class="ph codeph">.gpu</samp> device code. Depending on the value(s) of the <samp class="ph codeph">-code</samp> option to <samp class="ph codeph">nvcc</samp>, this device code is further translated by the CUDA compilers/assemblers into CUDA binary (<samp class="ph codeph">cubin</samp>) and/or into intermediate PTX code. This code is merged into a device code descriptor which is included by the previously
                              separated host code. This descriptor will be inspected by the CUDA runtime system whenever the device code is invoked (<em class="ph i">called</em>) by the host program, in order to obtain an appropriate load image for the current GPU.
                           </p>
                        </div>
                     </div>
                     <div class="topic concept nested2" id="cuda-frontend"><a name="cuda-frontend" shape="rect">
                           <!-- --></a><h3 class="title topictitle2"><a href="#cuda-frontend" name="cuda-frontend" shape="rect">4.2.2.&nbsp;CUDA Frontend</a></h3>
                        <div class="body conbody">
                           <p class="p">In the current CUDA compilation scheme, the CUDA front end is invoked twice. The first step is for the actual splitup of the
                              <samp class="ph codeph">.cu</samp> input into host and device code. The second step is a technical detail (it performs dead code analysis on the <samp class="ph codeph">.gpu</samp> generated by the first step), and it might disappear in future releases.
                           </p>
                        </div>
                     </div>
                     <div class="topic concept nested2" id="preprocessing"><a name="preprocessing" shape="rect">
                           <!-- --></a><h3 class="title topictitle2"><a href="#preprocessing" name="preprocessing" shape="rect">4.2.3.&nbsp;Preprocessing</a></h3>
                        <div class="body conbody">
                           <p class="p">The trajectory contains a number of preprocessing steps. The first of these, on the <samp class="ph codeph">.cu</samp> input, has the usual purpose of expanding include files and macro invocations that are present in the source file. The remaining
                              preprocessing steps expand CUDA system macros in (<dfn class="term">C</dfn>-) code that has been generated by preceding CUDA compilation steps. The last preprocessing step also merges the results of
                              the previously diverged compilation flow.
                           </p>
                        </div>
                     </div>
                  </div>
               </div>
               <div class="topic reference nested0" id="sample-nvcc-usage"><a name="sample-nvcc-usage" shape="rect">
                     <!-- --></a><h2 class="title topictitle1"><a href="#sample-nvcc-usage" name="sample-nvcc-usage" shape="rect">5.&nbsp;Sample NVCC Usage</a></h2>
                  <div class="body refbody">
                     <div class="section refsyn">
                        <p class="p">The following lists a sample <samp class="ph codeph">makefile</samp> that uses <samp class="ph codeph">nvcc</samp> for portability across Windows and Linux.
                        </p><pre xml:space="preserve">#
# On windows, store location of Visual Studio compiler
# into the environment. This will be picked up by nvcc,
# even without explicitly being passed.
# On Linux, use whatever gcc is in the current path
# (so leave compiler-bindir undefined):
#
ifdef ON_WINDOWS
    export compiler-bindir := c:/mvs/bin
endif


#
# Similar for OPENCC_FLAGS and PTXAS_FLAGS.
# These are simply passed via the environment:
#
export OPENCC_FLAGS := 
export PTXAS_FLAGS  := -fastimul


#
# cuda and C/C++ compilation rules, with
# dependency generation:
#
%.o : %.cpp
$(NVCC) -c %^ $(CFLAGS) -o $@
$(NVCC) -M %^ $(CFLAGS)  &gt; $@.dep

%.o : %.c
$(NVCC) -c %^ $(CFLAGS) -o $@
$(NVCC) -M %^ $(CFLAGS)  &gt; $@.dep

%.o : %.cu
$(NVCC) -c %^ $(CFLAGS) -o $@
$(NVCC) -M %^ $(CFLAGS)  &gt; $@.dep


#
# Pick up generated dependency files, and 
# add /dev/null because gmake does not consider
# an empty list to be a list:
#
include  $(wildcard *.dep) /dev/null


#
# Define the application; 
# for each object file, there must be a
# corresponding .c or .cpp or .cu file:
#
OBJECTS = a.o    b.o    c.o
APP     = app

$(APP) : $(OBJECTS)
	$(NVCC) $(OBJECTS) $(LDFLAGS) -o $@


#
# Cleanup:
#
clean : 
	$(RM) $(OBJECTS) *.dep</pre></div>
                  </div>
               </div>
               <div class="topic concept nested0" id="gpu-compilation"><a name="gpu-compilation" shape="rect">
                     <!-- --></a><h2 class="title topictitle1"><a href="#gpu-compilation" name="gpu-compilation" shape="rect">6.&nbsp;GPU Compilation</a></h2>
                  <div class="body conbody">
                     <p class="p">This chapter describes the GPU compilation model that is maintained by <samp class="ph codeph">nvcc</samp>, in cooperation with the CUDA driver. It goes through some technical sections, with concrete examples at the end.
                     </p>
                  </div>
                  <div class="topic concept nested1" id="gpu-generations"><a name="gpu-generations" shape="rect">
                        <!-- --></a><h3 class="title topictitle2"><a href="#gpu-generations" name="gpu-generations" shape="rect">6.1.&nbsp;GPU Generations</a></h3>
                     <div class="body conbody">
                        <p class="p">In order to allow for architectural evolution, NVIDIA GPUs are released in different generations. New generations introduce
                           major improvements in functionality and/or chip architecture, while GPU models within the same generation show minor configuration
                           differences that <em class="ph i">moderately</em> affect functionality, performance, or both.
                        </p>
                        <p class="p">Binary compatibility of GPU applications is not guaranteed across different generations. For example, a CUDA application that
                           has been compiled for a Fermi GPU will very likely not run on a next generation graphics card (and vice versa). This is because
                           the Fermi instruction set and instruction encodings is different from Kepler, which in turn will probably be substantially
                           different from those of the next generation GPU.
                        </p>
                        <dl class="dl">
                           <dt class="dt dlterm">Tesla</dt>
                           <dd class="dd">
                              <ul class="ul">
                                 <li class="li">sm_10</li>
                                 <li class="li">sm_11</li>
                                 <li class="li">sm_12</li>
                                 <li class="li">sm_13</li>
                              </ul>
                           </dd>
                           <dt class="dt dlterm">Fermi</dt>
                           <dd class="dd">
                              <ul class="ul">
                                 <li class="li">sm_20</li>
                                 <li class="li">sm_21</li>
                              </ul>
                           </dd>
                           <dt class="dt dlterm">Kepler</dt>
                           <dd class="dd">
                              <ul class="ul">
                                 <li class="li">sm_30</li>
                                 <li class="li">sm_35</li>
                                 <li class="li">..??..</li>
                              </ul>
                           </dd>
                           <dt class="dt dlterm">Next generation</dt>
                           <dd class="dd">
                              <ul class="ul">
                                 <li class="li">..??..</li>
                              </ul>
                           </dd>
                        </dl>
                        <p class="p">Because they share the basic instruction set, binary compatibility within one GPU generation, however, can under certain conditions
                           guaranteed. This is the case between two GPU versions that do not show functional differences at all (for instance when one
                           version is a scaled down version of the other), or when one version is functionally included in the other. An example of the
                           latter is the <em class="ph i">base</em> Tesla version <samp class="ph codeph">sm_10</samp> whose functionality is a subset of all other Tesla versions: any code compiled for <samp class="ph codeph">sm_10</samp> will run on all other Tesla GPUs
                        </p>
                     </div>
                  </div>
                  <div class="topic concept nested1" id="gpu-feature-list"><a name="gpu-feature-list" shape="rect">
                        <!-- --></a><h3 class="title topictitle2"><a href="#gpu-feature-list" name="gpu-feature-list" shape="rect">6.2.&nbsp;GPU Feature List</a></h3>
                     <div class="body conbody">
                        <p class="p">The following table lists the names of the current GPU architectures, annotated with the functional capabilities that they
                           provide. There are other differences, such as amounts of register and processor clusters, that only affect execution performance.
                        </p>
                        <p class="p">In the CUDA naming scheme, GPUs are named <samp class="ph codeph">sm_xy</samp>, where <samp class="ph codeph">x</samp> denotes the GPU generation number, and <samp class="ph codeph">y</samp> the version in that generation. Additionally, to facilitate comparing GPU capabilities, CUDA attempts to choose its GPU names
                           such that if <samp class="ph codeph">x<sub class="ph sub">1</sub>y<sub class="ph sub">1</sub></samp> &lt;= <samp class="ph codeph">x<sub class="ph sub">2</sub>y<sub class="ph sub">2</sub></samp> then all non-ISA related capabilities of <samp class="ph codeph">sm_x<sub class="ph sub">1</sub>y<sub class="ph sub">1</sub></samp> are included in those of <samp class="ph codeph">sm_x<sub class="ph sub">2</sub>y<sub class="ph sub">2</sub></samp>. From this it indeed follows that <samp class="ph codeph">sm_10</samp> is the <em class="ph i">base</em> Tesla model, and it also explains why higher entries in the tables are always functional extensions to the lower entries.
                           This is denoted by the plus sign in the table. Moreover, if we abstract from the instruction encoding, it implies that <samp class="ph codeph">sm_10</samp>'s functionality will continue to be included in all later GPU generations. As we will see next, this property will be the
                           foundation for application compatibility support by <samp class="ph codeph">nvcc</samp>.
                        </p>
                        <div class="tablenoborder">
                           <table cellpadding="4" cellspacing="0" summary="" class="table" frame="border" border="1" rules="all">
                              <tbody class="tbody">
                                 <tr class="row">
                                    <td class="entry" valign="top" width="25%" rowspan="1" colspan="1"><samp class="ph codeph">sm_10</samp></td>
                                    <td class="entry" valign="top" width="75%" rowspan="1" colspan="1">
                                       <p class="p">ISA_1</p>
                                       <p class="p">Basic features</p>
                                    </td>
                                 </tr>
                                 <tr class="row">
                                    <td class="entry" valign="top" width="25%" rowspan="1" colspan="1"><samp class="ph codeph">sm_11</samp></td>
                                    <td class="entry" valign="top" width="75%" rowspan="1" colspan="1"> + atomic memory operations on global memory </td>
                                 </tr>
                                 <tr class="row">
                                    <td class="entry" valign="top" width="25%" rowspan="1" colspan="1"><samp class="ph codeph">sm_12</samp></td>
                                    <td class="entry" valign="top" width="75%" rowspan="1" colspan="1">
                                       <p class="p">+ atomic memory operations on shared memory</p>
                                       <p class="p">+ vote instructions</p>
                                    </td>
                                 </tr>
                                 <tr class="row">
                                    <td class="entry" valign="top" width="25%" rowspan="1" colspan="1"><samp class="ph codeph">sm_13</samp></td>
                                    <td class="entry" valign="top" width="75%" rowspan="1" colspan="1"> + double precision floating point support </td>
                                 </tr>
                                 <tr class="row">
                                    <td class="entry" valign="top" width="25%" rowspan="1" colspan="1"><samp class="ph codeph">sm_20</samp></td>
                                    <td class="entry" valign="top" width="75%" rowspan="1" colspan="1"> + Fermi support </td>
                                 </tr>
                                 <tr class="row">
                                    <td class="entry" valign="top" width="25%" rowspan="1" colspan="1"><samp class="ph codeph">sm_30</samp></td>
                                    <td class="entry" valign="top" width="75%" rowspan="1" colspan="1"> + Kepler support </td>
                                 </tr>
                                 <tr class="row">
                                    <td class="entry" valign="top" width="25%" rowspan="1" colspan="1"><samp class="ph codeph">sm_35</samp></td>
                                    <td class="entry" valign="top" width="75%" rowspan="1" colspan="1"> + dynamic parallelism support </td>
                                 </tr>
                              </tbody>
                           </table>
                        </div>
                     </div>
                  </div>
                  <div class="topic concept nested1" id="application-compatibility"><a name="application-compatibility" shape="rect">
                        <!-- --></a><h3 class="title topictitle2"><a href="#application-compatibility" name="application-compatibility" shape="rect">6.3.&nbsp;Application Compatibility</a></h3>
                     <div class="body conbody">
                        <p class="p">Binary code compatibility over CPU generations, together with a published instruction set architecture is the usual mechanism
                           for ensuring that distributed applications <em class="ph i">out there in the field</em> will continue to run on newer versions of the CPU when these become mainstream.
                        </p>
                        <p class="p">This situation is different for GPUs, because NVIDIA cannot guarantee binary compatibility without sacrificing regular opportunities
                           for GPU improvements. Rather, as is already conventional in the graphics programming domain, <samp class="ph codeph">nvcc</samp> relies on a two stage compilation model for ensuring application compatibility with future GPU generations.
                        </p>
                     </div>
                  </div>
                  <div class="topic concept nested1" id="virtual-architectures"><a name="virtual-architectures" shape="rect">
                        <!-- --></a><h3 class="title topictitle2"><a href="#virtual-architectures" name="virtual-architectures" shape="rect">6.4.&nbsp;Virtual Architectures</a></h3>
                     <div class="body conbody">
                        <p class="p">GPU compilation is performed via an intermediate representation, PTX ([...]), which can be considered as assembly for a virtual
                           GPU architecture. Contrary to an actual graphics processor, such a virtual GPU is defined entirely by the set of capabilities,
                           or features, that it provides to the application. In particular, a virtual GPU architecture provides a (largely) generic instruction
                           set, and binary instruction encoding is a non-issue because PTX programs are always represented in text format.
                        </p>
                        <p class="p">Hence, a <samp class="ph codeph">nvcc</samp> compilation command always uses two architectures: a <dfn class="term">compute</dfn> architecture to specify the virtual intermediate architecture, plus a <em class="ph i">real</em> GPU architecture to specify the intended processor to execute on. For such an <samp class="ph codeph">nvcc</samp> command to be valid, the <em class="ph i">real</em> architecture must be an implementation (someway or another) of the virtual architecture. This is further explained below.
                        </p>
                        <p class="p">The chosen virtual architecture is more of a statement on the GPU capabilities that the application requires: using a <em class="ph i">smallest</em> virtual architecture still allows a <em class="ph i">widest</em> range of actual architectures for the second <samp class="ph codeph">nvcc</samp> stage. Conversely, specifying a virtual architecture that provides features unused by the application unnecessarily restricts
                           the set of possible GPUs that can be specified in the second <samp class="ph codeph">nvcc</samp> stage.
                        </p>
                        <p class="p">From this it follows that the virtual <dfn class="term">compute</dfn> architecture should always be chosen as <em class="ph i">low</em> as possible, thereby maximizing the actual GPUs to run on. The <em class="ph i">real</em> sm architecture should be chosen as <em class="ph i">high</em> as possible (assuming that this always generates better code), but this is only possible with knowledge of the actual GPUs
                           on which the application is expected to run. As we will see later, in the situation of just in time compilation, where the
                           driver has this exact knowledge: the runtime GPU is the one on which the program is about to be launched/executed.
                        </p><br clear="none"></br><div class="imagecenter"><img class="image imagecenter" src="graphics/virtual-architectures.png" alt="Virtual compute architecture and Real sm architecture."></img></div><br clear="none"></br></div>
                  </div>
                  <div class="topic reference nested1" id="virtual-architecture-feature-list"><a name="virtual-architecture-feature-list" shape="rect">
                        <!-- --></a><h3 class="title topictitle2"><a href="#virtual-architecture-feature-list" name="virtual-architecture-feature-list" shape="rect">6.5.&nbsp;Virtual Architecture Feature List</a></h3>
                     <div class="body refbody">
                        <div class="section refsyn">
                           <div class="tablenoborder"><a name="virtual-architecture-feature-list__virtual-architecture-features" shape="rect">
                                 <!-- --></a><table cellpadding="4" cellspacing="0" summary="" id="virtual-architecture-feature-list__virtual-architecture-features" class="table" frame="border" border="1" rules="all">
                                 <tbody class="tbody">
                                    <tr class="row">
                                       <td class="entry" valign="top" width="25%" rowspan="1" colspan="1"><samp class="ph codeph">compute_10</samp></td>
                                       <td class="entry" valign="top" width="75%" rowspan="1" colspan="1"> Basic features </td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="25%" rowspan="1" colspan="1"><samp class="ph codeph">compute_11</samp></td>
                                       <td class="entry" valign="top" width="75%" rowspan="1" colspan="1"> + atomic memory operations on global memory </td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="25%" rowspan="1" colspan="1"><samp class="ph codeph">compute_12</samp></td>
                                       <td class="entry" valign="top" width="75%" rowspan="1" colspan="1">
                                          <p class="p">+ atomic memory operations on shared memory</p>
                                          <p class="p">+ vote instructions</p>
                                       </td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="25%" rowspan="1" colspan="1"><samp class="ph codeph">compute_13</samp></td>
                                       <td class="entry" valign="top" width="75%" rowspan="1" colspan="1"> + double precision floating point support </td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="25%" rowspan="1" colspan="1"><samp class="ph codeph">compute_20</samp></td>
                                       <td class="entry" valign="top" width="75%" rowspan="1" colspan="1"> + Fermi support </td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="25%" rowspan="1" colspan="1"><samp class="ph codeph">compute_30</samp></td>
                                       <td class="entry" valign="top" width="75%" rowspan="1" colspan="1"> + Kepler support </td>
                                    </tr>
                                 </tbody>
                              </table>
                           </div>
                           <p class="p">The above table lists the currently defined virtual architectures. As it appears, this table shows a 1-1 correspondence to
                              the table of actual GPUs listed earlier in this chapter. The only difference except for the architecture names is that the
                              ISA specification is missing for the compute architectures.
                           </p>
                           <p class="p">However, this correspondence is misleading, and might degrade when new GPU architectures are introduced and also due to development
                              of the CUDA compiler.
                           </p>
                           <p class="p">First, a next generation architecture might not provide any functional improvements, in which case the list of <em class="ph i">real</em> architectures will be extended (because we must be able to generate code for this architecture), but no new compute architecture
                              is necessary.
                           </p>
                           <p class="p">Second, it may be decided to let the compiler emulate certain <em class="ph i">higher</em> grade features on <em class="ph i">lower</em> grade GPUs. For example, this might be done for double precision floating point support. In this case double precision based
                              applications will run on all <em class="ph i">real</em> GPU architectures, though with considerably lower performance on the models that do not provide native double support. Such
                              double precision emulation is here used merely as an example (it currently is not actually considered), but the CUDA compiler
                              already does emulation for features that are considered <em class="ph i">basic</em> though not natively supported: integer division and 64-bit integer arithmetic. Because integer division and 64-bit integer
                              support are part of the basic feature set, they will not explicitly show up in the features tables.
                           </p>
                           <p class="p">Feature emulation might have two different consequences for the virtual architecture table: the feature might be silently
                              added to a lower grade virtual architecture (as has happened for integer division and 64-bit arithmetic), or it could be kept
                              in a separate virtual architecture. For instance if we were to emulate double precision floating point on an <samp class="ph codeph">sm_10</samp>, then keeping the virtual architecture <samp class="ph codeph">compute_13</samp> would make sense because of the drastic performance consequences: applications would then have to explicitly <em class="ph i">enable</em> it during <samp class="ph codeph">nvcc</samp> compilation and there would therefore be no danger of unwittingly using it on lower grade GPUs. Either way, the following
                              <samp class="ph codeph">nvcc</samp> command would become valid (which currently is not the case):
                           </p><pre xml:space="preserve">nvcc x.cu -arch=compute_13 -code=sm_10</pre><p class="p">The two cases of feature implementation are further illustrated below: </p><br clear="none"></br><div class="imagecenter"><img class="image imagecenter" src="graphics/virtual-architecture-feature-list.png" alt="Illustration of  two cases of feature implementation."></img></div><br clear="none"></br></div>
                     </div>
                  </div>
                  <div class="topic concept nested1" id="further-mechanisms"><a name="further-mechanisms" shape="rect">
                        <!-- --></a><h3 class="title topictitle2"><a href="#further-mechanisms" name="further-mechanisms" shape="rect">6.6.&nbsp;Further Mechanisms</a></h3>
                     <div class="body conbody">
                        <p class="p">Clearly, compilation staging in itself does not help towards the goal of application
                           compatibility with future GPUs. For this we need the two other mechanisms by CUDA Samples:
                           just in time compilation (JIT) and fatbinaries. 
                        </p>
                     </div>
                     <div class="topic concept nested2" id="just-in-time-compilation"><a name="just-in-time-compilation" shape="rect">
                           <!-- --></a><h3 class="title topictitle2"><a href="#just-in-time-compilation" name="just-in-time-compilation" shape="rect">6.6.1.&nbsp;Just in Time Compilation</a></h3>
                        <div class="body conbody">
                           <p class="p">The compilation step to an actual GPU binds the code to one generation of GPUs. Within that generation, it involves a choice
                              between GPU <em class="ph i">coverage</em> and possible performance. For example, compiling to <samp class="ph codeph">sm_30</samp> allows the code to run on all Kepler-generation GPUs, but compiling to <samp class="ph codeph">sm_35</samp> would probably yield better code if Kepler GK110 and later are the only targets.
                           </p><br clear="none"></br><div class="imagecenter"><img class="image imagecenter" src="graphics/just-in-time-compilation.png" alt="Just in time compilation."></img></div><br clear="none"></br><p class="p">By specifying a virtual code architecture instead of a <em class="ph i">real</em> GPU, nvcc postpones the second compilation stage until application runtime, at which the target GPU is exactly known. For
                              instance, the command below allows generation of exactly matching GPU binary code, when the application is launched on an
                              <samp class="ph codeph">sm_20</samp> or later architecture
                           </p><pre class="pre screen" xml:space="preserve">nvcc x.cu -arch=compute_20 -code=compute_20</pre><p class="p">The disadvantage of just in time compilation is increased application startup delay, but this can be alleviated by letting
                              the CUDA driver use a compilation cache (refer to "Section 3.1.1.2. Just-in-Time Compilation" of <em class="ph i">CUDA C Programming Guide</em>) which is persistent over multiple runs of the applications.
                           </p>
                        </div>
                     </div>
                     <div class="topic concept nested2" id="fatbinaries"><a name="fatbinaries" shape="rect">
                           <!-- --></a><h3 class="title topictitle2"><a href="#fatbinaries" name="fatbinaries" shape="rect">6.6.2.&nbsp;Fatbinaries</a></h3>
                        <div class="body conbody">
                           <div class="p">A different solution to overcome startup delay by JIT while still allowing execution on newer GPUs is to specify multiple
                              code instances, as in
                              <pre class="pre screen" xml:space="preserve">nvcc x.cu -arch=compute_10 -code=compute_10,sm_10,sm_13</pre></div>
                           <p class="p">This command generates exact code for two Tesla variants, plus ptx code for use by JIT in case a next-generation GPU is encountered.
                              <samp class="ph codeph">nvcc</samp> organizes its device code in fatbinaries, which are able to hold multiple translations of the same GPU source code. At runtime,
                              the CUDA driver will select the most appropriate translation when the device function is launched. 
                           </p>
                        </div>
                     </div>
                  </div>
                  <div class="topic concept nested1" id="nvcc-examples"><a name="nvcc-examples" shape="rect">
                        <!-- --></a><h3 class="title topictitle2"><a href="#nvcc-examples" name="nvcc-examples" shape="rect">6.7.&nbsp;NVCC Examples</a></h3>
                     <div class="body conbody"></div>
                     <div class="topic reference nested2" id="base-notation"><a name="base-notation" shape="rect">
                           <!-- --></a><h3 class="title topictitle2"><a href="#base-notation" name="base-notation" shape="rect">6.7.1.&nbsp;Base Notation</a></h3>
                        <div class="body refbody">
                           <div class="section">
                              <p class="p"><samp class="ph codeph">nvcc</samp> provides the options <samp class="ph codeph">-arch</samp> and <samp class="ph codeph">-code</samp> for specifying the target architectures for both translation stages. Except for allowed short hands described below, the
                                 <samp class="ph codeph">-arch</samp> option takes a single value, which must be the name of a virtual compute architecture, while option <samp class="ph codeph">-code</samp> takes a list of values which must all be the names of actual GPUs. <samp class="ph codeph">nvcc</samp> performs a stage 2 translation for each of these GPUs, and will embed the result in the result of compilation (which usually
                                 is a host object file or executable).
                              </p>
                           </div>
                           <div class="section">
                              <h4 class="title sectiontitle">Example</h4><pre class="pre screen" xml:space="preserve">nvcc x.cu -arch=compute_10 -code=sm_10,sm_13</pre></div>
                        </div>
                     </div>
                     <div class="topic concept nested2" id="shorthand"><a name="shorthand" shape="rect">
                           <!-- --></a><h3 class="title topictitle2"><a href="#shorthand" name="shorthand" shape="rect">6.7.2.&nbsp;Shorthand</a></h3>
                        <div class="body conbody">
                           <p class="p"><samp class="ph codeph">nvcc</samp> allows a number of shorthands for simple cases.
                           </p>
                        </div>
                        <div class="topic reference nested3" id="shorthand-1"><a name="shorthand-1" shape="rect">
                              <!-- --></a><h3 class="title topictitle2"><a href="#shorthand-1" name="shorthand-1" shape="rect">6.7.2.1.&nbsp;Shorthand 1</a></h3>
                           <div class="body refbody">
                              <div class="section">
                                 <p class="p"><samp class="ph codeph">-code</samp> arguments can be virtual architectures. In this case the stage 2 translation will be omitted for such virtual architecture,
                                    and the stage 1 PTX result will be embedded instead. At application launch, and in case the driver does not find a better
                                    alternative, the stage 2 compilation will be invoked by the driver with the PTX as input.
                                 </p>
                              </div>
                              <div class="section">
                                 <h5 class="title sectiontitle">Example</h5><pre class="pre screen" xml:space="preserve">nvcc x.cu -arch=compute_10 <strong class="ph b">-code=compute_10</strong>,sm_10,sm_13</pre></div>
                           </div>
                        </div>
                        <div class="topic reference nested3" id="shorthand-2"><a name="shorthand-2" shape="rect">
                              <!-- --></a><h3 class="title topictitle2"><a href="#shorthand-2" name="shorthand-2" shape="rect">6.7.2.2.&nbsp;Shorthand 2</a></h3>
                           <div class="body refbody">
                              <div class="section">
                                 <p class="p">The <samp class="ph codeph">-code</samp> option can be omitted. Only in this case, the <samp class="ph codeph">-arch</samp> value can be a non-virtual architecture. The <samp class="ph codeph">-code</samp> values default to the closest virtual architecture that is implemented by the GPU specified with <samp class="ph codeph">-arch</samp>, plus the <samp class="ph codeph">-arch</samp> value itself (in case the <samp class="ph codeph">-arch</samp> value is a virtual architecture then these two are the same, resulting in a single<samp class="ph codeph">-code</samp> default). After that, the effective <samp class="ph codeph">-arch</samp> value will be the <em class="ph i">closest</em> virtual architecture:
                                 </p>
                              </div>
                              <div class="section">
                                 <h5 class="title sectiontitle">Example</h5>
                                 <div class="p"><pre class="pre screen" xml:space="preserve">nvcc x.cu -arch=sm_13
nvcc x.cu -arch=compute_10</pre>
                                    
                                    are short hands for
                                    <pre class="pre screen" xml:space="preserve">nvcc x.cu -arch=<strong class="ph b">compute_13</strong> -code=<strong class="ph b">sm_13,compute_13</strong>
nvcc x.cu -arch=compute_10 <strong class="ph b">-code=compute_10</strong></pre></div>
                              </div>
                           </div>
                        </div>
                        <div class="topic reference nested3" id="shorthand-3"><a name="shorthand-3" shape="rect">
                              <!-- --></a><h3 class="title topictitle2"><a href="#shorthand-3" name="shorthand-3" shape="rect">6.7.2.3.&nbsp;Shorthand 3</a></h3>
                           <div class="body refbody">
                              <div class="section">
                                 <p class="p">Both <samp class="ph codeph">-arch</samp> and <samp class="ph codeph">-code</samp> options can be omitted.
                                 </p>
                              </div>
                              <div class="section">
                                 <h5 class="title sectiontitle">Example</h5>
                                 <div class="p"><pre class="pre screen" xml:space="preserve">nvcc x.cu</pre>
                                    
                                    is short hand for
                                    <pre class="pre screen" xml:space="preserve">nvcc x.cu <strong class="ph b">-arch=compute_20 -code=sm_20,compute_20</strong></pre></div>
                              </div>
                           </div>
                        </div>
                     </div>
                     <div class="topic concept nested2" id="extended-notation"><a name="extended-notation" shape="rect">
                           <!-- --></a><h3 class="title topictitle2"><a href="#extended-notation" name="extended-notation" shape="rect">6.7.3.&nbsp;Extended Notation</a></h3>
                        <div class="body conbody">
                           <p class="p">The options <samp class="ph codeph">-arch</samp> and <samp class="ph codeph">-code</samp> can be used in all cases where code is to be generated for one or more GPUs using a common virtual architecture. This will
                              cause a single invocation of <samp class="ph codeph">nvcc</samp> stage&nbsp;1 (that is, preprocessing and generation of virtual PTX assembly code), followed by a compilation stage 2 (binary code
                              generation) repeated for each specified GPU.
                           </p>
                           <p class="p">Using a common virtual architecture means that all assumed GPU features are fixed for the entire <samp class="ph codeph">nvcc</samp> compilation. For instance, the following <samp class="ph codeph">nvcc</samp> command assumes no warp shuffle support for both the <samp class="ph codeph">sm_20</samp> code and the <samp class="ph codeph">sm_30</samp> code:
                           </p><pre class="pre screen" xml:space="preserve">nvcc x.cu -arch=compute_20 -code=compute_20,sm_20,sm_30</pre><p class="p">Sometimes it is necessary to perform different GPU code generation steps, partitioned over different architectures. This is
                              possible using <samp class="ph codeph">nvcc</samp> option <samp class="ph codeph">-gencode</samp>, which then must be used instead of a <samp class="ph codeph">-arch</samp>/<samp class="ph codeph">-code</samp> combination.
                           </p>
                           <p class="p">Unlike option <samp class="ph codeph">-arch</samp>, option <samp class="ph codeph">-gencode</samp> may be repeated on the <samp class="ph codeph">nvcc</samp> command line. It takes sub-options <samp class="ph codeph">arch</samp> and <samp class="ph codeph">code</samp>, which must not be confused with their main option equivalents, but behave similarly. If repeated architecture compilation
                              is used, then the device code must use conditional compilation based on the value of the architecture identification macro
                              <samp class="ph codeph">__CUDA_ARCH__</samp>, which is described in the next section.
                           </p>
                           <p class="p">For example, the following assumes absence of warp shuffle support for the <samp class="ph codeph">sm_20</samp> and <samp class="ph codeph">sm_21 code</samp>, but full support on <samp class="ph codeph">sm_3x</samp>:
                           </p><pre class="pre screen" xml:space="preserve">nvcc x.cu \
    -gencode arch=compute_20,code=sm_20 \
    -gencode arch=compute_20,code=sm_21 \
    -gencode arch=compute_30,code=sm_30</pre><p class="p">Or, leaving actual GPU code generation to the JIT compiler in the CUDA driver:</p><pre class="pre screen" xml:space="preserve">nvcc x.cu \
    -gencode arch=compute_20,code=compute_20 \
    -gencode arch=compute_30,code=compute_30</pre><p class="p">The code sub-options can be combined, but for technical reasons must then be quoted, which causes a slightly more complex
                              syntax:
                           </p><pre class="pre screen" xml:space="preserve">nvcc x.cu \
    -gencode arch=compute_20,code=\’sm_20,sm_21\’ \
    -gencode arch=compute_30,code=\’sm_30,sm_35\’</pre></div>
                     </div>
                     <div class="topic reference nested2" id="virtual-architecture-identification-macro"><a name="virtual-architecture-identification-macro" shape="rect">
                           <!-- --></a><h3 class="title topictitle2"><a href="#virtual-architecture-identification-macro" name="virtual-architecture-identification-macro" shape="rect">6.7.4.&nbsp;Virtual Architecture Identification Macro</a></h3>
                        <div class="body refbody">
                           <div class="section refsyn">
                              <p class="p">The architecture identification macro <samp class="ph codeph">__CUDA_ARCH__</samp> is assigned a three-digit value string <samp class="ph codeph">xy0</samp> (ending in a literal <samp class="ph codeph">0</samp>) during each <samp class="ph codeph">nvcc</samp> compilation stage 1 that compiles for <samp class="ph codeph">compute_xy</samp>.
                              </p>
                              <p class="p">This macro can be used in the implementation of GPU functions for determining the virtual architecture for which it is currently
                                 being compiled. The host code (the non-GPU code) must <em class="ph i">not</em> depend on it.
                              </p>
                           </div>
                        </div>
                     </div>
                  </div>
               </div>
               <div class="topic concept nested0" id="using-separate-compilation-in-cuda"><a name="using-separate-compilation-in-cuda" shape="rect">
                     <!-- --></a><h2 class="title topictitle1"><a href="#using-separate-compilation-in-cuda" name="using-separate-compilation-in-cuda" shape="rect">7.&nbsp;Using Separate Compilation in CUDA</a></h2>
                  <div class="body conbody">
                     <p class="p">Prior to the 5.0 release, CUDA did not support separate compilation, so CUDA code could not call device functions or access
                        variables across files. Such compilation is referred to as <dfn class="term">whole program compilation</dfn>. We have always supported the separate compilation of host code, it was just the device CUDA code that needed to all be within
                        one file. Starting with CUDA 5.0, separate compilation of device code is supported, but the old whole program mode is still
                        the default, so there are new options to invoke separate compilation.
                     </p>
                  </div>
                  <div class="topic concept nested1" id="code-changes-for-separate-compilation"><a name="code-changes-for-separate-compilation" shape="rect">
                        <!-- --></a><h3 class="title topictitle2"><a href="#code-changes-for-separate-compilation" name="code-changes-for-separate-compilation" shape="rect">7.1.&nbsp;Code Changes for Separate Compilation</a></h3>
                     <div class="body conbody">
                        <p class="p">The code changes required for separate compilation of device code are the same as what you already do for host code, namely
                           using <samp class="ph codeph">extern</samp> and <samp class="ph codeph">static</samp> to control the visibility of symbols. Note that previously <samp class="ph codeph">extern</samp> was ignored in CUDA code; now it will be honored. With the use of <samp class="ph codeph">static</samp> it is possible to have multiple device symbols with the same name in different files. For this reason, the CUDA API calls
                           that referred to symbols by their string name are deprecated; instead the symbol should be referenced by its address.
                        </p>
                     </div>
                  </div>
                  <div class="topic concept nested1" id="nvcc-options-for-separate-compilation"><a name="nvcc-options-for-separate-compilation" shape="rect">
                        <!-- --></a><h3 class="title topictitle2"><a href="#nvcc-options-for-separate-compilation" name="nvcc-options-for-separate-compilation" shape="rect">7.2.&nbsp;NVCC Options for Separate Compilation</a></h3>
                     <div class="body conbody">
                        <p class="p">CUDA works by embedding device code into host objects. In whole program compilation, it embeds executable device code into
                           the host object.&nbsp; In separate compilation, we embed relocatable device code into the host object, and run the device linker
                           (<samp class="ph codeph">nvlink</samp>) to link all the device code together.&nbsp; The output of nvlink is then linked together with all the host objects by the host
                           linker to form the final executable.
                        </p>
                        <p class="p">The generation of relocatable vs executable device code is controlled by the <samp class="ph codeph">--relocatable-device-code={true,false}</samp> option, which can be shortened to <samp class="ph codeph">–rdc={true,false}</samp>.
                        </p>
                        <p class="p">The <samp class="ph codeph">–c</samp> option is already used to control stopping a compile at a host object, so a new option <samp class="ph codeph">--device-c</samp> (or <samp class="ph codeph">–dc</samp>) is added that simply does <samp class="ph codeph">–c --relocatable-device-code=true</samp>.
                        </p>
                        <div class="p">To invoke just the device linker, the <samp class="ph codeph">--device-link</samp> (<samp class="ph codeph">-dlink</samp>) option can be used, which emits a host object containing the embedded executable device code.&nbsp; The output of that must then
                           be passed to the host linker.&nbsp;Or:
                           <pre xml:space="preserve">nvcc &lt;objects&gt;</pre>
                           can be used to implicitly call both the device and host linkers as long as the architecture is &gt; <samp class="ph codeph">sm_20</samp>. This works because if the device linker does not see any relocatable code it does not do anything.
                        </div>
                        <p class="p">A diagram of the flow is as follows:</p>
                        <div class="fig fignone" id="nvcc-options-for-separate-compilation__nvcc-options-for-separate-compilation"><a name="nvcc-options-for-separate-compilation__nvcc-options-for-separate-compilation" shape="rect">
                              <!-- --></a><img class="image" src="graphics/nvcc-options-for-separate-compilation.png" alt="Flow diagram of nvcc options for separate compilation"></img></div>
                     </div>
                  </div>
                  <div class="topic concept nested1" id="libraries"><a name="libraries" shape="rect">
                        <!-- --></a><h3 class="title topictitle2"><a href="#libraries" name="libraries" shape="rect">7.3.&nbsp;Libraries</a></h3>
                     <div class="body conbody">
                        <p class="p">The device linker has the ability to read the static host library formats (<samp class="ph codeph">.a</samp> on Linux and Mac, <samp class="ph codeph">.lib</samp> on Windows). It ignores any dynamic (<samp class="ph codeph">.so</samp> or <samp class="ph codeph">.dll</samp>) libraries. The <samp class="ph codeph">-l</samp> and <samp class="ph codeph">-L</samp> options can be used to pass libraries to both the device and host linker.
                           The library name is specified without the library file extension when
                           the <samp class="ph codeph">-l</samp> option is used.
                        </p><pre class="pre screen" xml:space="preserve">nvcc -arch=sm_20 a.o b.o -L&lt;path&gt; -lfoo</pre><p class="p">Alternatively, the library name, including the library file
                           extension, can be used without the <samp class="ph codeph">-l</samp> option on
                           Windows.
                        </p><pre class="pre screen" xml:space="preserve">nvcc -arch=sm_20 a.obj b.obj foo.lib -L&lt;path&gt;</pre><p class="p">Note that the device linker ignores any objects that do not have relocatable device code.</p>
                     </div>
                  </div>
                  <div class="topic reference nested1" id="examples"><a name="examples" shape="rect">
                        <!-- --></a><h3 class="title topictitle2"><a href="#examples" name="examples" shape="rect">7.4.&nbsp;Examples</a></h3>
                     <div class="body refbody">
                        <div class="section">
                           <p class="p">Suppose we have the following files:</p>
                        </div>
                        <div class="example">
                           <h3 class="title sectiontitle">******* b.h ***********</h3><pre xml:space="preserve">#define N 8

<span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-keyword">extern</span> <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-attribute">__device__</span> <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-keyword">int</span> g[N];

<span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-keyword">extern</span> <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-attribute">__device__</span> <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-keyword">void</span> bar(<span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-keyword">void</span>;</pre></div>
                        <div class="example">
                           <h3 class="title sectiontitle">******* b.cu***********</h3><pre xml:space="preserve">#include <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-string">"b.h"</span>

<span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-attribute">__device__</span> <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-keyword">int</span> g[N];

<span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-attribute">__device__</span> <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-keyword">void</span> bar (<span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-keyword">void</span>)
{
  g[<span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-attribute">threadIdx</span>.x]++;
}</pre></div>
                        <div class="example">
                           <h3 class="title sectiontitle">******* a.cu ***********</h3><pre xml:space="preserve">#include &lt;stdio.h&gt;
<span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-directive">#include "b.h"</span>

<span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-attribute">__global__</span> <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-keyword">void</span> foo (<span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-keyword">void</span>) {

  <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-attribute">__shared__</span> <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-keyword">int</span> a[N];
  a[<span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-attribute">threadIdx</span>.x] = <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-attribute">threadIdx</span>.x;

  __syncthreads();

  g[<span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-attribute">threadIdx</span>.x] = a[<span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-attribute">blockDim</span>.x - <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-attribute">threadIdx</span>.x - 1];

  bar();
}

<span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-keyword">int</span> main (<span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-keyword">void</span>) {
  <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-keyword">unsigned</span> <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-keyword">int</span> i;
  <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-keyword">int</span> *dg, hg[N];
  <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-keyword">int</span> sum = 0;

  foo<span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-attribute">&lt;&lt;&lt;</span>1, N<span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-attribute">&gt;&gt;&gt;</span>();

  <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-keyword">if</span>(cudaGetSymbolAddress((<span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-keyword">void</span>**)&amp;dg, g)){
      printf(<span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-string">"couldn't get the symbol addr\n"</span>);
      <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-keyword">return</span> 1;
  }
  <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-keyword">if</span>(cudaMemcpy(hg, dg, N * <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-keyword">sizeof</span>(<span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-keyword">int</span>), cudaMemcpyDeviceToHost)){
      printf(<span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-string">"couldn't memcpy\n"</span>);
      <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-keyword">return</span> 1;
  }

  <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-keyword">for</span> (i = 0; i &lt; N; i++) {
    sum += hg[i];
  }
  <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-keyword">if</span> (sum == 36) {
    printf(<span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-string">"PASSED\n"</span>);
  } <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-keyword">else</span> {
    printf(<span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-string">"FAILED (%d)\n"</span>, sum);
  }

  <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-keyword">return</span> 0;
}</pre></div>
                        <div class="section">
                           <p class="p">These can be compiled with the following commands (these examples are for Linux):</p><pre class="pre screen" xml:space="preserve">nvcc –arch=sm_20 –dc a.cu b.cu
nvcc –arch=sm_20 a.o b.o</pre><p class="p">If you want to invoke the device and host linker separately, you can do:</p><pre class="pre screen" xml:space="preserve">nvcc –arch=sm_20 –dc a.cu b.cu
nvcc –arch=sm_20 –dlink a.o b.o –o link.o
g++ a.o b.o link.o –L&lt;path&gt; -lcudart</pre><p class="p">Note that a target architecture must be passed to the device linker. If you invoke the device linker without a target arch,
                              e.g.,
                           </p><pre class="pre screen" xml:space="preserve">nvcc –dlink a.o b.o</pre><p class="p">You will get an error because that defaults to <samp class="ph codeph">sm_10</samp>.
                           </p>
                           <p class="p">The objects could be put into a library and used with:</p><pre class="pre screen" xml:space="preserve">nvcc –arch=sm_20 –dc a.cu b.cu
nvcc –lib a.o b.o –o test.a
nvcc –arch=sm_20 test.a</pre><p class="p">Note that only static libraries are supported by the device linker.</p>
                           <p class="p">A ptx file can be compiled to a host object file and then linked by using:</p><pre class="pre screen" xml:space="preserve">nvcc -arch=sm_20 -dc a.ptx</pre><p class="p">An example that uses libraries, host linker, and dynamic parallelism would be:</p><pre class="pre screen" xml:space="preserve">nvcc –arch=sm_35 –dc a.cu b.cu
nvcc –arch=sm_35 –dlink a.o b.o –lcudadevrt –o link.o
g++ a.o b.o link.o –lcudadevrt –L&lt;path&gt; –lcudart</pre><p class="p">It is possible to do multiple device links within a single host executable, as long as each device link is independent of
                              the other. 
                              This requirement of independence means that they cannot share code across device executables, nor can they share addresses
                              (e.g. a device function address can be passed from host to device for a callback only if the device link sees both the caller
                              and potential callback callee; you cannot pass an address from one device executable to another, as those are separate address
                              spaces).
                           </p>
                        </div>
                     </div>
                  </div>
                  <div class="topic concept nested1" id="potential-separate-compilation-issues"><a name="potential-separate-compilation-issues" shape="rect">
                        <!-- --></a><h3 class="title topictitle2"><a href="#potential-separate-compilation-issues" name="potential-separate-compilation-issues" shape="rect">7.5.&nbsp;Potential Separate Compilation Issues</a></h3>
                     <div class="topic concept nested2" id="object-compatibility"><a name="object-compatibility" shape="rect">
                           <!-- --></a><h3 class="title topictitle2"><a href="#object-compatibility" name="object-compatibility" shape="rect">7.5.1.&nbsp;Object Compatibility</a></h3>
                        <div class="body conbody">
                           <p class="p">Only relocatable device code with the same ABI version, same SM target architecture, and same pointer size (32 or 64) can
                              be linked together.  Incompatible objects will produce a link error.  An object could have been compiled for a different architecture
                              but also have PTX available, in which case the device linker will JIT the PTX to cubin for the desired architecture and then
                              link.  Relocatable device code requires CUDA 5.0 or later toolkit.
                           </p>
                           <p class="p">If a kernel is limited to a certain number of registers with the <samp class="ph codeph">launch_bounds</samp> attribute or the <samp class="ph codeph">-maxrregcount</samp> option, then all functions that the kernel calls must not use more than that number of registers; if they exceed the limit,
                              then a link error will be given.
                           </p>
                        </div>
                     </div>
                     <div class="topic concept nested2" id="jit-linking-not-supported"><a name="jit-linking-not-supported" shape="rect">
                           <!-- --></a><h3 class="title topictitle2"><a href="#jit-linking-not-supported" name="jit-linking-not-supported" shape="rect">7.5.2.&nbsp;JIT Linking Support</a></h3>
                        <div class="body conbody">
                           <p class="p">CUDA 5.0 does not support JIT linking, while CUDA 5.5 does.  This means that to use JIT linking you must recompile your code
                              with CUDA 5.5 or later.  JIT linking means doing a relink of the code at startup time.  The device linker (<samp class="ph codeph">nvlink</samp>) links at the cubin level. If the cubin does not match the target architecture at load time, the driver re-invokes the device
                              linker to generate cubin for the target architecture, by first JIT'ing the PTX for each object to the appropriate cubin, and
                              then linking together the new cubin.
                           </p>
                        </div>
                     </div>
                     <div class="topic concept nested2" id="implicit-cuda-host-code"><a name="implicit-cuda-host-code" shape="rect">
                           <!-- --></a><h3 class="title topictitle2"><a href="#implicit-cuda-host-code" name="implicit-cuda-host-code" shape="rect">7.5.3.&nbsp;Implicit CUDA Host Code</a></h3>
                        <div class="body conbody">
                           <p class="p">A file like <samp class="ph codeph">b.cu</samp> above only contains CUDA device code, so one might think that the b.o object doesn't need to be passed to the host linker.
                              But actually there is implicit host code generated whenever a device symbol can be accessed from the host side, either via
                              a launch or an API call like <samp class="ph codeph">cudaGetSymbolAddress()</samp>. This implicit host code is put into <samp class="ph codeph">b.o</samp>, and needs to be passed to the host linker. Plus, for JIT linking to work all device code must be passed to the host linker,
                              else the host executable will not contain device code needed for the JIT link.  So a general rule is that the device linker
                              and host linker must see the same host object files (if the object files have any device references in them - if a file is
                              pure host then the device linker doesn't need to see it).
                           </p>
                        </div>
                     </div>
                  </div>
               </div>
               <div class="topic concept nested0" id="miscellaneous-nvcc-usage"><a name="miscellaneous-nvcc-usage" shape="rect">
                     <!-- --></a><h2 class="title topictitle1"><a href="#miscellaneous-nvcc-usage" name="miscellaneous-nvcc-usage" shape="rect">8.&nbsp;Miscellaneous NVCC Usage</a></h2>
                  <div class="topic concept nested1" id="printing-code-generation-statistics"><a name="printing-code-generation-statistics" shape="rect">
                        <!-- --></a><h3 class="title topictitle2"><a href="#printing-code-generation-statistics" name="printing-code-generation-statistics" shape="rect">8.1.&nbsp;Printing Code Generation Statistics</a></h3>
                     <div class="body conbody">
                        <p class="p">A summary on the amount of used registers and the amount of memory needed per compiled device function can be printed by passing
                           option <samp class="ph codeph">-v</samp> to <samp class="ph codeph">ptxas</samp>:
                        </p><pre class="pre screen" xml:space="preserve">nvcc -Xptxas -v acos.cu
ptxas info   : Compiling entry function 'acos_main'
ptxas info   : Used 4 registers, 60+56 bytes lmem, 44+40 bytes smem, 
               20 bytes cmem[1], 12 bytes cmem[14]</pre><p class="p">As shown in the above example, the amounts of local and shared memory are listed by two numbers each. First number represents
                           the total size of all the variables declared in that memory segment and the second number represents the amount of system
                           allocated data. The amount and location of system allocated data as well as the allocation of constant variables to constant
                           banks is profile specific. For constant memory, the total space allocated in that bank is shown.
                        </p>
                        <p class="p">If separate compilation is used, some of this info must come from the device linker, so should use <samp class="ph codeph">nvcc –Xnvlink –v</samp>.
                        </p>
                     </div>
                  </div>
               </div>
               <div class="topic concept nested0" id="notices-header"><a name="notices-header" shape="rect">
                     <!-- --></a><h2 class="title topictitle1"><a href="#notices-header" name="notices-header" shape="rect">Notices</a></h2>
                  <div class="topic reference nested1" id="notice"><a name="notice" shape="rect">
                        <!-- --></a><h3 class="title topictitle2"><a href="#notice" name="notice" shape="rect"></a></h3>
                     <div class="body refbody">
                        <div class="section">
                           <h3 class="title sectiontitle">Notice</h3>
                           <p class="p">ALL NVIDIA DESIGN SPECIFICATIONS, REFERENCE BOARDS, FILES, DRAWINGS, DIAGNOSTICS, LISTS, AND OTHER DOCUMENTS (TOGETHER AND
                              SEPARATELY, "MATERIALS") ARE BEING PROVIDED "AS IS." NVIDIA MAKES NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE
                              WITH RESPECT TO THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT, MERCHANTABILITY, AND FITNESS
                              FOR A PARTICULAR PURPOSE. 
                           </p>
                           <p class="p">Information furnished is believed to be accurate and reliable. However, NVIDIA Corporation assumes no responsibility for the
                              consequences of use of such information or for any infringement of patents or other rights of third parties that may result
                              from its use. No license is granted by implication of otherwise under any patent rights of NVIDIA Corporation. Specifications
                              mentioned in this publication are subject to change without notice. This publication supersedes and replaces all other information
                              previously supplied. NVIDIA Corporation products are not authorized as critical components in life support devices or systems
                              without express written approval of NVIDIA Corporation.
                           </p>
                        </div>
                     </div>
                  </div>
                  <div class="topic reference nested1" id="trademarks"><a name="trademarks" shape="rect">
                        <!-- --></a><h3 class="title topictitle2"><a href="#trademarks" name="trademarks" shape="rect"></a></h3>
                     <div class="body refbody">
                        <div class="section">
                           <h3 class="title sectiontitle">Trademarks</h3>
                           <p class="p">NVIDIA and the NVIDIA logo are trademarks or registered trademarks of NVIDIA Corporation
                              in the U.S. and other countries.  Other company and product names may be trademarks of
                              the respective companies with which they are associated.
                           </p>
                        </div>
                     </div>
                  </div>
                  <div class="topic reference nested1" id="copyright-past-to-present"><a name="copyright-past-to-present" shape="rect">
                        <!-- --></a><h3 class="title topictitle2"><a href="#copyright-past-to-present" name="copyright-past-to-present" shape="rect"></a></h3>
                     <div class="body refbody">
                        <div class="section">
                           <h3 class="title sectiontitle">Copyright</h3>
                           <p class="p">© <span class="ph">2007</span>-<span class="ph">2014</span> NVIDIA
                              Corporation. All rights reserved.
                           </p>
                           <p class="p">This product includes software developed by the Syncro Soft SRL (http://www.sync.ro/).</p>
                        </div>
                     </div>
                  </div>
               </div>
               
               <hr id="contents-end"></hr>
               
            </article>
         </div>
      </div>
      <script language="JavaScript" type="text/javascript" charset="utf-8" src="../common/formatting/common.min.js"></script>
      <script language="JavaScript" type="text/javascript" charset="utf-8" src="../common/scripts/google-analytics/google-analytics-write.js"></script>
      <script language="JavaScript" type="text/javascript" charset="utf-8" src="../common/scripts/google-analytics/google-analytics-tracker.js"></script>
      <script type="text/javascript">var switchTo5x=true;</script><script type="text/javascript" src="http://w.sharethis.com/button/buttons.js"></script><script type="text/javascript">stLight.options({publisher: "998dc202-a267-4d8e-bce9-14debadb8d92", doNotHash: false, doNotCopy: false, hashAddressBar: false});</script></body>
</html>