Sophie

Sophie

distrib > Mageia > 7 > x86_64 > media > nonfree-updates > by-pkgid > b86a85131cc739c1c53d0b55840a4328 > files > 2439

nvidia-cuda-toolkit-devel-10.1.168-1.2.mga7.nonfree.x86_64.rpm

<!DOCTYPE html
  PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" lang="en-us" xml:lang="en-us">
   <head>
      <meta http-equiv="Content-Type" content="text/html; charset=utf-8"></meta>
      <meta http-equiv="X-UA-Compatible" content="IE=edge"></meta>
      <meta name="copyright" content="(C) Copyright 2005"></meta>
      <meta name="DC.rights.owner" content="(C) Copyright 2005"></meta>
      <meta name="DC.Type" content="concept"></meta>
      <meta name="DC.Title" content="Inline PTX Assembly in CUDA"></meta>
      <meta name="abstract" content="The reference guide for inlining PTX (parallel thread execution) assembly statements into CUDA."></meta>
      <meta name="description" content="The reference guide for inlining PTX (parallel thread execution) assembly statements into CUDA."></meta>
      <meta name="DC.Coverage" content="Programming Guides"></meta>
      <meta name="DC.subject" content="CUDA PTX, CUDA PTX ASM parameters, CUDA PTX ASM constraints, CUDA PTX namespace conflicts, CUDA PTX memory space conflicts, CUDA PTX incorrect optimization, CUDA PTX ptxas error, CUDA PTX error checks, CUDA PTX multiple instructions, CUDA PTX operands distinction"></meta>
      <meta name="keywords" content="CUDA PTX, CUDA PTX ASM parameters, CUDA PTX ASM constraints, CUDA PTX namespace conflicts, CUDA PTX memory space conflicts, CUDA PTX incorrect optimization, CUDA PTX ptxas error, CUDA PTX error checks, CUDA PTX multiple instructions, CUDA PTX operands distinction"></meta>
      <meta name="DC.Format" content="XHTML"></meta>
      <meta name="DC.Identifier" content="abstract"></meta>
      <link rel="stylesheet" type="text/css" href="../common/formatting/commonltr.css"></link>
      <link rel="stylesheet" type="text/css" href="../common/formatting/site.css"></link>
      <title>Inline PTX Assembly :: CUDA Toolkit Documentation</title>
      <!--[if lt IE 9]>
      <script src="../common/formatting/html5shiv-printshiv.min.js"></script>
      <![endif]-->
      <script type="text/javascript" charset="utf-8" src="//assets.adobedtm.com/b92787824f2e0e9b68dc2e993f9bd995339fe417/satelliteLib-7ba51e58dc61bcb0e9311aadd02a0108ab24cc6c.js"></script>
      <script type="text/javascript" charset="utf-8" src="../common/formatting/jquery.min.js"></script>
      <script type="text/javascript" charset="utf-8" src="../common/formatting/jquery.ba-hashchange.min.js"></script>
      <script type="text/javascript" charset="utf-8" src="../common/formatting/jquery.scrollintoview.min.js"></script>
      <script type="text/javascript" src="../search/htmlFileList.js"></script>
      <script type="text/javascript" src="../search/htmlFileInfoList.js"></script>
      <script type="text/javascript" src="../search/nwSearchFnt.min.js"></script>
      <script type="text/javascript" src="../search/stemmers/en_stemmer.min.js"></script>
      <script type="text/javascript" src="../search/index-1.js"></script>
      <script type="text/javascript" src="../search/index-2.js"></script>
      <script type="text/javascript" src="../search/index-3.js"></script>
      <link rel="canonical" href="http://docs.nvidia.com/cuda/inline-ptx-assembly/index.html"></link>
      <link rel="stylesheet" type="text/css" href="../common/formatting/qwcode.highlight.css"></link>
   </head>
   <body>
      
      <header id="header"><span id="company">NVIDIA</span><span id="site-title">CUDA Toolkit Documentation</span><form id="search" method="get" action="search">
            <input type="text" name="search-text"></input><fieldset id="search-location">
               <legend>Search In:</legend>
               <label><input type="radio" name="search-type" value="site"></input>Entire Site</label>
               <label><input type="radio" name="search-type" value="document"></input>Just This Document</label></fieldset>
            <button type="reset">clear search</button>
            <button id="submit" type="submit">search</button></form>
      </header>
      <div id="site-content">
         <nav id="site-nav">
            <div class="category closed"><a href="../index.html" title="The root of the site.">CUDA Toolkit 
                  
                  
                  v10.1.168</a></div>
            <div class="category"><a href="index.html" title="Inline PTX Assembly">Inline PTX Assembly</a></div>
            <ul>
               <li>
                  <div class="section-link"><a href="#using-inline-ptx-assembly-in-cuda">1.&nbsp;Using Inline PTX Assembly in CUDA</a></div>
                  <ul>
                     <li>
                        <div class="section-link"><a href="#assembler-asm-statements">1.1.&nbsp;Assembler (ASM) Statements</a></div>
                        <ul>
                           <li>
                              <div class="section-link"><a href="#parameters">1.1.1.&nbsp;Parameters</a></div>
                           </li>
                           <li>
                              <div class="section-link"><a href="#constraints">1.1.2.&nbsp;Constraints</a></div>
                           </li>
                        </ul>
                     </li>
                     <li>
                        <div class="section-link"><a href="#pitfalls">1.2.&nbsp;Pitfalls</a></div>
                        <ul>
                           <li>
                              <div class="section-link"><a href="#namespace-conflicts">1.2.1.&nbsp;Namespace Conflicts</a></div>
                           </li>
                           <li>
                              <div class="section-link"><a href="#memory-space-conflicts">1.2.2.&nbsp;Memory Space Conflicts</a></div>
                           </li>
                           <li>
                              <div class="section-link"><a href="#incorrect-optimization">1.2.3.&nbsp;Incorrect Optimization</a></div>
                           </li>
                           <li>
                              <div class="section-link"><a href="#incorrect-ptx">1.2.4.&nbsp;Incorrect PTX</a></div>
                           </li>
                        </ul>
                     </li>
                     <li>
                        <div class="section-link"><a href="#error-checking">1.3.&nbsp;Error Checking</a></div>
                     </li>
                  </ul>
               </li>
            </ul>
         </nav>
         <div id="resize-nav"></div>
         <nav id="search-results">
            <h2>Search Results</h2>
            <ol></ol>
         </nav>
         
         <div id="contents-container">
            <div id="breadcrumbs-container">
               <div id="release-info">Inline PTX Assembly
                  (<a href="../../pdf/Inline_PTX_Assembly.pdf">PDF</a>)
                  -
                   
                  
                  
                  v10.1.168
                  (<a href="https://developer.nvidia.com/cuda-toolkit-archive">older</a>)
                  -
                  Last updated April 24, 2019
                  -
                  <a href="mailto:CUDAIssues@nvidia.com?subject=CUDA Toolkit Documentation Feedback: Inline PTX Assembly">Send Feedback</a></div>
            </div>
            <article id="contents">
               <div class="topic nested0" id="abstract"><a name="abstract" shape="rect">
                     <!-- --></a><h2 class="title topictitle1"><a href="#abstract" name="abstract" shape="rect">Inline PTX Assembly in CUDA</a></h2>
                  <div class="body conbody">
                     <p class="shortdesc">The reference guide for inlining PTX (parallel thread execution)
                        assembly statements into CUDA.
                     </p>
                  </div>
               </div>
               <div class="topic concept nested0" id="using-inline-ptx-assembly-in-cuda"><a name="using-inline-ptx-assembly-in-cuda" shape="rect">
                     <!-- --></a><h2 class="title topictitle1"><a href="#using-inline-ptx-assembly-in-cuda" name="using-inline-ptx-assembly-in-cuda" shape="rect">1.&nbsp;Using Inline PTX Assembly in CUDA</a></h2>
                  <div class="body conbody">
                     <p class="p">The NVIDIA<sup>®</sup>
                        CUDA<sup>®</sup>  programming environment provides a parallel thread execution
                        (<dfn class="term">PTX</dfn>) instruction set architecture (ISA) for using the GPU as a data-parallel
                        computing device. For more information on the PTX ISA, refer to the latest version of the
                        <a class="xref" href="http://docs.nvidia.com/cuda/parallel-thread-execution/index.html" target="_blank" shape="rect"><em class="ph i">PTX ISA reference document</em></a>.
                     </p>
                     <p class="p">This application note describes how to inline PTX assembly language statements into CUDA
                        code.
                     </p>
                  </div>
                  <div class="topic concept nested1" id="assembler-asm-statements"><a name="assembler-asm-statements" shape="rect">
                        <!-- --></a><h3 class="title topictitle2"><a href="#assembler-asm-statements" name="assembler-asm-statements" shape="rect">1.1.&nbsp;Assembler (ASM) Statements</a></h3>
                     <div class="body conbody">
                        <div class="p">Assembler statements, <samp class="ph codeph">asm()</samp>, provide a way to insert arbitrary
                           <dfn class="term">PTX</dfn> code into your CUDA program.  A simple example is:
                           <pre xml:space="preserve">asm(<span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-string">"membar.gl;"</span>);</pre></div>
                        <p class="p">This inserts a PTX <samp class="ph codeph">membar.gl</samp> into your generated PTX code at the point
                           of the <samp class="ph codeph">asm()</samp> statement.
                        </p>
                     </div>
                     <div class="topic concept nested2" id="parameters"><a name="parameters" shape="rect">
                           <!-- --></a><h3 class="title topictitle2"><a href="#parameters" name="parameters" shape="rect">1.1.1.&nbsp;Parameters</a></h3>
                        <div class="body conbody">
                           <p class="p">An <samp class="ph codeph">asm()</samp> statement becomes more complicated, and more useful, when we pass
                              values in and out of the <dfn class="term">asm</dfn>.  The basic syntax is as follows:
                           </p><pre xml:space="preserve">asm(<span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-string">"template-string"</span> : <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-string">"constraint"</span>(output) : <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-string">"constraint"</span>(input));</pre><p class="p">where you can have multiple input or output operands separated by commas.  The template
                              string contains <dfn class="term">PTX</dfn> instructions with references to the operands.  Multiple PTX
                              instructions can be given by separating them with semicolons.
                           </p>
                           <p class="p">A simple example is as follows:</p><pre xml:space="preserve">asm(<span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-string">"add.s32 %0, %1, %2;"</span> : <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-string">"=r"</span>(i) : <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-string">"r"</span>(j), <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-string">"r"</span>(k));</pre><p class="p">Each <samp class="ph codeph">%n</samp> in the template string is an index into the following list of
                              operands, in text order. So <samp class="ph codeph">%0</samp> refers to the first operand,
                              <samp class="ph codeph">%1</samp> to the second operand, and so on. Since the output operands are always
                              listed ahead of the input operands, they are assigned the smallest indices.  This example is
                              conceptually equivalent to the following:
                           </p><pre xml:space="preserve">add.s32 i, j, k;</pre><p class="p">Note that the numbered references in the string can be in arbitrary order.  The following is
                              equivalent to the above example:
                           </p><pre xml:space="preserve">asm(<span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-string">"add.s32 %0, %2, %1;"</span> : <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-string">"=r"</span>(i) : <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-string">"r"</span>(k), <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-string">"r"</span>(j));</pre><p class="p">You can also repeat a reference, e.g.:</p><pre xml:space="preserve">asm(<span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-string">"add.s32 %0, %1, %1;"</span> : <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-string">"=r"</span>(i) : <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-string">"r"</span>(k));</pre><p class="p">is conceptually</p><pre xml:space="preserve">add.s32 i, k, k;</pre><p class="p">If there is no input operand, you can drop the final colon, e.g.:</p><pre xml:space="preserve">asm(<span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-string">"mov.s32 %0, 2;"</span> : <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-string">"=r"</span>(i));</pre><p class="p">If there is no output operand, the colon separators are adjacent, e.g.:</p><pre xml:space="preserve">asm(<span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-string">"mov.s32 r1, %0;"</span> :: <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-string">"r"</span>(i));</pre><p class="p">If you want the <samp class="ph codeph">%</samp> in a ptx instruction, then you should escape it with
                              double <samp class="ph codeph">%%</samp>, e.g.:
                           </p><pre xml:space="preserve">asm(<span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-string">"mov.u32 %0, %%clock;"</span> : <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-string">"=r"</span>(x));</pre><p class="p">The above was simplified to explain the ordering of the string <samp class="ph codeph">%</samp>
                              references.  In reality, the operand values are passed via whatever mechanism the constraint
                              specifies.  The full list of constraints will be explained later, but the "r" constraint
                              refers to a 32bit integer register.  So the earlier example <samp class="ph codeph">asm()</samp>
                              statement:
                           </p><pre xml:space="preserve">asm(<span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-string">"add.s32 %0, %1, %2;"</span> : <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-string">"=r"</span>(i) : <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-string">"r"</span>(j), <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-string">"r"</span>(k));</pre><p class="p">produces the following code sequence in the output generated by the compiler:</p><pre xml:space="preserve">ld.s32 r1, [j];
ld.s32 r2, [k];
add.s32 r3, r1, r2;
st.s32 [i], r3;</pre><p class="p">This is where the distinction between input and output operands becomes important.  The input
                              operands are loaded into registers before the <samp class="ph codeph">asm()</samp> statement, then the
                              result register is stored to the output operand.  The "=" modifier in "=r" specifies that the
                              register is written to.  There is also available a "+" modifier that specifies the register is
                              both read and written, e.g.:
                           </p><pre xml:space="preserve">asm(<span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-string">"add.s32 %0, %0, %1;"</span> : <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-string">"+r"</span>(i) : <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-string">"r"</span> (j));</pre><p class="p">Multiple instructions can be combined into a single <samp class="ph codeph">asm()</samp> statement;
                              basically, anything legal can be put into the asm string.  Multiple instructions can be split
                              across multiple lines by making use of C/C++'s implicit string concatenation. Both C++ style
                              line end comments "//" and classical C-style comments "/**/" can be interspersed with these
                              strings. To generate readable output in the PTX intermediate file it is best practice to
                              terminate each instruction string except the last one with "\n\t".
                           </p>
                           <p class="p">For example, a cube routine could be written as:</p><pre xml:space="preserve"><span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-attribute">__device__</span> <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-keyword">int</span> cube (<span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-keyword">int</span> x)
{
  <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-keyword">int</span> y;
  asm(<span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-string">".reg .u32 t1;\n\t"</span>              <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-comment">// temp reg t1</span>
      <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-string">" mul.lo.u32 t1, %1, %1;\n\t"</span>    <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-comment">// t1 = x * x</span>
      <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-string">" mul.lo.u32 %0, t1, %1;"</span>        <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-comment">// y = t1 * x</span>
      : <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-string">"=r"</span>(y) : <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-string">"r"</span> (x));
  <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-keyword">return</span> y;
}</pre><p class="p">If an output operand is conditionally updated by the asm instructions, then the "+" modifier
                              should be used. There is an implicit use of the output operand in such a case. For example, 
                           </p><pre xml:space="preserve"><span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-attribute">__device__</span> <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-keyword">int</span> cond (<span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-keyword">int</span> x)
{
  <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-keyword">int</span> y = 0;
  asm(<span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-string">"{\n\t"</span>
      <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-string">" .reg .pred %p;\n\t"</span>
      <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-string">" setp.eq.s32 %p, %1, 34;\n\t"</span> <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-comment">// x == 34?</span>
      <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-string">" @%p mov.s32 %0, 1;\n\t"</span>      <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-comment">// set y to 1 if true</span>
      <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-string">"}"</span>                            <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-comment">// conceptually y = (x==34)?1:y</span>
      : <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-string">"+r"</span>(y) : <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-string">"r"</span> (x));          
  <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-keyword">return</span> y;
}</pre></div>
                     </div>
                     <div class="topic concept nested2" id="constraints"><a name="constraints" shape="rect">
                           <!-- --></a><h3 class="title topictitle2"><a href="#constraints" name="constraints" shape="rect">1.1.2.&nbsp;Constraints</a></h3>
                        <div class="body conbody">
                           <p class="p">There is a separate constraint letter for each PTX register type:</p><pre xml:space="preserve"><span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-string">"h"</span> = .u16 reg
<span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-string">"r"</span> = .u32 reg
<span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-string">"l"</span> = .u64 reg
<span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-string">"f"</span> = .f32 reg
<span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-string">"d"</span> = .f64 reg</pre><p class="p">The constraint "n" may be used for immediate integer operands with a 
                              known value.
                           </p>
                           <p class="p">For example:</p><pre xml:space="preserve">asm(<span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-string">"cvt.f32.s64 %0, %1;"</span> : <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-string">"=f"</span>(x) : <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-string">"l"</span>(y));</pre><p class="p">will generate:</p><pre xml:space="preserve">ld.s64 rd1, [y];
cvt.f32.s64 f1, rd1;
st.f32 [x], f1;</pre><p class="p">Note that there are some constraints supported in earlier versions of
                              the compiler like "m" and "s" that are not guaranteed to work across all
                              versions and thus should not be used.
                           </p>
                        </div>
                     </div>
                  </div>
                  <div class="topic concept nested1" id="pitfalls"><a name="pitfalls" shape="rect">
                        <!-- --></a><h3 class="title topictitle2"><a href="#pitfalls" name="pitfalls" shape="rect">1.2.&nbsp;Pitfalls</a></h3>
                     <div class="body conbody">
                        <p class="p">Although <samp class="ph codeph">asm()</samp> statements are very flexible and powerful, you may
                           encounter some pitfalls—these are listed in this section.
                        </p>
                     </div>
                     <div class="topic concept nested2" id="namespace-conflicts"><a name="namespace-conflicts" shape="rect">
                           <!-- --></a><h3 class="title topictitle2"><a href="#namespace-conflicts" name="namespace-conflicts" shape="rect">1.2.1.&nbsp;Namespace Conflicts</a></h3>
                        <div class="body conbody">
                           <p class="p">If the cube function (described before) is called and inlined multiple
                              times in the code, it generates an error about duplicate definitions of
                              the temp register t1. To avoid this error you need to:
                           </p>
                           <ul class="ul">
                              <li class="li">not inline the cube function, or,</li>
                              <li class="li">nest the t1 use inside <samp class="ph codeph">{}</samp> so that it has a separate
                                 scope for each invocation, e.g.:
                                 <pre xml:space="preserve"><span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-attribute">__device__</span> <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-keyword">int</span> cube (<span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-keyword">int</span> x)
{
  <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-keyword">int</span> y;
  asm(<span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-string">"{\n\t"</span>                        <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-comment">// use braces for local scope</span>
      <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-string">" reg .u32 t1;\n\t"</span>            <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-comment">// temp reg t1, </span>
      <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-string">" mul.lo.u32 t1, %1, %1;\n\t"</span>  <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-comment">// t1 = x * x</span>
      <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-string">" mul.lo.u32 %0, t1, %1;\n\t"</span>  <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-comment">// y = t1 * x</span>
      <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-string">"}"</span>
      : <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-string">"=r"</span>(y) : <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-string">"r"</span> (x));
  <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-keyword">return</span> y;
}</pre></li>
                           </ul>
                           <p class="p">Note that you can similarly use braces for local labels inside the
                              <samp class="ph codeph">asm()</samp> statement.
                           </p>
                        </div>
                     </div>
                     <div class="topic concept nested2" id="memory-space-conflicts"><a name="memory-space-conflicts" shape="rect">
                           <!-- --></a><h3 class="title topictitle2"><a href="#memory-space-conflicts" name="memory-space-conflicts" shape="rect">1.2.2.&nbsp;Memory Space Conflicts</a></h3>
                        <div class="body conbody">
                           <p class="p">Since <samp class="ph codeph">asm()</samp> statements have no way of knowing what memory space a
                              register is in, the user must make sure that the appropriate <dfn class="term">PTX</dfn>
                              instruction is used.  For <samp class="ph codeph">sm_20</samp> and greater, any pointer argument to an
                              <samp class="ph codeph">asm()</samp> statement is passed as a generic address.
                           </p>
                        </div>
                     </div>
                     <div class="topic concept nested2" id="incorrect-optimization"><a name="incorrect-optimization" shape="rect">
                           <!-- --></a><h3 class="title topictitle2"><a href="#incorrect-optimization" name="incorrect-optimization" shape="rect">1.2.3.&nbsp;Incorrect Optimization</a></h3>
                        <div class="body conbody">
                           <p class="p">The compiler assumes that an <samp class="ph codeph">asm()</samp> statement has no side effects except
                              to change the output operands.  To ensure that the asm is not deleted or moved during generation of PTX, you
                              should use the volatile keyword, e.g.:
                           </p><pre xml:space="preserve">asm <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-keyword">volatile</span> (<span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-string">"mov.u32 %0, %%clock;"</span> : <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-string">"=r"</span>(x));</pre><p class="p">Normally any memory that is written to will be specified as an out operand, but if there
                              is a hidden side effect on user memory (for example, indirect access of a memory
                              location via an operand), or if you want to stop any memory optimizations around the
                              <samp class="ph codeph">asm()</samp> statement performed during generation of PTX, you can add a "memory" clobbers specification
                              after a 3rd colon, e.g.:
                           </p><pre xml:space="preserve">asm <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-keyword">volatile</span> (<span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-string">"mov.u32 %0, %%clock;"</span> : <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-string">"=r"</span>(x) :: <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-string">"memory"</span>);
asm (<span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-string">"st.u32 [%0], %1;"</span> : <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-string">"r"</span>(p), <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-string">"r"</span>(x) :: <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-string">"memory"</span>);
</pre></div>
                     </div>
                     <div class="topic concept nested2" id="incorrect-ptx"><a name="incorrect-ptx" shape="rect">
                           <!-- --></a><h3 class="title topictitle2"><a href="#incorrect-ptx" name="incorrect-ptx" shape="rect">1.2.4.&nbsp;Incorrect PTX</a></h3>
                        <div class="body conbody">
                           <p class="p">The compiler front end does not parse the <samp class="ph codeph">asm()</samp> statement template
                              string and does not know what it means or even whether it is valid <dfn class="term">PTX</dfn>
                              input. So if there are any errors in the string it will not show up until
                              <samp class="ph codeph">ptxas</samp>. For example, if you pass a value with an “r” constraint but
                              use it in an <samp class="ph codeph">add.f64</samp> you will get a parse error from ptxas. Similarly,
                              operand modifiers are not supported. For example, in
                           </p><pre xml:space="preserve">asm(<span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-string">"mov.u32 %0, %n1;"</span> : <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-string">"=r"</span>(n) : <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-string">"r"</span>(1));</pre><p class="p">the ‘n’ modifier in “%n1” is not supported and will be passed to <samp class="ph codeph">ptxas</samp>,
                              where it can cause undefined behavior. Refer to the document <cite class="cite">nvcc.pdf </cite>for
                              further compiler related details.
                           </p>
                        </div>
                     </div>
                  </div>
                  <div class="topic concept nested1" id="error-checking"><a name="error-checking" shape="rect">
                        <!-- --></a><h3 class="title topictitle2"><a href="#error-checking" name="error-checking" shape="rect">1.3.&nbsp;Error Checking</a></h3>
                     <div class="body conbody">
                        <p class="p">The following are some of the error checks that the compiler will do on
                           				inline<dfn class="term">PTX</dfn><dfn class="term">asm</dfn>:
                        </p>
                        <ul class="ul">
                           <li class="li">Multiple constraint letters for a single asm operand are not allowed, e.g.:
                              				<pre xml:space="preserve">asm(<span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-string">"add.s32 %0, %1, %2;"</span> : <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-string">"=r"</span>(i) : <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-string">"rf"</span>(j), <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-string">"r"</span>(k));</pre>
                              				error: an asm operand may specify only one constraint letter in a
                              				__device__/__global__ function </li>
                           <li class="li">Only scalar variables are allowed as asm operands. Specifically aggregates like
                              				‘struct’ type variables are not allowed,
                              				e.g.<pre xml:space="preserve">int4 i4;
asm(<span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-string">"add.s32 %0, %1, %2;"</span> : <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-string">"=r"</span>(i4) : <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-string">"r"</span>(j), <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-string">"r"</span>(k));</pre>error:
                              				an asm operand must have scalar type</li>
                           <li class="li">The type and size implied by a PTX asm constraint must match that of the associated
                              				operand. Example where size does not match:
                              <div class="p">For ‘char’ type variable “ci”,
                                 					<pre xml:space="preserve">asm(<span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-string">"add.s32 %0,%1,%2;"</span>:<span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-string">"=r"</span>(ci):<span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-string">"r"</span>(j),<span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-string">"r"</span>(k));</pre>
                                 					error: asm operand type size(1) does not match type/size implied by constraint
                                 					'r'</div>
                              <div class="p">In order to use ‘char’ type variables “ci”, “cj”, and “ck” in the
                                 					above asm statement, code segment similar to the following may be
                                 					used,<pre xml:space="preserve"><span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-keyword">int</span> temp = ci;
asm(<span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-string">"add.s32 %0,%1,%2;"</span>:<span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-string">"=r"</span>(temp):<span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-string">"r"</span>((<span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-keyword">int</span>)cj),<span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-string">"r"</span>((<span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-keyword">int</span>)ck));
ci = temp;</pre></div>
                              <p class="p">Another
                                 					example where type does not match:
                              </p>
                              <p class="p">For ‘float’ type variable “fi”,</p><pre xml:space="preserve">asm(<span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-string">"add.s32 %0,%1,%2;"</span>:<span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-string">"=r"</span>(fi):<span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-string">"r"</span>(j),<span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-string">"r"</span>(k));</pre><p class="p">error: asm operand type size(4) does not match type/size implied by constraint
                                 					'r'
                              </p>
                           </li>
                        </ul>
                     </div>
                  </div>
               </div>
               <div class="topic concept nested0" id="notices-header"><a name="notices-header" shape="rect">
                     <!-- --></a><h2 class="title topictitle1"><a href="#notices-header" name="notices-header" shape="rect">Notices</a></h2>
                  <div class="topic reference nested1" id="notice"><a name="notice" shape="rect">
                        <!-- --></a><h3 class="title topictitle2"><a href="#notice" name="notice" shape="rect"></a></h3>
                     <div class="body refbody">
                        <div class="section">
                           <h3 class="title sectiontitle">Notice</h3>
                           <p class="p">ALL NVIDIA DESIGN SPECIFICATIONS, REFERENCE BOARDS, FILES, DRAWINGS, DIAGNOSTICS, LISTS, AND OTHER DOCUMENTS (TOGETHER AND
                              SEPARATELY, "MATERIALS") ARE BEING PROVIDED "AS IS." NVIDIA MAKES NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE
                              WITH RESPECT TO THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT, MERCHANTABILITY, AND FITNESS
                              FOR A PARTICULAR PURPOSE. 
                           </p>
                           <p class="p">Information furnished is believed to be accurate and reliable. However, NVIDIA Corporation assumes no responsibility for the
                              consequences of use of such information or for any infringement of patents or other rights of third parties that may result
                              from its use. No license is granted by implication of otherwise under any patent rights of NVIDIA Corporation. Specifications
                              mentioned in this publication are subject to change without notice. This publication supersedes and replaces all other information
                              previously supplied. NVIDIA Corporation products are not authorized as critical components in life support devices or systems
                              without express written approval of NVIDIA Corporation.
                           </p>
                        </div>
                     </div>
                  </div>
                  <div class="topic reference nested1" id="trademarks"><a name="trademarks" shape="rect">
                        <!-- --></a><h3 class="title topictitle2"><a href="#trademarks" name="trademarks" shape="rect"></a></h3>
                     <div class="body refbody">
                        <div class="section">
                           <h3 class="title sectiontitle">Trademarks</h3>
                           <p class="p">NVIDIA and the NVIDIA logo are trademarks or registered trademarks of NVIDIA Corporation
                              in the U.S. and other countries.  Other company and product names may be trademarks of
                              the respective companies with which they are associated.
                           </p>
                        </div>
                     </div>
                  </div>
                  <div class="topic reference nested1" id="copyright-past-to-present"><a name="copyright-past-to-present" shape="rect">
                        <!-- --></a><h3 class="title topictitle2"><a href="#copyright-past-to-present" name="copyright-past-to-present" shape="rect"></a></h3>
                     <div class="body refbody">
                        <div class="section">
                           <h3 class="title sectiontitle">Copyright</h3>
                           <p class="p">© <span class="ph">2012</span>-<span class="ph">2019</span> NVIDIA
                              Corporation. All rights reserved.
                           </p>
                           <p class="p">This product includes software developed by the Syncro Soft SRL (http://www.sync.ro/).</p>
                        </div>
                     </div>
                  </div>
               </div>
               
               <hr id="contents-end"></hr>
               
            </article>
         </div>
      </div>
      <script language="JavaScript" type="text/javascript" charset="utf-8" src="../common/formatting/common.min.js"></script>
      <script language="JavaScript" type="text/javascript" charset="utf-8" src="../common/scripts/google-analytics/google-analytics-write.js"></script>
      <script language="JavaScript" type="text/javascript" charset="utf-8" src="../common/scripts/google-analytics/google-analytics-tracker.js"></script>
      <script type="text/javascript">var switchTo5x=true;</script><script type="text/javascript" src="http://w.sharethis.com/button/buttons.js"></script><script type="text/javascript">stLight.options({publisher: "998dc202-a267-4d8e-bce9-14debadb8d92", doNotHash: false, doNotCopy: false, hashAddressBar: false});</script><script type="text/javascript">_satellite.pageBottom();</script></body>
</html>