Sophie: nvidia-cuda-toolkit-devel-6.5.14-6.1.mga5.nonfree x86

nvidia-cuda-toolkit-devel-6.5.14-6.1.mga5.nonfree.x86_64.rpm

<!DOCTYPE html
  PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" lang="en-us" xml:lang="en-us">
   <head>
      <meta http-equiv="Content-Type" content="text/html; charset=utf-8"></meta>
      <meta http-equiv="X-UA-Compatible" content="IE=edge"></meta>
      <meta name="copyright" content="(C) Copyright 2005"></meta>
      <meta name="DC.rights.owner" content="(C) Copyright 2005"></meta>
      <meta name="DC.Type" content="cuda_reference"></meta>
      <meta name="DC.Title" content="Usage"></meta>
      <meta name="DC.Format" content="XHTML"></meta>
      <meta name="DC.Identifier" content="r_main"></meta>
      <link rel="stylesheet" type="text/css" href="../common/formatting/commonltr.css"></link>
      <link rel="stylesheet" type="text/css" href="../common/formatting/site.css"></link>
      <title>CUPTI :: CUDA Toolkit Documentation</title>
      <!--[if lt IE 9]>
      <script src="../common/formatting/html5shiv-printshiv.min.js"></script>
      <![endif]-->
      <script type="text/javascript" charset="utf-8" src="../common/scripts/tynt/tynt.js"></script>
      <script type="text/javascript" charset="utf-8" src="../common/formatting/jquery.min.js"></script>
      <script type="text/javascript" charset="utf-8" src="../common/formatting/jquery.ba-hashchange.min.js"></script>
      <script type="text/javascript" charset="utf-8" src="../common/formatting/jquery.scrollintoview.min.js"></script>
      <script type="text/javascript" src="../search/htmlFileList.js"></script>
      <script type="text/javascript" src="../search/htmlFileInfoList.js"></script>
      <script type="text/javascript" src="../search/nwSearchFnt.min.js"></script>
      <script type="text/javascript" src="../search/stemmers/en_stemmer.min.js"></script>
      <script type="text/javascript" src="../search/index-1.js"></script>
      <script type="text/javascript" src="../search/index-2.js"></script>
      <script type="text/javascript" src="../search/index-3.js"></script>
      <link rel="canonical" href="http://docs.nvidia.com/cuda/cupti/index.html"></link>
      <link rel="stylesheet" type="text/css" href="../common/formatting/qwcode.highlight.css"></link>
   </head>
   <body>
      
      <header id="header"><span id="company">NVIDIA</span><span id="site-title">CUDA Toolkit Documentation</span><form id="search" method="get" action="search">
            <input type="text" name="search-text"></input><fieldset id="search-location">
               <legend>Search In:</legend>
               <label><input type="radio" name="search-type" value="site"></input>Entire Site</label>
               <label><input type="radio" name="search-type" value="document"></input>Just This Document</label></fieldset>
            <button type="reset">clear search</button>
            <button id="submit" type="submit">search</button></form>
      </header>
      <div id="site-content">
         <nav id="site-nav">
            <div class="category closed"><a href="../index.html" title="The root of the site.">CUDA Toolkit
                  v6.5</a></div>
            <div class="category"><a href="index.html" title="CUPTI">CUPTI</a></div>
            <ul>
               <li>
                  <div class="section-link"><a href="r_main.html#r_main">1.&nbsp;Usage</a></div>
                  <ul>
                     <li>
                        <div class="section-link"><a href="r_main.html#r_compatibility_requirements">1.1.&nbsp;CUPTI Compatibility and Requirements</a></div>
                     </li>
                     <li>
                        <div class="section-link"><a href="r_main.html#r_initialization">1.2.&nbsp;CUPTI Initialization</a></div>
                     </li>
                     <li>
                        <div class="section-link"><a href="r_main.html#r_activity">1.3.&nbsp;CUPTI Activity API</a></div>
                        <ul>
                           <li>
                              <div class="section-link"><a href="r_main.html#r_context_activity">1.3.1.&nbsp;Context Activity Record</a></div>
                           </li>
                           <li>
                              <div class="section-link"><a href="r_main.html#r_legacy_activity">1.3.2.&nbsp;Legacy Activity Records</a></div>
                           </li>
                        </ul>
                     </li>
                     <li>
                        <div class="section-link"><a href="r_main.html#r_callback_api">1.4.&nbsp;CUPTI Callback API</a></div>
                        <ul>
                           <li>
                              <div class="section-link"><a href="r_main.html#r_driver_runtime_api_callback">1.4.1.&nbsp;Driver and Runtime API Callbacks</a></div>
                           </li>
                           <li>
                              <div class="section-link"><a href="r_main.html#r_resource_callbacks">1.4.2.&nbsp;Resource Callbacks</a></div>
                           </li>
                           <li>
                              <div class="section-link"><a href="r_main.html#r_synchronization_callbacks">1.4.3.&nbsp;Synchronization Callbacks</a></div>
                           </li>
                           <li>
                              <div class="section-link"><a href="r_main.html#r_nvtx_callbacks">1.4.4.&nbsp;NVIDIA Tools Extension Callbacks</a></div>
                           </li>
                        </ul>
                     </li>
                     <li>
                        <div class="section-link"><a href="r_main.html#r_event_api">1.5.&nbsp;CUPTI Event API</a></div>
                        <ul>
                           <li>
                              <div class="section-link"><a href="r_main.html#r_collecting_kernel_execution_events">1.5.1.&nbsp;Collecting Kernel Execution Events</a></div>
                           </li>
                           <li>
                              <div class="section-link"><a href="r_main.html#r_sampling_events">1.5.2.&nbsp;Sampling Events</a></div>
                           </li>
                        </ul>
                     </li>
                     <li>
                        <div class="section-link"><a href="r_main.html#r_metric_api">1.6.&nbsp;CUPTI Metric API</a></div>
                     </li>
                     <li>
                        <div class="section-link"><a href="r_main.html#r_samples">1.7.&nbsp;Samples</a></div>
                     </li>
                  </ul>
               </li>
               <li>
                  <div class="section-link"><a href="modules.html#modules">2.&nbsp;Modules</a></div>
                  <ul>
                     <li>
                        <div class="section-link"><a href="group__CUPTI__VERSION__API.html#group__CUPTI__VERSION__API">2.1.&nbsp;CUPTI Version</a></div>
                     </li>
                     <li>
                        <div class="section-link"><a href="group__CUPTI__RESULT__API.html#group__CUPTI__RESULT__API">2.2.&nbsp;CUPTI Result Codes</a></div>
                     </li>
                     <li>
                        <div class="section-link"><a href="group__CUPTI__ACTIVITY__API.html#group__CUPTI__ACTIVITY__API">2.3.&nbsp;CUPTI Activity API</a></div>
                     </li>
                     <li>
                        <div class="section-link"><a href="group__CUPTI__CALLBACK__API.html#group__CUPTI__CALLBACK__API">2.4.&nbsp;CUPTI Callback API</a></div>
                     </li>
                     <li>
                        <div class="section-link"><a href="group__CUPTI__EVENT__API.html#group__CUPTI__EVENT__API">2.5.&nbsp;CUPTI Event API</a></div>
                     </li>
                     <li>
                        <div class="section-link"><a href="group__CUPTI__METRIC__API.html#group__CUPTI__METRIC__API">2.6.&nbsp;CUPTI Metric API</a></div>
                     </li>
                  </ul>
               </li>
               <li>
                  <div class="section-link"><a href="annotated.html#annotated">3.&nbsp;Data Structures</a></div>
                  <ul>
                     <li>
                        <div class="section-link"><a href="structCUpti__Activity.html#structCUpti__Activity">3.1.&nbsp;CUpti_Activity</a></div>
                     </li>
                     <li>
                        <div class="section-link"><a href="structCUpti__ActivityAPI.html#structCUpti__ActivityAPI">3.2.&nbsp;CUpti_ActivityAPI</a></div>
                     </li>
                     <li>
                        <div class="section-link"><a href="structCUpti__ActivityAutoBoostState.html#structCUpti__ActivityAutoBoostState">3.3.&nbsp;CUpti_ActivityAutoBoostState</a></div>
                     </li>
                     <li>
                        <div class="section-link"><a href="structCUpti__ActivityBranch.html#structCUpti__ActivityBranch">3.4.&nbsp;CUpti_ActivityBranch</a></div>
                     </li>
                     <li>
                        <div class="section-link"><a href="structCUpti__ActivityBranch2.html#structCUpti__ActivityBranch2">3.5.&nbsp;CUpti_ActivityBranch2</a></div>
                     </li>
                     <li>
                        <div class="section-link"><a href="structCUpti__ActivityCdpKernel.html#structCUpti__ActivityCdpKernel">3.6.&nbsp;CUpti_ActivityCdpKernel</a></div>
                     </li>
                     <li>
                        <div class="section-link"><a href="structCUpti__ActivityContext.html#structCUpti__ActivityContext">3.7.&nbsp;CUpti_ActivityContext</a></div>
                     </li>
                     <li>
                        <div class="section-link"><a href="structCUpti__ActivityDevice.html#structCUpti__ActivityDevice">3.8.&nbsp;CUpti_ActivityDevice</a></div>
                     </li>
                     <li>
                        <div class="section-link"><a href="structCUpti__ActivityDeviceAttribute.html#structCUpti__ActivityDeviceAttribute">3.9.&nbsp;CUpti_ActivityDeviceAttribute</a></div>
                     </li>
                     <li>
                        <div class="section-link"><a href="structCUpti__ActivityEnvironment.html#structCUpti__ActivityEnvironment">3.10.&nbsp;CUpti_ActivityEnvironment</a></div>
                     </li>
                     <li>
                        <div class="section-link"><a href="structCUpti__ActivityEvent.html#structCUpti__ActivityEvent">3.11.&nbsp;CUpti_ActivityEvent</a></div>
                     </li>
                     <li>
                        <div class="section-link"><a href="structCUpti__ActivityEventInstance.html#structCUpti__ActivityEventInstance">3.12.&nbsp;CUpti_ActivityEventInstance</a></div>
                     </li>
                     <li>
                        <div class="section-link"><a href="structCUpti__ActivityFunction.html#structCUpti__ActivityFunction">3.13.&nbsp;CUpti_ActivityFunction</a></div>
                     </li>
                     <li>
                        <div class="section-link"><a href="structCUpti__ActivityGlobalAccess.html#structCUpti__ActivityGlobalAccess">3.14.&nbsp;CUpti_ActivityGlobalAccess</a></div>
                     </li>
                     <li>
                        <div class="section-link"><a href="structCUpti__ActivityGlobalAccess2.html#structCUpti__ActivityGlobalAccess2">3.15.&nbsp;CUpti_ActivityGlobalAccess2</a></div>
                     </li>
                     <li>
                        <div class="section-link"><a href="structCUpti__ActivityInstructionExecution.html#structCUpti__ActivityInstructionExecution">3.16.&nbsp;CUpti_ActivityInstructionExecution</a></div>
                     </li>
                     <li>
                        <div class="section-link"><a href="structCUpti__ActivityKernel.html#structCUpti__ActivityKernel">3.17.&nbsp;CUpti_ActivityKernel</a></div>
                     </li>
                     <li>
                        <div class="section-link"><a href="structCUpti__ActivityKernel2.html#structCUpti__ActivityKernel2">3.18.&nbsp;CUpti_ActivityKernel2</a></div>
                     </li>
                     <li>
                        <div class="section-link"><a href="structCUpti__ActivityMarker.html#structCUpti__ActivityMarker">3.19.&nbsp;CUpti_ActivityMarker</a></div>
                     </li>
                     <li>
                        <div class="section-link"><a href="structCUpti__ActivityMarkerData.html#structCUpti__ActivityMarkerData">3.20.&nbsp;CUpti_ActivityMarkerData</a></div>
                     </li>
                     <li>
                        <div class="section-link"><a href="structCUpti__ActivityMemcpy.html#structCUpti__ActivityMemcpy">3.21.&nbsp;CUpti_ActivityMemcpy</a></div>
                     </li>
                     <li>
                        <div class="section-link"><a href="structCUpti__ActivityMemcpy2.html#structCUpti__ActivityMemcpy2">3.22.&nbsp;CUpti_ActivityMemcpy2</a></div>
                     </li>
                     <li>
                        <div class="section-link"><a href="structCUpti__ActivityMemset.html#structCUpti__ActivityMemset">3.23.&nbsp;CUpti_ActivityMemset</a></div>
                     </li>
                     <li>
                        <div class="section-link"><a href="structCUpti__ActivityMetric.html#structCUpti__ActivityMetric">3.24.&nbsp;CUpti_ActivityMetric</a></div>
                     </li>
                     <li>
                        <div class="section-link"><a href="structCUpti__ActivityMetricInstance.html#structCUpti__ActivityMetricInstance">3.25.&nbsp;CUpti_ActivityMetricInstance</a></div>
                     </li>
                     <li>
                        <div class="section-link"><a href="structCUpti__ActivityModule.html#structCUpti__ActivityModule">3.26.&nbsp;CUpti_ActivityModule</a></div>
                     </li>
                     <li>
                        <div class="section-link"><a href="structCUpti__ActivityName.html#structCUpti__ActivityName">3.27.&nbsp;CUpti_ActivityName</a></div>
                     </li>
                     <li>
                        <div class="section-link"><a href="unionCUpti__ActivityObjectKindId.html#unionCUpti__ActivityObjectKindId">3.28.&nbsp;CUpti_ActivityObjectKindId</a></div>
                     </li>
                     <li>
                        <div class="section-link"><a href="structCUpti__ActivityOverhead.html#structCUpti__ActivityOverhead">3.29.&nbsp;CUpti_ActivityOverhead</a></div>
                     </li>
                     <li>
                        <div class="section-link"><a href="structCUpti__ActivityPreemption.html#structCUpti__ActivityPreemption">3.30.&nbsp;CUpti_ActivityPreemption</a></div>
                     </li>
                     <li>
                        <div class="section-link"><a href="structCUpti__ActivitySharedAccess.html#structCUpti__ActivitySharedAccess">3.31.&nbsp;CUpti_ActivitySharedAccess</a></div>
                     </li>
                     <li>
                        <div class="section-link"><a href="structCUpti__ActivitySourceLocator.html#structCUpti__ActivitySourceLocator">3.32.&nbsp;CUpti_ActivitySourceLocator</a></div>
                     </li>
                     <li>
                        <div class="section-link"><a href="structCUpti__ActivityUnifiedMemoryCounter.html#structCUpti__ActivityUnifiedMemoryCounter">3.33.&nbsp;CUpti_ActivityUnifiedMemoryCounter</a></div>
                     </li>
                     <li>
                        <div class="section-link"><a href="structCUpti__ActivityUnifiedMemoryCounterConfig.html#structCUpti__ActivityUnifiedMemoryCounterConfig">3.34.&nbsp;CUpti_ActivityUnifiedMemoryCounterConfig</a></div>
                     </li>
                     <li>
                        <div class="section-link"><a href="structCUpti__CallbackData.html#structCUpti__CallbackData">3.35.&nbsp;CUpti_CallbackData</a></div>
                     </li>
                     <li>
                        <div class="section-link"><a href="structCUpti__EventGroupSet.html#structCUpti__EventGroupSet">3.36.&nbsp;CUpti_EventGroupSet</a></div>
                     </li>
                     <li>
                        <div class="section-link"><a href="structCUpti__EventGroupSets.html#structCUpti__EventGroupSets">3.37.&nbsp;CUpti_EventGroupSets</a></div>
                     </li>
                     <li>
                        <div class="section-link"><a href="unionCUpti__MetricValue.html#unionCUpti__MetricValue">3.38.&nbsp;CUpti_MetricValue</a></div>
                     </li>
                     <li>
                        <div class="section-link"><a href="structCUpti__ModuleResourceData.html#structCUpti__ModuleResourceData">3.39.&nbsp;CUpti_ModuleResourceData</a></div>
                     </li>
                     <li>
                        <div class="section-link"><a href="structCUpti__NvtxData.html#structCUpti__NvtxData">3.40.&nbsp;CUpti_NvtxData</a></div>
                     </li>
                     <li>
                        <div class="section-link"><a href="structCUpti__ResourceData.html#structCUpti__ResourceData">3.41.&nbsp;CUpti_ResourceData</a></div>
                     </li>
                     <li>
                        <div class="section-link"><a href="structCUpti__SynchronizeData.html#structCUpti__SynchronizeData">3.42.&nbsp;CUpti_SynchronizeData</a></div>
                     </li>
                  </ul>
               </li>
               <li>
                  <div class="section-link"><a href="functions.html#functions">4.&nbsp;Data Fields</a></div>
               </li>
               <li>
                  <div class="section-link"><a href="notices-header.html#notices-header">Notices</a></div>
                  <ul></ul>
               </li>
            </ul>
         </nav>
         <div id="resize-nav"></div>
         <nav id="search-results">
            <h2>Search Results</h2>
            <ol></ol>
         </nav>
         
         <div id="contents-container">
            <div id="breadcrumbs-container">
               <div id="breadcrumbs"><a href="index.html" shape="rect">&lt; Previous</a> | <a href="modules.html" shape="rect">Next &gt;</a></div>
               <div id="release-info">CUPTI
                  (<a href="../../pdf/CUPTI_Library.pdf">PDF</a>)
                  -
                  
                  v6.5
                  (<a href="https://developer.nvidia.com/cuda-toolkit-archive">older</a>)
                  -
                  Last updated August 1, 2014
                  -
                  <a href="mailto:cudatools@nvidia.com?subject=CUDA Toolkit Documentation Feedback: CUPTI">Send Feedback</a>
                  -
                  <span class="st_facebook"></span><span class="st_twitter"></span><span class="st_linkedin"></span><span class="st_reddit"></span><span class="st_slashdot"></span><span class="st_tumblr"></span><span class="st_sharethis"></span></div>
            </div>
            <article id="contents">
               <div class="topic nested1" id="r_main"><a name="r_main" shape="rect">
                     <!-- --></a><h2 class="topictitle2">1.&nbsp;Usage</h2>
                  <div class="body refbody">
                     <div class="section">
                        <p class="p">The <em class="ph i">CUDA Profiling Tools Interface</em> (CUPTI) enables the creation
                           of profiling and tracing tools that target CUDA applications. CUPTI
                           provides four APIs: <em class="ph i">the Activity API</em>, the <em class="ph i">Callback API</em>,
                           the <em class="ph i">Event API</em>, and the <em class="ph i">Metric API</em>. Using these APIs, you
                           can develop profiling tools that give insight into the CPU and GPU
                           behavior of CUDA applications. CUPTI is delivered as a dynamic library
                           on all platforms supported by CUDA.
                        </p>
                     </div>
                  </div>
                  <div class="topic reference cuda_reference nested1" id="r_compatibility_requirements"><a name="r_compatibility_requirements" shape="rect">
                        <!-- --></a><h3 class="topictitle3">1.1.&nbsp;CUPTI Compatibility and Requirements</h3>
                     <div class="body refbody">
                        <div class="section">
                           <p class="p">New versions of the CUDA driver are backwards compatible with older
                              versions of CUPTI. For example, a developer using a profiling tool
                              based on CUPTI 4.1 can update to a more recently released CUDA
                              driver. However, new versions of CUPTI are not backwards compatible
                              with older versions of the CUDA driver. For example, a developer using
                              a profiling tool based on CUPTI 4.1 must have a version of the CUDA
                              driver released with CUDA Toolkit 4.1 (or later) installed as
                              well. CUPTI calls will fail with
                              <tt class="ph tt">CUPTI_ERROR_NOT_INITIALIZED</tt> if the CUDA driver version is
                              not compatible with the CUPTI version.
                           </p>
                        </div>
                     </div>
                  </div>
                  <div class="topic reference cuda_reference nested1" id="r_initialization"><a name="r_initialization" shape="rect">
                        <!-- --></a><h3 class="topictitle3">1.2.&nbsp;CUPTI Initialization</h3>
                     <div class="body refbody">
                        <div class="section">
                           <p class="p">CUPTI initialization occurs lazily the first time you
                              invoke any CUPTI function. For the Activity, Event, Metric,
                              and Callback APIs there are no requirements on when this
                              initialization must occur (i.e. you can invoke the first
                              CUPTI function at any point). See the CUPTI Activity API
                              section for more information on CUPTI initialization
                              requirements for the activity API.
                           </p>
                        </div>
                     </div>
                  </div>
                  <div class="topic reference cuda_reference nested1" id="r_activity"><a name="r_activity" shape="rect">
                        <!-- --></a><h3 class="topictitle3">1.3.&nbsp;CUPTI Activity API</h3>
                     <div class="body refbody">
                        <div class="section">
                           <p class="p">The CUPTI Activity API allows you to asynchronously collect a
                              trace of an application's CPU and GPU CUDA activity.  The
                              following terminology is used by the activity API.
                           </p>
                           <dl class="dl">
                              <dt class="dt dlterm">Activity Record</dt>
                              <dd class="dd">CPU and GPU activity is reported in C data structures
                                 called activity records. There is a different C structure
                                 type for each activity kind (e.g.
                                 <tt class="ph tt">CUpti_ActivityMemcpy</tt>). Records are generically
                                 referred to using the <tt class="ph tt">CUpti_Activity</tt> type. This
                                 type contains only a kind field that indicates the kind of
                                 the activity record. Using this kind, the object can be cast
                                 from the generic <tt class="ph tt">CUpti_Activity</tt> type to the
                                 specific type representing the activity. See the
                                 <tt class="ph tt">printActivity</tt> function in the <a class="xref" href="r_main.html#r_samples__activity_trace_async" shape="rect">activity_trace_async</a>
                                 sample for an example.
                              </dd>
                              <dt class="dt dlterm">Activity Buffer</dt>
                              <dd class="dd">An activity buffer is used to transfer one or more
                                 activity records from CUPTI to the client.  CUPTI fills
                                 activity buffers with activity records as the corresponding
                                 activities occur on the CPU and GPU. The CUPTI client is
                                 responsible for providing empty activity buffers as
                                 necessary to ensure that no records are dropped.
                              </dd>
                           </dl>
                           <p class="p">An <em class="ph i">asynchronous</em> buffering API is implemented by
                              <tt class="ph tt">cuptiActivityRegisterCallbacks</tt> and
                              <tt class="ph tt">cuptiActivityFlushAll</tt>.
                           </p>
                           <p class="p">It is not required that the activity API be initalized before
                              CUDA, but if the activity API is not initialized before CUDA
                              some activity records may not be collected. You can force
                              initialization of the activity API by enabling one or more
                              activity kinds using <tt class="ph tt">cuptiActivityEnable</tt> or
                              <tt class="ph tt">cuptiActivityEnableContext</tt>, as shown in the
                              <tt class="ph tt">initTrace</tt> function of the <a class="xref" href="r_main.html#r_samples__activity_trace_async" shape="rect">activity_trace_async</a>
                              sample. Some activity kinds cannot be directly enabled, see the
                              API documentation for for <tt class="ph tt">CUpti_ActivityKind</tt> for
                              details. Functions <tt class="ph tt">cuptiActivityEnable</tt> and
                              <tt class="ph tt">cuptiActivityEnableContext</tt> will return
                              <tt class="ph tt">CUPTI_ERROR_NOT_COMPATIBLE</tt> if the requested activity
                              kind cannot be enabled.
                           </p>
                        </div>
                        <div class="section">
                           The activity buffer API uses callbacks to request and return
                           buffers of activity records. To use the asynchronous buffering
                           API you must first register two callbacks using
                           <tt class="ph tt">cuptiActivityRegisterCallbacks</tt>. One of these callbacks
                           will be invoked whenever CUPTI needs an empty activity
                           buffer. The other callback is used to deliver a buffer
                           containing one or more activity records to the client. To
                           minimize profiling overhead the client should return as quickly
                           as possible from these callbacks. Function
                           <tt class="ph tt">cuptiActivityFlushAll</tt>
                           can be used to force CUPTI to deliver any activity buffers that
                           contain completed activity records. Functions
                           <tt class="ph tt">cuptiActivityGetAttribute</tt> and
                           <tt class="ph tt">cuptiActivitySetAttribute</tt> can be used to read and write
                           attributes that control how the buffering API behaves. See the
                           API documentation for more information.
                           
                           <p class="p">
                              The <a class="xref" href="r_main.html#r_samples__activity_trace_async" shape="rect">activity_trace_async</a>
                              sample shows how to use the activity buffer API to collect a
                              trace of CPU and GPU activity for a simple application.
                              
                           </p>
                        </div>
                     </div>
                     <div class="topic reference cuda_reference nested2" id="r_context_activity"><a name="r_context_activity" shape="rect">
                           <!-- --></a><h4 class="topictitle4">1.3.1.&nbsp;Context Activity Record</h4>
                        <div class="body refbody">
                           <div class="section">
                              <p class="p">In 6.0 the context activity record,
                                 <tt class="ph tt">CUpti_ActivityContext</tt>, was changed in a manner that
                                 introduced a new field into the structure. This new field was
                                 introduced in a way that preserves backward compatibility with
                                 any persisted versions of this structure.
                              </p>
                              <p class="p">The 32-bit <tt class="ph tt">computeApiKind</tt> field was replaced with
                                 two 16 bit fields, <tt class="ph tt">computeApiKind</tt> and
                                 <tt class="ph tt">defaultStreamId</tt>. Because all valid
                                 <tt class="ph tt">computeApiKind</tt> values fit within 16 bits, and because
                                 all supported CUDA platforms are little-endian, persisted
                                 context record data read with the new structure will have the
                                 correct value for <tt class="ph tt">computeApiKind</tt> and have a value of
                                 zero for <tt class="ph tt">defaultStreamId</tt>. The CUPTI client is
                                 responsible for versioning the persisted context data to
                                 recognize when the <tt class="ph tt">defaultStreamId</tt> field is valid.
                                 
                              </p>
                           </div>
                        </div>
                     </div>
                     <div class="topic reference cuda_reference nested2" id="r_legacy_activity"><a name="r_legacy_activity" shape="rect">
                           <!-- --></a><h4 class="topictitle4">1.3.2.&nbsp;Legacy Activity Records</h4>
                        <div class="body refbody">
                           <div class="section">
                              <p class="p">In CUPTI 5.5 the <tt class="ph tt">CUpti_ActivityKernel2</tt> structure
                                 replaced <tt class="ph tt">CUpti_ActivityKernel</tt> as the activity record
                                 used for the CUPTI_ACTIVITY_KIND_KERNEL and
                                 CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL activity kinds. The
                                 <tt class="ph tt">CUpti_ActivityKernel</tt> definition is retained in CUPTI to
                                 enable newer versions of CUPTI to work with presisted activity
                                 record data.
                              </p>
                              <p class="p">The CUPTI client is responsible for
                                 versioning the persisted activity record data to recognize when
                                 the persisted data is stored using <tt class="ph tt">CUpti_ActivityKernel</tt>
                                 or <tt class="ph tt">CUpti_ActivityKernel2</tt>.
                              </p>
                           </div>
                        </div>
                     </div>
                  </div>
                  <div class="topic reference cuda_reference nested1" id="r_callback_api"><a name="r_callback_api" shape="rect">
                        <!-- --></a><h3 class="topictitle3">1.4.&nbsp;CUPTI Callback API</h3>
                     <div class="body refbody">
                        <div class="section">
                           <p class="p">The CUPTI Callback API allows you to register a callback into your own
                              code. Your callback will be invoked when the application being
                              profiled calls a CUDA runtime or driver function, or when certain
                              events occur in the CUDA driver.  The following terminology is used by
                              the callback API.
                           </p>
                           <dl class="dl">
                              <dt class="dt dlterm">Callback Domain</dt>
                              <dd class="dd">Callbacks are grouped into domains to make it
                                 easier to associate your callback functions with groups of related
                                 CUDA functions or events. There are currently four callback domains,
                                 as defined by <tt class="ph tt">CUpti_CallbackDomain</tt>: a domain for CUDA
                                 runtime functions, a domain for CUDA driver functions, a domain for
                                 CUDA resource tracking, and a domain for CUDA synchronization
                                 notification.
                              </dd>
                              <dt class="dt dlterm">Callback ID</dt>
                              <dd class="dd">Each callback is given a unique ID within the
                                 corresponding callback domain so that you can identify it within
                                 your callback function. The CUDA driver API IDs are defined in
                                 <tt class="ph tt">cupti_driver_cbid.h</tt> and the CUDA runtime API IDs are
                                 defined in <tt class="ph tt">cupti_runtime_cbid.h</tt>. Both of these headers
                                 are included for you when you include <tt class="ph tt">cupti.h</tt>. The CUDA
                                 resource callback IDs are defined by
                                 <tt class="ph tt">CUpti_CallbackIdResource</tt> and the CUDA synchronization
                                 callback IDs are defined by <tt class="ph tt">CUpti_CallbackIdSync</tt>.
                              </dd>
                              <dt class="dt dlterm">Callback Function</dt>
                              <dd class="dd">Your callback function must be of type
                                 <tt class="ph tt">CUpti_CallbackFunc</tt>. This function type has two arguments
                                 that specify the callback domain and ID so that you know why the
                                 callback is occurring. The type also has a <tt class="ph tt">cbdata</tt> argument
                                 that is used to pass data specific to the callback.
                              </dd>
                              <dt class="dt dlterm">Subscriber</dt>
                              <dd class="dd">A subscriber is used to associate each of your
                                 callback functions with one or more CUDA API functions. There can be
                                 at most one subscriber initialized with <tt class="ph tt">cuptiSubscribe()</tt> at any
                                 time. Before initializing a new subscriber, the existing subscriber
                                 must be finalized with <tt class="ph tt">cuptiUnsubscribe()</tt>.
                              </dd>
                           </dl>
                           <p class="p">
                              Each callback domain is described in detail below. Unless explicitly
                              stated, it is not supported to call any CUDA runtime or driver API
                              from within a callback function. Doing so may cause the application to
                              hang.
                              
                           </p>
                        </div>
                     </div>
                     <div class="topic reference cuda_reference nested2" id="r_driver_runtime_api_callback"><a name="r_driver_runtime_api_callback" shape="rect">
                           <!-- --></a><h4 class="topictitle4">1.4.1.&nbsp;Driver and Runtime API Callbacks</h4>
                        <div class="body refbody">
                           <div class="section">
                              <p class="p">Using the callback API with the
                                 <tt class="ph tt">CUPTI_CB_DOMAIN_DRIVER_API</tt> or
                                 <tt class="ph tt">CUPTI_CB_DOMAIN_RUNTIME_API</tt> domains, you can associate a
                                 callback function with one or more CUDA API functions. When those CUDA
                                 functions are invoked in the application, your callback function is
                                 invoked as well. For these domains, the <tt class="ph tt">cbdata</tt> argument to
                                 your callback function will be of the type
                                 <tt class="ph tt">CUpti_CallbackData</tt>.
                              </p>
                              <p class="p">It is legal to call <tt class="ph tt">cudaThreadSynchronize()</tt>,
                                 <tt class="ph tt">cudaDeviceSynchronize()</tt>, <tt class="ph tt">cudaStreamSynchronize()</tt>,
                                 <tt class="ph tt">cuCtxSynchronize()</tt>, and <tt class="ph tt">cuStreamSynchronize()</tt> from
                                 within a driver or runtime API callback function.
                                 
                              </p>
                              <p class="p">The following code shows a typical sequence used to associate a
                                 callback function with one or more CUDA API functions. To simplify the
                                 presentation error checking code has been removed.
                              </p><pre xml:space="preserve">  CUpti_SubscriberHandle subscriber;
  MyDataStruct *my_data = ...;
  ...
  cuptiSubscribe(&amp;subscriber, 
                 (CUpti_CallbackFunc)my_callback , my_data);
  cuptiEnableDomain(1, subscriber, 
                    CUPTI_CB_DOMAIN_RUNTIME_API);</pre><p class="p">
                                 First, <tt class="ph tt">cuptiSubscribe</tt> is used to initialize a subscriber with
                                 the <tt class="ph tt">my_callback</tt> callback function. Next,
                                 <tt class="ph tt">cuptiEnableDomain</tt> is used to associate that callback with all
                                 the CUDA runtime API functions. Using this code sequence will cause
                                 <tt class="ph tt">my_callback</tt> to be called twice each time any of the CUDA
                                 runtime API functions are invoked, once on entry to the CUDA function
                                 and once just before exit from the CUDA function. CUPTI callback API
                                 functions <tt class="ph tt">cuptiEnableCallback</tt> and
                                 <tt class="ph tt">cuptiEnableAllDomains</tt> can also be used to associate CUDA API
                                 functions with a callback (see reference below for more information).
                                 
                              </p>
                              <p class="p">The following code shows a typical callback function.</p><pre xml:space="preserve">void CUPTIAPI
my_callback(void *userdata, CUpti_CallbackDomain domain,
            CUpti_CallbackId cbid, const void *cbdata)
{
  const CUpti_CallbackData *cbInfo = (CUpti_CallbackData *)cbdata;
  MyDataStruct *my_data = (MyDataStruct *)userdata;
      
  if ((domain == CUPTI_CB_DOMAIN_RUNTIME_API) &amp;&amp;
      (cbid == CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy_v3020))  { 
    if (cbInfo-&gt;callbackSite == CUPTI_API_ENTER) {
        cudaMemcpy_v3020_params *funcParams = 
             (cudaMemcpy_v3020_params *)(cbInfo-&gt;
                 functionParams);

        size_t count = funcParams-&gt;count;
        enum cudaMemcpyKind kind = funcParams-&gt;kind;
        ...
      }
  ...</pre><p class="p">
                                 In your callback function, you use the <tt class="ph tt">CUpti_CallbackDomain</tt>
                                 and <tt class="ph tt">CUpti_CallbackID</tt> parameters to determine which CUDA API
                                 function invocation is causing this callback. In the example above, we
                                 are checking for the CUDA runtime <tt class="ph tt">cudaMemcpy</tt> function. The
                                 <tt class="ph tt">cbdata</tt> parameter holds a structure of useful
                                 information that can be used within the callback. In this case we use
                                 the <tt class="ph tt">callbackSite</tt> member of the structure to detect that the
                                 callback is occurring on entry to <tt class="ph tt">cudaMemcpy</tt>, and we use the
                                 <tt class="ph tt">functionParams</tt> member to access the parameters that were
                                 passed to <tt class="ph tt">cudaMemcpy</tt>. To access the parameters we first cast
                                 <tt class="ph tt">functionParams</tt> to a structure type corresponding to the
                                 <tt class="ph tt">cudaMemcpy</tt> function. These parameter structures are contained
                                 in <tt class="ph tt">generated_cuda_runtime_api_meta.h</tt>,
                                 <tt class="ph tt">generated_cuda_meta.h</tt>, and a number of other files. When
                                 possible these files are included for you by <tt class="ph tt">cupti.h</tt>.
                                 
                              </p>
                              <p class="p">
                                 The <strong class="ph b">callback_event</strong> and <strong class="ph b">callback_timestamp</strong> samples
                                 described on the <a class="xref" href="r_main.html#r_samples" shape="rect">samples page</a> both show how to use the callback
                                 API for the driver and runtime API domains.
                                 
                              </p>
                           </div>
                        </div>
                     </div>
                     <div class="topic reference cuda_reference nested2" id="r_resource_callbacks"><a name="r_resource_callbacks" shape="rect">
                           <!-- --></a><h4 class="topictitle4">1.4.2.&nbsp;Resource Callbacks</h4>
                        <div class="body refbody">
                           <div class="section">
                              <p class="p">Using the callback API with the <tt class="ph tt">CUPTI_CB_DOMAIN_RESOURCE</tt>
                                 domain, you can associate a callback function with some CUDA resource
                                 creation and destruction events. For example, when a CUDA context is
                                 created, your callback function will be invoked with a callback ID
                                 equal to <tt class="ph tt">CUPTI_CBID_RESOURCE_CONTEXT_CREATED</tt>.  For this
                                 domain, the <tt class="ph tt">cbdata</tt> argument to your callback function will be
                                 of the type <tt class="ph tt">CUpti_ResourceData</tt>.
                              </p>
                              <p class="p">Note that, APIs <tt class="ph tt">cuptiActivityFlush</tt> and <tt class="ph tt">cuptiActivityFlushAll</tt> 
                                 will result in deadlock when called from stream destroy starting callback
                                 identified using callback ID <tt class="ph tt">CUPTI_CBID_RESOURCE_STREAM_DESTROY_STARTING</tt>.
                              </p>
                           </div>
                        </div>
                     </div>
                     <div class="topic reference cuda_reference nested2" id="r_synchronization_callbacks"><a name="r_synchronization_callbacks" shape="rect">
                           <!-- --></a><h4 class="topictitle4">1.4.3.&nbsp;Synchronization Callbacks</h4>
                        <div class="body refbody">
                           <div class="section">
                              <p class="p">Using the callback API with the
                                 <tt class="ph tt">CUPTI_CB_DOMAIN_SYNCHRONIZE</tt> domain, you can associate a
                                 callback function with CUDA context and stream synchronizations. For
                                 example, when a CUDA context is synchronized, your callback function
                                 will be invoked with a callback ID equal to
                                 <tt class="ph tt">CUPTI_CBID_SYNCHRONIZE_CONTEXT_SYNCHRONIZED</tt>.  For this
                                 domain, the <tt class="ph tt">cbdata</tt> argument to your callback function will be
                                 of the type <tt class="ph tt">CUpti_SynchronizeData</tt>.
                              </p>
                           </div>
                        </div>
                     </div>
                     <div class="topic reference cuda_reference nested2" id="r_nvtx_callbacks"><a name="r_nvtx_callbacks" shape="rect">
                           <!-- --></a><h4 class="topictitle4">1.4.4.&nbsp;NVIDIA Tools Extension Callbacks</h4>
                        <div class="body refbody">
                           <div class="section">
                              <p class="p">Using the callback API with the
                                 <tt class="ph tt">CUPTI_CB_DOMAIN_NVTX</tt> domain, you can associate a
                                 callback function with NVIDIA Tools Extension (NVTX) API
                                 functions. When an NVTX function is invoked in the
                                 application, your callback function is invoked as
                                 well. For these domains, the <tt class="ph tt">cbdata</tt> argument to
                                 your callback function will be of the type
                                 <tt class="ph tt">CUpti_NvtxData</tt>.
                                 
                              </p>
                              <div class="p">
                                 The NVTX library has its own convention for discovering the
                                 profiling library that will provide the implementation of the
                                 NVTX callbacks. To receive callbacks you must set the NVTX
                                 environment variables appropriately so that when the
                                 application calls an NVTX function, your profiling library
                                 recieve the callbacks. The following code sequence shows a
                                 typical initialization sequence to enable NVTX callbacks and
                                 activity records.
                                 <pre xml:space="preserve">/* Set env so CUPTI-based profiling library loads on first nvtx call. */
char *inj32_path = "/path/to/32-bit/version/of/cupti/based/profiling/library";
char *inj64_path = "/path/to/64-bit/version/of/cupti/based/profiling/library";
setenv("NVTX_INJECTION32_PATH", inj32_path, 1);
setenv("NVTX_INJECTION64_PATH", inj64_path, 1);</pre></div>
                              <p class="p">The following code shows a typical sequence used to associate a
                                 callback function with one or more NVTX functions. To simplify the
                                 presentation error checking code has been removed.
                              </p><pre xml:space="preserve">CUpti_SubscriberHandle subscriber;
MyDataStruct *my_data = ...;
...
cuptiSubscribe(&amp;subscriber, 
               (CUpti_CallbackFunc)my_callback , my_data);
cuptiEnableDomain(1, subscriber, 
                  CUPTI_CB_DOMAIN_NVTX);</pre><p class="p">
                                 First, <tt class="ph tt">cuptiSubscribe</tt> is used to initialize a subscriber with
                                 the <tt class="ph tt">my_callback</tt> callback function. Next,
                                 <tt class="ph tt">cuptiEnableDomain</tt> is used to associate that callback
                                 with all the NVTX functions. Using this code sequence will
                                 cause <tt class="ph tt">my_callback</tt> to be called once each time any of
                                 the NVTX functions are invoked. CUPTI callback API
                                 functions <tt class="ph tt">cuptiEnableCallback</tt> and
                                 <tt class="ph tt">cuptiEnableAllDomains</tt> can also be used to associate NVTX API
                                 functions with a callback (see reference below for more information).
                                 
                              </p>
                              <p class="p">The following code shows a typical callback function.</p><pre xml:space="preserve">void CUPTIAPI
my_callback(void *userdata, CUpti_CallbackDomain domain,
            CUpti_CallbackId cbid, const void *cbdata)
{
  const CUpti_NvtxData *nvtxInfo = (CUpti_NvtxData *)cbdata;
  MyDataStruct *my_data = (MyDataStruct *)userdata;
      
  if ((domain == CUPTI_CB_DOMAIN_NVTX) &amp;&amp;
      (cbid == NVTX_CBID_CORE_NameOsThreadA))  { 
    nvtxNameOsThreadA_params *params = (nvtxNameOsThreadA_params *)nvtxInfo-&gt;
             functionParams;
    ...
  }
  ...</pre><p class="p">
                                 In your callback function, you use the <tt class="ph tt">CUpti_CallbackDomain</tt>
                                 and <tt class="ph tt">CUpti_CallbackID</tt> parameters to determine which NVTX API
                                 function invocation is causing this callback. In the example above, we
                                 are checking for the <tt class="ph tt">nvtxNameOsThreadA</tt> function. The
                                 <tt class="ph tt">cbdata</tt> parameter holds a structure of useful
                                 information that can be used within the callback. In this
                                 case, we use the <tt class="ph tt">functionParams</tt> member to access the
                                 parameters that were passed to <tt class="ph tt">nvtxNameOsThreadA</tt>. To access
                                 the parameters we first cast
                                 <tt class="ph tt">functionParams</tt> to a structure type corresponding to the
                                 <tt class="ph tt">nvtxNameOsThreadA</tt> function. These parameter
                                 structures are contained in <tt class="ph tt">generated_nvtx_meta.h</tt>.
                                 
                              </p>
                           </div>
                        </div>
                     </div>
                  </div>
                  <div class="topic reference cuda_reference nested1" id="r_event_api"><a name="r_event_api" shape="rect">
                        <!-- --></a><h3 class="topictitle3">1.5.&nbsp;CUPTI Event API</h3>
                     <div class="body refbody">
                        <div class="section">
                           <p class="p">The CUPTI Event API allows you to query, configure, start, stop, and
                              read the event counters on a CUDA-enabled device. The following
                              terminology is used by the event API.
                           </p>
                           <dl class="dl">
                              <dt class="dt dlterm">Event</dt>
                              <dd class="dd">An event is a countable activity, action, or occurrence
                                 on a device.
                              </dd>
                              <dt class="dt dlterm">Event ID</dt>
                              <dd class="dd">Each event is assigned a unique identifier. A named
                                 event will represent the same activity, action, or occurrence on all
                                 device types. But the named event may have different IDs on
                                 different device families. Use <tt class="ph tt">cuptiEventGetIdFromName</tt> to
                                 get the ID for a named event on a particular device.
                              </dd>
                              <dt class="dt dlterm">Event Category</dt>
                              <dd class="dd">Each event is placed in one of the categories
                                 defined by <tt class="ph tt">CUpti_EventCategory</tt>. The category indicates the
                                 general type of activity, action, or occurrence measured by the
                                 event.
                              </dd>
                              <dt class="dt dlterm">Event Domain</dt>
                              <dd class="dd">A device exposes one or more event domains. Each
                                 event domain represents a group of related events available on that
                                 device. A device may have multiple instances of a domain, indicating
                                 that the device can simultaneously record multiple instances of each
                                 event within that domain.
                              </dd>
                              <dt class="dt dlterm">Event Group</dt>
                              <dd class="dd">An event group is a collection of events that are
                                 managed together. The number and type of events that can be added to
                                 an event group are subject to device-specific limits. At any given
                                 time, a device may be configured to count events from a limited
                                 number of event groups. All events in an event group must belong to
                                 the same event domain.
                              </dd>
                              <dt class="dt dlterm">Event Group Set</dt>
                              <dd class="dd">An event group set is a collection of event
                                 groups that can be enabled at the same time. Event group sets are
                                 created by <tt class="ph tt">cuptiEventGroupSetsCreate</tt> and
                                 <tt class="ph tt">cuptiMetricCreateEventGroupSets</tt>.
                              </dd>
                           </dl>
                           <p class="p">
                              You can determine the events available on a device using the
                              <tt class="ph tt">cuptiDeviceEnumEventDomains</tt> and
                              <tt class="ph tt">cuptiEventDomainEnumEvents</tt> functions. The
                              <strong class="ph b">cupti_query</strong> sample described on the <a class="xref" href="r_main.html#r_samples" shape="rect">samples page</a> shows
                              how to use these functions.  You can also enumerate all the CUPTI
                              events available on any device using the
                              <tt class="ph tt">cuptiEnumEventDomains</tt> function.
                              
                           </p>
                           <p class="p">
                              Configuring and reading event counts requires the following steps.
                              First, select your event collection mode. If you want to count events
                              that occur during the execution of a kernel, use
                              <tt class="ph tt">cuptiSetEventCollectionMode</tt> to set mode
                              <tt class="ph tt">CUPTI_EVENT_COLLECTION_MODE_KERNEL</tt>. If you want to
                              continuously sample the event counts, use mode
                              <tt class="ph tt">CUPTI_EVENT_COLLECTION_MODE_CONTINUOUS</tt>. Next determine
                              the names of the events that you want to count, and then use the
                              <tt class="ph tt">cuptiEventGroupCreate</tt>, <tt class="ph tt">cuptiEventGetIdFromName</tt>, and
                              <tt class="ph tt">cuptiEventGroupAddEvent</tt> functions to create and initialize an
                              event group with those events. If you are unable to add all the events
                              to a single event group then you will need to create multiple event
                              groups. Alternatively, you can use the
                              <tt class="ph tt">cuptiEventGroupSetsCreate</tt> function to automatically create
                              the event group(s) required for a set of events.
                              
                           </p>
                           <p class="p">
                              To begin counting a set of events, enable the event group
                              or groups that contain those events by using the
                              <tt class="ph tt">cuptiEventGroupEnable</tt> function. If your events
                              are contained in multiple event groups you may be unable
                              to enable all of the event groups at the same time, due to
                              device limitations. In this case, you can gather the
                              events across multiple executions of the application or
                              you can enable kernel replay. If you enable kernel replay
                              using <tt class="ph tt">cuptiEnableKernelReplayMode</tt> you will be
                              able to enabled any number of event groups and all the
                              contained events will be collect.
                              
                           </p>
                           <p class="p">
                              Use the <tt class="ph tt">cuptiEventGroupReadEvent</tt> and/or
                              <tt class="ph tt">cuptiEventGroupReadAllEvents</tt> functions to read
                              the event values. When you are done collecting events, use
                              the <tt class="ph tt">cuptiEventGroupDisable</tt> function to stop
                              counting of the events contained in an event group.  The
                              <strong class="ph b">callback_event</strong> sample described on the <a class="xref" href="r_main.html#r_samples" shape="rect">samples page</a> shows how to use
                              these functions to create, enable, and disable event
                              groups, and how to read event counts.
                              
                           </p>
                        </div>
                     </div>
                     <div class="topic reference cuda_reference nested2" id="r_collecting_kernel_execution_events"><a name="r_collecting_kernel_execution_events" shape="rect">
                           <!-- --></a><h4 class="topictitle4">1.5.1.&nbsp;Collecting Kernel Execution Events</h4>
                        <div class="body refbody">
                           <div class="section">
                              <p class="p">A common use of the event API is to count a set of events during the
                                 execution of a kernel (as demonstrated by the <strong class="ph b">callback_event</strong>
                                 sample). The following code shows a typical callback used for this
                                 purpose. Assume that the callback was enabled only for a kernel launch
                                 using the CUDA runtime (i.e. by <tt class="ph tt">cuptiEnableCallback(1, subscriber, CUPTI_CB_DOMAIN_RUNTIME_API,
                                    CUPTI_RUNTIME_TRACE_CBID_cudaLaunch_v3020)</tt>.  To simplify the
                                 presentation error checking code has been removed.
                              </p><pre xml:space="preserve">static void CUPTIAPI
getEventValueCallback(void *userdata,
                      CUpti_CallbackDomain domain,
                      CUpti_CallbackId cbid,
                      const void *cbdata)
{
  const CUpti_CallbackData *cbData = 
                (CUpti_CallbackData *)cbdata;
     
  if (cbData-&gt;callbackSite == CUPTI_API_ENTER) {
    cudaDeviceSynchronize();
    cuptiSetEventCollectionMode(cbInfo-&gt;context, 
                                CUPTI_EVENT_COLLECTION_MODE_KERNEL);
    cuptiEventGroupEnable(eventGroup);
  }
    
  if (cbData-&gt;callbackSite == CUPTI_API_EXIT) {
    cudaDeviceSynchronize();
    cuptiEventGroupReadEvent(eventGroup, 
                             CUPTI_EVENT_READ_FLAG_NONE, 
                             eventId, 
                             &amp;bytesRead, &amp;eventVal);
      
    cuptiEventGroupDisable(eventGroup);
  }
}</pre><p class="p">
                                 Two synchronization points are used to ensure that events are counted
                                 only for the execution of the kernel. If the application contains
                                 other threads that launch kernels, then additional thread-level
                                 synchronization must also be introduced to ensure that those threads
                                 do not launch kernels while the callback is collecting events. When
                                 the cudaLaunch API is entered (that is, before the kernel is actually
                                 launched on the device), <tt class="ph tt">cudaDeviceSynchronize</tt> is used to
                                 wait until the GPU is idle. The event collection mode is set to
                                 <tt class="ph tt">CUPTI_EVENT_COLLECTION_MODE_KERNEL</tt> so that the event
                                 counters are automatically started and stopped just before and after
                                 the kernel executes. Then event collection is enabled with
                                 <tt class="ph tt">cuptiEventGroupEnable</tt>.
                                 
                              </p>
                              <p class="p">
                                 When the cudaLaunch API is exited (that is, after the kernel is queued
                                 for execution on the GPU) another <tt class="ph tt">cudaDeviceSynchronize</tt> is
                                 used to cause the CPU thread to wait for the kernel to finish
                                 execution. Finally, the event counts are read with
                                 <tt class="ph tt">cuptiEventGroupReadEvent</tt>.
                                 
                              </p>
                           </div>
                        </div>
                     </div>
                     <div class="topic reference cuda_reference nested2" id="r_sampling_events"><a name="r_sampling_events" shape="rect">
                           <!-- --></a><h4 class="topictitle4">1.5.2.&nbsp;Sampling Events</h4>
                        <div class="body refbody">
                           <div class="section">
                              <p class="p">The event API can also be used to sample event values while a kernel
                                 or kernels are executing (as demonstrated by the
                                 <strong class="ph b">event_sampling</strong> sample). The sample shows one possible way to
                                 perform the sampling. The event collection mode is set to
                                 <tt class="ph tt">CUPTI_EVENT_COLLECTION_MODE_CONTINUOUS</tt> so that the event
                                 counters run continuously. Two threads are used in
                                 <strong class="ph b">event_sampling</strong>: one thread schedules the kernels and memcpys
                                 that perform the computation, while another thread wakes periodically
                                 to sample an event counter. In this sample there is no correlation of
                                 the event samples with what is happening on the GPU. To get some
                                 coarse correlation, you can use <tt class="ph tt">cuptiDeviceGetTimestamp</tt> to
                                 collect the GPU timestamp at the time of the sample and also at other
                                 interesting points in your application.
                              </p>
                           </div>
                        </div>
                     </div>
                  </div>
                  <div class="topic reference cuda_reference nested1" id="r_metric_api"><a name="r_metric_api" shape="rect">
                        <!-- --></a><h3 class="topictitle3">1.6.&nbsp;CUPTI Metric API</h3>
                     <div class="body refbody">
                        <div class="section">
                           <p class="p">The CUPTI Metric API allows you to collect application metrics
                              calculated from one or more event values.  The following terminology
                              is used by the metric API.
                           </p>
                           <dl class="dl">
                              <dt class="dt dlterm">Metric</dt>
                              <dd class="dd">An characteristic of an application that is calculated
                                 from one or more event values.
                              </dd>
                              <dt class="dt dlterm">Metric ID</dt>
                              <dd class="dd">Each metric is assigned a unique identifier. A named
                                 metric will represent the same characteristic on all device
                                 types. But the named metric may have different IDs on different
                                 device families. Use <tt class="ph tt">cuptiMetricGetIdFromName</tt> to get the ID
                                 for a named metric on a particular device.
                              </dd>
                              <dt class="dt dlterm">Metric Category</dt>
                              <dd class="dd">Each metric is placed in one of the categories
                                 defined by <tt class="ph tt">CUpti_MetricCategory</tt>. The category indicates
                                 the general type of the characteristic measured by the metric.
                              </dd>
                              <dt class="dt dlterm">Metric Property</dt>
                              <dd class="dd">Each metric is calculated from input values. These
                                 input values can be events or properties of the device
                                 or system. The available properties are defined by
                                 <tt class="ph tt">CUpti_MetricPropertyID</tt>.
                              </dd>
                              <dt class="dt dlterm">Metric Value</dt>
                              <dd class="dd">Each metric has a value that represents one of
                                 the kinds defined by <tt class="ph tt">CUpti_MetricValueKind</tt>. For each value
                                 kind, there is a corresponding member of the
                                 <tt class="ph tt">CUpti_MetricValue</tt> union that is used to hold the metric's
                                 value.
                              </dd>
                           </dl>
                        </div>
                        <div class="section">
                           <p class="p">
                              The tables included in this section list the metrics available for
                              each device, as determined by the device's compute capability.  You
                              can also determine the metrics available on a device using the
                              <tt class="ph tt">cuptiDeviceEnumMetrics</tt> function.  The <strong class="ph b">cupti_query</strong>
                              sample described on the <a class="xref" href="r_main.html#r_samples" shape="rect">samples page</a> shows how to use this
                              function.  You can also enumerate all the CUPTI metrics available on
                              any device using the <tt class="ph tt">cuptiEnumMetrics</tt> function.
                              
                           </p>
                           <p class="p">
                              CUPTI provides two functions for calculating a metric
                              value. <tt class="ph tt">cuptiMetricGetValue2</tt> can be used to
                              calculate a metric value when the device is not
                              available. All required event values and metric properties
                              must be provided by the
                              caller. <tt class="ph tt">cuptiMetricGetValue</tt> can be used to
                              calculate a metric value when the device is available (as a
                              CUdevice object). All required event values must be
                              provided by the caller but CUPTI will determine the
                              appropriate property values from the CUdevice object.
                              
                           </p>
                           <p class="p">
                              Configuring and calculating metric values requires the
                              following steps. First, determine the name of the metric
                              that you want to collect, and then use the
                              <tt class="ph tt">cuptiMetricGetIdFromName</tt> to get the metric
                              ID. Use <tt class="ph tt">cuptiMetricEnumEvents</tt> to get the events
                              required to calculate the metric and follow instructions
                              in the CUPTI Event API section to create the event groups
                              for those events. When creating event groups in this
                              manner it is important to use the result of
                              <tt class="ph tt">cuptiMetricGetRequiredEventGroupSets</tt> to properly
                              group together events that must be collected in the same
                              pass to ensure proper metric calculation.
                              
                           </p>
                           <p class="p">
                              Alternatively, you can use the
                              <tt class="ph tt">cuptiMetricCreateEventGroupSets</tt> function to
                              automatically create the event group(s) required for
                              metric's events. When using this function events will be
                              grouped as required to most accurately calculate the
                              metric, as a result it is not necessary to use
                              <tt class="ph tt">cuptiMetricGetRequiredEventGroupSets</tt>.
                              
                           </p>
                           <p class="p">
                              If you are using <tt class="ph tt">cuptiMetricGetValue2</tt> the you must
                              also collect the required metric property values using
                              <tt class="ph tt">cuptiMetricEnumProperties</tt>.
                              
                           </p>
                           <p class="p">
                              Collect event counts as described in the CUPTI Event API
                              section, and then use either <tt class="ph tt">cuptiMetricGetValue</tt>
                              or <tt class="ph tt">cuptiMetricGetValue2</tt> to calculate the metric
                              value from the collected event and property values. The
                              <strong class="ph b">callback_metric</strong> sample described on the <a class="xref" href="r_main.html#r_samples" shape="rect">samples page</a> shows how to use
                              the functions to calculate event values and calculate a
                              metric using <tt class="ph tt">cuptiMetricGetValue</tt>. Note that, as
                              shown in the example, you should collect event counts from
                              all domain instances and normalize the counts to get the
                              most accurate metric values. It is necessary to normalize
                              the event counts because the number of event counter
                              instances varies by device and by the event being counted.
                              
                           </p>
                           <p class="p">
                              For example, a device might have 8 multiprocessors but
                              only have event counters for 4 of the multiprocessors, and
                              might have 3 memory units and only have events counters
                              for one memory unit. When calculating a metric that
                              requires a multiprocessor event and a memory unit event,
                              the 4 multiprocessor counters should be summed and
                              multiplied by 2 to normalize the event count across the
                              entire device. Similarly, the one memory unit counter
                              should be multiplied by 3 to normalize the event count
                              across the entire device. The normalized values can then
                              be passed to <tt class="ph tt">cuptiMetricGetValue</tt> or
                              <tt class="ph tt">cuptiMetricGetValue2</tt> to calculate the metric
                              value.
                              
                           </p>
                           <p class="p">
                              As described, the normalization assumes the kernel executes a
                              sufficient number of blocks to completely load the device. If the
                              kernel has only a small number of blocks, normalizing across the
                              entire device may skew the result.
                              
                           </p>
                        </div>
                        <div class="section">
                           <h3 class="title sectiontitle">Metric Reference - Compute Capability 1.x</h3>
                           <p class="p">Devices with compute capability less than 2.0 implement the
                              metrics shown in the following table. A scope value of
                              single-context indicates that the metric can only be accurately
                              collected when a single context (CUDA or graphic) is executing
                              on the GPU. A scope value of multi-context indicates that the
                              metric can be accurately collected when multiple contexts are
                              executing on the GPU.
                           </p>
                           <div class="tablenoborder">
                              <table cellpadding="4" cellspacing="0" summary="" class="table capabity_metrics" frame="border" border="1" rules="all">
                                 <caption><span class="tablecap">Table 1. Capability 1.x Metrics</span></caption>
                                 <thead class="thead" align="left">
                                    <tr class="row">
                                       <th class="entry" valign="top" width="33.33333333333333%" id="d28881e1059" rowspan="1" colspan="1">Metric Name</th>
                                       <th class="entry" valign="top" width="44.44444444444444%" id="d28881e1062" rowspan="1" colspan="1">Description</th>
                                       <th class="entry" valign="top" width="22.22222222222222%" id="d28881e1065" rowspan="1" colspan="1">Scope</th>
                                    </tr>
                                 </thead>
                                 <tbody class="tbody">
                                    <tr class="row">
                                       <td class="entry" valign="top" width="33.33333333333333%" headers="d28881e1059" rowspan="1" colspan="1">branch_efficiency</td>
                                       <td class="entry" valign="top" width="44.44444444444444%" headers="d28881e1062" rowspan="1" colspan="1">Ratio of non-divergent branches to total branches expressed as percentage</td>
                                       <td class="entry" valign="top" width="22.22222222222222%" headers="d28881e1065" rowspan="1" colspan="1">Single-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="33.33333333333333%" headers="d28881e1059" rowspan="1" colspan="1">gld_efficiency</td>
                                       <td class="entry" valign="top" width="44.44444444444444%" headers="d28881e1062" rowspan="1" colspan="1">Ratio of requested global memory load transactions to actual
                                          global memory load transactions expressed as percentage
                                       </td>
                                       <td class="entry" valign="top" width="22.22222222222222%" headers="d28881e1065" rowspan="1" colspan="1">Single-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="33.33333333333333%" headers="d28881e1059" rowspan="1" colspan="1">gst_efficiency</td>
                                       <td class="entry" valign="top" width="44.44444444444444%" headers="d28881e1062" rowspan="1" colspan="1">Ratio of requested global memory store transactions to
                                          actual global memory store transactions expressed as percentage
                                       </td>
                                       <td class="entry" valign="top" width="22.22222222222222%" headers="d28881e1065" rowspan="1" colspan="1">Single-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="33.33333333333333%" headers="d28881e1059" rowspan="1" colspan="1">gld_requested_throughput</td>
                                       <td class="entry" valign="top" width="44.44444444444444%" headers="d28881e1062" rowspan="1" colspan="1">Requested global memory load throughput</td>
                                       <td class="entry" valign="top" width="22.22222222222222%" headers="d28881e1065" rowspan="1" colspan="1">Single-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="33.33333333333333%" headers="d28881e1059" rowspan="1" colspan="1">gst_requested_throughput</td>
                                       <td class="entry" valign="top" width="44.44444444444444%" headers="d28881e1062" rowspan="1" colspan="1">Requested global memory store throughput</td>
                                       <td class="entry" valign="top" width="22.22222222222222%" headers="d28881e1065" rowspan="1" colspan="1">Single-context</td>
                                    </tr>
                                 </tbody>
                              </table>
                           </div>
                        </div>
                        <div class="section">
                           <h3 class="title sectiontitle">Metric Reference - Compute Capability 2.x</h3>
                           <p class="p">Devices with compute capability between 2.0, inclusive, and
                              3.0 implement the metrics shown in the following table. A scope
                              value of single-context indicates that the metric can only be
                              accurately collected when a single context (CUDA or graphic) is
                              executing on the GPU. A scope value of multi-context indicates
                              that the metric can be accurately collected when multiple
                              contexts are executing on the GPU.
                           </p>
                           <div class="tablenoborder">
                              <table cellpadding="4" cellspacing="0" summary="" class="table capabity_metrics" frame="border" border="1" rules="all">
                                 <caption><span class="tablecap">Table 2. Capability 2.x Metrics</span></caption>
                                 <thead class="thead" align="left">
                                    <tr class="row">
                                       <th class="entry" valign="top" width="36.36363636363637%" id="d28881e1161" rowspan="1" colspan="1">Metric Name</th>
                                       <th class="entry" valign="top" width="45.45454545454545%" id="d28881e1164" rowspan="1" colspan="1">Description</th>
                                       <th class="entry" valign="top" width="18.181818181818183%" id="d28881e1167" rowspan="1" colspan="1">Scope</th>
                                    </tr>
                                 </thead>
                                 <tbody class="tbody">
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e1161" rowspan="1" colspan="1">achieved_occupancy</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e1164" rowspan="1" colspan="1">Ratio of the average active warps per active cycle
                                          to the maximum number of warps supported on a
                                          multiprocessor
                                       </td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e1167" rowspan="1" colspan="1">Multi-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e1161" rowspan="1" colspan="1">alu_fu_utilization</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e1164" rowspan="1" colspan="1">The utilization level of the multiprocessor function units that execute integer and floating-point arithmetic instructions
                                          on a scale of 0 to 10
                                       </td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e1167" rowspan="1" colspan="1">Multi-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e1161" rowspan="1" colspan="1">atomic_replay_overhead</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e1164" rowspan="1" colspan="1">Average number of replays due to atomic and reduction bank conflicts for each instruction executed</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e1167" rowspan="1" colspan="1">Multi-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e1161" rowspan="1" colspan="1">atomic_throughput</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e1164" rowspan="1" colspan="1">Global memory atomic and reduction throughput</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e1167" rowspan="1" colspan="1">Multi-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e1161" rowspan="1" colspan="1">atomic_transactions</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e1164" rowspan="1" colspan="1">Global memory atomic and reduction transactions</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e1167" rowspan="1" colspan="1">Multi-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e1161" rowspan="1" colspan="1">atomic_transactions_per_request</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e1164" rowspan="1" colspan="1">Average number of global memory atomic and reduction 
                                          transactions performed for each atomic and reduction instruction
                                       </td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e1167" rowspan="1" colspan="1">Multi-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e1161" rowspan="1" colspan="1">branch_efficiency</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e1164" rowspan="1" colspan="1">Ratio of non-divergent branches to total branches expressed as percentage</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e1167" rowspan="1" colspan="1">Multi-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e1161" rowspan="1" colspan="1">cf_executed</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e1164" rowspan="1" colspan="1">Number of executed control-flow instructions</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e1167" rowspan="1" colspan="1">Multi-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e1161" rowspan="1" colspan="1">cf_fu_utilization</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e1164" rowspan="1" colspan="1">The utilization level of the multiprocessor function units that execute control-flow instructions on a scale of 0 to 10</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e1167" rowspan="1" colspan="1">Multi-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e1161" rowspan="1" colspan="1">cf_issued</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e1164" rowspan="1" colspan="1">Number of issued control-flow instructions</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e1167" rowspan="1" colspan="1">Multi-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e1161" rowspan="1" colspan="1">dram_read_throughput</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e1164" rowspan="1" colspan="1">Device memory read throughput</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e1167" rowspan="1" colspan="1">Single-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e1161" rowspan="1" colspan="1">dram_read_transactions</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e1164" rowspan="1" colspan="1">Device memory read transactions</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e1167" rowspan="1" colspan="1">Single-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e1161" rowspan="1" colspan="1">dram_utilization</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e1164" rowspan="1" colspan="1">The utilization level of the device memory relative to the peak utilization on a scale of 0 to 10</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e1167" rowspan="1" colspan="1">Single-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e1161" rowspan="1" colspan="1">dram_write_throughput</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e1164" rowspan="1" colspan="1">Device memory write throughput</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e1167" rowspan="1" colspan="1">Single-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e1161" rowspan="1" colspan="1">dram_write_transactions</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e1164" rowspan="1" colspan="1">Device memory write transactions</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e1167" rowspan="1" colspan="1">Single-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e1161" rowspan="1" colspan="1">ecc_throughput</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e1164" rowspan="1" colspan="1">ECC throughput from L2 to DRAM</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e1167" rowspan="1" colspan="1">Single-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e1161" rowspan="1" colspan="1">ecc_transactions</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e1164" rowspan="1" colspan="1">Number of ECC transactions between L2 and DRAM</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e1167" rowspan="1" colspan="1">Single-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e1161" rowspan="1" colspan="1">eligible_warps_per_cycle</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e1164" rowspan="1" colspan="1">Average number of warps that are eligible to issue per active cycle</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e1167" rowspan="1" colspan="1">Multi-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e1161" rowspan="1" colspan="1">flop_count_dp</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e1164" rowspan="1" colspan="1">Number of double-precision floating-point operations executed by
                                          non-predicated threads (add, multiply, multiply-accumulate and special). 
                                          Each multiply-accumulate operation contributes 2 to the count.
                                       </td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e1167" rowspan="1" colspan="1">Multi-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e1161" rowspan="1" colspan="1">flop_count_dp_add</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e1164" rowspan="1" colspan="1">Number of double-precision floating-point add operations executed by non-predicated threads</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e1167" rowspan="1" colspan="1">Multi-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e1161" rowspan="1" colspan="1">flop_count_dp_fma</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e1164" rowspan="1" colspan="1">Number of double-precision floating-point multiply-accumulate operations 
                                          executed by non-predicated threads. Each multiply-accumulate operation 
                                          contributes 1 to the count.
                                       </td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e1167" rowspan="1" colspan="1">Multi-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e1161" rowspan="1" colspan="1">flop_count_dp_mul</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e1164" rowspan="1" colspan="1">Number of double-precision floating-point multiply operations executed by non-predicated threads</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e1167" rowspan="1" colspan="1">Multi-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e1161" rowspan="1" colspan="1">flop_count_sp</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e1164" rowspan="1" colspan="1">Number of single-precision floating-point operations executed by 
                                          non-predicated threads (add, multiply, multiply-accumulate and special). 
                                          Each multiply-accumulate operation contributes 2 to the count.
                                       </td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e1167" rowspan="1" colspan="1">Multi-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e1161" rowspan="1" colspan="1">flop_count_sp_add</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e1164" rowspan="1" colspan="1">Number of single-precision floating-point add operations executed by non-predicated threads</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e1167" rowspan="1" colspan="1">Multi-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e1161" rowspan="1" colspan="1">flop_count_sp_fma</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e1164" rowspan="1" colspan="1">Number of single-precision floating-point multiply-accumulate operations 
                                          executed by non-predicated threads. Each multiply-accumulate operation 
                                          contributes 1 to the count.
                                       </td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e1167" rowspan="1" colspan="1">Multi-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e1161" rowspan="1" colspan="1">flop_count_sp_mul</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e1164" rowspan="1" colspan="1">Number of single-precision floating-point multiply operations executed by non-predicated threads</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e1167" rowspan="1" colspan="1">Multi-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e1161" rowspan="1" colspan="1">flop_count_sp_special</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e1164" rowspan="1" colspan="1">Number of single-precision floating-point special operations executed by non-predicated threads</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e1167" rowspan="1" colspan="1">Multi-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e1161" rowspan="1" colspan="1">flop_dp_efficiency</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e1164" rowspan="1" colspan="1">Ratio of achieved to peak double-precision floating-point operations</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e1167" rowspan="1" colspan="1">Multi-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e1161" rowspan="1" colspan="1">flop_sp_efficiency</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e1164" rowspan="1" colspan="1">Ratio of achieved to peak single-precision floating-point operations</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e1167" rowspan="1" colspan="1">Multi-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e1161" rowspan="1" colspan="1">gld_efficiency</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e1164" rowspan="1" colspan="1">Ratio of requested global memory load throughput to
                                          required global memory load throughput expressed as percentage
                                       </td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e1167" rowspan="1" colspan="1">Single-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e1161" rowspan="1" colspan="1">gld_requested_throughput</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e1164" rowspan="1" colspan="1">Requested global memory load throughput</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e1167" rowspan="1" colspan="1">Multi-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e1161" rowspan="1" colspan="1">gld_throughput</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e1164" rowspan="1" colspan="1">Global memory load throughput</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e1167" rowspan="1" colspan="1">Single-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e1161" rowspan="1" colspan="1">gld_transactions</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e1164" rowspan="1" colspan="1">Number of global memory load transactions</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e1167" rowspan="1" colspan="1">Single-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e1161" rowspan="1" colspan="1">gld_transactions_per_request</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e1164" rowspan="1" colspan="1">Average number of global memory load transactions performed for each global memory load</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e1167" rowspan="1" colspan="1">Single-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e1161" rowspan="1" colspan="1">global_cache_replay_overhead</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e1164" rowspan="1" colspan="1">Average number of replays due to global memory
                                          cache misses for each instruction executed
                                       </td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e1167" rowspan="1" colspan="1">Single-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e1161" rowspan="1" colspan="1">gst_efficiency</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e1164" rowspan="1" colspan="1">Ratio of requested global memory store throughput
                                          to required global memory store throughput expressed as percentage
                                       </td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e1167" rowspan="1" colspan="1">Single-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e1161" rowspan="1" colspan="1">gst_requested_throughput</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e1164" rowspan="1" colspan="1">Requested global memory store throughput</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e1167" rowspan="1" colspan="1">Multi-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e1161" rowspan="1" colspan="1">gst_throughput</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e1164" rowspan="1" colspan="1">Global memory store throughput</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e1167" rowspan="1" colspan="1">Single-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e1161" rowspan="1" colspan="1">gst_transactions</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e1164" rowspan="1" colspan="1">Number of global memory store transactions</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e1167" rowspan="1" colspan="1">Single-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e1161" rowspan="1" colspan="1">gst_transactions_per_request</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e1164" rowspan="1" colspan="1">Average number of global memory store transactions performed for each global memory store</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e1167" rowspan="1" colspan="1">Single-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e1161" rowspan="1" colspan="1">inst_bit_convert</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e1164" rowspan="1" colspan="1">Number of bit-conversion instructions executed by non-predicated threads</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e1167" rowspan="1" colspan="1">Multi-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e1161" rowspan="1" colspan="1">inst_compute_ld_st</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e1164" rowspan="1" colspan="1">Number of compute load/store instructions executed by non-predicated threads</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e1167" rowspan="1" colspan="1">Multi-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e1161" rowspan="1" colspan="1">inst_control</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e1164" rowspan="1" colspan="1">Number of control-flow instructions executed by non-predicated threads (jump, branch, etc.)</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e1167" rowspan="1" colspan="1">Multi-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e1161" rowspan="1" colspan="1">inst_executed</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e1164" rowspan="1" colspan="1">The number of instructions executed</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e1167" rowspan="1" colspan="1">Multi-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e1161" rowspan="1" colspan="1">inst_fp_32</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e1164" rowspan="1" colspan="1">Number of single-precision floating-point instructions executed by non-predicated threads (arithmetric, compare, etc.)</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e1167" rowspan="1" colspan="1">Multi-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e1161" rowspan="1" colspan="1">inst_fp_64</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e1164" rowspan="1" colspan="1">Number of double-precision floating-point instructions executed by non-predicated threads (arithmetric, compare, etc.)</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e1167" rowspan="1" colspan="1">Multi-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e1161" rowspan="1" colspan="1">inst_integer</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e1164" rowspan="1" colspan="1">Number of integer instructions executed by non-predicated threads</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e1167" rowspan="1" colspan="1">Multi-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e1161" rowspan="1" colspan="1">inst_inter_thread_communication</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e1164" rowspan="1" colspan="1">Number of inter-thread communication instructions executed by non-predicated threads</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e1167" rowspan="1" colspan="1">Multi-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e1161" rowspan="1" colspan="1">inst_issued</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e1164" rowspan="1" colspan="1">The number of instructions issued</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e1167" rowspan="1" colspan="1">Multi-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e1161" rowspan="1" colspan="1">inst_misc</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e1164" rowspan="1" colspan="1">Number of miscellaneous instructions executed by non-predicated threads</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e1167" rowspan="1" colspan="1">Multi-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e1161" rowspan="1" colspan="1">inst_per_warp</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e1164" rowspan="1" colspan="1">Average number of instructions executed by each warp</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e1167" rowspan="1" colspan="1">Multi-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e1161" rowspan="1" colspan="1">inst_replay_overhead</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e1164" rowspan="1" colspan="1">Average number of replays for each instruction executed</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e1167" rowspan="1" colspan="1">Multi-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e1161" rowspan="1" colspan="1">ipc</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e1164" rowspan="1" colspan="1">Instructions executed per cycle</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e1167" rowspan="1" colspan="1">Multi-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e1161" rowspan="1" colspan="1">ipc_instance</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e1164" rowspan="1" colspan="1">Instructions executed per cycle for a single multiprocessor</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e1167" rowspan="1" colspan="1">Multi-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e1161" rowspan="1" colspan="1">issue_slot_utilization</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e1164" rowspan="1" colspan="1">Percentage of issue slots that issued at least one
                                          instruction, averaged across all cycles
                                       </td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e1167" rowspan="1" colspan="1">Multi-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e1161" rowspan="1" colspan="1">issue_slots</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e1164" rowspan="1" colspan="1">The number of issue slots used</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e1167" rowspan="1" colspan="1">Multi-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e1161" rowspan="1" colspan="1">issued_ipc</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e1164" rowspan="1" colspan="1">Instructions issued per cycle</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e1167" rowspan="1" colspan="1">Multi-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e1161" rowspan="1" colspan="1">l1_cache_global_hit_rate</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e1164" rowspan="1" colspan="1">Hit rate in L1 cache for global loads</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e1167" rowspan="1" colspan="1">Single-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e1161" rowspan="1" colspan="1">l1_cache_local_hit_rate</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e1164" rowspan="1" colspan="1">Hit rate in L1 cache for local loads and stores</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e1167" rowspan="1" colspan="1">Single-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e1161" rowspan="1" colspan="1">l1_shared_utilization</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e1164" rowspan="1" colspan="1">The utilization level of the L1/shared memory relative to peak utilization on a scale of 0 to 10</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e1167" rowspan="1" colspan="1">Single-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e1161" rowspan="1" colspan="1">l2_atomic_throughput</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e1164" rowspan="1" colspan="1">Memory read throughput seen at L2 cache for 
                                          atomic and reduction requests
                                       </td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e1167" rowspan="1" colspan="1">Sinlge-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e1161" rowspan="1" colspan="1">l2_atomic_transactions</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e1164" rowspan="1" colspan="1">Memory read transactions seen at L2 cache for atomic and reduction requests</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e1167" rowspan="1" colspan="1">Single-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e1161" rowspan="1" colspan="1">l2_l1_read_hit_rate</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e1164" rowspan="1" colspan="1">Hit rate at L2 cache for all read requests from L1 cache</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e1167" rowspan="1" colspan="1">Sinlge-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e1161" rowspan="1" colspan="1">l2_l1_read_throughput</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e1164" rowspan="1" colspan="1">Memory read throughput seen at L2 cache for read
                                          requests from L1 cache
                                       </td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e1167" rowspan="1" colspan="1">Single-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e1161" rowspan="1" colspan="1">l2_l1_read_transactions</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e1164" rowspan="1" colspan="1">Memory read transactions seen at L2 cache for all read requests from L1 cache</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e1167" rowspan="1" colspan="1">Single-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e1161" rowspan="1" colspan="1">l2_l1_write_throughput</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e1164" rowspan="1" colspan="1">Memory write throughput seen at L2 cache for write requests from L1 cache</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e1167" rowspan="1" colspan="1">Single-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e1161" rowspan="1" colspan="1">l2_l1_write_transactions</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e1164" rowspan="1" colspan="1">Memory write transactions seen at L2 cache for all write requests from L1 cache</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e1167" rowspan="1" colspan="1">Single-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e1161" rowspan="1" colspan="1">l2_read_throughput</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e1164" rowspan="1" colspan="1">Memory read throughput seen at L2 cache for all read requests</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e1167" rowspan="1" colspan="1">Single-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e1161" rowspan="1" colspan="1">l2_read_transactions</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e1164" rowspan="1" colspan="1">Memory read transactions seen at L2 cache for all read requests</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e1167" rowspan="1" colspan="1">Single-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e1161" rowspan="1" colspan="1">l2_tex_read_transactions</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e1164" rowspan="1" colspan="1">Memory read transactions seen at L2 cache for read requests from the texture cache</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e1167" rowspan="1" colspan="1">Single-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e1161" rowspan="1" colspan="1">l2_texture_read_hit_rate</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e1164" rowspan="1" colspan="1">Hit rate at L2 cache for all read requests from texture cache</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e1167" rowspan="1" colspan="1">Single-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e1161" rowspan="1" colspan="1">l2_texure_read_throughput</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e1164" rowspan="1" colspan="1">Memory read throughput seen at L2 cache for read
                                          requests from the texture cache
                                       </td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e1167" rowspan="1" colspan="1">Sinlge-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e1161" rowspan="1" colspan="1">l2_utilization</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e1164" rowspan="1" colspan="1">The utilization level of the L2 cache relative to the peak utilization on a scale of 0 to 10</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e1167" rowspan="1" colspan="1">Single-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e1161" rowspan="1" colspan="1">l2_write_throughput</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e1164" rowspan="1" colspan="1">Memory write throughput seen at L2 cache for all write requests</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e1167" rowspan="1" colspan="1">Single-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e1161" rowspan="1" colspan="1">l2_write_transactions</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e1164" rowspan="1" colspan="1">Memory write transactions seen at L2 cache for all write requests</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e1167" rowspan="1" colspan="1">Single-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e1161" rowspan="1" colspan="1">ldst_executed</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e1164" rowspan="1" colspan="1">Number of executed load and store instructions</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e1167" rowspan="1" colspan="1">Multi-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e1161" rowspan="1" colspan="1">ldst_fu_utilization</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e1164" rowspan="1" colspan="1">The utilization level of the multiprocessor function units that execute global, local and shared memory instructions on a
                                          scale of 0 to 10
                                       </td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e1167" rowspan="1" colspan="1">Multi-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e1161" rowspan="1" colspan="1">ldst_issued</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e1164" rowspan="1" colspan="1">Number of issued load and store instructions</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e1167" rowspan="1" colspan="1">Multi-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e1161" rowspan="1" colspan="1">local_load_throughput</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e1164" rowspan="1" colspan="1">Local memory load throughput</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e1167" rowspan="1" colspan="1">Single-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e1161" rowspan="1" colspan="1">local_load_transactions</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e1164" rowspan="1" colspan="1">Number of local memory load transactions</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e1167" rowspan="1" colspan="1">Single-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e1161" rowspan="1" colspan="1">local_load_transactions_per_request</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e1164" rowspan="1" colspan="1">Average number of local memory load transactions performed for each local memory load</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e1167" rowspan="1" colspan="1">Single-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e1161" rowspan="1" colspan="1">local_memory_overhead</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e1164" rowspan="1" colspan="1">Ratio of local memory traffic to total memory
                                          traffic between the L1 and L2 caches expressed as percentage
                                       </td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e1167" rowspan="1" colspan="1">Single-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e1161" rowspan="1" colspan="1">local_replay_overhead</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e1164" rowspan="1" colspan="1">Average number of replays due to local memory
                                          accesses for each instruction executed
                                       </td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e1167" rowspan="1" colspan="1">Single-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e1161" rowspan="1" colspan="1">local_store_throughput</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e1164" rowspan="1" colspan="1">Local memory store throughput</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e1167" rowspan="1" colspan="1">Single-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e1161" rowspan="1" colspan="1">local_store_transactions</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e1164" rowspan="1" colspan="1">Number of local memory store transactions</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e1167" rowspan="1" colspan="1">Single-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e1161" rowspan="1" colspan="1">local_store_transactions_per_request</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e1164" rowspan="1" colspan="1">Average number of local memory store transactions performed for each local memory store</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e1167" rowspan="1" colspan="1">Single-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e1161" rowspan="1" colspan="1">shared_efficiency</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e1164" rowspan="1" colspan="1">Ratio of requested shared memory throughput to required shared memory throughput expressed as percentage</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e1167" rowspan="1" colspan="1">Single-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e1161" rowspan="1" colspan="1">shared_load_throughput</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e1164" rowspan="1" colspan="1">Shared memory load throughput</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e1167" rowspan="1" colspan="1">Single-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e1161" rowspan="1" colspan="1">shared_load_transactions</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e1164" rowspan="1" colspan="1">Number of shared memory load transactions</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e1167" rowspan="1" colspan="1">Single-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e1161" rowspan="1" colspan="1">shared_load_transactions_per_request</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e1164" rowspan="1" colspan="1">Average number of shared memory load transactions performed for each shared memory load</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e1167" rowspan="1" colspan="1">Single-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e1161" rowspan="1" colspan="1">shared_replay_overhead</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e1164" rowspan="1" colspan="1">Average number of replays due to shared memory
                                          conflicts for each instruction executed
                                       </td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e1167" rowspan="1" colspan="1">Single-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e1161" rowspan="1" colspan="1">shared_store_throughput</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e1164" rowspan="1" colspan="1">Shared memory store throughput</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e1167" rowspan="1" colspan="1">Single-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e1161" rowspan="1" colspan="1">shared_store_transactions</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e1164" rowspan="1" colspan="1">Number of shared memory store transactions</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e1167" rowspan="1" colspan="1">Single-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e1161" rowspan="1" colspan="1">shared_store_transactions_per_request</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e1164" rowspan="1" colspan="1">Average number of shared memory store transactions performed for each shared memory store</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e1167" rowspan="1" colspan="1">Single-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e1161" rowspan="1" colspan="1">sm_efficiency</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e1164" rowspan="1" colspan="1">The percentage of time at least one warp is active
                                          on a multiprocessor averaged over all multiprocessors on the GPU
                                       </td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e1167" rowspan="1" colspan="1">Single-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e1161" rowspan="1" colspan="1">sm_efficiency_instance</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e1164" rowspan="1" colspan="1">The percentage of time at least one warp is active
                                          on a specific multiprocessor
                                       </td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e1167" rowspan="1" colspan="1">Single-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e1161" rowspan="1" colspan="1">stall_data_request</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e1164" rowspan="1" colspan="1">Percentage of stalls occurring because a memory operation cannot be performed due to the required resources not being available
                                          or fully utilized, or because too many requests of a given type are outstanding
                                       </td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e1167" rowspan="1" colspan="1">Multi-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e1161" rowspan="1" colspan="1">stall_exec_dependency</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e1164" rowspan="1" colspan="1">Percentage of stalls occurring because an input required by the instruction is not yet available</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e1167" rowspan="1" colspan="1">Multi-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e1161" rowspan="1" colspan="1">stall_inst_fetch</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e1164" rowspan="1" colspan="1">Percentage of stalls occurring because the next assembly instruction has not yet been fetched</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e1167" rowspan="1" colspan="1">Multi-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e1161" rowspan="1" colspan="1">stall_other</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e1164" rowspan="1" colspan="1">Percentage of stalls occurring due to miscellaneous reasons</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e1167" rowspan="1" colspan="1">Multi-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e1161" rowspan="1" colspan="1">stall_sync</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e1164" rowspan="1" colspan="1">Percentage of stalls occurring because the warp is blocked at a __syncthreads() call</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e1167" rowspan="1" colspan="1">Multi-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e1161" rowspan="1" colspan="1">stall_texture</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e1164" rowspan="1" colspan="1">Percentage of stalls occurring because the texture sub-system is fully utilized or has too many outstanding requests</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e1167" rowspan="1" colspan="1">Multi-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e1161" rowspan="1" colspan="1">sysmem_read_throughput</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e1164" rowspan="1" colspan="1">System memory read throughput</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e1167" rowspan="1" colspan="1">Single-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e1161" rowspan="1" colspan="1">sysmem_read_transactions</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e1164" rowspan="1" colspan="1">System memory read transactions</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e1167" rowspan="1" colspan="1">Single-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e1161" rowspan="1" colspan="1">sysmem_utilization</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e1164" rowspan="1" colspan="1">The utilization level of the system memory relative to the peak utilization on a scale of 0 to 10</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e1167" rowspan="1" colspan="1">Single-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e1161" rowspan="1" colspan="1">sysmem_write_throughput</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e1164" rowspan="1" colspan="1">System memory write throughput</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e1167" rowspan="1" colspan="1">Single-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e1161" rowspan="1" colspan="1">sysmem_write_transactions</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e1164" rowspan="1" colspan="1">System memory write transactions</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e1167" rowspan="1" colspan="1">Single-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e1161" rowspan="1" colspan="1">tex_cache_hit_rate</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e1164" rowspan="1" colspan="1">Texture cache hit rate</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e1167" rowspan="1" colspan="1">Single-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e1161" rowspan="1" colspan="1">tex_cache_throughput</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e1164" rowspan="1" colspan="1">Texture cache throughput</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e1167" rowspan="1" colspan="1">Single-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e1161" rowspan="1" colspan="1">tex_cache_transactions</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e1164" rowspan="1" colspan="1">Texture cache read transactions</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e1167" rowspan="1" colspan="1">Single-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e1161" rowspan="1" colspan="1">tex_fu_utilization</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e1164" rowspan="1" colspan="1">The utilization level of the multiprocessor function units that execute texture instructions on a scale of 0 to 10</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e1167" rowspan="1" colspan="1">Multi-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e1161" rowspan="1" colspan="1">tex_utilization</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e1164" rowspan="1" colspan="1">The utilization level of the texture cache relative to the peak utilization on a scale of 0 to 10</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e1167" rowspan="1" colspan="1">Single-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e1161" rowspan="1" colspan="1">warp_execution_efficiency</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e1164" rowspan="1" colspan="1">Ratio of the average active threads per warp to the
                                          maximum number of threads per warp supported on a
                                          multiprocessor expressed as percentage
                                       </td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e1167" rowspan="1" colspan="1">Multi-context</td>
                                    </tr>
                                 </tbody>
                              </table>
                           </div>
                        </div>
                        <div class="section">
                           <h3 class="title sectiontitle">Metric Reference - Compute Capability 3.x</h3>
                           <p class="p">Devices with compute capability between 3.0, inclusive, and
                              4.0 implement the metrics shown in the following table. A scope
                              value of single-context indicates that the metric can only be
                              accurately collected when a single context (CUDA or graphic) is
                              executing on the GPU. A scope value of multi-context indicates
                              that the metric can be accurately collected when multiple
                              contexts are executing on the GPU.
                           </p>
                           <div class="tablenoborder">
                              <table cellpadding="4" cellspacing="0" summary="" class="table capabity_metrics" frame="border" border="1" rules="all">
                                 <caption><span class="tablecap">Table 3. Capability 3.x Metrics</span></caption>
                                 <thead class="thead" align="left">
                                    <tr class="row">
                                       <th class="entry" valign="top" width="36.36363636363637%" id="d28881e2577" rowspan="1" colspan="1">Metric Name</th>
                                       <th class="entry" valign="top" width="45.45454545454545%" id="d28881e2580" rowspan="1" colspan="1">Description</th>
                                       <th class="entry" valign="top" width="18.181818181818183%" id="d28881e2583" rowspan="1" colspan="1">Scope</th>
                                    </tr>
                                 </thead>
                                 <tbody class="tbody">
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e2577" rowspan="1" colspan="1">achieved_occupancy</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e2580" rowspan="1" colspan="1">Ratio of the average active warps per active cycle
                                          to the maximum number of warps supported on a
                                          multiprocessor
                                       </td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e2583" rowspan="1" colspan="1">Multi-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e2577" rowspan="1" colspan="1">alu_fu_utilization</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e2580" rowspan="1" colspan="1">The utilization level of the multiprocessor function units that execute integer and floating-point arithmetic instructions
                                          on a scale of 0 to 10
                                       </td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e2583" rowspan="1" colspan="1">Multi-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e2577" rowspan="1" colspan="1">atomic_replay_overhead</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e2580" rowspan="1" colspan="1">Average number of replays due to atomic and reduction bank conflicts for each instruction executed</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e2583" rowspan="1" colspan="1">Multi-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e2577" rowspan="1" colspan="1">atomic_throughput</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e2580" rowspan="1" colspan="1">Global memory atomic and reduction throughput</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e2583" rowspan="1" colspan="1">Multi-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e2577" rowspan="1" colspan="1">atomic_transactions</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e2580" rowspan="1" colspan="1">Global memory atomic and reduction transactions</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e2583" rowspan="1" colspan="1">Multi-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e2577" rowspan="1" colspan="1">atomic_transactions_per_request</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e2580" rowspan="1" colspan="1">Average number of global memory atomic and reduction 
                                          transactions performed for each atomic and reduction instruction
                                       </td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e2583" rowspan="1" colspan="1">Multi-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e2577" rowspan="1" colspan="1">branch_efficiency</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e2580" rowspan="1" colspan="1">Ratio of non-divergent branches to total branches expressed as percentage.
                                          This is available for compute capability 3.0.
                                          
                                       </td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e2583" rowspan="1" colspan="1">Multi-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e2577" rowspan="1" colspan="1">cf_executed</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e2580" rowspan="1" colspan="1">Number of executed control-flow instructions</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e2583" rowspan="1" colspan="1">Multi-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e2577" rowspan="1" colspan="1">cf_fu_utilization</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e2580" rowspan="1" colspan="1">The utilization level of the multiprocessor function units that execute control-flow instructions on a scale of 0 to 10</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e2583" rowspan="1" colspan="1">Multi-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e2577" rowspan="1" colspan="1">cf_issued</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e2580" rowspan="1" colspan="1">Number of issued control-flow instructions</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e2583" rowspan="1" colspan="1">Multi-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e2577" rowspan="1" colspan="1">dram_read_throughput</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e2580" rowspan="1" colspan="1">Device memory read throughput</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e2583" rowspan="1" colspan="1">Single-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e2577" rowspan="1" colspan="1">dram_read_transactions</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e2580" rowspan="1" colspan="1">Device memory read transactions</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e2583" rowspan="1" colspan="1">Single-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e2577" rowspan="1" colspan="1">dram_utilization</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e2580" rowspan="1" colspan="1">The utilization level of the device memory relative to the peak utilization on a scale of 0 to 10</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e2583" rowspan="1" colspan="1">Single-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e2577" rowspan="1" colspan="1">dram_write_throughput</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e2580" rowspan="1" colspan="1">Device memory write throughput</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e2583" rowspan="1" colspan="1">Single-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e2577" rowspan="1" colspan="1">dram_write_transactions</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e2580" rowspan="1" colspan="1">Device memory write transactions</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e2583" rowspan="1" colspan="1">Single-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e2577" rowspan="1" colspan="1">ecc_throughput</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e2580" rowspan="1" colspan="1">ECC throughput from L2 to DRAM</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e2583" rowspan="1" colspan="1">Single-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e2577" rowspan="1" colspan="1">ecc_transactions</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e2580" rowspan="1" colspan="1">Number of ECC transactions between L2 and DRAM</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e2583" rowspan="1" colspan="1">Single-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e2577" rowspan="1" colspan="1">eligible_warps_per_cycle</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e2580" rowspan="1" colspan="1">Average number of warps that are eligible to issue per active cycle</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e2583" rowspan="1" colspan="1">Multi-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e2577" rowspan="1" colspan="1">flop_count_dp</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e2580" rowspan="1" colspan="1">Number of double-precision floating-point operations executed by
                                          non-predicated threads (add, multiply, multiply-accumulate and special). 
                                          Each multiply-accumulate operation contributes 2 to the count.
                                       </td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e2583" rowspan="1" colspan="1">Multi-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e2577" rowspan="1" colspan="1">flop_count_dp_add</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e2580" rowspan="1" colspan="1">Number of double-precision floating-point add operations executed by non-predicated threads</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e2583" rowspan="1" colspan="1">Multi-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e2577" rowspan="1" colspan="1">flop_count_dp_fma</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e2580" rowspan="1" colspan="1">Number of double-precision floating-point multiply-accumulate operations 
                                          executed by non-predicated threads. Each multiply-accumulate operation 
                                          contributes 1 to the count.
                                       </td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e2583" rowspan="1" colspan="1">Multi-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e2577" rowspan="1" colspan="1">flop_count_dp_mul</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e2580" rowspan="1" colspan="1">Number of double-precision floating-point multiply operations executed by non-predicated threads</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e2583" rowspan="1" colspan="1">Multi-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e2577" rowspan="1" colspan="1">flop_count_sp</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e2580" rowspan="1" colspan="1">Number of single-precision floating-point operations executed by 
                                          non-predicated threads (add, multiply, multiply-accumulate and special). 
                                          Each multiply-accumulate operation contributes 2 to the count.
                                       </td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e2583" rowspan="1" colspan="1">Multi-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e2577" rowspan="1" colspan="1">flop_count_sp_add</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e2580" rowspan="1" colspan="1">Number of single-precision floating-point add operations executed by non-predicated threads</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e2583" rowspan="1" colspan="1">Multi-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e2577" rowspan="1" colspan="1">flop_count_sp_fma</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e2580" rowspan="1" colspan="1">Number of single-precision floating-point multiply-accumulate operations 
                                          executed by non-predicated threads. Each multiply-accumulate operation 
                                          contributes 1 to the count.
                                       </td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e2583" rowspan="1" colspan="1">Multi-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e2577" rowspan="1" colspan="1">flop_count_sp_mul</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e2580" rowspan="1" colspan="1">Number of single-precision floating-point multiply operations executed by non-predicated threads</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e2583" rowspan="1" colspan="1">Multi-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e2577" rowspan="1" colspan="1">flop_count_sp_special</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e2580" rowspan="1" colspan="1">Number of single-precision floating-point special operations executed by non-predicated threads</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e2583" rowspan="1" colspan="1">Multi-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e2577" rowspan="1" colspan="1">flop_dp_efficiency</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e2580" rowspan="1" colspan="1">Ratio of achieved to peak double-precision floating-point operations</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e2583" rowspan="1" colspan="1">Multi-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e2577" rowspan="1" colspan="1">flop_sp_efficiency</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e2580" rowspan="1" colspan="1">Ratio of achieved to peak single-precision floating-point operations</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e2583" rowspan="1" colspan="1">Multi-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e2577" rowspan="1" colspan="1">gld_efficiency</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e2580" rowspan="1" colspan="1">Ratio of requested global memory load throughput to
                                          required global memory load throughput expressed as percentage
                                       </td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e2583" rowspan="1" colspan="1">Single-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e2577" rowspan="1" colspan="1">gld_requested_throughput</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e2580" rowspan="1" colspan="1">Requested global memory load throughput</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e2583" rowspan="1" colspan="1">Multi-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e2577" rowspan="1" colspan="1">gld_throughput</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e2580" rowspan="1" colspan="1">Global memory load throughput</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e2583" rowspan="1" colspan="1">Single-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e2577" rowspan="1" colspan="1">gld_transactions</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e2580" rowspan="1" colspan="1">Number of global memory load transactions expressed as percentage</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e2583" rowspan="1" colspan="1">Single-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e2577" rowspan="1" colspan="1">gld_transactions_per_request</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e2580" rowspan="1" colspan="1">Average number of global memory load transactions performed for each global memory load</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e2583" rowspan="1" colspan="1">Single-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e2577" rowspan="1" colspan="1">global_cache_replay_overhead</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e2580" rowspan="1" colspan="1">Average number of replays due to global memory
                                          cache misses for each instruction executed
                                       </td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e2583" rowspan="1" colspan="1">Multi-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e2577" rowspan="1" colspan="1">global_replay_overhead</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e2580" rowspan="1" colspan="1">Average number of replays due to global memory
                                          cache misses
                                       </td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e2583" rowspan="1" colspan="1">Multi-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e2577" rowspan="1" colspan="1">gst_efficiency</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e2580" rowspan="1" colspan="1">Ratio of requested global memory store throughput
                                          to required global memory store throughput expressed as percentage
                                       </td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e2583" rowspan="1" colspan="1">Single-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e2577" rowspan="1" colspan="1">gst_requested_throughput</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e2580" rowspan="1" colspan="1">Requested global memory store throughput</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e2583" rowspan="1" colspan="1">Multi-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e2577" rowspan="1" colspan="1">gst_throughput</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e2580" rowspan="1" colspan="1">Global memory store throughput</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e2583" rowspan="1" colspan="1">Single-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e2577" rowspan="1" colspan="1">gst_transactions</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e2580" rowspan="1" colspan="1">Number of global memory store transactions</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e2583" rowspan="1" colspan="1">Single-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e2577" rowspan="1" colspan="1">gst_transactions_per_request</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e2580" rowspan="1" colspan="1">Average number of global memory store transactions performed for each global memory store</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e2583" rowspan="1" colspan="1">Single-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e2577" rowspan="1" colspan="1">inst_bit_convert</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e2580" rowspan="1" colspan="1">Number of bit-conversion instructions executed by non-predicated threads</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e2583" rowspan="1" colspan="1">Multi-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e2577" rowspan="1" colspan="1">inst_compute_ld_st</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e2580" rowspan="1" colspan="1">Number of compute load/store instructions executed by non-predicated threads</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e2583" rowspan="1" colspan="1">Multi-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e2577" rowspan="1" colspan="1">inst_control</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e2580" rowspan="1" colspan="1">Number of control-flow instructions executed by non-predicated threads (jump, branch, etc.)</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e2583" rowspan="1" colspan="1">Multi-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e2577" rowspan="1" colspan="1">inst_executed</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e2580" rowspan="1" colspan="1">The number of instructions executed</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e2583" rowspan="1" colspan="1">Multi-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e2577" rowspan="1" colspan="1">inst_fp_32</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e2580" rowspan="1" colspan="1">Number of single-precision floating-point instructions executed by non-predicated threads (arithmetric, compare, etc.)</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e2583" rowspan="1" colspan="1">Multi-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e2577" rowspan="1" colspan="1">inst_fp_64</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e2580" rowspan="1" colspan="1">Number of double-precision floating-point instructions executed by non-predicated threads (arithmetric, compare, etc.)</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e2583" rowspan="1" colspan="1">Multi-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e2577" rowspan="1" colspan="1">inst_integer</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e2580" rowspan="1" colspan="1">Number of integer instructions executed by non-predicated threads</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e2583" rowspan="1" colspan="1">Multi-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e2577" rowspan="1" colspan="1">inst_inter_thread_communication</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e2580" rowspan="1" colspan="1">Number of inter-thread communication instructions executed by non-predicated threads</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e2583" rowspan="1" colspan="1">Multi-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e2577" rowspan="1" colspan="1">inst_issued</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e2580" rowspan="1" colspan="1">The number of instructions issued</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e2583" rowspan="1" colspan="1">Multi-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e2577" rowspan="1" colspan="1">inst_misc</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e2580" rowspan="1" colspan="1">Number of miscellaneous instructions executed by non-predicated threads</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e2583" rowspan="1" colspan="1">Multi-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e2577" rowspan="1" colspan="1">inst_per_warp</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e2580" rowspan="1" colspan="1">Average number of instructions executed by each warp</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e2583" rowspan="1" colspan="1">Multi-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e2577" rowspan="1" colspan="1">inst_replay_overhead</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e2580" rowspan="1" colspan="1">Average number of replays for each instruction executed</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e2583" rowspan="1" colspan="1">Multi-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e2577" rowspan="1" colspan="1">ipc</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e2580" rowspan="1" colspan="1">Instructions executed per cycle</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e2583" rowspan="1" colspan="1">Multi-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e2577" rowspan="1" colspan="1">ipc_instance</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e2580" rowspan="1" colspan="1">Instructions executed per cycle for a single multiprocessor</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e2583" rowspan="1" colspan="1">Multi-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e2577" rowspan="1" colspan="1">issue_slot_utilization</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e2580" rowspan="1" colspan="1">Percentage of issue slots that issued at least one
                                          instruction, averaged across all cycles
                                       </td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e2583" rowspan="1" colspan="1">Multi-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e2577" rowspan="1" colspan="1">issue_slots</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e2580" rowspan="1" colspan="1">The number of issue slots used</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e2583" rowspan="1" colspan="1">Multi-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e2577" rowspan="1" colspan="1">issued_ipc</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e2580" rowspan="1" colspan="1">Instructions issued per cycle</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e2583" rowspan="1" colspan="1">Multi-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e2577" rowspan="1" colspan="1">l1_cache_global_hit_rate</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e2580" rowspan="1" colspan="1">Hit rate in L1 cache for global loads</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e2583" rowspan="1" colspan="1">Single-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e2577" rowspan="1" colspan="1">l1_cache_local_hit_rate</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e2580" rowspan="1" colspan="1">Hit rate in L1 cache for local loads and stores</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e2583" rowspan="1" colspan="1">Single-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e2577" rowspan="1" colspan="1">l1_shared_utilization</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e2580" rowspan="1" colspan="1">The utilization level of the L1/shared memory relative to peak utilization on a scale of 0 to 10.
                                          This is available for compute capability 3.0 and 3.5.
                                          
                                       </td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e2583" rowspan="1" colspan="1">Single-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e2577" rowspan="1" colspan="1">l2_atomic_throughput</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e2580" rowspan="1" colspan="1">Memory read throughput seen at L2 cache for 
                                          atomic and reduction requests
                                       </td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e2583" rowspan="1" colspan="1">Sinlge-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e2577" rowspan="1" colspan="1">l2_atomic_transactions</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e2580" rowspan="1" colspan="1">Memory read transactions seen at L2 cache for atomic and reduction requests</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e2583" rowspan="1" colspan="1">Single-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e2577" rowspan="1" colspan="1">l2_l1_read_hit_rate</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e2580" rowspan="1" colspan="1">Hit rate at L2 cache for all read requests from L1 cache.
                                          This is available for compute capability 3.0 and 3.5.
                                          
                                       </td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e2583" rowspan="1" colspan="1">Sinlge-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e2577" rowspan="1" colspan="1">l2_l1_read_throughput</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e2580" rowspan="1" colspan="1">Memory read throughput seen at L2 cache for read
                                          requests from L1 cache.
                                          This is available for compute capability 3.0 and 3.5.
                                          
                                       </td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e2583" rowspan="1" colspan="1">Single-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e2577" rowspan="1" colspan="1">l2_l1_read_transactions</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e2580" rowspan="1" colspan="1">Memory read transactions seen at L2 cache for all read requests from L1 cache</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e2583" rowspan="1" colspan="1">Single-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e2577" rowspan="1" colspan="1">l2_l1_write_throughput</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e2580" rowspan="1" colspan="1">Memory write throughput seen at L2 cache for write requests from L1 cache</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e2583" rowspan="1" colspan="1">Single-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e2577" rowspan="1" colspan="1">l2_l1_write_transactions</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e2580" rowspan="1" colspan="1">Memory write transactions seen at L2 cache for all write requests from L1 cache</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e2583" rowspan="1" colspan="1">Single-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e2577" rowspan="1" colspan="1">l2_read_throughput</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e2580" rowspan="1" colspan="1">Memory read throughput seen at L2 cache for all read requests</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e2583" rowspan="1" colspan="1">Single-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e2577" rowspan="1" colspan="1">l2_read_transactions</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e2580" rowspan="1" colspan="1">Memory read transactions seen at L2 cache for all read requests</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e2583" rowspan="1" colspan="1">Single-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e2577" rowspan="1" colspan="1">l2_tex_read_transactions</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e2580" rowspan="1" colspan="1">Memory read transactions seen at L2 cache for read requests from the texture cache</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e2583" rowspan="1" colspan="1">Single-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e2577" rowspan="1" colspan="1">l2_texture_read_hit_rate</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e2580" rowspan="1" colspan="1">Hit rate at L2 cache for all read requests from texture cache</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e2583" rowspan="1" colspan="1">Single-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e2577" rowspan="1" colspan="1">l2_texture_read_throughput</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e2580" rowspan="1" colspan="1">Memory read throughput seen at L2 cache for read
                                          requests from the texture cache
                                       </td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e2583" rowspan="1" colspan="1">Sinlge-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e2577" rowspan="1" colspan="1">l2_utilization</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e2580" rowspan="1" colspan="1">The utilization level of the L2 cache relative to the peak utilization on a scale of 0 to 10</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e2583" rowspan="1" colspan="1">Single-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e2577" rowspan="1" colspan="1">l2_write_throughput</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e2580" rowspan="1" colspan="1">Memory write throughput seen at L2 cache for all write requests</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e2583" rowspan="1" colspan="1">Single-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e2577" rowspan="1" colspan="1">l2_write_transactions</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e2580" rowspan="1" colspan="1">Memory write transactions seen at L2 cache for all write requests</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e2583" rowspan="1" colspan="1">Single-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e2577" rowspan="1" colspan="1">ldst_executed</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e2580" rowspan="1" colspan="1">Number of executed load and store instructions</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e2583" rowspan="1" colspan="1">Multi-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e2577" rowspan="1" colspan="1">ldst_fu_utilization</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e2580" rowspan="1" colspan="1">The utilization level of the multiprocessor function units that execute global, local and shared memory instructions on a
                                          scale of 0 to 10
                                       </td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e2583" rowspan="1" colspan="1">Multi-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e2577" rowspan="1" colspan="1">ldst_issued</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e2580" rowspan="1" colspan="1">Number of issued load and store instructions</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e2583" rowspan="1" colspan="1">Multi-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e2577" rowspan="1" colspan="1">local_load_throughput</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e2580" rowspan="1" colspan="1">Local memory load throughput</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e2583" rowspan="1" colspan="1">Single-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e2577" rowspan="1" colspan="1">local_load_transactions</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e2580" rowspan="1" colspan="1">Number of local memory load transactions</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e2583" rowspan="1" colspan="1">Single-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e2577" rowspan="1" colspan="1">local_load_transactions_per_request</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e2580" rowspan="1" colspan="1">Average number of local memory load transactions performed for each local memory load</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e2583" rowspan="1" colspan="1">Single-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e2577" rowspan="1" colspan="1">local_memory_overhead</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e2580" rowspan="1" colspan="1">Ratio of local memory traffic to total memory
                                          traffic between the L1 and L2 caches expressed as percentage.
                                          This is available for compute capability 3.0 and 3.5.
                                          
                                       </td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e2583" rowspan="1" colspan="1">Single-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e2577" rowspan="1" colspan="1">local_replay_overhead</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e2580" rowspan="1" colspan="1">Average number of replays due to local memory
                                          accesses for each instruction executed
                                       </td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e2583" rowspan="1" colspan="1">Multi-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e2577" rowspan="1" colspan="1">local_store_throughput</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e2580" rowspan="1" colspan="1">Local memory store throughput</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e2583" rowspan="1" colspan="1">Single-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e2577" rowspan="1" colspan="1">local_store_transactions</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e2580" rowspan="1" colspan="1">Number of local memory store transactions</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e2583" rowspan="1" colspan="1">Single-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e2577" rowspan="1" colspan="1">local_store_transactions_per_request</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e2580" rowspan="1" colspan="1">Average number of local memory store transactions performed for each local memory store</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e2583" rowspan="1" colspan="1">Single-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e2577" rowspan="1" colspan="1">nc_cache_global_hit_rate</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e2580" rowspan="1" colspan="1">Hit rate in non coherent cache for global loads</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e2583" rowspan="1" colspan="1">Single-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e2577" rowspan="1" colspan="1">nc_gld_efficiency</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e2580" rowspan="1" colspan="1">Ratio of requested non coherent global memory load throughput to required non coherent global memory load throughput expressed
                                          as percentage
                                       </td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e2583" rowspan="1" colspan="1">Single-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e2577" rowspan="1" colspan="1">nc_gld_requested_throughput</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e2580" rowspan="1" colspan="1">Requested throughput for global memory loaded via non-coherent cache</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e2583" rowspan="1" colspan="1">Multi-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e2577" rowspan="1" colspan="1">nc_gld_throughput</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e2580" rowspan="1" colspan="1">Non coherent global memory load throughput</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e2583" rowspan="1" colspan="1">Single-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e2577" rowspan="1" colspan="1">nc_l2_read_throughput</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e2580" rowspan="1" colspan="1">Memory read throughput for non coherent global read requests seen at L2 cache</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e2583" rowspan="1" colspan="1">Single-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e2577" rowspan="1" colspan="1">nc_l2_read_transactions</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e2580" rowspan="1" colspan="1">Memory read transactions seen at L2 cache for non coherent global read requests</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e2583" rowspan="1" colspan="1">Single-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e2577" rowspan="1" colspan="1">shared_efficiency</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e2580" rowspan="1" colspan="1">Ratio of requested shared memory throughput to required shared memory throughput expressed as percentage</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e2583" rowspan="1" colspan="1">Single-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e2577" rowspan="1" colspan="1">shared_load_throughput</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e2580" rowspan="1" colspan="1">Shared memory load throughput</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e2583" rowspan="1" colspan="1">Single-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e2577" rowspan="1" colspan="1">shared_load_transactions</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e2580" rowspan="1" colspan="1">Number of shared memory load transactions</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e2583" rowspan="1" colspan="1">Single-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e2577" rowspan="1" colspan="1">shared_load_transactions_per_request</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e2580" rowspan="1" colspan="1">Average number of shared memory load transactions performed for each shared memory load</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e2583" rowspan="1" colspan="1">Single-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e2577" rowspan="1" colspan="1">shared_replay_overhead</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e2580" rowspan="1" colspan="1">Average number of replays due to shared memory
                                          conflicts for each instruction executed
                                       </td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e2583" rowspan="1" colspan="1">Multi-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e2577" rowspan="1" colspan="1">shared_store_throughput</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e2580" rowspan="1" colspan="1">Shared memory store throughput</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e2583" rowspan="1" colspan="1">Single-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e2577" rowspan="1" colspan="1">shared_store_transactions</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e2580" rowspan="1" colspan="1">Number of shared memory store transactions</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e2583" rowspan="1" colspan="1">Single-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e2577" rowspan="1" colspan="1">shared_store_transactions_per_request</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e2580" rowspan="1" colspan="1">Average number of shared memory store transactions performed for each shared memory store</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e2583" rowspan="1" colspan="1">Single-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e2577" rowspan="1" colspan="1">sm_efficiency</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e2580" rowspan="1" colspan="1">The percentage of time at least one warp is active
                                          on a multiprocessor averaged over all multiprocessors on the GPU
                                       </td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e2583" rowspan="1" colspan="1">Single-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e2577" rowspan="1" colspan="1">sm_efficiency_instance</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e2580" rowspan="1" colspan="1">The percentage of time at least one warp is active
                                          on a specific multiprocessor
                                       </td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e2583" rowspan="1" colspan="1">Single-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e2577" rowspan="1" colspan="1">stall_compute</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e2580" rowspan="1" colspan="1">Percentage of stalls occurring because a compute operation cannot be performed due to the required resources not being available</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e2583" rowspan="1" colspan="1">Multi-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e2577" rowspan="1" colspan="1">stall_data_request</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e2580" rowspan="1" colspan="1">Percentage of stalls occurring because a memory operation cannot be performed due to the required resources not being available
                                          or fully utilized, or because too many requests of a given type are outstanding
                                       </td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e2583" rowspan="1" colspan="1">Multi-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e2577" rowspan="1" colspan="1">stall_exec_dependency</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e2580" rowspan="1" colspan="1">Percentage of stalls occurring because an input required by the instruction is not yet available</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e2583" rowspan="1" colspan="1">Multi-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e2577" rowspan="1" colspan="1">stall_imc</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e2580" rowspan="1" colspan="1">Percentage of stalls occurring because of immediate constant cache miss</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e2583" rowspan="1" colspan="1">Multi-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e2577" rowspan="1" colspan="1">stall_inst_fetch</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e2580" rowspan="1" colspan="1">Percentage of stalls occurring because the next assembly instruction has not yet been fetched</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e2583" rowspan="1" colspan="1">Multi-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e2577" rowspan="1" colspan="1">stall_other</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e2580" rowspan="1" colspan="1">Percentage of stalls occurring due to miscellaneous reasons</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e2583" rowspan="1" colspan="1">Multi-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e2577" rowspan="1" colspan="1">stall_sync</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e2580" rowspan="1" colspan="1">Percentage of stalls occurring because the warp is blocked at a __syncthreads() call</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e2583" rowspan="1" colspan="1">Multi-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e2577" rowspan="1" colspan="1">stall_texture</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e2580" rowspan="1" colspan="1">Percentage of stalls occurring because the texture sub-system is fully utilized or has too many outstanding requests</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e2583" rowspan="1" colspan="1">Multi-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e2577" rowspan="1" colspan="1">sysmem_read_throughput</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e2580" rowspan="1" colspan="1">System memory read throughput.
                                          This is available for compute capability 3.0 and 3.5.
                                          
                                       </td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e2583" rowspan="1" colspan="1">Single-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e2577" rowspan="1" colspan="1">sysmem_read_transactions</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e2580" rowspan="1" colspan="1">System memory read transactions.
                                          This is available for compute capability 3.0 and 3.5.
                                          
                                       </td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e2583" rowspan="1" colspan="1">Single-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e2577" rowspan="1" colspan="1">sysmem_utilization</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e2580" rowspan="1" colspan="1">The utilization level of the system memory relative to the peak utilization on a scale of 0 to 10.
                                          This is available for compute capability 3.0 and 3.5.
                                          
                                       </td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e2583" rowspan="1" colspan="1">Single-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e2577" rowspan="1" colspan="1">sysmem_write_throughput</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e2580" rowspan="1" colspan="1">System memory write throughput.
                                          This is available for compute capability 3.0 and 3.5.
                                          
                                       </td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e2583" rowspan="1" colspan="1">Single-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e2577" rowspan="1" colspan="1">sysmem_write_transactions</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e2580" rowspan="1" colspan="1">System memory write transactions.
                                          This is available for compute capability 3.0 and 3.5.
                                          
                                       </td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e2583" rowspan="1" colspan="1">Single-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e2577" rowspan="1" colspan="1">tex_cache_hit_rate</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e2580" rowspan="1" colspan="1">Texture cache hit rate</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e2583" rowspan="1" colspan="1">Single-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e2577" rowspan="1" colspan="1">tex_cache_throughput</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e2580" rowspan="1" colspan="1">Texture cache throughput</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e2583" rowspan="1" colspan="1">Single-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e2577" rowspan="1" colspan="1">tex_cache_transactions</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e2580" rowspan="1" colspan="1">Texture cache read transactions</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e2583" rowspan="1" colspan="1">Single-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e2577" rowspan="1" colspan="1">tex_fu_utilization</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e2580" rowspan="1" colspan="1">The utilization level of the multiprocessor function units that execute texture instructions on a scale of 0 to 10</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e2583" rowspan="1" colspan="1">Multi-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e2577" rowspan="1" colspan="1">tex_utilization</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e2580" rowspan="1" colspan="1">The utilization level of the texture cache relative to the peak utilization on a scale of 0 to 10</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e2583" rowspan="1" colspan="1">Single-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e2577" rowspan="1" colspan="1">warp_execution_efficiency</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e2580" rowspan="1" colspan="1">Ratio of the average active threads per warp to the
                                          maximum number of threads per warp supported on a
                                          multiprocessor expressed as percentage
                                       </td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e2583" rowspan="1" colspan="1">Multi-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e2577" rowspan="1" colspan="1">warp_nonpred_execution_efficiency</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e2580" rowspan="1" colspan="1">Ratio of the average active threads per warp
                                          executing non-predicated instructions to the maximum
                                          number of threads per warp supported on a
                                          multiprocessor expressed as percentage
                                       </td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e2583" rowspan="1" colspan="1">Multi-context</td>
                                    </tr>
                                 </tbody>
                              </table>
                           </div>
                        </div>
                        <div class="section">
                           <h3 class="title sectiontitle">Metric Reference - Compute Capability 5.x</h3>
                           <p class="p">Devices with compute capability greater than or equal to 5.0
                              implement the metrics shown in the following table. A scope
                              value of single-context indicates that the metric can only be
                              accurately collected when a single context (CUDA or graphic) is
                              executing on the GPU. A scope value of multi-context indicates
                              that the metric can be accurately collected when multiple
                              contexts are executing on the GPU.
                           </p>
                           <div class="tablenoborder">
                              <table cellpadding="4" cellspacing="0" summary="" class="table capabity_metrics" frame="border" border="1" rules="all">
                                 <caption><span class="tablecap">Table 4. Capability 5.x Metrics</span></caption>
                                 <thead class="thead" align="left">
                                    <tr class="row">
                                       <th class="entry" valign="top" width="36.36363636363637%" id="d28881e4116" rowspan="1" colspan="1">Metric Name</th>
                                       <th class="entry" valign="top" width="45.45454545454545%" id="d28881e4119" rowspan="1" colspan="1">Description</th>
                                       <th class="entry" valign="top" width="18.181818181818183%" id="d28881e4122" rowspan="1" colspan="1">Scope</th>
                                    </tr>
                                 </thead>
                                 <tbody class="tbody">
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e4116" rowspan="1" colspan="1">achieved_occupancy</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e4119" rowspan="1" colspan="1">Ratio of the average active warps per active cycle
                                          to the maximum number of warps supported on a
                                          multiprocessor
                                       </td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e4122" rowspan="1" colspan="1">Multi-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e4116" rowspan="1" colspan="1">atomic_transactions</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e4119" rowspan="1" colspan="1">Global memory atomic and reduction transactions</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e4122" rowspan="1" colspan="1">Multi-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e4116" rowspan="1" colspan="1">atomic_transactions_per_request</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e4119" rowspan="1" colspan="1">Average number of global memory atomic and reduction 
                                          transactions performed for each atomic and reduction instruction
                                       </td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e4122" rowspan="1" colspan="1">Multi-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e4116" rowspan="1" colspan="1">branch_efficiency</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e4119" rowspan="1" colspan="1">Ratio of non-divergent branches to total branches expressed as percentage</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e4122" rowspan="1" colspan="1">Multi-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e4116" rowspan="1" colspan="1">cf_executed</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e4119" rowspan="1" colspan="1">Number of executed control-flow instructions</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e4122" rowspan="1" colspan="1">Multi-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e4116" rowspan="1" colspan="1">cf_fu_utilization</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e4119" rowspan="1" colspan="1">The utilization level of the multiprocessor function units that execute control-flow instructions on a scale of 0 to 10</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e4122" rowspan="1" colspan="1">Multi-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e4116" rowspan="1" colspan="1">cf_issued</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e4119" rowspan="1" colspan="1">Number of issued control-flow instructions</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e4122" rowspan="1" colspan="1">Multi-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e4116" rowspan="1" colspan="1">double_precision_fu_utilization</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e4119" rowspan="1" colspan="1">The utilization level of the multiprocessor function units that execute double-precision floating-point instructions and integer
                                          instructions on a scale of 0 to 10
                                       </td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e4122" rowspan="1" colspan="1">Multi-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e4116" rowspan="1" colspan="1">dram_read_throughput</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e4119" rowspan="1" colspan="1">Device memory read throughput</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e4122" rowspan="1" colspan="1">Single-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e4116" rowspan="1" colspan="1">dram_read_transactions</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e4119" rowspan="1" colspan="1">Device memory read transactions</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e4122" rowspan="1" colspan="1">Single-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e4116" rowspan="1" colspan="1">dram_utilization</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e4119" rowspan="1" colspan="1">The utilization level of the device memory relative to the peak utilization on a scale of 0 to 10</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e4122" rowspan="1" colspan="1">Single-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e4116" rowspan="1" colspan="1">dram_write_throughput</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e4119" rowspan="1" colspan="1">Device memory write throughput</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e4122" rowspan="1" colspan="1">Single-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e4116" rowspan="1" colspan="1">dram_write_transactions</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e4119" rowspan="1" colspan="1">Device memory write transactions</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e4122" rowspan="1" colspan="1">Single-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e4116" rowspan="1" colspan="1">ecc_throughput</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e4119" rowspan="1" colspan="1">ECC throughput from L2 to DRAM</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e4122" rowspan="1" colspan="1">Single-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e4116" rowspan="1" colspan="1">ecc_transactions</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e4119" rowspan="1" colspan="1">Number of ECC transactions between L2 and DRAM</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e4122" rowspan="1" colspan="1">Single-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e4116" rowspan="1" colspan="1">eligible_warps_per_cycle</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e4119" rowspan="1" colspan="1">Average number of warps that are eligible to issue per active cycle</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e4122" rowspan="1" colspan="1">Multi-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e4116" rowspan="1" colspan="1">flop_count_dp</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e4119" rowspan="1" colspan="1">Number of double-precision floating-point operations executed by
                                          non-predicated threads (add, multiply, multiply-accumulate and special). 
                                          Each multiply-accumulate operation contributes 2 to the count.
                                       </td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e4122" rowspan="1" colspan="1">Multi-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e4116" rowspan="1" colspan="1">flop_count_dp_add</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e4119" rowspan="1" colspan="1">Number of double-precision floating-point add operations executed by non-predicated threads</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e4122" rowspan="1" colspan="1">Multi-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e4116" rowspan="1" colspan="1">flop_count_dp_fma</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e4119" rowspan="1" colspan="1">Number of double-precision floating-point multiply-accumulate operations 
                                          executed by non-predicated threads. Each multiply-accumulate operation 
                                          contributes 1 to the count.
                                       </td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e4122" rowspan="1" colspan="1">Multi-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e4116" rowspan="1" colspan="1">flop_count_dp_mul</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e4119" rowspan="1" colspan="1">Number of double-precision floating-point multiply operations executed by non-predicated threads</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e4122" rowspan="1" colspan="1">Multi-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e4116" rowspan="1" colspan="1">flop_count_sp</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e4119" rowspan="1" colspan="1">Number of single-precision floating-point operations executed by 
                                          non-predicated threads (add, multiply, multiply-accumulate and special). 
                                          Each multiply-accumulate operation contributes 2 to the count.
                                       </td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e4122" rowspan="1" colspan="1">Multi-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e4116" rowspan="1" colspan="1">flop_count_sp_add</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e4119" rowspan="1" colspan="1">Number of single-precision floating-point add operations executed by non-predicated threads</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e4122" rowspan="1" colspan="1">Multi-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e4116" rowspan="1" colspan="1">flop_count_sp_fma</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e4119" rowspan="1" colspan="1">Number of single-precision floating-point multiply-accumulate operations 
                                          executed by non-predicated threads. Each multiply-accumulate operation 
                                          contributes 1 to the count.
                                       </td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e4122" rowspan="1" colspan="1">Multi-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e4116" rowspan="1" colspan="1">flop_count_sp_mul</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e4119" rowspan="1" colspan="1">Number of single-precision floating-point multiply operations executed by non-predicated threads</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e4122" rowspan="1" colspan="1">Multi-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e4116" rowspan="1" colspan="1">flop_count_sp_special</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e4119" rowspan="1" colspan="1">Number of single-precision floating-point special operations executed by non-predicated threads</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e4122" rowspan="1" colspan="1">Multi-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e4116" rowspan="1" colspan="1">flop_dp_efficiency</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e4119" rowspan="1" colspan="1">Ratio of achieved to peak double-precision floating-point operations</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e4122" rowspan="1" colspan="1">Multi-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e4116" rowspan="1" colspan="1">flop_sp_efficiency</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e4119" rowspan="1" colspan="1">Ratio of achieved to peak single-precision floating-point operations</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e4122" rowspan="1" colspan="1">Multi-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e4116" rowspan="1" colspan="1">gld_efficiency</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e4119" rowspan="1" colspan="1">Ratio of requested global memory load throughput to
                                          required global memory load throughput expressed as percentage
                                       </td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e4122" rowspan="1" colspan="1">Single-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e4116" rowspan="1" colspan="1">gld_requested_throughput</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e4119" rowspan="1" colspan="1">Requested global memory load throughput</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e4122" rowspan="1" colspan="1">Multi-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e4116" rowspan="1" colspan="1">gld_throughput</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e4119" rowspan="1" colspan="1">Global memory load throughput</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e4122" rowspan="1" colspan="1">Single-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e4116" rowspan="1" colspan="1">gld_transactions</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e4119" rowspan="1" colspan="1">Number of global memory load transactions</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e4122" rowspan="1" colspan="1">Single-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e4116" rowspan="1" colspan="1">gld_transactions_per_request</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e4119" rowspan="1" colspan="1">Average number of global memory load transactions performed for each global memory load</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e4122" rowspan="1" colspan="1">Single-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e4116" rowspan="1" colspan="1">global_hit_rate</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e4119" rowspan="1" colspan="1">Hit rate for global loads</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e4122" rowspan="1" colspan="1">Single-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e4116" rowspan="1" colspan="1">gst_efficiency</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e4119" rowspan="1" colspan="1">Ratio of requested global memory store throughput
                                          to required global memory store throughput expressed as percentage
                                       </td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e4122" rowspan="1" colspan="1">Single-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e4116" rowspan="1" colspan="1">gst_requested_throughput</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e4119" rowspan="1" colspan="1">Requested global memory store throughput</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e4122" rowspan="1" colspan="1">Multi-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e4116" rowspan="1" colspan="1">gst_throughput</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e4119" rowspan="1" colspan="1">Global memory store throughput</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e4122" rowspan="1" colspan="1">Single-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e4116" rowspan="1" colspan="1">gst_transactions</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e4119" rowspan="1" colspan="1">Number of global memory store transactions</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e4122" rowspan="1" colspan="1">Single-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e4116" rowspan="1" colspan="1">gst_transactions_per_request</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e4119" rowspan="1" colspan="1">Average number of global memory store transactions performed for each global memory store</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e4122" rowspan="1" colspan="1">Single-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e4116" rowspan="1" colspan="1">inst_bit_convert</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e4119" rowspan="1" colspan="1">Number of bit-conversion instructions executed by non-predicated threads</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e4122" rowspan="1" colspan="1">Multi-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e4116" rowspan="1" colspan="1">inst_compute_ld_st</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e4119" rowspan="1" colspan="1">Number of compute load/store instructions executed by non-predicated threads</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e4122" rowspan="1" colspan="1">Multi-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e4116" rowspan="1" colspan="1">inst_control</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e4119" rowspan="1" colspan="1">Number of control-flow instructions executed by non-predicated threads (jump, branch, etc.)</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e4122" rowspan="1" colspan="1">Multi-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e4116" rowspan="1" colspan="1">inst_executed</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e4119" rowspan="1" colspan="1">The number of instructions executed</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e4122" rowspan="1" colspan="1">Multi-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e4116" rowspan="1" colspan="1">inst_fp_32</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e4119" rowspan="1" colspan="1">Number of single-precision floating-point instructions executed by non-predicated threads (arithmetric, compare, etc.)</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e4122" rowspan="1" colspan="1">Multi-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e4116" rowspan="1" colspan="1">inst_fp_64</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e4119" rowspan="1" colspan="1">Number of double-precision floating-point instructions executed by non-predicated threads (arithmetric, compare, etc.)</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e4122" rowspan="1" colspan="1">Multi-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e4116" rowspan="1" colspan="1">inst_integer</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e4119" rowspan="1" colspan="1">Number of integer instructions executed by non-predicated threads</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e4122" rowspan="1" colspan="1">Multi-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e4116" rowspan="1" colspan="1">inst_inter_thread_communication</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e4119" rowspan="1" colspan="1">Number of inter-thread communication instructions executed by non-predicated threads</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e4122" rowspan="1" colspan="1">Multi-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e4116" rowspan="1" colspan="1">inst_issued</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e4119" rowspan="1" colspan="1">The number of instructions issued</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e4122" rowspan="1" colspan="1">Multi-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e4116" rowspan="1" colspan="1">inst_misc</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e4119" rowspan="1" colspan="1">Number of miscellaneous instructions executed by non-predicated threads</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e4122" rowspan="1" colspan="1">Multi-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e4116" rowspan="1" colspan="1">inst_per_warp</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e4119" rowspan="1" colspan="1">Average number of instructions executed by each warp</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e4122" rowspan="1" colspan="1">Multi-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e4116" rowspan="1" colspan="1">inst_replay_overhead</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e4119" rowspan="1" colspan="1">Average number of replays for each instruction executed</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e4122" rowspan="1" colspan="1">Multi-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e4116" rowspan="1" colspan="1">ipc</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e4119" rowspan="1" colspan="1">Instructions executed per cycle</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e4122" rowspan="1" colspan="1">Multi-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e4116" rowspan="1" colspan="1">issue_slot_utilization</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e4119" rowspan="1" colspan="1">Percentage of issue slots that issued at least one
                                          instruction, averaged across all cycles
                                       </td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e4122" rowspan="1" colspan="1">Multi-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e4116" rowspan="1" colspan="1">issue_slots</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e4119" rowspan="1" colspan="1">The number of issue slots used</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e4122" rowspan="1" colspan="1">Multi-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e4116" rowspan="1" colspan="1">issued_ipc</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e4119" rowspan="1" colspan="1">Instructions issued per cycle</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e4122" rowspan="1" colspan="1">Multi-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e4116" rowspan="1" colspan="1">l2_atomic_throughput</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e4119" rowspan="1" colspan="1">Memory read throughput seen at L2 cache for atomic and reduction requests</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e4122" rowspan="1" colspan="1">Multi-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e4116" rowspan="1" colspan="1">l2_atomic_transactions</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e4119" rowspan="1" colspan="1">Memory read transactions seen at L2 cache for atomic and reduction requests</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e4122" rowspan="1" colspan="1">Single-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e4116" rowspan="1" colspan="1">l2_read_throughput</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e4119" rowspan="1" colspan="1">Memory read throughput seen at L2 cache for all read requests</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e4122" rowspan="1" colspan="1">Single-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e4116" rowspan="1" colspan="1">l2_read_transactions</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e4119" rowspan="1" colspan="1">Memory read transactions seen at L2 cache for all read requests</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e4122" rowspan="1" colspan="1">Single-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e4116" rowspan="1" colspan="1">l2_tex_read_hit_rate</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e4119" rowspan="1" colspan="1">Hit rate at L2 cache for all read requests from texture cache</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e4122" rowspan="1" colspan="1">Single-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e4116" rowspan="1" colspan="1">l2_tex_read_throughput</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e4119" rowspan="1" colspan="1">Memory read throughput seen at L2 cache for read requests from the texture cache</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e4122" rowspan="1" colspan="1">Sinlge-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e4116" rowspan="1" colspan="1">l2_tex_read_transactions</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e4119" rowspan="1" colspan="1">Memory read transactions seen at L2 cache for read requests from the texture cache</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e4122" rowspan="1" colspan="1">Single-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e4116" rowspan="1" colspan="1">l2_tex_write_hit_rate</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e4119" rowspan="1" colspan="1">Hit Rate at L2 cache for all write requests from texture cache</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e4122" rowspan="1" colspan="1">Single-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e4116" rowspan="1" colspan="1">l2_tex_write_throughput</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e4119" rowspan="1" colspan="1">Memory write throughput seen at L2 cache for write requests from the texture cache</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e4122" rowspan="1" colspan="1">Sinlge-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e4116" rowspan="1" colspan="1">l2_tex_write_transactions</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e4119" rowspan="1" colspan="1">Memory write transactions seen at L2 cache for write requests from the texture cache</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e4122" rowspan="1" colspan="1">Single-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e4116" rowspan="1" colspan="1">l2_utilization</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e4119" rowspan="1" colspan="1">The utilization level of the L2 cache relative to the peak utilization on a scale of 0 to 10</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e4122" rowspan="1" colspan="1">Single-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e4116" rowspan="1" colspan="1">l2_write_throughput</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e4119" rowspan="1" colspan="1">Memory write throughput seen at L2 cache for all write requests</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e4122" rowspan="1" colspan="1">Single-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e4116" rowspan="1" colspan="1">l2_write_transactions</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e4119" rowspan="1" colspan="1">Memory write transactions seen at L2 cache for all write requests</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e4122" rowspan="1" colspan="1">Single-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e4116" rowspan="1" colspan="1">ldst_executed</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e4119" rowspan="1" colspan="1">Number of executed load and store instructions</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e4122" rowspan="1" colspan="1">Multi-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e4116" rowspan="1" colspan="1">ldst_fu_utilization</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e4119" rowspan="1" colspan="1">The utilization level of the multiprocessor function units that execute global, local and shared memory instructions on a
                                          scale of 0 to 10
                                       </td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e4122" rowspan="1" colspan="1">Multi-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e4116" rowspan="1" colspan="1">ldst_issued</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e4119" rowspan="1" colspan="1">Number of issued load and store instructions</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e4122" rowspan="1" colspan="1">Multi-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e4116" rowspan="1" colspan="1">local_hit_rate</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e4119" rowspan="1" colspan="1">Hit rate for local loads and stores</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e4122" rowspan="1" colspan="1">Single-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e4116" rowspan="1" colspan="1">local_load_throughput</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e4119" rowspan="1" colspan="1">Local memory load throughput</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e4122" rowspan="1" colspan="1">Single-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e4116" rowspan="1" colspan="1">local_load_transactions</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e4119" rowspan="1" colspan="1">Number of local memory load transactions</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e4122" rowspan="1" colspan="1">Single-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e4116" rowspan="1" colspan="1">local_load_transactions_per_request</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e4119" rowspan="1" colspan="1">Average number of local memory load transactions performed for each local memory load</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e4122" rowspan="1" colspan="1">Single-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e4116" rowspan="1" colspan="1">local_memory_overhead</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e4119" rowspan="1" colspan="1">Ratio of local memory traffic to total memory
                                          traffic between the L1 and L2 caches expressed as percentage
                                       </td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e4122" rowspan="1" colspan="1">Single-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e4116" rowspan="1" colspan="1">local_store_throughput</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e4119" rowspan="1" colspan="1">Local memory store throughput</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e4122" rowspan="1" colspan="1">Single-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e4116" rowspan="1" colspan="1">local_store_transactions</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e4119" rowspan="1" colspan="1">Number of local memory store transactions</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e4122" rowspan="1" colspan="1">Single-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e4116" rowspan="1" colspan="1">local_store_transactions_per_request</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e4119" rowspan="1" colspan="1">Average number of local memory store transactions performed for each local memory store</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e4122" rowspan="1" colspan="1">Single-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e4116" rowspan="1" colspan="1">shared_efficiency</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e4119" rowspan="1" colspan="1">Ratio of requested shared memory throughput to required shared memory throughput expressed as percentage</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e4122" rowspan="1" colspan="1">Single-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e4116" rowspan="1" colspan="1">shared_load_throughput</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e4119" rowspan="1" colspan="1">Shared memory load throughput</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e4122" rowspan="1" colspan="1">Single-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e4116" rowspan="1" colspan="1">shared_load_transactions</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e4119" rowspan="1" colspan="1">Number of shared memory load transactions</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e4122" rowspan="1" colspan="1">Single-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e4116" rowspan="1" colspan="1">shared_load_transactions_per_request</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e4119" rowspan="1" colspan="1">Average number of shared memory load transactions performed for each shared memory load</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e4122" rowspan="1" colspan="1">Single-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e4116" rowspan="1" colspan="1">shared_store_throughput</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e4119" rowspan="1" colspan="1">Shared memory store throughput</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e4122" rowspan="1" colspan="1">Single-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e4116" rowspan="1" colspan="1">shared_store_transactions</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e4119" rowspan="1" colspan="1">Number of shared memory store transactions</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e4122" rowspan="1" colspan="1">Single-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e4116" rowspan="1" colspan="1">shared_store_transactions_per_request</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e4119" rowspan="1" colspan="1">Average number of shared memory store transactions performed for each shared memory store</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e4122" rowspan="1" colspan="1">Single-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e4116" rowspan="1" colspan="1">shared_utilization</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e4119" rowspan="1" colspan="1">The utilization level of the shared memory relative to peak utilization on a scale of 0 to 10</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e4122" rowspan="1" colspan="1">Single-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e4116" rowspan="1" colspan="1">single_precision_fu_utilization</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e4119" rowspan="1" colspan="1">The utilization level of the multiprocessor function units that execute single-precision floating-point instructions and integer
                                          instructions on a scale of 0 to 10
                                       </td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e4122" rowspan="1" colspan="1">Multi-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e4116" rowspan="1" colspan="1">sm_efficiency</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e4119" rowspan="1" colspan="1">The percentage of time at least one warp is active
                                          on a multiprocessor
                                       </td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e4122" rowspan="1" colspan="1">Single-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e4116" rowspan="1" colspan="1">special_fu_utilization</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e4119" rowspan="1" colspan="1">The utilization level of the multiprocessor function units that execute sin, cos, ex2, popc, flo, and similar instructions
                                          on a scale of 0 to 10
                                       </td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e4122" rowspan="1" colspan="1">Multi-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e4116" rowspan="1" colspan="1">stall_compute</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e4119" rowspan="1" colspan="1">Percentage of stalls occurring because a compute operation cannot be performed due to the required resources not being available</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e4122" rowspan="1" colspan="1">Multi-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e4116" rowspan="1" colspan="1">stall_data_request</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e4119" rowspan="1" colspan="1">Percentage of stalls occurring because a memory operation cannot be performed due to the required resources not being available
                                          or fully utilized, or because too many requests of a given type are outstanding
                                       </td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e4122" rowspan="1" colspan="1">Multi-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e4116" rowspan="1" colspan="1">stall_exec_dependency</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e4119" rowspan="1" colspan="1">Percentage of stalls occurring because an input required by the instruction is not yet available</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e4122" rowspan="1" colspan="1">Multi-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e4116" rowspan="1" colspan="1">stall_imc</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e4119" rowspan="1" colspan="1">Percentage of stalls occurring because of immediate constant cache miss</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e4122" rowspan="1" colspan="1">Multi-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e4116" rowspan="1" colspan="1">stall_inst_fetch</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e4119" rowspan="1" colspan="1">Percentage of stalls occurring because the next assembly instruction has not yet been fetched</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e4122" rowspan="1" colspan="1">Multi-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e4116" rowspan="1" colspan="1">stall_other</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e4119" rowspan="1" colspan="1">Percentage of stalls occurring due to miscellaneous reasons</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e4122" rowspan="1" colspan="1">Multi-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e4116" rowspan="1" colspan="1">stall_sync</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e4119" rowspan="1" colspan="1">Percentage of stalls occurring because the warp is blocked at a __syncthreads() call</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e4122" rowspan="1" colspan="1">Multi-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e4116" rowspan="1" colspan="1">stall_texture</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e4119" rowspan="1" colspan="1">Percentage of stalls occurring because the texture sub-system is fully utilized or has too many outstanding requests</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e4122" rowspan="1" colspan="1">Multi-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e4116" rowspan="1" colspan="1">sysmem_read_throughput</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e4119" rowspan="1" colspan="1">System memory read throughput</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e4122" rowspan="1" colspan="1">Single-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e4116" rowspan="1" colspan="1">sysmem_read_transactions</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e4119" rowspan="1" colspan="1">System memory read transactions</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e4122" rowspan="1" colspan="1">Single-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e4116" rowspan="1" colspan="1">sysmem_utilization</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e4119" rowspan="1" colspan="1">The utilization level of the system memory relative to the peak utilization on a scale of 0 to 10</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e4122" rowspan="1" colspan="1">Single-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e4116" rowspan="1" colspan="1">sysmem_write_throughput</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e4119" rowspan="1" colspan="1">System memory write throughput</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e4122" rowspan="1" colspan="1">Single-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e4116" rowspan="1" colspan="1">sysmem_write_transactions</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e4119" rowspan="1" colspan="1">System memory write transactions</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e4122" rowspan="1" colspan="1">Single-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e4116" rowspan="1" colspan="1">tex_cache_hit_rate</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e4119" rowspan="1" colspan="1">Texture cache hit rate</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e4122" rowspan="1" colspan="1">Single-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e4116" rowspan="1" colspan="1">tex_cache_throughput</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e4119" rowspan="1" colspan="1">Texture cache throughput</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e4122" rowspan="1" colspan="1">Single-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e4116" rowspan="1" colspan="1">tex_cache_transactions</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e4119" rowspan="1" colspan="1">Texture cache read transactions</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e4122" rowspan="1" colspan="1">Single-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e4116" rowspan="1" colspan="1">tex_fu_utilization</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e4119" rowspan="1" colspan="1">The utilization level of the multiprocessor function units that execute texture instructions on a scale of 0 to 10</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e4122" rowspan="1" colspan="1">Multi-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e4116" rowspan="1" colspan="1">tex_utilization</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e4119" rowspan="1" colspan="1">The utilization level of the texture cache relative to the peak utilization on a scale of 0 to 10</td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e4122" rowspan="1" colspan="1">Single-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e4116" rowspan="1" colspan="1">warp_execution_efficiency</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e4119" rowspan="1" colspan="1">Ratio of the average active threads per warp to the
                                          maximum number of threads per warp supported on a
                                          multiprocessor expressed as percentage
                                       </td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e4122" rowspan="1" colspan="1">Multi-context</td>
                                    </tr>
                                    <tr class="row">
                                       <td class="entry" valign="top" width="36.36363636363637%" headers="d28881e4116" rowspan="1" colspan="1">warp_nonpred_execution_efficiency</td>
                                       <td class="entry" valign="top" width="45.45454545454545%" headers="d28881e4119" rowspan="1" colspan="1">Ratio of the average active threads per warp executing non-predicated instructions to the maximum number of threads per warp
                                          supported on a multiprocessor
                                       </td>
                                       <td class="entry" valign="top" width="18.181818181818183%" headers="d28881e4122" rowspan="1" colspan="1">Multi-context</td>
                                    </tr>
                                 </tbody>
                              </table>
                           </div>
                        </div>
                     </div>
                  </div>
                  <div class="topic reference cuda_reference nested1" id="r_samples"><a name="r_samples" shape="rect">
                        <!-- --></a><h3 class="topictitle3">1.7.&nbsp;Samples</h3>
                     <div class="body refbody">
                        <div class="section">
                           <p class="p">The CUPTI installation includes several samples that demonstrate the
                              use of the CUPTI APIs. The samples are:
                           </p>
                           <dl class="dl">
                              <dt class="dt dlterm"><a name="r_samples__activity_trace_async" shape="rect">
                                    <!-- --></a>activity_trace_async
                              </dt>
                              <dd class="dd">This sample shows how to collect a trace of CPU and
                                 GPU activity using the new asynchronous activity buffer
                                 APIs.
                              </dd>
                              <dt class="dt dlterm">callback_event</dt>
                              <dd class="dd">This sample shows how to use both the callback
                                 and event APIs to record the events that occur during the execution
                                 of a simple kernel. The sample shows the required ordering for
                                 synchronization, and for event group enabling, disabling and
                                 reading.
                              </dd>
                              <dt class="dt dlterm">callback_metric</dt>
                              <dd class="dd">This sample shows how to use both the
                                 callback and metric APIs to record the metric's events during the
                                 execution of a simple kernel, and then use those events to calculate
                                 the metric value.
                              </dd>
                              <dt class="dt dlterm">callback_timestamp</dt>
                              <dd class="dd">This sample shows how to use the callback
                                 API to record a trace of API start and stop times.
                              </dd>
                              <dt class="dt dlterm">cupti_query</dt>
                              <dd class="dd">This sample shows how to query CUDA-enabled
                                 devices for their event domains, events, and metrics.
                              </dd>
                              <dt class="dt dlterm">event_sampling</dt>
                              <dd class="dd">This sample shows how to use the event
                                 API to sample events using a separate host thread.
                              </dd>
                              <dt class="dt dlterm">sass_source_map</dt>
                              <dd class="dd">This sample shows how to generate CUpti_ActivityInstructionExecution records
                                 and how to map SASS assembly instructions to CUDA C source.
                              </dd>
                              <dt class="dt dlterm">unified_memory</dt>
                              <dd class="dd">This sample shows how to collect various counters
                                 like page faults and page transfers for unified memory.
                              </dd>
                           </dl>
                        </div>
                     </div>
                  </div>
               </div>
               
               <hr id="contents-end"></hr>
               
            </article>
         </div>
      </div>
      <script language="JavaScript" type="text/javascript" charset="utf-8" src="../common/formatting/common.min.js"></script>
      <script language="JavaScript" type="text/javascript" charset="utf-8" src="../common/scripts/google-analytics/google-analytics-write.js"></script>
      <script language="JavaScript" type="text/javascript" charset="utf-8" src="../common/scripts/google-analytics/google-analytics-tracker.js"></script>
      <script type="text/javascript">var switchTo5x=true;</script><script type="text/javascript" src="http://w.sharethis.com/button/buttons.js"></script><script type="text/javascript">stLight.options({publisher: "998dc202-a267-4d8e-bce9-14debadb8d92", doNotHash: false, doNotCopy: false, hashAddressBar: false});</script></body>
</html>