Sophie: nvidia-cuda-toolkit-devel-6.5.14-6.1.mga5.nonfree x86

nvidia-cuda-toolkit-devel-6.5.14-6.1.mga5.nonfree.x86_64.rpm

<!DOCTYPE html
  PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" lang="en-us" xml:lang="en-us">
   <head>
      <meta http-equiv="Content-Type" content="text/html; charset=utf-8"></meta>
      <meta http-equiv="X-UA-Compatible" content="IE=edge"></meta>
      <meta name="copyright" content="(C) Copyright 2005"></meta>
      <meta name="DC.rights.owner" content="(C) Copyright 2005"></meta>
      <meta name="DC.Type" content="concept"></meta>
      <meta name="DC.Title" content="Developing a Linux Kernel Module using GPUDirect RDMA"></meta>
      <meta name="abstract" content="The API reference guide for enabling GPUDirect RDMA connections to NVIDIA GPUs."></meta>
      <meta name="description" content="The API reference guide for enabling GPUDirect RDMA connections to NVIDIA GPUs."></meta>
      <meta name="DC.Coverage" content="Miscellaneous"></meta>
      <meta name="DC.subject" content="CUDA RDMA, CUDA RDMA GPU, CUDA RDMA GPU direct, CUDA RDMA DMA, CUDA RDMA GPUDirect transfer, CUDA RDMA systems, CUDA RDMA GPU memory, CUDA RDMA free callback, CUDA RDMA link kernel module, CUDA RDMA references"></meta>
      <meta name="keywords" content="CUDA RDMA, CUDA RDMA GPU, CUDA RDMA GPU direct, CUDA RDMA DMA, CUDA RDMA GPUDirect transfer, CUDA RDMA systems, CUDA RDMA GPU memory, CUDA RDMA free callback, CUDA RDMA link kernel module, CUDA RDMA references"></meta>
      <meta name="DC.Format" content="XHTML"></meta>
      <meta name="DC.Identifier" content="abstract"></meta>
      <link rel="stylesheet" type="text/css" href="../common/formatting/commonltr.css"></link>
      <link rel="stylesheet" type="text/css" href="../common/formatting/site.css"></link>
      <title>GPUDirect RDMA :: CUDA Toolkit Documentation</title>
      <!--[if lt IE 9]>
      <script src="../common/formatting/html5shiv-printshiv.min.js"></script>
      <![endif]-->
      <script type="text/javascript" charset="utf-8" src="../common/scripts/tynt/tynt.js"></script>
      <script type="text/javascript" charset="utf-8" src="../common/formatting/jquery.min.js"></script>
      <script type="text/javascript" charset="utf-8" src="../common/formatting/jquery.ba-hashchange.min.js"></script>
      <script type="text/javascript" charset="utf-8" src="../common/formatting/jquery.scrollintoview.min.js"></script>
      <script type="text/javascript" src="../search/htmlFileList.js"></script>
      <script type="text/javascript" src="../search/htmlFileInfoList.js"></script>
      <script type="text/javascript" src="../search/nwSearchFnt.min.js"></script>
      <script type="text/javascript" src="../search/stemmers/en_stemmer.min.js"></script>
      <script type="text/javascript" src="../search/index-1.js"></script>
      <script type="text/javascript" src="../search/index-2.js"></script>
      <script type="text/javascript" src="../search/index-3.js"></script>
      <link rel="canonical" href="http://docs.nvidia.com/cuda/gpudirect-rdma/index.html"></link>
      <link rel="stylesheet" type="text/css" href="../common/formatting/qwcode.highlight.css"></link>
   </head>
   <body>
      
      <header id="header"><span id="company">NVIDIA</span><span id="site-title">CUDA Toolkit Documentation</span><form id="search" method="get" action="search">
            <input type="text" name="search-text"></input><fieldset id="search-location">
               <legend>Search In:</legend>
               <label><input type="radio" name="search-type" value="site"></input>Entire Site</label>
               <label><input type="radio" name="search-type" value="document"></input>Just This Document</label></fieldset>
            <button type="reset">clear search</button>
            <button id="submit" type="submit">search</button></form>
      </header>
      <div id="site-content">
         <nav id="site-nav">
            <div class="category closed"><a href="../index.html" title="The root of the site.">CUDA Toolkit
                  v6.5</a></div>
            <div class="category"><a href="index.html" title="GPUDirect RDMA">GPUDirect RDMA</a></div>
            <ul>
               <li>
                  <div class="section-link"><a href="#overview">1.&nbsp;Overview</a></div>
                  <ul>
                     <li>
                        <div class="section-link"><a href="#how-gpudirect-rdma-works">1.1.&nbsp;How GPUDirect RDMA Works</a></div>
                     </li>
                     <li>
                        <div class="section-link"><a href="#standard-dma-transfer-example-sequence">1.2.&nbsp;Standard DMA Transfer</a></div>
                     </li>
                     <li>
                        <div class="section-link"><a href="#gpudirect-rdma-example-sequence">1.3.&nbsp;GPUDirect RDMA Transfers</a></div>
                     </li>
                     <li>
                        <div class="section-link"><a href="#new-in-cuda-60">1.4.&nbsp;Changes in CUDA 6.0</a></div>
                     </li>
                  </ul>
               </li>
               <li>
                  <div class="section-link"><a href="#design-considerations">2.&nbsp;Design Considerations</a></div>
                  <ul>
                     <li>
                        <div class="section-link"><a href="#lazy-unpinning-optimization">2.1.&nbsp;Lazy Unpinning Optimization</a></div>
                     </li>
                     <li>
                        <div class="section-link"><a href="#registration-cache">2.2.&nbsp;Registration Cache</a></div>
                     </li>
                     <li>
                        <div class="section-link"><a href="#unpin-callback">2.3.&nbsp;Unpin Callback</a></div>
                     </li>
                     <li>
                        <div class="section-link"><a href="#supported-systems">2.4.&nbsp;Supported Systems</a></div>
                     </li>
                     <li>
                        <div class="section-link"><a href="#bar-sizes">2.5.&nbsp;PCI BAR sizes</a></div>
                     </li>
                     <li>
                        <div class="section-link"><a href="#tokens-usage">2.6.&nbsp;Tokens Usage</a></div>
                     </li>
                     <li>
                        <div class="section-link"><a href="#sync-behavior">2.7.&nbsp;Synchronization and Memory Ordering</a></div>
                     </li>
                  </ul>
               </li>
               <li>
                  <div class="section-link"><a href="#how-to-perform-specific-tasks">3.&nbsp;How to Perform Specific Tasks</a></div>
                  <ul>
                     <li>
                        <div class="section-link"><a href="#display-bar-space">3.1.&nbsp;Displaying GPU BAR space</a></div>
                     </li>
                     <li>
                        <div class="section-link"><a href="#pinning-gpu-memory">3.2.&nbsp;Pinning GPU memory</a></div>
                     </li>
                     <li>
                        <div class="section-link"><a href="#unpinning-gpu-memory">3.3.&nbsp;Unpinning GPU memory</a></div>
                     </li>
                     <li>
                        <div class="section-link"><a href="#handling-free-callback">3.4.&nbsp;Handling the free callback</a></div>
                     </li>
                     <li>
                        <div class="section-link"><a href="#invalidating-based-on-buffer-id">3.5.&nbsp;Buffer ID Tag Check for A Registration Cache</a></div>
                     </li>
                     <li>
                        <div class="section-link"><a href="#linking-kernel-module-against-nvidia-ko">3.6.&nbsp;Linking a Kernel Module against nvidia.ko</a></div>
                     </li>
                  </ul>
               </li>
               <li>
                  <div class="section-link"><a href="#references">4.&nbsp;References</a></div>
                  <ul>
                     <li>
                        <div class="section-link"><a href="#basics-of-uva-cuda-memory-management">4.1.&nbsp;Basics of UVA CUDA Memory Management</a></div>
                     </li>
                     <li>
                        <div class="section-link"><a href="#userspace-api">4.2.&nbsp;Userspace API</a></div>
                     </li>
                     <li>
                        <div class="section-link"><a href="#kernel-api">4.3.&nbsp;Kernel API</a></div>
                     </li>
                  </ul>
               </li>
            </ul>
         </nav>
         <div id="resize-nav"></div>
         <nav id="search-results">
            <h2>Search Results</h2>
            <ol></ol>
         </nav>
         
         <div id="contents-container">
            <div id="breadcrumbs-container">
               <div id="release-info">GPUDirect RDMA
                  (<a href="../../pdf/GPUDirect_RDMA.pdf">PDF</a>)
                  -
                  
                  v6.5
                  (<a href="https://developer.nvidia.com/cuda-toolkit-archive">older</a>)
                  -
                  Last updated August 1, 2014
                  -
                  <a href="mailto:cudatools@nvidia.com?subject=CUDA Toolkit Documentation Feedback: GPUDirect RDMA">Send Feedback</a>
                  -
                  <span class="st_facebook"></span><span class="st_twitter"></span><span class="st_linkedin"></span><span class="st_reddit"></span><span class="st_slashdot"></span><span class="st_tumblr"></span><span class="st_sharethis"></span></div>
            </div>
            <article id="contents">
               <div class="topic nested0" id="abstract"><a name="abstract" shape="rect">
                     <!-- --></a><h2 class="title topictitle1"><a href="#abstract" name="abstract" shape="rect">Developing a Linux Kernel Module using GPUDirect RDMA</a></h2>
                  <div class="body conbody"></div>
               </div>
               <div class="topic concept nested0" id="overview"><a name="overview" shape="rect">
                     <!-- --></a><h2 class="title topictitle1"><a href="#overview" name="overview" shape="rect">1.&nbsp;Overview</a></h2>
                  <div class="body conbody">
                     <p class="p"> GPUDirect RDMA is a technology introduced in Kepler-class GPUs and CUDA 5.0 that enables a direct path for
                        data exchange between the GPU and a third-party peer device using standard features of PCI Express.
                        Examples of third-party devices are: network interfaces, video acquisition devices, storage adapters.
                        
                     </p>
                     <p class="p">
                        	GPUDirect RDMA is available on both Tesla and Quadro GPUs.
                        
                     </p>
                     <p class="p">
                        A number of limitations can apply, the most important being that the two devices must share the 
                        same upstream PCI Express root complex. Some of the limitations depend on the platform used and could be lifted
                        in current/future products.
                        	
                     </p>
                     <p class="p"> A few straightforward changes must be made to
                        device drivers to enable this functionality with a wide range of hardware devices. This document introduces
                        the technology and describes the steps necessary to enable an GPUDirect RDMA connection to NVIDIA GPUs on Linux.
                        
                     </p>
                     <div class="fig fignone" id="overview__gpudirect-rdma-within-linux-device-driver-model"><a name="overview__gpudirect-rdma-within-linux-device-driver-model" shape="rect">
                           <!-- --></a><span class="figcap">Figure 1. GPUDirect RDMA within the Linux Device Driver Model</span><br clear="none"></br><div class="imagecenter"><img class="image imagecenter" src="graphics/gpudirect-rdma-within-linux-device-driver-model.png" alt="GPUDirect RDMA within the Linux Device Driver Model."></img></div><br clear="none"></br></div>
                  </div>
                  <div class="topic concept nested1" id="how-gpudirect-rdma-works"><a name="how-gpudirect-rdma-works" shape="rect">
                        <!-- --></a><h3 class="title topictitle2"><a href="#how-gpudirect-rdma-works" name="how-gpudirect-rdma-works" shape="rect">1.1.&nbsp;How GPUDirect RDMA Works</a></h3>
                     <div class="body conbody">
                        <p class="p"> When setting up GPUDirect RDMA communication between two peers, all physical
                           addresses are the same from the PCI Express devices' point of view. Within this physical address space are
                           linear windows called PCI BARs.
                           Each device has six BAR registers at most, so it can have up to six active 32bit BAR regions.
                           64bit BARs consume two BAR registers. 
                           The PCI Express device issues reads and writes
                           to a peer device's BAR addresses in the same way that they are issued to system memory.
                           
                        </p>
                        <p class="p">Traditionally, resources like BAR windows are mapped to user or kernel address space using the CPU's MMU as
                           memory mapped I/O (MMIO) addresses. However, because current operating systems don't have sufficient
                           mechanisms for exchanging MMIO regions between drivers, the NVIDIA kernel driver exports functions to
                           perform the necessary address translations and mappings.
                           
                        </p>
                        <p class="p">
                           To add GPUDirect RDMA support to a device driver, a small amount of address mapping code within the kernel
                           driver must be modified. This code typically resides near existing calls to <samp class="ph codeph">get_user_pages()</samp>.
                           
                        </p>
                        <p class="p">The APIs and control flow involved with GPUDirect RDMA are very similar to those used with standard DMA
                           transfers.
                           
                        </p>
                        <p class="p">See <a class="xref" href="index.html#supported-systems" shape="rect">Supported Systems</a> and <a class="xref" href="index.html#bar-sizes" shape="rect">PCI BAR sizes</a> for more hardware details.
                        </p>
                     </div>
                  </div>
                  <div class="topic concept nested1" id="standard-dma-transfer-example-sequence"><a name="standard-dma-transfer-example-sequence" shape="rect">
                        <!-- --></a><h3 class="title topictitle2"><a href="#standard-dma-transfer-example-sequence" name="standard-dma-transfer-example-sequence" shape="rect">1.2.&nbsp;Standard DMA Transfer</a></h3>
                     <div class="body conbody">
                        <div class="p">First, we outline a standard DMA Transfer initiated from userspace.
                           In this scenario, the following components are present:
                           
                           <ul class="ul">
                              <li class="li">Userspace program</li>
                              <li class="li">Userspace communication library</li>
                              <li class="li">Kernel driver for the device interested in doing DMA transfers</li>
                           </ul>
                           
                           The general sequence is as follows:
                           
                        </div>
                        <ol class="ol">
                           <li class="li">The userspace program requests a transfer via the userspace communication library.
                              This operation takes a pointer to data (a virtual address) and a size in bytes.
                              
                           </li>
                           <li class="li">
                              The communication library must make sure the memory region corresponding
                              to the virtual address and size is ready for the transfer.
                              If this is not the case already, it has to be handled by the kernel driver (next step).
                              
                           </li>
                           <li class="li">
                              The kernel driver receives the virtual address and size from the userspace communication library. It
                              then asks the kernel to translate the virtual address range to a list of physical pages
                              and make sure they are ready to be transferred to or from.
                              We will refer to this operation as <dfn class="term">pinning</dfn> the memory.
                              
                           </li>
                           <li class="li">The kernel driver uses the list of pages to program the physical device's DMA engine(s).</li>
                           <li class="li">The communication library initiates the transfer.</li>
                           <li class="li">
                              After the transfer is done, the communication library should eventually
                              clean up any resources used to pin the memory.
                              We will refer to this operation as <dfn class="term">unpinning</dfn> the memory.
                              
                           </li>
                        </ol>
                     </div>
                  </div>
                  <div class="topic concept nested1" id="gpudirect-rdma-example-sequence"><a name="gpudirect-rdma-example-sequence" shape="rect">
                        <!-- --></a><h3 class="title topictitle2"><a href="#gpudirect-rdma-example-sequence" name="gpudirect-rdma-example-sequence" shape="rect">1.3.&nbsp;GPUDirect RDMA Transfers</a></h3>
                     <div class="body conbody">
                        <div class="p">
                           For the communication to support GPUDirect RDMA transfers some changes to the sequence above
                           have to be introduced.
                           First of all, two new components are present:
                           
                           <ul class="ul">
                              <li class="li">Userspace CUDA library</li>
                              <li class="li">NVIDIA kernel driver</li>
                           </ul>
                           
                           As described in <a class="xref" href="index.html#basics-of-uva-cuda-memory-management" shape="rect">Basics of UVA CUDA Memory Management</a>, programs using the CUDA library have their
                           address space split between GPU and CPU virtual addresses, and the communication library
                           has to implement two separate paths for them.
                           
                        </div>
                        <p class="p">
                           The userspace CUDA library provides a function that lets the communication library
                           distinguish between CPU and GPU addresses. Moreover, for GPU addresses it returns additional metadata
                           that is required to uniquely identify the GPU memory represented by the address.
                           See <a class="xref" href="index.html#userspace-api" shape="rect">Userspace API</a> for details.
                           
                        </p>
                        <p class="p">
                           The difference between the paths for CPU and GPU addresses is in how the memory is pinned and unpinned.
                           For CPU memory this is handled by built-in Linux Kernel functions (<samp class="ph codeph">get_user_pages()</samp> and <samp class="ph codeph">put_page()</samp>).
                           However, in the GPU memory case the pinning and unpinning has to be handled
                           by functions provided by the NVIDIA Kernel driver.
                           See <a class="xref" href="index.html#pinning-gpu-memory" shape="rect">Pinning GPU memory</a> and <a class="xref" href="index.html#unpinning-gpu-memory" shape="rect">Unpinning GPU memory</a> for details.
                           
                        </p>
                        <p class="p">
                           Some hardware caveats are explained in <a class="xref" href="index.html#supported-systems" shape="rect">Supported Systems</a> and <a class="xref" href="index.html#bar-sizes" shape="rect">PCI BAR sizes</a>.
                           
                        </p>
                     </div>
                  </div>
                  <div class="topic concept nested1" id="new-in-cuda-60"><a name="new-in-cuda-60" shape="rect">
                        <!-- --></a><h3 class="title topictitle2"><a href="#new-in-cuda-60" name="new-in-cuda-60" shape="rect">1.4.&nbsp;Changes in CUDA 6.0</a></h3>
                     <div class="body conbody">
                        <div class="p">
                           In this section we briefly list the changes that are available in CUDA 6.0:
                           
                           <ul class="ul">
                              <li class="li">
                                 CUDA peer-to-peer tokens are no longer mandatory. For memory buffers owned by the calling 
                                 process (which is typical) tokens can be replaced by zero (0) in the kernel-mode function 
                                 <samp class="ph codeph">nvidia_p2p_get_pages()</samp>.
                                 This new feature is meant to make it easier
                                 for existing third party software stacks to adopt RDMA for GPUDirect.
                                 
                              </li>
                              <li class="li">
                                 As a consequence of the change above, a new API <samp class="ph codeph">cuPointerSetAttribute()</samp> 
                                 has been introduced.
                                 This API must be used to register any buffer for which no peer-to-peer tokens are used. 
                                 	        It is necessary to ensure correct synchronization behavior of the CUDA API when operation on memory which may be
                                 read by RDMA for GPUDirect. 
                                 Failing to use it in these cases may cause data corruption.
                                 See changes in <a class="xref" href="index.html#tokens-usage" shape="rect">Tokens Usage</a>.
                                 	
                              </li>
                              <li class="li"><samp class="ph codeph">cuPointerGetAttribute()</samp> has been extended to return a globally unique numeric
                                 identifier, which in turn can be used by lower-level libraries to detect buffer 
                                 reallocations happening in user-level code (see <a class="xref" href="index.html#userspace-api" shape="rect">Userspace API</a>).
                                 It provides an alternative method to detect reallocations when intercepting CUDA 
                                 allocation and deallocation APIs is not possible.
                                 
                              </li>
                              <li class="li">
                                 The kernel-mode memory pinning feature has been extended to work in combination with 
                                 Multi-Process Service (MPS).
                                 
                              </li>
                           </ul>
                           
                           Caveats as of CUDA 6.0:
                           
                           <ul class="ul">
                              <li class="li">
                                 CUDA Unified Memory is not explicitly supported in combination with GPUDirect RDMA. While the page 
                                 table returned by <samp class="ph codeph">nvidia_p2p_get_pages()</samp> is valid for managed memory
                                 buffers and provides a mapping of GPU memory at any given moment in time, the GPU device copy of that 
                                 memory may be incoherent with the writable copy of the page which is not on the GPU. 
                                 Using the page table in this circumstance may result in accessing stale data,
                                 or data loss, because of a DMA write access to device memory that is subsequently 
                                 overwritten by the Unified Memory run-time. 
                                 <samp class="ph codeph">cuPointerGetAttribute()</samp> may be used to determine if an address 
                                 is being managed by the Unified Memory runtime.
                                 
                              </li>
                              <li class="li">
                                 Every time a device memory region is pinned, new GPU BAR space is 
                                 allocated unconditionally, even when pinning overlapping or duplicate device memory ranges.
                                 			There is no attempt at reusing mappings.
                                 This behavior could change in the future, so we encourage developers not to rely on it.
                                 
                              </li>
                           </ul>
                        </div>
                     </div>
                  </div>
               </div>
               <div class="topic concept nested0" id="design-considerations"><a name="design-considerations" shape="rect">
                     <!-- --></a><h2 class="title topictitle1"><a href="#design-considerations" name="design-considerations" shape="rect">2.&nbsp;Design Considerations</a></h2>
                  <div class="body conbody">
                     <p class="p">
                        When designing a system to utilize GPUDirect RDMA, there a number of considerations 
                        which should be taken into account.
                        	
                     </p>
                  </div>
                  <div class="topic concept nested1" id="lazy-unpinning-optimization"><a name="lazy-unpinning-optimization" shape="rect">
                        <!-- --></a><h3 class="title topictitle2"><a href="#lazy-unpinning-optimization" name="lazy-unpinning-optimization" shape="rect">2.1.&nbsp;Lazy Unpinning Optimization</a></h3>
                     <div class="body conbody">
                        <p class="p">
                           Pinning GPU device memory in BAR is an
                           		expensive operation, taking up to milliseconds.
                           		Therefore the application should be designed in a way to minimize that
                           		overhead. 
                           
                        </p>
                        <p class="p">
                           The most straightforward implementation using GPUDirect RDMA would pin memory
                           before each transfer and unpin it right after the transfer is complete.
                           Unfortunately, this would perform poorly in general, as pinning and unpinning
                           memory are expensive operations. The rest of the steps required to perform an RDMA transfer, however,
                           can be performed quickly without entering the kernel (the DMA list can be cached and
                           replayed using MMIO registers/command lists).
                           
                        </p>
                        <p class="p">
                           Hence, lazily unpinning memory is key to a high performance RDMA implementation. What it implies, is
                           keeping the memory pinned even after the transfer has finished. This takes advantage of the fact
                           that it is likely that the same memory region will be used for future DMA transfers thus
                           lazy unpinning saves pin/unpin operations.
                           
                        </p>
                        <p class="p">
                           An example implementation of lazy unpinning would keep a set of pinned memory regions
                           and only unpin some of them (for example the least recently used one)
                           if the total size of the regions reached some threshold, or if pinning a new region failed
                           because of BAR space exhaustion (see <a class="xref" href="index.html#bar-sizes" shape="rect">PCI BAR sizes</a>).
                           
                        </p>
                     </div>
                  </div>
                  <div class="topic concept nested1" id="registration-cache"><a name="registration-cache" shape="rect">
                        <!-- --></a><h3 class="title topictitle2"><a href="#registration-cache" name="registration-cache" shape="rect">2.2.&nbsp;Registration Cache</a></h3>
                     <div class="body conbody">
                        <p class="p">
                           Communication middleware often employs an optimization called a registration cache, 
                           or pin-down cache, to minimize pinning overhead.
                           	Typically it already exists for host memory, implementing lazy unpinning, LRU de-registration, etc.
                           	For networking middleware, such caches are usually implemented in user-space, as they are used in combination
                           	with hardware capable of user-mode message injection.
                           	CUDA UVA memory address layout enables GPU memory pinning to work with these caches
                           	by taking into account just a few design considerations.
                           	In the CUDA environment, this is even more important as the amount of memory which can 
                           	be pinned may be significantly more constrained than for host memory.
                           
                        </p>
                        <p class="p">
                           	
                           	As the GPU BAR space is typically mapped using 64KB pages, it is more resource efficient 
                           	to maintain a cache of regions rounded to the 64KB boundary.
                           	In additon, pinning two memory areas which are in the same 64KB boundary would 
                           	allocate and return two distinct BAR regions mapping the same GPU device memory. 
                           	So proper handling of alignment helps conserving GPU BAR space.
                           
                        </p>
                        <div class="p">
                           	
                           	
                           	Registration caches usually rely on the ability to intercept deallocation events happening
                           	in the user application, so that they can unpin the memory and free important HW resources,
                           	e.g. on the network card.
                           	To implement a similar mechanism for GPU memory, an implementation has two options:
                           	
                           <ul class="ul">
                              <li class="li">Instrument all CUDA allocation and deallocation APIs.</li>
                              <li class="li">Use a tag check function to track deallocation and reallocation.
                                 	See <a class="xref" href="index.html#invalidating-based-on-buffer-id" shape="rect">Buffer ID Tag Check for A Registration Cache</a>.
                                 	
                              </li>
                           </ul>
                        </div>
                        <div class="p">
                           	While intercepting CUDA APIs is beyond the scope of this document, an approach to performing tag checks is available starting
                           with CUDA 6.0.
                           	It involves the usage of the <samp class="ph codeph">CU_POINTER_ATTRIBUTE_BUFFER_ID</samp> attribute
                           	in <samp class="ph codeph">cuPointerGetAttribute()</samp> to detect memory buffer deallocations or
                           	reallocations. The API will return a different ID value in case of reallocation or 
                           	an error if the buffer address is no longer valid. 
                           	See <a class="xref" href="index.html#userspace-api" shape="rect">Userspace API</a> for API usage.
                           		
                           <div class="note note"><span class="notetitle">Note:</span> Using tag checks introduces an extra call into the CUDA API on each memory buffer use, 
                              	so this approach is most appropriate when the additional latency is not a concern.
                              		
                           </div>
                        </div>
                     </div>
                  </div>
                  <div class="topic concept nested1" id="unpin-callback"><a name="unpin-callback" shape="rect">
                        <!-- --></a><h3 class="title topictitle2"><a href="#unpin-callback" name="unpin-callback" shape="rect">2.3.&nbsp;Unpin Callback</a></h3>
                     <div class="body conbody">
                        <p class="p">
                           When a third party device driver pins the GPU pages with <samp class="ph codeph">nvidia_p2p_get_pages()</samp>
                           it must also provide a callback function that the NVIDIA driver will call if it needs
                           to revoke access to the mapping.
                           <strong class="ph b">This callback occurs synchronously</strong>, giving the third party driver the opportunity to clean up
                           and remove any references to the pages in question (i.e., wait for outstanding DMAs to complete).
                           <strong class="ph b">The user callback function may block for a few milliseconds</strong>,
                           although it is recommended that the callback complete as quickly as possible.
                           Care has to be taken not to introduce deadlocks as waiting within the callback
                           for the GPU to do anything is not safe.
                           
                        </p>
                        <p class="p">
                           	The callback must call <samp class="ph codeph">nvidia_p2p_free_page_table()</samp> 
                           	(not <samp class="ph codeph">nvidia_p2p_put_pages()</samp>) to free
                           	the memory pointed to by <samp class="ph codeph">page_table</samp>.
                           	The corresponding mapped memory areas will only be unmapped by the NVIDIA driver after returning
                           	from the callback.
                           		
                        </p>
                        <div class="p">
                           Note that the callback will be invoked in two scenarios:
                           
                           <ul class="ul">
                              <li class="li"> If the userspace program explicitly deallocates the corresponding GPU memory,
                                 		 e.g. <samp class="ph codeph">cuMemFree</samp>, <samp class="ph codeph">cuCtxDestroy</samp>, etc.
                                 		 before the third party kernel driver has a chance to unpin the memory with
                                 <samp class="ph codeph">nvidia_p2p_put_pages()</samp>.
                                 		
                              </li>
                              <li class="li"> As a consequence of an early exit of the process.
                                 		
                              </li>
                           </ul>
                           
                           In the latter case there can be tear-down ordering issues between closing
                           the file descriptor of the third party
                           kernel driver and that of the NVIDIA kernel driver. In the case the file
                           descriptor for the NVIDIA kernel driver is closed first, the 
                           <samp class="ph codeph">nvidia_p2p_put_pages()</samp> callback will be invoked.
                           
                        </div>
                        <p class="p">
                           A proper software design is  important as the 
                           NVIDIA kernel driver will protect itself from reentrancy issues with locks before invoking the callback.
                           The third party kernel driver will almost certainly take similar actions, so 
                           dead-locking or live-locking scenarios may arise if careful consideration is not taken.
                           
                        </p>
                     </div>
                  </div>
                  <div class="topic concept nested1" id="supported-systems"><a name="supported-systems" shape="rect">
                        <!-- --></a><h3 class="title topictitle2"><a href="#supported-systems" name="supported-systems" shape="rect">2.4.&nbsp;Supported Systems</a></h3>
                     <div class="body conbody">
                        <p class="p">
                           Even though the only theoretical requirement for GPUDirect RDMA
                           to work between a third-party device and an NVIDIA GPU is that they share the same root complex,
                           there exist bugs (mostly in chipsets) causing it to perform badly, or not work at all in certain setups.
                           
                        </p>
                        <div class="p">
                           We can distinguish between three situations, depending on what is
                           on the path between the GPU and the third-party device:
                           
                           <ul class="ul">
                              <li class="li">PCIe switches only</li>
                              <li class="li">single CPU/IOH</li>
                              <li class="li">CPU/IOH &lt;-&gt; QPI/HT &lt;-&gt; CPU/IOH</li>
                           </ul>
                           
                           The first situation, where there are only PCIe switches on the path, is optimal and yields the best performance.
                           The second one, where a single CPU/IOH is involved, works, but yields worse performance (
                           especially peer-to-peer read bandwidth has been shown to be severely limited on some processor architectures ).
                           Finally, the third situation, where the path traverses a QPI/HT link, doesn't work reliably.
                           
                        </div>
                        <div class="note tip"><span class="tiptitle">Tip:</span> lspci can be used to check the PCI topology: <pre xml:space="preserve">$ lspci -t</pre></div>
                        <div class="section">
                           <h3 class="title sectiontitle">IOMMUs</h3>
                           <p class="p">
                              GPUDirect RDMA currently relies upon all physical addresses being the same from the different PCI devices' point of view.
                              This makes it incompatible with IOMMUs performing any form of translation other than 1:1, hence they must be disabled or configured
                              for pass-through translation for GPUDirect RDMA to work.
                              
                           </p>
                        </div>
                     </div>
                  </div>
                  <div class="topic concept nested1" id="bar-sizes"><a name="bar-sizes" shape="rect">
                        <!-- --></a><h3 class="title topictitle2"><a href="#bar-sizes" name="bar-sizes" shape="rect">2.5.&nbsp;PCI BAR sizes</a></h3>
                     <div class="body conbody">
                        <p class="p">PCI devices can ask the OS/BIOS to map a region of physical address space
                           	to them. These regions are commonly called <dfn class="term">BARs</dfn>.
                           NVIDIA GPUs currently expose multiple BARs, and some of them can back arbitrary 
                           device memory, making GPUDirect RDMA possible.
                           
                        </p>
                        <p class="p">
                           The maximum BAR size available for GPUDirect RDMA differs from GPU to GPU.
                           
                           For example, currently the smallest available BAR size on Kepler class GPUs is 256 MB.
                           Of that, 32MB are currently reserved for internal use. These sizes may change.
                           
                        </p>
                        <p class="p">
                           	On some Tesla-class GPUs a large BAR feature is enabled, e.g. BAR1 size is set to 16GB or larger.
                           	Large BARs can pose a problem for the BIOS, especially on older motherbords, related 
                           	to compatibility support for 32bit operating systems. 
                           	On those motherboards the bootstrap can stop during the early POST phase,
                           	or the GPU may be misconfigured and so unusable.
                           	If this appears to be occuring it might be necessary to enable some 
                           	special BIOS feature to deal with the large BAR issue.
                           	Please consult your system vendor for more details regarding large BAR support.
                           
                        </p>
                     </div>
                  </div>
                  <div class="topic concept nested1" id="tokens-usage"><a name="tokens-usage" shape="rect">
                        <!-- --></a><h3 class="title topictitle2 preserve-case"><a href="#tokens-usage" name="tokens-usage" shape="rect">2.6.&nbsp;Tokens Usage</a></h3>
                     <div class="body conbody">
                        <div class="note warning"><span class="warningtitle">Warning:</span> Starting in CUDA 6.0, tokens should be considered deprecated, 
                           though they are still supported.
                        </div>
                        <p class="p">
                           As can be seen in <a class="xref" href="index.html#userspace-api" shape="rect">Userspace API</a> and <a class="xref" href="index.html#kernel-api" shape="rect">Kernel API</a>,
                           one method for pinning and unpinning memory requires two tokens in addition to the GPU virtual address.
                           
                        </p>
                        <p class="p">
                           These tokens, <samp class="ph codeph">p2pToken</samp> and <samp class="ph codeph">vaSpaceToken</samp>,
                           are necessary to uniquely identify a GPU VA space. A process identifier alone does not identify
                           a GPU VA space.
                           
                        </p>
                        <p class="p">
                           The tokens are consistent within a single CUDA context (i.e., all memory obtained through <samp class="ph codeph">cudaMalloc()</samp>
                           within the same CUDA context will have the same <samp class="ph codeph">p2pToken</samp> and <samp class="ph codeph">vaSpaceToken</samp>).
                           However, a given GPU virtual address need not map to the same context/GPU for its entire lifetime. As a
                           concrete example:
                           
                        </p><pre xml:space="preserve">cudaSetDevice(0)
ptr0 = cudaMalloc();
cuPointerGetAttribute(&amp;return_data, CU_POINTER_ATTRIBUTE_P2P_TOKENS, ptr0);
<span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-comment">// Returns [p2pToken = 0xabcd, vaSpaceToken = 0x1]</span>
cudaFree(ptr0);
cudaSetDevice(1);
ptr1 = cudaMalloc();
assert(ptr0 == ptr1);
<span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-comment">// The CUDA driver is free (although not guaranteed) to reuse the VA,</span>
<span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-comment">// even on a different GPU</span>
cuPointerGetAttribute(&amp;return_data, CU_POINTER_ATTRIBUTE_P2P_TOKENS, ptr0);
<span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-comment">// Returns [p2pToken = 0x0123, vaSpaceToken = 0x2]</span></pre><p class="p">
                           That is, the same address, when passed to <samp class="ph codeph">cuPointerGetAttribute</samp>,
                           may return different tokens at different times during the program's execution. Therefore,
                           the third party communication library must call <samp class="ph codeph">cuPointerGetAttribute()</samp>
                           for every pointer it operates on.
                           
                        </p>
                        <div class="section">
                           <h3 class="title sectiontitle">Security implications</h3>
                           <p class="p">
                              The two tokens act as an authentication mechanism for the NVIDIA kernel driver.
                              If you know the tokens, you can map the address space corresponding to them,
                              and the NVIDIA kernel driver doesn't perform any additional checks.
                              The 64bit <samp class="ph codeph">p2pToken</samp> is randomized to prevent it from being guessed by an adversary.
                              
                           </p>
                           <p class="p">
                              When no tokens are used, the NVIDIA driver limits the <a class="xref" href="index.html#kernel-api" shape="rect">Kernel API</a>
                              to the process which owns the memory allocation.
                              
                           </p>
                        </div>
                     </div>
                  </div>
                  <div class="topic concept nested1" id="sync-behavior"><a name="sync-behavior" shape="rect">
                        <!-- --></a><h3 class="title topictitle2 preserve-case"><a href="#sync-behavior" name="sync-behavior" shape="rect">2.7.&nbsp;Synchronization and Memory Ordering</a></h3>
                     <div class="body conbody">
                        <div class="p">
                           		GPUDirect RDMA introduces a new independent GPU data flow path exposed to third party devices and it is important to understand
                           how these devices interact with the GPU's relaxed memory model.
                           
                           		
                           <ul class="ul">
                              <li class="li">Properly registering a BAR mapping of CUDA memory is required for that mapping to remain consistent with CUDA APIs operations
                                 on that memory.
                              </li>
                              <li class="li">Only CUDA synchronization and work submission APIs provide memory ordering of GPUDirect RDMA operations.</li>
                           </ul>
                        </div>
                        <div class="section">
                           <h3 class="title sectiontitle">Registration for CUDA API Consistency</h3>
                           <p class="p">
                              		Registration is necesary to ensure the CUDA API memory operations visible to a BAR mapping happen before the API call returns
                              control to the calling CPU thread. This provides a consistent view of memory to a device using GPUDirect RDMA mappings when
                              invoked after a CUDA API in the thread. This is a strictly more conservative mode of operation for the CUDA API and disables
                              optimizations, thus it may negatively impact performance.
                              	
                           </p>
                           <p class="p">
                              		This behavior is enabled on a per-allocation granularity either by calling <samp class="ph codeph">cuPointerSetAttribute()</samp> with the <samp class="ph codeph">CU_POINTER_ATTRIBUTE_SYNC_MEMOPS</samp> attribute, or p2p tokens are retrieved for a buffer when using the legacy path. See <a class="xref" href="index.html#userspace-api" shape="rect">Userspace API</a> for more details.
                              	
                           </p>
                           <p class="p">
                              		An example situation would be Read-after-Write dependency betewen a <samp class="ph codeph">cudaMemcpyDtoD()</samp> and subsequent GPUDirect RDMA read operation on the destination of the copy. As an optimization the device-to-device memory
                              copy typically returns asynchronously to the calling thread after queuing the copy to the GPU scheduler. However, in this
                              circumstance that will lead to incosistent data read via the BAR mapping, so this optimization is disabled an the copy completed
                              before the CUDA API returns.
                              	
                           </p>
                        </div>
                        <div class="section">
                           <h3 class="title sectiontitle">CUDA APIs for Memory Ordering</h3>
                           <p class="p">
                              		Only CPU initiated CUDA APIs provide ordering of GPUDirect memory operations as observed by the GPU. That is, despite a
                              third party device having issued all PCIE transactions, a running GPU kernel or copy operation may observe stale data or data
                              that arrives out-of-order until a subsequent CPU initiated CUDA work submission or synchronization API.
                              		To ensure that memory updates are visible to CUDA kernels or copies, an implementation should ensure that all writes to
                              the GPU BAR happen before control is returned to the CPU thread which will invoke the dependent CUDA API.
                              	
                           </p>
                           <p class="p">
                              		An example situation for a network communication scenario is when a network RDMA write operation is completed by the third
                              party network device and the data is written to the GPU BAR mapping.
                              
                              		Though reading back the written data either through GPU BAR or
                              		a CUDA memory copy operation, will return the newly written data, a concurrently running GPU kernel to that network write
                              might observe stale data, the data partially written, or the data written out-of-order. 
                              	
                           </p>
                           <p class="p"></p>
                           
                           		In short, a GPU kernel is wholly inconsistent with concurrent RDMA for GPUDirect  operations and accessing the memory overwritten
                           by the third party device in such a situation would be considered a data race. To resolve this inconsistency and remove the
                           data race the DMA write operation must complete with respect to the CPU thread which will launch the dependent GPU kernel.
                           	
                        </div>
                     </div>
                  </div>
               </div>
               <div class="topic concept nested0" id="how-to-perform-specific-tasks"><a name="how-to-perform-specific-tasks" shape="rect">
                     <!-- --></a><h2 class="title topictitle1"><a href="#how-to-perform-specific-tasks" name="how-to-perform-specific-tasks" shape="rect">3.&nbsp;How to Perform Specific Tasks</a></h2>
                  <div class="topic task nested1" id="display-bar-space"><a name="display-bar-space" shape="rect">
                        <!-- --></a><h3 class="title topictitle2"><a href="#display-bar-space" name="display-bar-space" shape="rect">3.1.&nbsp;Displaying GPU BAR space</a></h3>
                     <div class="body taskbody">
                        <div class="li step p"><span class="ph cmd">
                              Starting in CUDA 6.0 the NVIDIA SMI utility provides the capability to dump
                              	    BAR1 memory usage.
                              	    It can be used to understand the application usage of BAR space, the primary resource consumed by GPUDirect RDMA mappings.
                              </span><pre class="pre screen" xml:space="preserve">
$ nvidia-smi -q
...
    BAR1 Memory Usage
        Total                       : 256 MiB
        Used                        : 2 MiB
        Free                        : 254 MiB
...

</pre>
                           
                           
                           
                           GPU memory is pinned in fixed size chunks, 
                           so the amount of space reflected here might be unexpected.
                           In addition, a certain amount of BAR space is reserved by the driver
                           for internal use, so not all available memory may be usable via GPUDirect RDMA.
                           Note that the same ability is offered programmatically through the 
                           <samp class="ph codeph">nvmlDeviceGetBAR1MemoryInfo()</samp> NVML API.
                           
                           
                        </div>
                     </div>
                  </div>
                  <div class="topic task nested1" id="pinning-gpu-memory"><a name="pinning-gpu-memory" shape="rect">
                        <!-- --></a><h3 class="title topictitle2"><a href="#pinning-gpu-memory" name="pinning-gpu-memory" shape="rect">3.2.&nbsp;Pinning GPU memory</a></h3>
                     <div class="body taskbody">
                        <ol class="ol steps">
                           <li class="li step"><span class="ph cmd">
                                 Correct behavior requires using <samp class="ph codeph">cuPointerSetAttribute()</samp> on the memory address to enable
                                 proper synchronization behavior in the CUDA driver.
                                 See section <a class="xref" href="index.html#sync-behavior" shape="rect">Synchronization and Memory Ordering</a>.
                                 </span><pre xml:space="preserve">
<span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-keyword">void</span> pin_buffer(<span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-keyword">void</span> *address, size_t size)
{
    <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-keyword">unsigned</span> <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-keyword">int</span> flag = 1;
    CUresult status = cuPointerSetAttribute(&amp;flag, CU_POINTER_ATTRIBUTE_SYNC_MEMOPS, address);
    <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-keyword">if</span> (CUDA_SUCCESS == status) {
        <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-comment">// GPU path</span>
        pass_to_kernel_driver(address, size);
    } <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-keyword">else</span> {
        <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-comment">// CPU path</span>
        <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-comment">// ...</span>
    }
}
                    </pre>
                              
                              This is required so that the GPU memory buffer is treated in a special
                              way by the CUDA driver, so that CUDA memory transfers are guaranteed to always be synchronous
                              with respect to the host. 
                              See <a class="xref" href="index.html#userspace-api" shape="rect">Userspace API</a> for details on <samp class="ph codeph">cuPointerSetAttribute()</samp>.
                              
                              
                           </li>
                           <li class="li step"><span class="ph cmd">In the kernel driver, invoke <samp class="ph codeph">nvidia_p2p_get_pages()</samp>.</span><pre xml:space="preserve">
<span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-comment">// for boundary alignment requirement</span>
<span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-directive">#define GPU_BOUND_SHIFT   16</span>
<span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-directive">#define GPU_BOUND_SIZE    ((u64)1 &lt;&lt; GPU_BOUND_SHIFT)</span>
<span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-directive">#define GPU_BOUND_OFFSET  (GPU_BOUND_SIZE-1)</span>
<span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-directive">#define GPU_BOUND_MASK    (~GPU_BOUND_OFFSET)</span>

<span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-keyword">struct</span> kmd_state {
	nvidia_p2p_page_table_t *page_table;
    <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-comment">// ...</span>
};

<span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-keyword">void</span> kmd_pin_memory(<span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-keyword">struct</span> kmd_state *my_state, <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-keyword">void</span> *address, size_t size)
{ 
    <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-comment">// do proper alignment, as required by NVIDIA kernel driver</span>
    u64 virt_start = address &amp; GPU_BOUND_MASK;
    size_t pin_size = address + size - virt_start;
    <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-keyword">if</span> (!size)
    	<span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-keyword">return</span> -EINVAL;
    <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-keyword">int</span> ret = nvidia_p2p_get_pages(0, 0, virt_start, pin_size, &amp;my_state-&gt;page_table, free_callback, &amp;my_state);
    <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-keyword">if</span> (ret == 0) {
        <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-comment">// Succesfully pinned, page_table can be accessed</span>
    } <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-keyword">else</span> {
        <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-comment">// Pinning failed</span>
    }
}
                    </pre><p class="p">
                                 Note how the start address is aligned to a 64KB boundary before calling the
                                 pinning functions.
                                 
                              </p>
                              <p class="p">
                                 If the function succeeds the memory has been pinned and the <samp class="ph codeph">page_table</samp> entries
                                 can be used to program the device's DMA engine.
                                 See <a class="xref" href="index.html#kernel-api" shape="rect">Kernel API</a> for details on <samp class="ph codeph">nvidia_p2p_get_pages()</samp>.
                                 
                              </p>
                           </li>
                        </ol>
                     </div>
                  </div>
                  <div class="topic task nested1" id="unpinning-gpu-memory"><a name="unpinning-gpu-memory" shape="rect">
                        <!-- --></a><h3 class="title topictitle2"><a href="#unpinning-gpu-memory" name="unpinning-gpu-memory" shape="rect">3.3.&nbsp;Unpinning GPU memory</a></h3>
                     <div class="body taskbody">
                        <div class="li step p"><span class="ph cmd">
                              	In the kernel driver, invoke <samp class="ph codeph">nvidia_p2p_put_pages()</samp>.
                              </span><div class="p"><pre xml:space="preserve">
<span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-keyword">void</span> unpin_memory(<span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-keyword">void</span> *address, size_t size, nvidia_p2p_page_table_t *page_table)
{
    nvidia_p2p_put_pages(0, 0, address, size, page_table);
}
                    </pre>
                              See <a class="xref" href="index.html#kernel-api" shape="rect">Kernel API</a> for details on <samp class="ph codeph">nvidia_p2p_put_pages()</samp>.
                              		
                           </div>
                           <p class="p">
                              Starting CUDA 6.0 zeros should be used as the token parameters.
                              Note that <samp class="ph codeph">nvidia_p2p_put_pages()</samp> must be called
                              			from within the same process context as the one from which the corresponding
                              			<samp class="ph codeph">nvidia_p2p_get_pages()</samp> has been issued.
                              
                           </p>
                        </div>
                     </div>
                  </div>
                  <div class="topic task nested1" id="handling-free-callback"><a name="handling-free-callback" shape="rect">
                        <!-- --></a><h3 class="title topictitle2"><a href="#handling-free-callback" name="handling-free-callback" shape="rect">3.4.&nbsp;Handling the free callback</a></h3>
                     <div class="body taskbody">
                        <ol class="ol steps">
                           <li class="li step"><span class="ph cmd">The NVIDIA kernel driver invokes <samp class="ph codeph">free_callback(data)</samp>
                                 as specified in the <samp class="ph codeph">nvidia_p2p_get_pages()</samp> call if it needs to revoke the mapping.
                                 See <a class="xref" href="index.html#kernel-api" shape="rect">Kernel API</a> and <a class="xref" href="index.html#unpin-callback" shape="rect">Unpin Callback</a> for details.
                                 </span></li>
                           <li class="li step"><span class="ph cmd">The callback waits for pending transfers and then cleans up the page table allocation.</span><div class="p"><pre xml:space="preserve">
<span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-keyword">void</span> free_callback(<span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-keyword">void</span> *data)
{
    my_state *state = data;
    wait_for_pending_transfers(state);
    nvidia_p2p_free_pages(state-&gt;page_table);
}
                    </pre></div>
                           </li>
                           <li class="li step"><span class="ph cmd">The NVIDIA kernel driver handles the unmapping so <samp class="ph codeph">nvidia_p2p_put_pages()</samp> should not be called.</span></li>
                        </ol>
                     </div>
                  </div>
                  <div class="topic task nested1" id="invalidating-based-on-buffer-id"><a name="invalidating-based-on-buffer-id" shape="rect">
                        <!-- --></a><h3 class="title topictitle2"><a href="#invalidating-based-on-buffer-id" name="invalidating-based-on-buffer-id" shape="rect">3.5.&nbsp;Buffer ID Tag Check for A Registration Cache</a></h3>
                     <div class="body taskbody">
                        <div class="section context">
                           	Remember that a solution built around Buffer ID tag checking is not recommended for latency sensitive implementations. 
                           	Instead, instrumentation of CUDA allocation and deallocation APIs to provide callbacks to the registration cache is recommended,
                           removing tag checking overhead from the critical path.
                           	
                        </div>
                        <ol class="ol steps">
                           <li class="li step"><span class="ph cmd">
                                 				The first time a device memory buffer is encountered and recognized as not yet pinned,
                                 				the pinned mapping is created and the associated buffer ID is retrieved and stored 
                                 				together in the cache entry.
                                 				The <samp class="ph codeph">cuMemGetAddressRange()</samp> function can be used to obtain the
                                 				size and starting address for the whole allocation, which can then be used
                                 				to pin it.
                                 				As <samp class="ph codeph">nvidia_p2p_get_pages()</samp> will need a pointer aligned to 64K,
                                 				it is useful to directly align the cached address. Also, as the BAR space is
                                 				currently mapped in chunks of 64KB, it is more resource efficient to round
                                 				the whole pinning to 64KB.
                                 				</span><pre xml:space="preserve">
<span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-comment">// struct buf represents an entry of the registration cache</span>
<span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-keyword">struct</span> buf {
	CUdeviceptr pointer;
	size_t      size;
	CUdeviceptr aligned_pointer;
	size_t      aligned_size;
	<span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-keyword">int</span>         is_pinned;
	uint64_t    id; <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-comment">// buffer id obtained right after pinning</span>
};

    				</pre></li>
                           <li class="li step"><span class="ph cmd">
                                 				Once created, every time a registration cache entry will be used it must be
                                 				first checked for validity. One way to do this is to use the Buffer ID provided 
                                 				by CUDA as a tag to check for deallocation or reallocation.
                                 				
                                 				</span><pre xml:space="preserve">
<span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-keyword">int</span> buf_is_gpu_pinning_valid(<span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-keyword">struct</span> buf* buf) {
	uint64_t buffer_id;
	<span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-keyword">int</span> retcode;
	assert(buf-&gt;is_pinned);
	<span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-comment">// get the current buffer id</span>
	retcode = cuPointerGetAttribute(&amp;buffer_id, CU_POINTER_ATTRIBUTE_BUFFER_ID, buf-&gt;pointer);
	<span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-keyword">if</span> (CUDA_ERROR_INVALID_VALUE == retcode) {
		<span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-comment">// the device pointer is no longer valid</span>
		<span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-comment">// it could have been deallocated</span>
		<span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-keyword">return</span> ERROR_INVALIDATED;
	} <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-keyword">else</span> <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-keyword">if</span> (CUDA_SUCCESS != retcode) {
		<span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-comment">// handle more serious errors here</span>
		<span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-keyword">return</span> ERROR_SERIOUS;
	}
	<span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-keyword">if</span> (buf-&gt;id != buffer_id)
		<span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-comment">// the original buffer has been deallocated and the cached mapping should be invalidated and the buffer re-pinned</span>
		<span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-keyword">return</span> ERROR_INVALIDATED;
	<span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-keyword">return</span> 0;
}

					</pre><p class="p">
                                 				When the buffer identifier changes the corresponding memory buffer has been reallocated
                                 				so the corresponding kernel-space page table will not be valid anymore.
                                 				In this case the kernel-space <samp class="ph codeph">nvidia_p2p_get_pages()</samp> callback
                                 				would have been invoked. Thus the Buffer IDs provide a tag to keep the pin-down cache consistent with
                                 				the kernel-space page table without requiring the kernel driver to up-call into the user-space.
                                 				
                              </p>
                              <p class="p">
                                 				If <samp class="ph codeph">CUDA_ERROR_INVALID_VALUE</samp> is returned 
                                 				from <samp class="ph codeph">cuPointerGetAttribute()</samp>, the program should assume that the memory buffer
                                 				has been deallocated or is otherwise not a valid GPU memory buffer.
                                 				
                              </p>
                           </li>
                           <li class="li step"><span class="ph cmd">In both cases, the corresponding cache entry must be invalidated.</span><pre xml:space="preserve">
<span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-comment">// in the registration cache code</span>
<span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-keyword">if</span> (buf-&gt;is_pinned &amp;&amp; !buf_is_gpu_pinning_valid(buf)) { 
  regcache_invalidate_entry(buf); 
  pin_buffer(buf);
}

					</pre></li>
                        </ol>
                     </div>
                  </div>
                  <div class="topic task nested1" id="linking-kernel-module-against-nvidia-ko"><a name="linking-kernel-module-against-nvidia-ko" shape="rect">
                        <!-- --></a><h3 class="title topictitle2"><a href="#linking-kernel-module-against-nvidia-ko" name="linking-kernel-module-against-nvidia-ko" shape="rect">3.6.&nbsp;Linking a Kernel Module against nvidia.ko</a></h3>
                     <div class="body taskbody">
                        <ol class="ol steps">
                           <li class="li step"><span class="ph cmd">Run the extraction script:</span><pre xml:space="preserve">./NVIDIA-Linux-x86_64-&lt;version&gt;.run –x</pre>
                              
                              This extracts the NVIDA driver and kernel wrapper.
                              </li>
                           <li class="li step"><span class="ph cmd">Navigate to the output directory:</span><pre xml:space="preserve">cd &lt;output directory&gt;/kernel/</pre></li>
                           <li class="li step"><span class="ph cmd">Within this directory, build the NVIDIA module for your kernel:</span><pre xml:space="preserve">make module</pre>
                              
                              
                              After this is done, the <samp class="ph codeph">Module.symvers</samp> file under your kernel build directory
                              contains symbol information for <samp class="ph codeph">nvidia.ko</samp>.
                              
                              
                           </li>
                           <li class="li step"><span class="ph cmd">Modify your kernel module build process with the following line:</span><pre xml:space="preserve">KBUILD_EXTRA_SYMBOLS := &lt;path to kernel build directory&gt;/Module.symvers</pre></li>
                        </ol>
                     </div>
                  </div>
               </div>
               <div class="topic concept nested0" id="references"><a name="references" shape="rect">
                     <!-- --></a><h2 class="title topictitle1"><a href="#references" name="references" shape="rect">4.&nbsp;References</a></h2>
                  <div class="topic concept nested1" id="basics-of-uva-cuda-memory-management"><a name="basics-of-uva-cuda-memory-management" shape="rect">
                        <!-- --></a><h3 class="title topictitle2"><a href="#basics-of-uva-cuda-memory-management" name="basics-of-uva-cuda-memory-management" shape="rect">4.1.&nbsp;Basics of UVA CUDA Memory Management</a></h3>
                     <div class="body conbody">
                        <p class="p">Unified virtual addressing (UVA) is a memory address management system enabled by default in CUDA 4.0 and
                           later releases on Fermi and Kepler GPUs running 64-bit processes. The design of UVA memory management
                           provides a basis for the operation of GPUDirect RDMA. On UVA-supported configurations, when the CUDA runtime
                           initializes, the virtual address (VA) range of the application is partitioned into two areas: the
                           CUDA-managed VA range and the OS-managed VA range. All CUDA-managed pointers are within this VA range, and
                           the range will always fall within the first 40 bits of the process's VA space.
                        </p>
                        <div class="fig fignone" id="basics-of-uva-cuda-memory-management__cuda-va-space-addressing"><a name="basics-of-uva-cuda-memory-management__cuda-va-space-addressing" shape="rect">
                              <!-- --></a><span class="figcap">Figure 2. CUDA VA Space Addressing</span><br clear="none"></br><div class="imagecenter"><img class="image imagecenter" src="graphics/cuda-va-space-addressing.png" alt="CUDA VA Space Addressing."></img></div><br clear="none"></br></div>
                        <p class="p">Subsequently, within the CUDA VA space, addresses can be subdivided into three types:</p>
                        <dl class="dl">
                           <dt class="dt dlterm">GPU</dt>
                           <dd class="dd">A page backed by GPU memory. This will not be accessible from the host and the VA in question will
                              never have a physical backing on the host. Dereferencing a pointer to a GPU VA from the CPU will trigger
                              a segfault.
                           </dd>
                           <dt class="dt dlterm">CPU</dt>
                           <dd class="dd">A page backed by CPU memory. This will be accessible from both the host and the GPU at the same VA.</dd>
                           <dt class="dt dlterm">FREE</dt>
                           <dd class="dd">These VAs are reserved by CUDA for future allocations.</dd>
                        </dl>
                        <p class="p">This partitioning allows the CUDA runtime to determine the physical location of a memory object by its
                           pointer value within the reserved CUDA VA space.
                        </p>
                        <p class="p">
                           Addresses are subdivided into these categories at page granularity; all memory within a page is of the same
                           type. Note that GPU pages may not be the same size as CPU pages.
                           The CPU pages are usually 4KB and the GPU pages on Kepler-class GPUs are 64KB.
                           GPUDirect RDMA operates exclusively on GPU
                           pages (created by <samp class="ph codeph">cudaMalloc()</samp>) that are within this CUDA VA space.
                           
                        </p>
                     </div>
                  </div>
                  <div class="topic reference nested1" id="userspace-api"><a name="userspace-api" shape="rect">
                        <!-- --></a><h3 class="title topictitle2"><a href="#userspace-api" name="userspace-api" shape="rect">4.2.&nbsp;Userspace API</a></h3>
                     <div class="body refbody">
                        <div class="section refsyn">
                           <h3 class="title sectiontitle">Data structures</h3><pre xml:space="preserve">
typedef struct CUDA_POINTER_ATTRIBUTE_P2P_TOKENS_st {
    unsigned long long p2pToken;
    unsigned int vaSpaceToken;
} CUDA_POINTER_ATTRIBUTE_P2P_TOKENS;
    </pre></div>
                        <div class="section refsyn">
                           <h3 class="title sectiontitle">Function <span class="keyword apiname">cuPointerSetAttribute()</span></h3><pre xml:space="preserve">CUresult cuPointerSetAttribute(void *data, CUpointer_attribute attribute, CUdeviceptr pointer);</pre>
                           In GPUDirect RDMA scope, the interesting usage is when 
                           <samp class="ph codeph">CU_POINTER_ATTRIBUTE_SYNC_MEMOPS</samp> is passed as the <samp class="ph codeph">attribute</samp>:
                           
                           <pre xml:space="preserve">
    unsigned int flag = 1;
    cuPointerSetAttribute(&amp;flag, CU_POINTER_ATTRIBUTE_SYNC_MEMOPS, pointer);
    </pre><div class="p"><strong class="ph b">Parameters</strong><dl class="dl parml">
                                 <dt class="dt pt dlterm"><span class="keyword parmname">data</span> [in]
                                 </dt>
                                 <dd class="dd pd">A pointer to a <samp class="ph codeph">unsigned int</samp> variable containing a boolean value.
                                 </dd>
                                 <dt class="dt pt dlterm"><span class="keyword parmname">attribute</span> [in]
                                 </dt>
                                 <dd class="dd pd">In GPUDirect RDMA scope should always be <samp class="ph codeph">CU_POINTER_ATTRIBUTE_SYNC_MEMOPS</samp>.
                                 </dd>
                                 <dt class="dt pt dlterm"><span class="keyword parmname">pointer</span> [in]
                                 </dt>
                                 <dd class="dd pd">A pointer.</dd>
                              </dl>
                           </div>
                           <div class="p"><strong class="ph b">Returns</strong><dl class="dl">
                                 <dt class="dt dlterm"><samp class="ph codeph">CUDA_SUCCESS</samp></dt>
                                 <dd class="dd">if pointer points to GPU memory and the CUDA driver was able to set the new behavior for the whole device memory allocation.</dd>
                                 <dt class="dt dlterm">anything else</dt>
                                 <dd class="dd">if pointer points to CPU memory.</dd>
                              </dl>
                           </div>
                           
                           
                           	It is used to explicitly enable a strictly synchronizing behavior on the whole 
                           	memory allocation pointed to by <samp class="ph codeph">pointer</samp>, and by doing so disabling
                           	all data transfer optimizations which might create problems with concurrent RDMA and 
                           	CUDA memory copy operations.
                           	This API has CUDA synchronizing behavior, so it should be considered expensive and
                           	possibly invoked only once per buffer.
                           	
                           
                        </div>
                        <div class="section refsyn">
                           <h3 class="title sectiontitle">Function <span class="keyword apiname">cuPointerGetAttribute()</span></h3><pre xml:space="preserve">CUresult cuPointerGetAttribute(const void *data, CUpointer_attribute attribute, CUdeviceptr pointer);</pre><p class="p">
                              	This function has two different attributes related to GPUDirect RDMA: 
                              	<samp class="ph codeph">CU_POINTER_ATTRIBUTE_P2P_TOKENS</samp> and <samp class="ph codeph">CU_POINTER_ATTRIBUTE_BUFFER_ID</samp>.
                              	
                           </p>
                           <div class="note warning"><span class="warningtitle">Warning:</span> CU_POINTER_ATTRIBUTE_P2P_TOKENS has been deprecated in CUDA 6.0
                           </div>
                           
                           	
                           When <samp class="ph codeph">CU_POINTER_ATTRIBUTE_P2P_TOKENS</samp> is passed as the <samp class="ph codeph">attribute</samp>,
                           <samp class="ph codeph">data</samp> is a pointer to <samp class="ph codeph">CUDA_POINTER_ATTRIBUTE_P2P_TOKENS</samp>:
                           
                           <pre xml:space="preserve">
    CUDA_POINTER_ATTRIBUTE_P2P_TOKENS tokens;
    cuPointerGetAttribute(&amp;tokens, CU_POINTER_ATTRIBUTE_P2P_TOKENS, pointer);
    </pre>
                           
                           In this case, the function returns two tokens for use with the <a class="xref" href="index.html#kernel-api" shape="rect">Kernel API</a>.
                           
                           
                           <div class="p"><strong class="ph b">Parameters</strong><dl class="dl parml">
                                 <dt class="dt pt dlterm"><span class="keyword parmname">data</span> [out]
                                 </dt>
                                 <dd class="dd pd">Struct <samp class="ph codeph">CUDA_POINTER_ATTRIBUTE_P2P_TOKENS</samp> with the two tokens.
                                 </dd>
                                 <dt class="dt pt dlterm"><span class="keyword parmname">attribute</span> [in]
                                 </dt>
                                 <dd class="dd pd">In GPUDirect RDMA scope should always be <samp class="ph codeph">CU_POINTER_ATTRIBUTE_P2P_TOKENS</samp>.
                                 </dd>
                                 <dt class="dt pt dlterm"><span class="keyword parmname">pointer</span> [in]
                                 </dt>
                                 <dd class="dd pd">A pointer.</dd>
                              </dl>
                           </div>
                           <div class="p"><strong class="ph b">Returns</strong><dl class="dl">
                                 <dt class="dt dlterm"><samp class="ph codeph">CUDA_SUCCESS</samp></dt>
                                 <dd class="dd">if pointer points to GPU memory.</dd>
                                 <dt class="dt dlterm">anything else</dt>
                                 <dd class="dd">if pointer points to CPU memory.</dd>
                              </dl>
                           </div>
                           <p class="p">
                              This function may be called at any time, including before CUDA initialization,
                              and it has CUDA synchronizing behavior, as in <samp class="ph codeph">CU_POINTER_ATTRIBUTE_SYNC_MEMOPS</samp>,
                              so it should be considered expensive and should be invoked only once per buffer.
                              
                           </p>
                           <p class="p">
                              Note that values set in <samp class="ph codeph">tokens</samp> can be different for the same <samp class="ph codeph">pointer</samp>
                              value during a lifetime of a user-space program. See <a class="xref" href="index.html#tokens-usage" shape="rect">Tokens Usage</a> for a concrete example.
                              
                           </p>
                           <p class="p">
                              Note that for security reasons the value set in <samp class="ph codeph">p2pToken</samp> will be randomized,
                              to prevent it from being guessed by an adversary.
                              
                           </p>
                           <p class="p">
                              In CUDA 6.0, a new attribute has been introduced that is useful to detect memory reallocations.
                              	
                           </p>
                           
                           	
                           When <samp class="ph codeph">CU_POINTER_ATTRIBUTE_BUFFER_ID</samp> is passed as the <samp class="ph codeph">attribute</samp>, 
                           <samp class="ph codeph">data</samp> is expected to point to a 64bit unsigned integer variable, 
                           like <samp class="ph codeph">uint64_t</samp>.
                           <pre xml:space="preserve">
    uint64_t buf_id;
    cuPointerGetAttribute(&amp;buf_id, CU_POINTER_ATTRIBUTE_BUFFER_ID, pointer);
    </pre><div class="p"><strong class="ph b">Parameters</strong><dl class="dl parml">
                                 <dt class="dt pt dlterm"><span class="keyword parmname">data</span> [out]
                                 </dt>
                                 <dd class="dd pd">A pointer to a 64 bits variable where the buffer id will be stored.</dd>
                                 <dt class="dt pt dlterm"><span class="keyword parmname">attribute</span> [in]
                                 </dt>
                                 <dd class="dd pd">The <samp class="ph codeph">CU_POINTER_ATTRIBUTE_BUFFER_ID</samp> enumerator.
                                 </dd>
                                 <dt class="dt pt dlterm"><span class="keyword parmname">pointer</span> [in]
                                 </dt>
                                 <dd class="dd pd">A pointer to GPU memory.</dd>
                              </dl>
                           </div>
                           <div class="p"><strong class="ph b">Returns</strong><dl class="dl">
                                 <dt class="dt dlterm"><samp class="ph codeph">CUDA_SUCCESS</samp></dt>
                                 <dd class="dd">if pointer points to GPU memory.</dd>
                                 <dt class="dt dlterm">anything else</dt>
                                 <dd class="dd">if pointer points to CPU memory.</dd>
                              </dl>
                           </div>
                           <div class="p">
                              	Some general remarks follow:
                              
                              <ul class="ul">
                                 <li class="li"><samp class="ph codeph">cuPointerGetAttribute()</samp> and
                                    		<samp class="ph codeph">cuPointerSetAttribute()</samp> are CUDA driver API functions only.
                                    		
                                 </li>
                                 <li class="li">
                                    		In particular, <samp class="ph codeph">cuPointerGetAttribute()</samp> is not equivalent to <samp class="ph codeph">cudaPointerGetAttributes()</samp>,
                                    		as the required functionality is only present in the former function.
                                    		This in no way limits the scope where GPUDirect RDMA
                                    		may be used as <samp class="ph codeph">cuPointerGetAttribute()</samp> is compatible with the CUDA Runtime API.
                                    	   	
                                 </li>
                                 <li class="li">
                                    	    No runtime API equivalent to <samp class="ph codeph">cuPointerGetAttribute()</samp> is provided.
                                    		The additional overhead associated with the CUDA runtime API to driver API call sequence
                                    		would introduce unneeded overhead and <samp class="ph codeph">cuPointerGetAttribute()</samp>
                                    		can be on the critical path in communication libraries. The API is fully compatible with the CUDA Runtime API.
                                    	    
                                 </li>
                              </ul>
                           </div>
                        </div>
                     </div>
                  </div>
                  <div class="topic reference nested1" id="kernel-api"><a name="kernel-api" shape="rect">
                        <!-- --></a><h3 class="title topictitle2"><a href="#kernel-api" name="kernel-api" shape="rect">4.3.&nbsp;Kernel API</a></h3>
                     <div class="body refbody">
                        <div class="section">
                           Following declarations can be found in the <samp class="ph codeph">nv-p2p.h</samp> header
                           that is distributed in the NVIDIA Driver package.
                           
                        </div>
                        <div class="section refsyn">
                           <h3 class="title sectiontitle">Data structures</h3><pre xml:space="preserve">
<span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-keyword">typedef</span>
<span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-keyword">struct</span> nvidia_p2p_page {
    uint64_t physical_address;
    <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-keyword">union</span> nvidia_p2p_request_registers {
        <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-keyword">struct</span> {
            uint32_t wreqmb_h;
            uint32_t rreqmb_h;
            uint32_t rreqmb_0;
            uint32_t reserved[3];
        } fermi;
    } registers;
} nvidia_p2p_page_t;
    </pre>
                           In <samp class="ph codeph">nvidia_p2p_page</samp> only the <samp class="ph codeph">physical_address</samp> is relevant to GPUDirect RDMA.
                           
                           <pre xml:space="preserve">
#define NVIDIA_P2P_PAGE_TABLE_VERSION   0x00010001

<span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-keyword">typedef</span>
<span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-keyword">struct</span> nvidia_p2p_page_table {
    uint32_t version;
    uint32_t page_size;
    <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-keyword">struct</span> nvidia_p2p_page **pages;
    uint32_t entries;
} nvidia_p2p_page_table_t;
    </pre><div class="p">
                              Fields
                              
                              <dl class="dl">
                                 <dt class="dt dlterm"><samp class="ph codeph">version</samp></dt>
                                 <dd class="dd">the version of the page table; should be compared to <samp class="ph codeph">NVIDIA_P2P_PAGE_TABLE_VERSION</samp>
                                    before accessing the other fields
                                 </dd>
                                 <dt class="dt dlterm"><samp class="ph codeph">page_size</samp></dt>
                                 <dd class="dd">the GPU page size</dd>
                                 <dt class="dt dlterm"><samp class="ph codeph">pages</samp></dt>
                                 <dd class="dd">the page table entries</dd>
                                 <dt class="dt dlterm"><samp class="ph codeph">entries</samp></dt>
                                 <dd class="dd">number of the page table entries</dd>
                              </dl>
                           </div>
                        </div>
                        <div class="section refsyn">
                           <h3 class="title sectiontitle">Function <span class="keyword apiname">nvidia_p2p_get_pages()</span></h3><pre xml:space="preserve">
<span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-keyword">int</span> nvidia_p2p_get_pages(uint64_t p2p_token, uint32_t va_space_token,
                uint64_t virtual_address,
                uint64_t length,
                <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-keyword">struct</span> nvidia_p2p_page_table **page_table,
                <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-keyword">void</span> (*free_callback)(<span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-keyword">void</span> *data),
                <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-keyword">void</span> *data);
   </pre>
                           This function makes the pages underlying a range of GPU virtual memory accessible to a third-party device.
                           
                           <div class="p"><strong class="ph b">Parameters</strong><dl class="dl parml">
                                 <dt class="dt pt dlterm"><span class="keyword parmname">p2p_token</span> [in][deprecated]
                                 </dt>
                                 <dd class="dd pd">A token that uniquely identifies the P2P mapping or zero.</dd>
                                 <dt class="dt pt dlterm"><span class="keyword parmname">va_space_token</span> [in][deprecated]
                                 </dt>
                                 <dd class="dd pd">A GPU virtual address space qualifier or zero.</dd>
                                 <dt class="dt pt dlterm"><span class="keyword parmname">virtual_address</span> [in]
                                 </dt>
                                 <dd class="dd pd">The start address in the specified virtual address space. Has to be aligned to 64K.</dd>
                                 <dt class="dt pt dlterm"><span class="keyword parmname">length</span> [in]
                                 </dt>
                                 <dd class="dd pd">The length of the requested P2P mapping.</dd>
                                 <dt class="dt pt dlterm"><span class="keyword parmname">page_table</span> [out]
                                 </dt>
                                 <dd class="dd pd">A pointer to an array of structures with P2P PTEs. Cannot be NULL.</dd>
                                 <dt class="dt pt dlterm"><span class="keyword parmname">free_callback</span> [in]
                                 </dt>
                                 <dd class="dd pd">A pointer to the function to be invoked if the pages underlying the virtual address range
                                    are freed implicitly. Cannot be NULL.
                                 </dd>
                                 <dt class="dt pt dlterm"><span class="keyword parmname">data</span> [in]
                                 </dt>
                                 <dd class="dd pd">An opaque pointer to private data to be passed to the callback function.</dd>
                              </dl>
                           </div>
                           <div class="p"><strong class="ph b">Returns</strong><dl class="dl">
                                 <dt class="dt dlterm"><samp class="ph codeph">0</samp></dt>
                                 <dd class="dd">upon successful completion.</dd>
                                 <dt class="dt dlterm"><samp class="ph codeph">-EINVAL</samp></dt>
                                 <dd class="dd">if an invalid argument was supplied.</dd>
                                 <dt class="dt dlterm"><samp class="ph codeph">-ENOTSUPP</samp></dt>
                                 <dd class="dd">if the requested operation is not supported.</dd>
                                 <dt class="dt dlterm"><samp class="ph codeph">-ENOMEM</samp></dt>
                                 <dd class="dd">if the driver failed to allocate memory or if insufficient resources were available to complete the operation.</dd>
                                 <dt class="dt dlterm"><samp class="ph codeph">-EIO</samp></dt>
                                 <dd class="dd">if an unknown error occurred.</dd>
                              </dl>
                           </div>
                           <div class="note warning"><span class="warningtitle">Warning:</span> This is an expensive operation and should be performed as infrequently as possible - see <a class="xref" href="index.html#lazy-unpinning-optimization" shape="rect">Lazy Unpinning Optimization</a>.
                           </div>
                        </div>
                        <div class="section refsyn">
                           <h3 class="title sectiontitle">Function <span class="keyword apiname">nvidia_p2p_put_pages()</span></h3><pre xml:space="preserve">
<span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-keyword">int</span> nvidia_p2p_put_pages(uint64_t p2p_token, uint32_t va_space_token,
        uint64_t virtual_address,
        <span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-keyword">struct</span> nvidia_p2p_page_table *page_table);
    </pre>
                           
                           This function releases a set of pages previously made accessible to a third-party device.
                           Warning: it is not meant to be called from within the <samp class="ph codeph">nvidia_p2p_get_pages()</samp> callback.
                           
                           
                           <div class="p"><strong class="ph b">Parameters</strong><dl class="dl parml">
                                 <dt class="dt pt dlterm"><span class="keyword parmname">p2p_token</span> [in][deprecated]
                                 </dt>
                                 <dd class="dd pd">A token that uniquely identifies the P2P mapping or zero.</dd>
                                 <dt class="dt pt dlterm"><span class="keyword parmname">va_space_token</span> [in][deprecated]
                                 </dt>
                                 <dd class="dd pd">A GPU virtual address space qualifier or zero.</dd>
                                 <dt class="dt pt dlterm"><span class="keyword parmname">virtual_address</span> [in]
                                 </dt>
                                 <dd class="dd pd">The start address in the specified virtual address space.</dd>
                                 <dt class="dt pt dlterm"><span class="keyword parmname">page_table</span> [in]
                                 </dt>
                                 <dd class="dd pd">A pointer to an array of structures with P2P PTEs.</dd>
                              </dl>
                           </div>
                           <div class="p"><strong class="ph b">Returns</strong><dl class="dl">
                                 <dt class="dt dlterm"><samp class="ph codeph">0</samp></dt>
                                 <dd class="dd">upon successful completion.</dd>
                                 <dt class="dt dlterm"><samp class="ph codeph">-EINVAL</samp></dt>
                                 <dd class="dd">if an invalid argument was supplied.</dd>
                                 <dt class="dt dlterm"><samp class="ph codeph">-EIO</samp></dt>
                                 <dd class="dd">if an unknown error occurred.</dd>
                              </dl>
                           </div>
                        </div>
                        <div class="section refsyn">
                           <h3 class="title sectiontitle">Function <span class="keyword apiname">nvidia_p2p_free_page_table()</span></h3><pre xml:space="preserve">
<span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-keyword">int</span> nvidia_p2p_free_page_table(<span xmlns:xslthl="http://xslthl.sf.net" class="xslthl-keyword">struct</span> nvidia_p2p_page_table *page_table);
    </pre>
                           
                           This function frees a third-party P2P page table and is meant to be invoked
                           during the execution of the <samp class="ph codeph">nvidia_p2p_get_pages()</samp> callback.
                           
                           
                           <div class="p"><strong class="ph b">Parameters</strong><dl class="dl parml">
                                 <dt class="dt pt dlterm"><span class="keyword parmname">page_table</span> [in]
                                 </dt>
                                 <dd class="dd pd">A pointer to an array of structures with P2P PTEs.</dd>
                              </dl>
                           </div>
                           <div class="p"><strong class="ph b">Returns</strong><dl class="dl">
                                 <dt class="dt dlterm"><samp class="ph codeph">0</samp></dt>
                                 <dd class="dd">upon successful completion.</dd>
                                 <dt class="dt dlterm"><samp class="ph codeph">-EINVAL</samp></dt>
                                 <dd class="dd">if an invalid argument was supplied.</dd>
                              </dl>
                           </div>
                        </div>
                     </div>
                  </div>
               </div>
               <div class="topic concept nested0" id="notices-header"><a name="notices-header" shape="rect">
                     <!-- --></a><h2 class="title topictitle1"><a href="#notices-header" name="notices-header" shape="rect">Notices</a></h2>
                  <div class="topic reference nested1" id="notice"><a name="notice" shape="rect">
                        <!-- --></a><h3 class="title topictitle2"><a href="#notice" name="notice" shape="rect"></a></h3>
                     <div class="body refbody">
                        <div class="section">
                           <h3 class="title sectiontitle">Notice</h3>
                           <p class="p">ALL NVIDIA DESIGN SPECIFICATIONS, REFERENCE BOARDS, FILES, DRAWINGS, DIAGNOSTICS, LISTS, AND OTHER DOCUMENTS (TOGETHER AND
                              SEPARATELY, "MATERIALS") ARE BEING PROVIDED "AS IS." NVIDIA MAKES NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE
                              WITH RESPECT TO THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT, MERCHANTABILITY, AND FITNESS
                              FOR A PARTICULAR PURPOSE. 
                           </p>
                           <p class="p">Information furnished is believed to be accurate and reliable. However, NVIDIA Corporation assumes no responsibility for the
                              consequences of use of such information or for any infringement of patents or other rights of third parties that may result
                              from its use. No license is granted by implication of otherwise under any patent rights of NVIDIA Corporation. Specifications
                              mentioned in this publication are subject to change without notice. This publication supersedes and replaces all other information
                              previously supplied. NVIDIA Corporation products are not authorized as critical components in life support devices or systems
                              without express written approval of NVIDIA Corporation.
                           </p>
                        </div>
                     </div>
                  </div>
                  <div class="topic reference nested1" id="trademarks"><a name="trademarks" shape="rect">
                        <!-- --></a><h3 class="title topictitle2"><a href="#trademarks" name="trademarks" shape="rect"></a></h3>
                     <div class="body refbody">
                        <div class="section">
                           <h3 class="title sectiontitle">Trademarks</h3>
                           <p class="p">NVIDIA and the NVIDIA logo are trademarks or registered trademarks of NVIDIA Corporation
                              in the U.S. and other countries.  Other company and product names may be trademarks of
                              the respective companies with which they are associated.
                           </p>
                        </div>
                     </div>
                  </div>
                  <div class="topic reference nested1" id="copyright-past-to-present"><a name="copyright-past-to-present" shape="rect">
                        <!-- --></a><h3 class="title topictitle2"><a href="#copyright-past-to-present" name="copyright-past-to-present" shape="rect"></a></h3>
                     <div class="body refbody">
                        <div class="section">
                           <h3 class="title sectiontitle">Copyright</h3>
                           <p class="p">© <span class="ph">2007</span>-<span class="ph">2014</span> NVIDIA
                              Corporation. All rights reserved.
                           </p>
                           <p class="p">This product includes software developed by the Syncro Soft SRL (http://www.sync.ro/).</p>
                        </div>
                     </div>
                  </div>
               </div>
               
               <hr id="contents-end"></hr>
               
            </article>
         </div>
      </div>
      <script language="JavaScript" type="text/javascript" charset="utf-8" src="../common/formatting/common.min.js"></script>
      <script language="JavaScript" type="text/javascript" charset="utf-8" src="../common/scripts/google-analytics/google-analytics-write.js"></script>
      <script language="JavaScript" type="text/javascript" charset="utf-8" src="../common/scripts/google-analytics/google-analytics-tracker.js"></script>
      <script type="text/javascript">var switchTo5x=true;</script><script type="text/javascript" src="http://w.sharethis.com/button/buttons.js"></script><script type="text/javascript">stLight.options({publisher: "998dc202-a267-4d8e-bce9-14debadb8d92", doNotHash: false, doNotCopy: false, hashAddressBar: false});</script></body>
</html>