Sophie: nvidia-cuda-toolkit-devel-6.5.14-6.1.mga5.nonfree x86

nvidia-cuda-toolkit-devel-6.5.14-6.1.mga5.nonfree.x86_64.rpm

<!DOCTYPE html
  PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" lang="en-us" xml:lang="en-us">
   <head>
      <meta http-equiv="Content-Type" content="text/html; charset=utf-8"></meta>
      <meta http-equiv="X-UA-Compatible" content="IE=edge"></meta>
      <meta name="copyright" content="(C) Copyright 2005"></meta>
      <meta name="DC.rights.owner" content="(C) Copyright 2005"></meta>
      <meta name="DC.Type" content="concept"></meta>
      <meta name="DC.Title" content="Device API Overview"></meta>
      <meta name="DC.Format" content="XHTML"></meta>
      <meta name="DC.Identifier" content="device-api-overview"></meta>
      <meta name="DC.Language" content="en-us"></meta>
      <link rel="stylesheet" type="text/css" href="../common/formatting/commonltr.css"></link>
      <link rel="stylesheet" type="text/css" href="../common/formatting/site.css"></link>
      <title>cuRAND :: CUDA Toolkit Documentation</title>
      <!--[if lt IE 9]>
      <script src="../common/formatting/html5shiv-printshiv.min.js"></script>
      <![endif]-->
      <script type="text/javascript" charset="utf-8" src="../common/scripts/tynt/tynt.js"></script>
      <script type="text/javascript" charset="utf-8" src="../common/formatting/jquery.min.js"></script>
      <script type="text/javascript" charset="utf-8" src="../common/formatting/jquery.ba-hashchange.min.js"></script>
      <script type="text/javascript" charset="utf-8" src="../common/formatting/jquery.scrollintoview.min.js"></script>
      <script type="text/javascript" src="../search/htmlFileList.js"></script>
      <script type="text/javascript" src="../search/htmlFileInfoList.js"></script>
      <script type="text/javascript" src="../search/nwSearchFnt.min.js"></script>
      <script type="text/javascript" src="../search/stemmers/en_stemmer.min.js"></script>
      <script type="text/javascript" src="../search/index-1.js"></script>
      <script type="text/javascript" src="../search/index-2.js"></script>
      <script type="text/javascript" src="../search/index-3.js"></script>
      <link rel="canonical" href="http://docs.nvidia.com/cuda/curand/index.html"></link>
      <link rel="stylesheet" type="text/css" href="../common/formatting/qwcode.highlight.css"></link>
   </head>
   <body>
      
      <header id="header"><span id="company">NVIDIA</span><span id="site-title">CUDA Toolkit Documentation</span><form id="search" method="get" action="search">
            <input type="text" name="search-text"></input><fieldset id="search-location">
               <legend>Search In:</legend>
               <label><input type="radio" name="search-type" value="site"></input>Entire Site</label>
               <label><input type="radio" name="search-type" value="document"></input>Just This Document</label></fieldset>
            <button type="reset">clear search</button>
            <button id="submit" type="submit">search</button></form>
      </header>
      <div id="site-content">
         <nav id="site-nav">
            <div class="category closed"><a href="../index.html" title="The root of the site.">CUDA Toolkit
                  v6.5</a></div>
            <div class="category"><a href="index.html" title="cuRAND">cuRAND</a></div>
            <ul>
               <li>
                  <div class="section-link"><a href="introduction.html#introduction">Introduction</a></div>
               </li>
               <li>
                  <div class="section-link"><a href="compatibility-and-versioning.html#compatibility-and-versioning">1.&nbsp;Compatibility and Versioning</a></div>
               </li>
               <li>
                  <div class="section-link"><a href="host-api-overview.html#host-api-overview">2.&nbsp;Host API Overview</a></div>
                  <ul>
                     <li>
                        <div class="section-link"><a href="host-api-overview.html#generator-types">2.1.&nbsp;Generator Types</a></div>
                     </li>
                     <li>
                        <div class="section-link"><a href="host-api-overview.html#generator-options">2.2.&nbsp;Generator Options</a></div>
                        <ul>
                           <li>
                              <div class="section-link"><a href="host-api-overview.html#seed">2.2.1.&nbsp;Seed</a></div>
                           </li>
                           <li>
                              <div class="section-link"><a href="host-api-overview.html#offset">2.2.2.&nbsp;Offset</a></div>
                           </li>
                           <li>
                              <div class="section-link"><a href="host-api-overview.html#order">2.2.3.&nbsp;Order</a></div>
                           </li>
                        </ul>
                     </li>
                     <li>
                        <div class="section-link"><a href="host-api-overview.html#return-values">2.3.&nbsp;Return Values</a></div>
                     </li>
                     <li>
                        <div class="section-link"><a href="host-api-overview.html#generation-functions">2.4.&nbsp;Generation Functions</a></div>
                     </li>
                     <li>
                        <div class="section-link"><a href="host-api-overview.html#host-api-example">2.5.&nbsp;Host API Example</a></div>
                     </li>
                     <li>
                        <div class="section-link"><a href="host-api-overview.html#static-library">2.6.&nbsp;Static Library support</a></div>
                     </li>
                     <li>
                        <div class="section-link"><a href="host-api-overview.html#performance-notes2">2.7.&nbsp;Performance Notes</a></div>
                     </li>
                  </ul>
               </li>
               <li>
                  <div class="section-link"><a href="device-api-overview.html#device-api-overview">3.&nbsp;Device API Overview</a></div>
                  <ul>
                     <li>
                        <div class="section-link"><a href="device-api-overview.html#pseudorandom-sequences">3.1.&nbsp;Pseudorandom Sequences</a></div>
                        <ul>
                           <li>
                              <div class="section-link"><a href="device-api-overview.html#bit-generation-1">3.1.1.&nbsp;Bit Generation with XORWOW and MRG32k3a generators</a></div>
                           </li>
                           <li>
                              <div class="section-link"><a href="device-api-overview.html#bit-generation-2">3.1.2.&nbsp;Bit Generation with the MTGP32 generator</a></div>
                           </li>
                           <li>
                              <div class="section-link"><a href="device-api-overview.html#bit-generation-3">3.1.3.&nbsp;Bit Generation with Philox_4x32_10 generator</a></div>
                           </li>
                           <li>
                              <div class="section-link"><a href="device-api-overview.html#distributions">3.1.4.&nbsp;Distributions</a></div>
                           </li>
                        </ul>
                     </li>
                     <li>
                        <div class="section-link"><a href="device-api-overview.html#quasirandom-sequences">3.2.&nbsp;Quasirandom Sequences</a></div>
                     </li>
                     <li>
                        <div class="section-link"><a href="device-api-overview.html#skip-ahead">3.3.&nbsp;Skip-Ahead</a></div>
                     </li>
                     <li>
                        <div class="section-link"><a href="device-api-overview.html#device-api-for-discrete-distributions">3.4.&nbsp;Device API for discrete distributions</a></div>
                     </li>
                     <li>
                        <div class="section-link"><a href="device-api-overview.html#performance-notes">3.5.&nbsp;Performance Notes</a></div>
                     </li>
                     <li>
                        <div class="section-link"><a href="device-api-overview.html#device-api-example">3.6.&nbsp;Device API Examples</a></div>
                     </li>
                     <li>
                        <div class="section-link"><a href="device-api-overview.html#thrust-and-curand-example">3.7.&nbsp;Thrust and cuRAND Example</a></div>
                     </li>
                     <li>
                        <div class="section-link"><a href="device-api-overview.html#poisson-api-example">3.8.&nbsp;Poisson API Example</a></div>
                     </li>
                  </ul>
               </li>
               <li>
                  <div class="section-link"><a href="testing.html#testing">4.&nbsp;Testing</a></div>
               </li>
               <li>
                  <div class="section-link"><a href="modules.html#modules">5.&nbsp;Modules</a></div>
                  <ul>
                     <li>
                        <div class="section-link"><a href="group__HOST.html#group__HOST">5.1.&nbsp;Host API</a></div>
                     </li>
                     <li>
                        <div class="section-link"><a href="group__DEVICE.html#group__DEVICE">5.2.&nbsp;Device API</a></div>
                     </li>
                  </ul>
               </li>
               <li>
                  <div class="section-link"><a href="bibliography.html#bibliography">A.&nbsp;Bibliography</a></div>
               </li>
               <li>
                  <div class="section-link"><a href="acknowledgements.html#acknowledgements">B.&nbsp;Acknowledgements</a></div>
               </li>
               <li>
                  <div class="section-link"><a href="notices-header.html#notices-header">Notices</a></div>
                  <ul></ul>
               </li>
            </ul>
         </nav>
         <div id="resize-nav"></div>
         <nav id="search-results">
            <h2>Search Results</h2>
            <ol></ol>
         </nav>
         
         <div id="contents-container">
            <div id="breadcrumbs-container">
               <div id="eqn-warning">This document includes math equations
                  (highlighted in red) which are best viewed with <a target="_blank" href="https://www.mozilla.org/firefox">Firefox</a> version 4.0
                  or higher, or another <a target="_blank" href="http://www.w3.org/Math/Software/mathml_software_cat_browsers.html">MathML-aware
                     browser</a>. There is also a <a href="../../pdf/CURAND_Library.pdf">PDF version of this document</a>.
                  
               </div>
               <div id="breadcrumbs"><a href="host-api-overview.html" shape="rect">&lt; Previous</a> | <a href="testing.html" shape="rect">Next &gt;</a></div>
               <div id="release-info">cuRAND
                  (<a href="../../pdf/CURAND_Library.pdf">PDF</a>)
                  -
                  
                  v6.5
                  (<a href="https://developer.nvidia.com/cuda-toolkit-archive">older</a>)
                  -
                  Last updated August 1, 2014
                  -
                  <a href="mailto:cudatools@nvidia.com?subject=CUDA Toolkit Documentation Feedback: cuRAND">Send Feedback</a>
                  -
                  <span class="st_facebook"></span><span class="st_twitter"></span><span class="st_linkedin"></span><span class="st_reddit"></span><span class="st_slashdot"></span><span class="st_tumblr"></span><span class="st_sharethis"></span></div>
            </div>
            <article id="contents">
               <div class="topic nested1" id="device-api-overview"><a name="device-api-overview" shape="rect">
                     <!-- --></a><h2 class="topictitle2">3.&nbsp;Device API Overview</h2>
                  <div class="body conbody">
                     <p class="p">To use the device API, include the file <samp class="ph codeph">curand_kernel.h</samp> in files that define kernels that use cuRAND device functions. The device API includes functions <a class="xref" href="device-api-overview.html#pseudorandom-sequences" shape="rect">pseudorandom generation</a> for and <a class="xref" href="device-api-overview.html#quasirandom-sequences" shape="rect">quasirandom generation</a>.
                     </p>
                  </div>
                  <div class="topic concept nested1" xml:lang="en-us" id="pseudorandom-sequences"><a name="pseudorandom-sequences" shape="rect">
                        <!-- --></a><h3 class="topictitle3">3.1.&nbsp;Pseudorandom Sequences</h3>
                     <div class="body conbody">
                        <p class="p">The functions for pseudorandom sequences support bit generation and generation from distributions.</p>
                     </div>
                     <div class="topic concept nested2" xml:lang="en-us" id="bit-generation-1"><a name="bit-generation-1" shape="rect">
                           <!-- --></a><h4 class="topictitle4">3.1.1.&nbsp;Bit Generation with XORWOW and MRG32k3a generators</h4>
                        <div class="body conbody"><pre xml:space="preserve">
__device__ unsigned int
curand (curandState_t *state)
</pre><p class="p">Following a call to <samp class="ph codeph">curand_init()</samp>, <samp class="ph codeph">curand()</samp> returns a sequence of pseudorandom numbers with a period greater than 
                              <math xmlns="http://www.w3.org/1998/Math/MathML">
                                 <msup>
                                    <mn>2</mn>
                                    <mn>190</mn>
                                 </msup>
                              </math>. If <samp class="ph codeph">curand()</samp> is called with the same initial state each time, and the state is not modified between the calls to <samp class="ph codeph">curand()</samp>, the same sequence is always generated.
                           </p><pre xml:space="preserve">
__device__ void
curand_init (
    unsigned long long seed, unsigned long long sequence,
    unsigned long long offset, curandState_t *state)
</pre><p class="p">The <samp class="ph codeph">curand_init()</samp> function sets up an initial state allocated by the caller using the given seed, sequence number, and offset within the sequence.
                              Different seeds are guaranteed to produce different starting states and different sequences. The same seed always produces
                              the same state and the same sequence. The state set up will be the state after 
                              <math xmlns="http://www.w3.org/1998/Math/MathML">
                                 <msup>
                                    <mn>2</mn>
                                    <mn>67</mn>
                                 </msup>
                              </math> ⋅ <samp class="ph codeph">sequence</samp> + <samp class="ph codeph">offset</samp> calls to <samp class="ph codeph">curand()</samp> from the seed state.
                           </p>
                           <p class="p">Sequences generated with different seeds usually do not have statistically correlated values, but some choices of seeds may
                              give statistically correlated sequences. Sequences generated with the same seed and different sequence numbers will not have
                              statistically correlated values.
                           </p>
                           <p class="p">For the highest quality parallel pseudorandom number generation, each experiment should be assigned a unique seed. Within
                              an experiment, each thread of computation should be assigned a unique sequence number. If an experiment spans multiple kernel
                              launches, it is recommended that threads between kernel launches be given the same seed, and sequence numbers be assigned
                              in a monotonically increasing way. If the same configuration of threads is launched, random state can be preserved in global
                              memory between launches to avoid state setup time.
                           </p>
                        </div>
                     </div>
                     <div class="topic concept nested2" xml:lang="en-us" id="bit-generation-2"><a name="bit-generation-2" shape="rect">
                           <!-- --></a><h4 class="topictitle4">3.1.2.&nbsp;Bit Generation with the MTGP32 generator</h4>
                        <div class="body conbody">
                           <p class="p">The MTGP32 generator is an adaptation of code developed at Hiroshima
                              University (see&nbsp;<a class="xref" href="bibliography.html#bibliography__saito" shape="rect">[1]</a>).  In this
                              algorithm, samples are generated for multiple sequences, each sequence
                              based on a set of computed parameters. cuRAND uses the 200 parameter sets
                              that have been pre-generated for the 32-bit generator with period
                              2<sup class="ph sup">11214</sup>. It would be possible to generate other parameter sets,
                              as described in&nbsp;<a class="xref" href="bibliography.html#bibliography__saito" shape="rect">[1]</a>, and use
                              those instead. There is one state structure for each parameter set
                              (sequence), and the algorithm allows thread-safe generation and state
                              update for up to 256 concurrent threads (within a single block) for each
                              of the 200 sequences.
                           </p>
                           <p class="p">Note that two different blocks can not operate on the same state safely.
                              Also note that, within a block, at most 256 threads may operate on a
                              given state.
                           </p>
                           <p class="p">For the MTGP32 generator, two host functions are provided to help set up
                              parameters for the different sequences in device memory, and to set up
                              the initial state.
                           </p><pre xml:space="preserve">__host__ curandStatust curandMakeMTGP32Constants(mtgp32paramsfastt params[],
                                                 mtgp32kernelparamst *p)</pre><p class="p">This function reorganizes the parameter set data from the pre-generated
                              format (<samp class="ph codeph">mtgp32_params_fast_t</samp>) into the format used by
                              the kernel functions (<samp class="ph codeph">mtgp32_kernel_params_t</samp>), and
                              copies them to device memory.
                           </p><pre xml:space="preserve">__host__ curandStatus_t
curandMakeMTGP32KernelState(curandStateMtgp32_t *s, 
                            mtgp32_params_fast_t params[], 
                            mtgp32_kernel_params_t *k,
                            int n,
                            unsigned long long seed)</pre><p class="p">This function initializes <samp class="ph codeph">n</samp> states, based on the
                              specified parameter set and seed, and copies them to device memory
                              indicated by <samp class="ph codeph">s</samp>. Note that if you are using the
                              pre-generated states, the maximum value of <samp class="ph codeph">n</samp> is 200.
                           </p>
                           <p class="p">The cuRAND MTGP32 generator provides two kernel functions to generate
                              random bits.
                           </p><pre xml:space="preserve">__device__ unsigned int
curand (curandStateMtgp32_t *state)</pre><p class="p">This function computes a thread index, and for that index generates a
                              result and updates state. The thread index <samp class="ph codeph">t</samp> is computed
                              as:
                           </p>
                           <p class="p"><samp class="ph codeph">t= (blockDim.z * blockDim.y * threadIdx.z) + (blockDim.x *
                                 threadIdx.y) + threadIdx.x</samp></p>
                           <p class="p">This function may be called repeatedly from a single kernel launch, with
                              the following constraints:
                           </p>
                           <ul class="ul">
                              <li class="li">It may only be called safely from a block that has 256 or fewer
                                 threads.
                              </li>
                              <li class="li">A given state may not be used by more than one block.</li>
                              <li class="li">A given block may generate randoms using multiple states.</li>
                           </ul><pre xml:space="preserve">__device__ unsigned int
curandmtgp32specific(curandStateMtgp32_t *state, unsigned char index,
                     unsigned char n)</pre><p class="p">This function generates a result and updates state for the position
                              specified by a thread-specific <samp class="ph codeph">index</samp>, and advances the
                              offset in the state by <samp class="ph codeph">n</samp>
                              positions.<samp class="ph codeph">curand_mtgp32_specific</samp> may be called multiple
                              times within a kernel launch, with the following constraints:
                           </p>
                           <ul class="ul">
                              <li class="li">At most 256 threads may call this function for a given state.</li>
                              <li class="li">Within a block, for a given state, if <samp class="ph codeph">n</samp> threads
                                 are calling the function, the indices must run from
                                 <samp class="ph codeph">0...n-1</samp>.  The indices do not have to match the thread
                                 numbers, and may be distributed among the threads as required by the
                                 calling program.
                              </li>
                              <li class="li">A given state may not be used by more than one block.</li>
                              <li class="li">A given block may generate randoms using multiple states.</li>
                           </ul>
                           <div class="fig fignone" id="bit-generation-2__mtgppickup"><a name="bit-generation-2__mtgppickup" shape="rect">
                                 <!-- --></a><span class="figcap">Figure 1. MTGP32 Block and Thread Operation</span><br clear="none"></br><div class="imagecenter"><img class="image imagecenter" src="graphics/mtgppickup.png" alt="Illustration of MTGP32 progressing through its state array"></img></div><br clear="none"></br></div>
                           <p class="p"><a class="xref" href="device-api-overview.html#bit-generation-2__mtgppickup" shape="rect">Figure 1</a> is an illustration of how
                              blocks and threads in MTGP32 operate on the generator states. Each row
                              represents a circular state array of 32-bit integers
                              <samp class="ph codeph">s(n)</samp>. Threads operating on the array are identified as
                              <samp class="ph codeph">T(m)</samp>. The specific case shown matches the internal
                              implementation of the host API, which launches 64 blocks of 256 threads.
                              Each block operates on a different sequence, determined by a unique set
                              of parameters, <samp class="ph codeph">P(n)</samp>. One complete state of an MTGP32
                              sequence is defined by 351 32-bit integers. Each thread
                              <samp class="ph codeph">T(m)</samp> operates on one of these integers,
                              <samp class="ph codeph">s(n+m)</samp> combining it with <samp class="ph codeph">s(n+m+1)</samp> and a
                              pickup element <samp class="ph codeph">s(n+m+p)</samp>, where <samp class="ph codeph">p &lt;=
                                 95</samp>. It stores the new state at position
                              <samp class="ph codeph">s(n+m+351)</samp> in the state array. After thread
                              synchronization, the base index <samp class="ph codeph">n</samp> is advanced by the
                              number of threads that have updated the state. To avoid being
                              overwritten, the array itself must be at least 256 + 351 integers in
                              length. In fact it is sized at 1024 integers for efficiency of
                              indexing.
                           </p>
                           <p class="p">
                              The limitation on the number of threads in a block, which can operate on
                              a given state array, arises from the need to ensure that state
                              <samp class="ph codeph">s(n+351)</samp> has been updated before it is needed as a
                              pickup state. If there were a thread <samp class="ph codeph">T(256)</samp>, it could
                              use <samp class="ph codeph">s(n+256+95)</samp> i.e. <samp class="ph codeph">s(n+351)</samp> before
                              thread zero has updated <samp class="ph codeph">s(n+351)</samp>. If an application
                              requires that more than 256 threads in a block invoke an MTGP32 generator
                              function, it must use multiple MTGP32 states, either by using multiple
                              parameter sets, or by using multiple generators with different seeds.
                              Also note that the generator functions synchronize threads at the end of
                              each call, so it is most efficient for 256 threads in a block to invoke
                              the generator.
                              
                           </p>
                        </div>
                     </div>
                     <div class="topic concept nested2" xml:lang="en-us" id="bit-generation-3"><a name="bit-generation-3" shape="rect">
                           <!-- --></a><h4 class="topictitle4">3.1.3.&nbsp;Bit Generation with Philox_4x32_10 generator</h4>
                        <div class="body conbody"><pre xml:space="preserve">
__device__ unsigned int
curand (curandState_t *state)
</pre><p class="p">Following a call to <samp class="ph codeph">curand_init()</samp>, <samp class="ph codeph">curand()</samp> returns a sequence of pseudorandom numbers with a period 
                              <math xmlns="http://www.w3.org/1998/Math/MathML">
                                 <msup>
                                    <mn>2</mn>
                                    <mn>128</mn>
                                 </msup>
                              </math>. If <samp class="ph codeph">curand()</samp> is called with the same initial state each time, and the state is not modified between the calls to <samp class="ph codeph">curand()</samp>, the same sequence is always generated.
                           </p><pre xml:space="preserve">
__device__ void
curand_init (
    unsigned long long seed, unsigned long long subsequence,
    unsigned long long offset, curandState_t *state)
</pre><p class="p">The <samp class="ph codeph">curand_init()</samp> function sets up an initial state allocated by the caller using the given seed, subsequence and offset. Different seed is
                              guaranteed to produce different starting states and different sequences. Subsequence and offset together define offset in
                              a sequence with period 
                              <math xmlns="http://www.w3.org/1998/Math/MathML">
                                 <msup>
                                    <mn>2</mn>
                                    <mn>128</mn>
                                 </msup>
                              </math>. Offset defines offset in subsequence of length 
                              <math xmlns="http://www.w3.org/1998/Math/MathML">
                                 <msup>
                                    <mn>2</mn>
                                    <mn>64</mn>
                                 </msup>
                              </math>. When last element from subsequence was generated, then the next random number is first element from consecutive subsequence.
                              The same seed always produces the same state and the same sequence.
                           </p>
                           <p class="p">Sequences generated with different seeds usually do not have statistically correlated values, but some choices of seeds may
                              give statistically correlated sequences.
                           </p>
                           <p class="p">For the highest quality parallel pseudorandom number generation, each experiment should be assigned a unique seed value. Within
                              an experiment, each thread of computation should be assigned a unique id number. If an experiment spans multiple kernel launches,
                              it is recommended that threads between kernel launches be given the same seed, and id numbers be assigned in a monotonically
                              increasing way. If the same configuration of threads is launched, random state can be preserved in global memory between launches
                              to avoid state setup time.
                           </p>
                        </div>
                     </div>
                     <div class="topic concept nested2" xml:lang="en-us" id="distributions"><a name="distributions" shape="rect">
                           <!-- --></a><h4 class="topictitle4">3.1.4.&nbsp;Distributions</h4>
                        <div class="body conbody"><pre xml:space="preserve">__device__ float 
curand_uniform (curandState_t *state)</pre><p class="p">This function returns a sequence of pseudorandom floats uniformly distributed between 0.0 and 1.0. It may return from 0.0
                              to 1.0, where 1.0 is included and 0.0 is excluded. Distribution functions may use any number of unsigned integer values from
                              a basic generator. The number of values consumed is not guaranteed to be fixed.
                           </p><pre xml:space="preserve">__device__ float 
curand_normal (curandState_t *state)</pre><p class="p">This function returns a single normally distributed float with mean 0.0 and standard deviation 1.0. This result can be scaled
                              and shifted to produce normally distributed values with any mean and standard deviation.
                           </p><pre xml:space="preserve">__device__ float 
curand_log_normal (curandState_t *state, float mean, float stddev)</pre><p class="p">This function returns a single log-normally distributed float based on a normal distribution with the given mean and standard
                              deviation.
                           </p><pre xml:space="preserve">__device__ unsigned int 
curand_poisson (curandState_t *state, double lambda)</pre><p class="p">This function returns a single Poisson-distributed unsigned int based on a Poisson distribution with the given lambda. The
                              algorithm used to derive a Poisson result from a uniformly distributed result varies depending on the value of lambda and
                              the type of generator. Some algorithms draw more than one sample for a single output. Also note that this distribuition requires
                              pre-processing on the host. See the description of <samp class="ph codeph">curandCreatePoissonDistribution()</samp> below.
                           </p><pre xml:space="preserve">__device__ double 
curand_uniform_double (curandState_t *state)</pre><pre xml:space="preserve">__device__ double 
curand_normal_double (curandState_t *state)</pre><pre xml:space="preserve">__device__ double 
curand_log_normal_double (curandState_t *state, double mean, double stddev)</pre><p class="p">The three functions above are the double precision versions of <samp class="ph codeph">curand_uniform()</samp>, <samp class="ph codeph">curand_normal()</samp>, and <samp class="ph codeph">curand_log_normal()</samp>.
                           </p>
                           <p class="p">For pseudorandom generators, the double precision functions use multiple calls to <samp class="ph codeph">curand()</samp> to generate 53 random bits.
                           </p><pre xml:space="preserve">__device__ float2 
curand_normal2 (curandState_t *state)</pre><pre xml:space="preserve">__device__ float2 
curand_log_normal2 (curandState_t *state)</pre><pre xml:space="preserve">__device__ double2 
curand_normal2_double (curandState_t *state)</pre><pre xml:space="preserve">__device__ double2 
curand_log_normal2_double (curandState_t *state)</pre><p class="p">The above functions generate two normally or log normally distributed pseudorandom results with each call. Because the underlying
                              implementation uses the Box-Muller transform, this is generally more efficient than generating a single result with each call.
                           </p><pre xml:space="preserve">__device__ uint4 
curand4 (curandStatePhilox4_32_10_t *state)</pre><pre xml:space="preserve">__device__ float4 
curand_uniform4 (curandStatePhilox4_32_10_t *state)</pre><pre xml:space="preserve">__device__ float4 
curand_normal4 (curandStatePhilox4_32_10_t *state)</pre><pre xml:space="preserve">__device__ float4 
curand_log_normal4 (curandStatePhilox4_32_10_t *state, float mean, float stddev)</pre><pre xml:space="preserve">__device__ uint4 
curand_poisson4 (curandStatePhilox4_32_10_t *state, double lambda)</pre><pre xml:space="preserve">__device__ uint4 
curand_discrete4 (curandStatePhilox4_32_10_t *state, curandDiscreteDistribution_t discrete_distribution)</pre><pre xml:space="preserve">__device__ double2
curand_uniform2_double (curandStatePhilox4_32_10_t *state)</pre><pre xml:space="preserve">__device__ double2
curand_normal2_double (curandStatePhilox4_32_10_t *state)</pre><pre xml:space="preserve">__device__ double2
curand_log_normal2_double (curandStatePhilox4_32_10_t *state, double mean, double stddev)</pre><p class="p">The above functions generate four single precision or two double precision results with each call. Because the underlying
                              implementation uses the Philox generator, this is generally more efficient than generating a single result with each call.
                           </p>
                        </div>
                     </div>
                  </div>
                  <div class="topic concept nested1" xml:lang="en-us" id="quasirandom-sequences"><a name="quasirandom-sequences" shape="rect">
                        <!-- --></a><h3 class="topictitle3">3.2.&nbsp;Quasirandom Sequences</h3>
                     <div class="body conbody">
                        <p class="p">Although the default generator type is pseudorandom numbers from XORWOW, Sobol’ sequences based on Sobol’ 32-bit integers
                           can be generated using the following functions:
                        </p><pre xml:space="preserve">__device__ void
curand_init (
    unsigned int *direction_vectors, 
    unsigned int offset,
    curandStateSobol32_t *state)</pre><pre xml:space="preserve">__device__ void
curand_init (
    unsigned int *direction_vectors,
    unsigned int scramble_c, 
    unsigned int offset,
    curandStateScrambledSobol32_t *state)</pre><pre xml:space="preserve">__device__ unsigned int
curand (curandStateSobol32_t *state)</pre><pre xml:space="preserve">__device__ float
curand_uniform (curandStateSobol32_t *state)</pre><pre xml:space="preserve">__device__ float 
curand_normal (curandStateSobol32_t *state)</pre><pre xml:space="preserve">__device__ float 
curand_log_normal (
    curandStateSobol32_t *state,  
    float mean, 
    float stddev)</pre><pre xml:space="preserve">__device__ unsigned int 
curand_poisson (curandStateSobol32_t *state, double lambda)</pre><pre xml:space="preserve">__device__ double
curand_uniform_double (curandStateSobol32_t *state)</pre><pre xml:space="preserve">__device__ double
curand_normal_double (curandStateSobol32_t *state)</pre><pre xml:space="preserve">__device__ double
curand_log_normal_double (
    curandStateSobol32_t *state, 
    double mean, 
    double stddev)</pre><p class="p">The <samp class="ph codeph">curand_init()</samp> function initializes the quasirandom number generator state. There is no seed parameter, only direction vectors and offset.
                           For scrambled Sobol’ generators, there is an additional parameter <samp class="ph codeph">scramble_c</samp>, which is the initial value of the scrambled sequence. For the <samp class="ph codeph">curandStateSobol32_t</samp> type and the <samp class="ph codeph">curandStateScrambledSobol32_t</samp> type the direction vectors are an array of 32 unsigned integer values. For the <samp class="ph codeph">curandStateSobol64_t</samp> type and the <samp class="ph codeph">curandStateScrambledSobol64_t</samp> type the direction vectors are an array of 64 unsigned long long values. Offsets and initial constants for the scrambled
                           sequence are of type unsigned int for 32-bit Sobol’ generators. These parameters are of type unsigned long long for 64-bit
                           Soblol’ generators. For the <samp class="ph codeph">curandStateSobol32_t</samp> type and the <samp class="ph codeph">curandStateScrambledSobol32_t</samp> type the sequence is exactly 
                           <math xmlns="http://www.w3.org/1998/Math/MathML">
                              <msup>
                                 <mn>2</mn>
                                 <mn>32</mn>
                              </msup>
                           </math> elements long where each element is 32 bits. For the <samp class="ph codeph">curandStateSobol64_t</samp> type and the <samp class="ph codeph">curandStateScrambledSobol64_t</samp> type the sequence is exactly 
                           <math xmlns="http://www.w3.org/1998/Math/MathML">
                              <msup>
                                 <mn>2</mn>
                                 <mn>64</mn>
                              </msup>
                           </math> elements long where each element is 64 bits. Each call to <samp class="ph codeph">curand()</samp> returns the next quasirandom element. Calls to <samp class="ph codeph">curand_uniform()</samp> return quasirandom floats or doubles from 0.0 to 1.0, where 1.0 is included and 0.0 is excluded. Similarly, calls to <samp class="ph codeph">curand_normal()</samp> return normally distributed floats or doubles with mean 0.0 and standard deviation 1.0. Calls to <samp class="ph codeph">curand_log_normal()</samp> return log-normally distributed floats or doubles, derived from the normal distribution with the specified mean and standard
                           deviation. All of the generation functions may be called with any type of Sobol’ generator.
                        </p>
                        <p class="p">As an example, generating quasirandom coordinates that fill a unit cube requires keeping track of three quasirandom generators.
                           All three would start at <samp class="ph codeph">offset</samp> = 0 and would have dimensions 0, 1, and 2, respectively. A single call to <samp class="ph codeph">curand_uniform()</samp> for each generator state would generate the 
                           <math xmlns="http://www.w3.org/1998/Math/MathML">
                              <mi>x</mi>
                           </math>, 
                           <math xmlns="http://www.w3.org/1998/Math/MathML">
                              <mi>y</mi>
                           </math>, and 
                           <math xmlns="http://www.w3.org/1998/Math/MathML">
                              <mi>z</mi>
                           </math> coordinates. Tables of direction vectors are accessible on the host through the <samp class="ph codeph">curandGetDirectionVectors32()</samp> and <samp class="ph codeph">curandGetDirectionVectors64()</samp> functions. The direction vectors needed should be copied into device memory before use.
                        </p>
                        <p class="p">The normal distribution functions for quasirandom generation use the inverse cumulative density function to preserve the dimensionality
                           of the quasirandom sequence. Therefore there are no functions that generate more than one result at a time as there are with
                           the pseudorandom generators.
                        </p>
                        <p class="p">The double precision Sobol32 functions return results in double precision that use 32 bits of internal precision from the
                           underlying generator.
                        </p>
                        <p class="p">The double precision Sobol64 functions return results in double precision that use 53 bits of internal precision from the
                           underlying generator. These bits are taken from the high order 53 bits of the 64 bit samples.
                        </p>
                     </div>
                  </div>
                  <div class="topic concept nested1" xml:lang="en-us" id="skip-ahead"><a name="skip-ahead" shape="rect">
                        <!-- --></a><h3 class="topictitle3">3.3.&nbsp;Skip-Ahead</h3>
                     <div class="body conbody">
                        <p class="p">There are several functions to skip ahead from a generator state.</p><pre xml:space="preserve">__device__ void 
skipahead (unsigned long long n, curandState_t *state)</pre><pre xml:space="preserve">__device__ void 
skipahead (unsigned int n, curandStateSobol32_t *state)</pre><p class="p">Using this function is equivalent to calling <samp class="ph codeph">curand() </samp><math xmlns="http://www.w3.org/1998/Math/MathML">
                              <mi>n</mi>
                           </math> times without using the return value, but it is much faster.
                        </p><pre xml:space="preserve">__device__ void 
skipaheadsequence (unsigned long long n, curandState_t *state)</pre><p class="p">This function is the equivalent of calling <samp class="ph codeph">curand() </samp><math xmlns="http://www.w3.org/1998/Math/MathML">
                              <mi>n</mi>
                              <mo>⋅</mo>
                              <msup>
                                 <mn>2</mn>
                                 <mn>67</mn>
                              </msup>
                           </math> times without using the return value and is much faster.
                        </p>
                     </div>
                  </div>
                  <div class="topic concept nested1" xml:lang="en-us" id="device-api-for-discrete-distributions"><a name="device-api-for-discrete-distributions" shape="rect">
                        <!-- --></a><h3 class="topictitle3">3.4.&nbsp;Device API for discrete distributions</h3>
                     <div class="body conbody">
                        <p class="p">Discrete distributions, such as the Poisson distribution, require additional API’s that perform preprocessing on HOST side
                           to generate a histogram for the specific distribution. In the case of the Poisson distribution this histogram is different
                           for different values of lambda. Best performance for these distributions will be seen on GPUs with at least 48KB of L1 cache.
                        </p><pre xml:space="preserve">curandStatus_t 
curandCreatePoissonDistribution(
    double lambda, 
    curandDiscreteDistribution_t *discrete_distribution)</pre><p class="p">The <samp class="ph codeph">curandCreatePoissonDistribution()</samp> function is used to create a histogram for the Poisson distribution with the given lambda.
                        </p><pre xml:space="preserve">__device__ unsigned int 
curand_discrete (
    curandState_t *state,
    curandDiscreteDistribution_t discrete_distribution)</pre><p class="p">This function returns a single discrete distributed unsigned int based on a distribution for the given discrete distribution
                           histogram.
                        </p><pre xml:space="preserve">curandStatus_t 
curandDestroyDistribution(
    curandDiscreteDistribution_t discrete_distribution)</pre><p class="p">The <samp class="ph codeph">curandDestroyDistribution()</samp> function is used to clean up structures related to the histogram.
                        </p>
                     </div>
                  </div>
                  <div class="topic concept nested1" xml:lang="en-us" id="performance-notes"><a name="performance-notes" shape="rect">
                        <!-- --></a><h3 class="topictitle3">3.5.&nbsp;Performance Notes</h3>
                     <div class="body conbody">
                        <p class="p">Calls to <samp class="ph codeph">curand_init()</samp> are slower than calls to <samp class="ph codeph">curand()</samp> or <samp class="ph codeph">curand_uniform()</samp>. Large offsets to <samp class="ph codeph">curand_init()</samp> take more time than smaller offsets. It is much faster to save and restore random generator state than to recalculate the
                           starting state repeatedly.
                        </p>
                        <p class="p">As shown below, generator state can be stored in global memory between kernel launches, used in local memory for fast generation,
                           and then stored back into global memory.
                        </p><pre xml:space="preserve">
__global__ void example(curandState *global_state)
{
    curandState local_state;
    local_state = global_state[threadIdx.x];
    for(int i = 0; i &lt; 10000; i++) {
        unsigned int x = curand(&amp;local_state);
        ...
    }
    global_state[threadIdx.x] = local_state;
}
</pre><p class="p">Initialization of the random generator state generally requires more registers and local memory than random number generation.
                           It may be beneficial to separate calls to <samp class="ph codeph">curand_init()</samp> and <samp class="ph codeph">curand()</samp> into separate kernels for maximum performance.
                        </p>
                        <p class="p">State setup can be an expensive operation. One way to speed up the setup is to use different seeds for each thread and a constant
                           sequence number of 0. This can be especially helpful if many generators need to be created. While faster to set up, this method
                           provides less guarantees about the mathematical properties of the generated sequences. If there happens to be a bad interaction
                           between the hash function that initializes the generator state from the seed and the periodicity of the generators, there
                           might be threads with highly correlated outputs for some seed values. We do not know of any problem values; if they do exist
                           they are likely to be rare.
                        </p>
                     </div>
                  </div>
                  <div class="topic concept nested1" xml:lang="en-us" id="device-api-example"><a name="device-api-example" shape="rect">
                        <!-- --></a><h3 class="topictitle3">3.6.&nbsp;Device API Examples</h3>
                     <div class="body conbody">
                        <p class="p">This example uses the cuRAND device API to generate pseudorandom numbers using either the XORWOW or MRG32k3a generators. For
                           integers, it calculates the proportion that have the low bit set. For uniformly distributed real numbers, it calculates the
                           proportion that are greater than 0.5. For normally distributed real numbers, it calculates the proportion that are within
                           one standard deviation of the mean.
                        </p><pre xml:space="preserve">
/*
 * This program uses the device CURAND API to calculate what 
 * proportion of pseudo-random ints have low bit set.
 * It then generates uniform results to calculate how many
 * are greater than .5.
 * It then generates  normal results to calculate how many 
 * are within one standard deviation of the mean.
 */
#include &lt;stdio.h&gt;
#include &lt;stdlib.h&gt;

#include &lt;cuda.h&gt;
#include &lt;curand_kernel.h&gt;

#define CUDA_CALL(x) do { if((x) != cudaSuccess) { \
    printf("Error at %s:%d\n",__FILE__,__LINE__); \
    return EXIT_FAILURE;}} while(0)

__global__ void setup_kernel(curandState *state)
{
    int id = threadIdx.x + blockIdx.x * 64;
    /* Each thread gets same seed, a different sequence 
       number, no offset */
    curand_init(1234, id, 0, &amp;state[id]);
}

__global__ void setup_kernel(curandStatePhilox4_32_10_t *state)
{
    int id = threadIdx.x + blockIdx.x * 64;
    /* Each thread gets same seed, a different sequence 
       number, no offset */
    curand_init(1234, id, 0, &amp;state[id]);
}

__global__ void setup_kernel(curandStateMRG32k3a *state)
{
    int id = threadIdx.x + blockIdx.x * 64;
    /* Each thread gets same seed, a different sequence 
       number, no offset */
    curand_init(0, id, 0, &amp;state[id]);
}

__global__ void generate_kernel(curandState *state,
                                int n, 
                                unsigned int *result)
{
    int id = threadIdx.x + blockIdx.x * 64;
    int count = 0;
    unsigned int x;
    /* Copy state to local memory for efficiency */
    curandState localState = state[id];
    /* Generate pseudo-random unsigned ints */
    for(int i = 0; i &lt; n; i++) {
        x = curand(&amp;localState);
        /* Check if low bit set */
        if(x &amp; 1) {
            count++;
        }
    }
    /* Copy state back to global memory */
    state[id] = localState;
    /* Store results */
    result[id] += count;
}

__global__ void generate_kernel(curandStatePhilox4_32_10_t *state,
                                int n, 
                                unsigned int *result)
{
    int id = threadIdx.x + blockIdx.x * 64;
    int count = 0;
    unsigned int x;
    /* Copy state to local memory for efficiency */
    curandStatePhilox4_32_10_t localState = state[id];
    /* Generate pseudo-random unsigned ints */
    for(int i = 0; i &lt; n; i++) {
        x = curand(&amp;localState);
        /* Check if low bit set */
        if(x &amp; 1) {
            count++;
        }
    }
    /* Copy state back to global memory */
    state[id] = localState;
    /* Store results */
    result[id] += count;
}

__global__ void generate_uniform_kernel(curandState *state,
                                int n, 
                                unsigned int *result)
{
    int id = threadIdx.x + blockIdx.x * 64;
    unsigned int count = 0;
    float x;
    /* Copy state to local memory for efficiency */
    curandState localState = state[id];
    /* Generate pseudo-random uniforms */
    for(int i = 0; i &lt; n; i++) {
        x = curand_uniform(&amp;localState);
        /* Check if &gt; .5 */
        if(x &gt; .5) {
            count++;
        }
    }
    /* Copy state back to global memory */
    state[id] = localState;
    /* Store results */
    result[id] += count;
}

__global__ void generate_uniform_kernel(curandStatePhilox4_32_10_t *state,
                                int n, 
                                unsigned int *result)
{
    int id = threadIdx.x + blockIdx.x * 64;
    unsigned int count = 0;
    float x;
    /* Copy state to local memory for efficiency */
    curandStatePhilox4_32_10_t localState = state[id];
    /* Generate pseudo-random uniforms */
    for(int i = 0; i &lt; n; i++) {
        x = curand_uniform(&amp;localState);
        /* Check if &gt; .5 */
        if(x &gt; .5) {
            count++;
        }
    }
    /* Copy state back to global memory */
    state[id] = localState;
    /* Store results */
    result[id] += count;
}

__global__ void generate_normal_kernel(curandState *state,
                                int n, 
                                unsigned int *result)
{
    int id = threadIdx.x + blockIdx.x * 64;
    unsigned int count = 0;
    float2 x;
    /* Copy state to local memory for efficiency */
    curandState localState = state[id];
    /* Generate pseudo-random normals */
    for(int i = 0; i &lt; n/2; i++) {
        x = curand_normal2(&amp;localState);
        /* Check if within one standard deviaton */
        if((x.x &gt; -1.0) &amp;&amp; (x.x &lt; 1.0)) {
            count++;
        }
        if((x.y &gt; -1.0) &amp;&amp; (x.y &lt; 1.0)) {
            count++;
        }
    }
    /* Copy state back to global memory */
    state[id] = localState;
    /* Store results */
    result[id] += count;
}

__global__ void generate_normal_kernel(curandStatePhilox4_32_10_t *state,
                                int n, 
                                unsigned int *result)
{
    int id = threadIdx.x + blockIdx.x * 64;
    unsigned int count = 0;
    float2 x;
    /* Copy state to local memory for efficiency */
    curandStatePhilox4_32_10_t localState = state[id];
    /* Generate pseudo-random normals */
    for(int i = 0; i &lt; n/2; i++) {
        x = curand_normal2(&amp;localState);
        /* Check if within one standard deviaton */
        if((x.x &gt; -1.0) &amp;&amp; (x.x &lt; 1.0)) {
            count++;
        }
        if((x.y &gt; -1.0) &amp;&amp; (x.y &lt; 1.0)) {
            count++;
        }
    }
    /* Copy state back to global memory */
    state[id] = localState;
    /* Store results */
    result[id] += count;
}

__global__ void generate_kernel(curandStateMRG32k3a *state,
                                int n, 
                                unsigned int *result)
{
    int id = threadIdx.x + blockIdx.x * 64;
    unsigned int count = 0;
    unsigned int x;
    /* Copy state to local memory for efficiency */
    curandStateMRG32k3a localState = state[id];
    /* Generate pseudo-random unsigned ints */
    for(int i = 0; i &lt; n; i++) {
        x = curand(&amp;localState);
        /* Check if low bit set */
        if(x &amp; 1) {
            count++;
        }
    }
    /* Copy state back to global memory */
    state[id] = localState;
    /* Store results */
    result[id] += count;
}

__global__ void generate_uniform_kernel(curandStateMRG32k3a *state,
                                int n, 
                                unsigned int *result)
{
    int id = threadIdx.x + blockIdx.x * 64;
    unsigned int count = 0;
    double x;
    /* Copy state to local memory for efficiency */
    curandStateMRG32k3a localState = state[id];
    /* Generate pseudo-random uniforms */
    for(int i = 0; i &lt; n; i++) {
        x = curand_uniform_double(&amp;localState);
        /* Check if &gt; .5 */
        if(x &gt; .5) {
            count++;
        }
    }
    /* Copy state back to global memory */
    state[id] = localState;
    /* Store results */
    result[id] += count;
}

__global__ void generate_normal_kernel(curandStateMRG32k3a *state, 
                                int n,
                                unsigned int *result)
{
    int id = threadIdx.x + blockIdx.x * 64;
    unsigned int count = 0;
    double2 x;
    /* Copy state to local memory for efficiency */
    curandStateMRG32k3a localState = state[id];
    /* Generate pseudo-random normals */
    for(int i = 0; i &lt; n/2; i++) {
        x = curand_normal2_double(&amp;localState);
        /* Check if within one standard deviaton */
        if((x.x &gt; -1.0) &amp;&amp; (x.x &lt; 1.0)) {
            count++;
        }
        if((x.y &gt; -1.0) &amp;&amp; (x.y &lt; 1.0)) {
            count++;
        }
    }
    /* Copy state back to global memory */
    state[id] = localState;
    /* Store results */
    result[id] += count;
}

int main(int argc, char *argv[])
{
    int i;
    unsigned int total;
    curandState *devStates;
    curandStateMRG32k3a *devMRGStates;
    curandStatePhilox4_32_10_t *devPHILOXStates;
    unsigned int *devResults, *hostResults;
    bool useMRG = 0;
    bool usePHILOX = 0;
    int sampleCount = 10000;
    bool doubleSupported = 0;
    int device;
    struct cudaDeviceProp properties;  

    /* check for double precision support */
    CUDA_CALL(cudaGetDevice(&amp;device));
    CUDA_CALL(cudaGetDeviceProperties(&amp;properties,device));
    if ( properties.major &gt;= 2 || (properties.major == 1 &amp;&amp; properties.minor &gt;= 3) ) {
        doubleSupported = 1;
    }
    
    /* Check for MRG32k3a option (default is XORWOW) */
    if (argc &gt;= 2)  {
        if (strcmp(argv[1],"-m") == 0) {
            useMRG = 1;
            if (!doubleSupported){
                printf("MRG32k3a requires double precision\n");
                printf("^^^^ test WAIVED due to lack of double precision\n");
                return EXIT_SUCCESS;
            }
        }else if (strcmp(argv[1],"-p") == 0) {
		usePHILOX = 1;
	} 
        /* Allow over-ride of sample count */    
        sscanf(argv[argc-1],"%d",&amp;sampleCount); 
    }

    /* Allocate space for results on host */
    hostResults = (unsigned int *)calloc(64 * 64, sizeof(int));

    /* Allocate space for results on device */
    CUDA_CALL(cudaMalloc((void **)&amp;devResults, 64 * 64 * 
              sizeof(unsigned int)));

    /* Set results to 0 */
    CUDA_CALL(cudaMemset(devResults, 0, 64 * 64 * 
              sizeof(unsigned int)));

    /* Allocate space for prng states on device */
    if (useMRG) {
        CUDA_CALL(cudaMalloc((void **)&amp;devMRGStates, 64 * 64 * 
                  sizeof(curandStateMRG32k3a)));
    }else if(usePHILOX) {
        CUDA_CALL(cudaMalloc((void **)&amp;devPHILOXStates, 64 * 64 * 
                  sizeof(curandStatePhilox4_32_10_t)));
    }else {
        CUDA_CALL(cudaMalloc((void **)&amp;devStates, 64 * 64 * 
                  sizeof(curandState)));
    }
    
    /* Setup prng states */
    if (useMRG) {
        setup_kernel&lt;&lt;&lt;64, 64&gt;&gt;&gt;(devMRGStates);
    }else if(usePHILOX)
    {
        setup_kernel&lt;&lt;&lt;64, 64&gt;&gt;&gt;(devPHILOXStates);
    }else {
        setup_kernel&lt;&lt;&lt;64, 64&gt;&gt;&gt;(devStates);
    }
    
    /* Generate and use pseudo-random  */
    for(i = 0; i &lt; 50; i++) {
        if (useMRG) {
            generate_kernel&lt;&lt;&lt;64, 64&gt;&gt;&gt;(devMRGStates, sampleCount, devResults);
        }else if (usePHILOX){
            generate_kernel&lt;&lt;&lt;64, 64&gt;&gt;&gt;(devPHILOXStates, sampleCount, devResults);
	}else {
            generate_kernel&lt;&lt;&lt;64, 64&gt;&gt;&gt;(devStates, sampleCount, devResults);
        }
    }
    
    /* Copy device memory to host */
    CUDA_CALL(cudaMemcpy(hostResults, devResults, 64 * 64 * 
        sizeof(unsigned int), cudaMemcpyDeviceToHost));

    /* Show result */
    total = 0;
    for(i = 0; i &lt; 64 * 64; i++) {
        total += hostResults[i];
    }
    printf("Fraction with low bit set was %10.13f\n", 
        (float)total / (64.0f * 64.0f * sampleCount * 50.0f));
        
    /* Set results to 0 */
    CUDA_CALL(cudaMemset(devResults, 0, 64 * 64 * 
              sizeof(unsigned int)));

    /* Generate and use uniform pseudo-random  */
    for(i = 0; i &lt; 50; i++) {
        if (useMRG) {
            generate_uniform_kernel&lt;&lt;&lt;64, 64&gt;&gt;&gt;(devMRGStates, sampleCount, devResults);
        }else if(usePHILOX) {
            generate_uniform_kernel&lt;&lt;&lt;64, 64&gt;&gt;&gt;(devPHILOXStates, sampleCount, devResults);
	}else {
            generate_uniform_kernel&lt;&lt;&lt;64, 64&gt;&gt;&gt;(devStates, sampleCount, devResults);
        }
    }

    /* Copy device memory to host */
    CUDA_CALL(cudaMemcpy(hostResults, devResults, 64 * 64 * 
        sizeof(unsigned int), cudaMemcpyDeviceToHost));

    /* Show result */
    total = 0;
    for(i = 0; i &lt; 64 * 64; i++) {
        total += hostResults[i];
    }
    printf("Fraction of uniforms &gt; 0.5 was %10.13f\n", 
        (float)total / (64.0f * 64.0f * sampleCount * 50.0f));
    /* Set results to 0 */
    CUDA_CALL(cudaMemset(devResults, 0, 64 * 64 * 
              sizeof(unsigned int)));

    /* Generate and use normal pseudo-random  */
    for(i = 0; i &lt; 50; i++) {
        if (useMRG) {
            generate_normal_kernel&lt;&lt;&lt;64, 64&gt;&gt;&gt;(devMRGStates, sampleCount, devResults);
        }else if(usePHILOX) {
            generate_normal_kernel&lt;&lt;&lt;64, 64&gt;&gt;&gt;(devPHILOXStates, sampleCount, devResults);
	}else {
            generate_normal_kernel&lt;&lt;&lt;64, 64&gt;&gt;&gt;(devStates, sampleCount, devResults);
        }
    }

    /* Copy device memory to host */
    CUDA_CALL(cudaMemcpy(hostResults, devResults, 64 * 64 * 
        sizeof(unsigned int), cudaMemcpyDeviceToHost));

    /* Show result */
    total = 0;
    for(i = 0; i &lt; 64 * 64; i++) {
        total += hostResults[i];
    }
    printf("Fraction of normals within 1 standard deviation was %10.13f\n", 
        (float)total / (64.0f * 64.0f * sampleCount * 50.0f));

    /* Cleanup */
    if (useMRG) {
        CUDA_CALL(cudaFree(devMRGStates));
    }else if(usePHILOX)
    {
        CUDA_CALL(cudaFree(devPHILOXStates));
    }else {
        CUDA_CALL(cudaFree(devStates));
    }    
    CUDA_CALL(cudaFree(devResults));
    free(hostResults);
    printf("^^^^ kernel_example PASSED\n");
    return EXIT_SUCCESS;
}
</pre><p class="p">The following example uses the cuRAND host MTGP setup API, and the cuRAND device API, to generate integers using the MTGP32
                           generator, and calculates the proportion that have the low bit set.
                        </p><pre xml:space="preserve">
/*
 * This program uses the device CURAND API to calculate what 
 * proportion of pseudo-random ints have low bit set.
 */
#include &lt;stdio.h&gt;
#include &lt;stdlib.h&gt;
#include &lt;cuda.h&gt;
#include &lt;curand_kernel.h&gt;
/* include MTGP host helper functions */
#include &lt;curand_mtgp32_host.h&gt;
/* include MTGP pre-computed parameter sets */
#include &lt;curand_mtgp32dc_p_11213.h&gt;


#define CUDA_CALL(x) do { if((x) != cudaSuccess) { \
    printf("Error at %s:%d\n",__FILE__,__LINE__); \
    return EXIT_FAILURE;}} while(0)

#define CURAND_CALL(x) do { if((x) != CURAND_STATUS_SUCCESS) { \
    printf("Error at %s:%d\n",__FILE__,__LINE__); \
    return EXIT_FAILURE;}} while(0)

__global__ void generate_kernel(curandStateMtgp32 *state, 
                                int n,
                                int *result)
{
    int id = threadIdx.x + blockIdx.x * 256;
    int count = 0;
    unsigned int x;
    /* Generate pseudo-random unsigned ints */
    for(int i = 0; i &lt; n; i++) {
        x = curand(&amp;state[blockIdx.x]);
        /* Check if low bit set */
        if(x &amp; 1) {
            count++;
        }
    }
    /* Store results */
    result[id] += count;
}

int main(int argc, char *argv[])
{
    int i;
    long long total;
    curandStateMtgp32 *devMTGPStates;
    mtgp32_kernel_params *devKernelParams;
    int *devResults, *hostResults;
    int sampleCount = 10000;
    
    /* Allow over-ride of sample count */    
    if (argc == 2) {
        sscanf(argv[1],"%d",&amp;sampleCount);
    }
        
    /* Allocate space for results on host */
    hostResults = (int *)calloc(64 * 256, sizeof(int));

    /* Allocate space for results on device */
    CUDA_CALL(cudaMalloc((void **)&amp;devResults, 64 * 256 * 
              sizeof(int)));

    /* Set results to 0 */
    CUDA_CALL(cudaMemset(devResults, 0, 64 * 256 * 
              sizeof(int)));

    /* Allocate space for prng states on device */
    CUDA_CALL(cudaMalloc((void **)&amp;devMTGPStates, 64 * 
              sizeof(curandStateMtgp32)));
    
    /* Setup MTGP prng states */
    
    /* Allocate space for MTGP kernel parameters */
    CUDA_CALL(cudaMalloc((void**)&amp;devKernelParams, sizeof(mtgp32_kernel_params)));
    
    /* Reformat from predefined parameter sets to kernel format, */
    /* and copy kernel parameters to device memory               */
    CURAND_CALL(curandMakeMTGP32Constants(mtgp32dc_params_fast_11213, devKernelParams));
    
    /* Initialize one state per thread block */
    CURAND_CALL(curandMakeMTGP32KernelState(devMTGPStates, 
                mtgp32dc_params_fast_11213, devKernelParams, 64, 1234));
    
    /* State setup is complete */
    
    /* Generate and use pseudo-random  */
    for(i = 0; i &lt; 10; i++) {
        generate_kernel&lt;&lt;&lt;64, 256&gt;&gt;&gt;(devMTGPStates, sampleCount, devResults);
    }

    /* Copy device memory to host */
    CUDA_CALL(cudaMemcpy(hostResults, devResults, 64 * 256 * 
        sizeof(int), cudaMemcpyDeviceToHost));

    /* Show result */
    total = 0;
    for(i = 0; i &lt; 64 * 256; i++) {
        total += hostResults[i];
    }
    
    
    printf("Fraction with low bit set was %10.13g\n", 
        (double)total / (64.0f * 256.0f * sampleCount * 10.0f));

    /* Cleanup */
    CUDA_CALL(cudaFree(devMTGPStates));
    CUDA_CALL(cudaFree(devResults));
    free(hostResults);
    printf("^^^^ kernel_mtgp_example PASSED\n");
    return EXIT_SUCCESS;
}
</pre></div>
                  </div>
                  <div class="topic concept nested1" xml:lang="en-us" id="thrust-and-curand-example"><a name="thrust-and-curand-example" shape="rect">
                        <!-- --></a><h3 class="topictitle3">3.7.&nbsp;Thrust and cuRAND Example</h3>
                     <div class="body conbody">
                        <p class="p">The following example demonstrates mixing cuRAND and Thrust. It is a minimally modified version of <samp class="ph codeph">monte_carlo.cu</samp>, one of the standard Thrust examples. The example estimates 
                           <math xmlns="http://www.w3.org/1998/Math/MathML">
                              <mi>π</mi>
                           </math> by randomly picking points in the unit square and calculating the distance to the origin to see if the points are in the
                           quarter unit circle.
                        </p><pre xml:space="preserve">#include &lt;thrust/iterator/counting_iterator.h&gt;
#include &lt;thrust/functional.h&gt;
#include &lt;thrust/transform_reduce.h&gt;
#include &lt;curand_kernel.h&gt;

#include &lt;iostream&gt;
#include &lt;iomanip&gt;

// we could vary M &amp; N to find the perf sweet spot

struct estimate_pi : 
    public thrust::unary_function&lt;unsigned int, float&gt;
{
  __device__
  float operator()(unsigned int thread_id)
  {
    float sum = 0;
    unsigned int N = 10000; // samples per thread

    unsigned int seed = thread_id;

    curandState s;

    // seed a random number generator
    curand_init(seed, 0, 0, &amp;s);

    // take N samples in a quarter circle
    for(unsigned int i = 0; i &lt; N; ++i)
    {
      // draw a sample from the unit square
      float x = curand_uniform(&amp;s);
      float y = curand_uniform(&amp;s);

      // measure distance from the origin
      float dist = sqrtf(x*x + y*y);

      // add 1.0f if (u0,u1) is inside the quarter circle
      if(dist &lt;= 1.0f)
        sum += 1.0f;
    }

    // multiply by 4 to get the area of the whole circle
    sum *= 4.0f;

    // divide by N
    return sum / N;
  }
};

int main(void)
{
  // use 30K independent seeds
  int M = 30000;

  float estimate = thrust::transform_reduce(
        thrust::counting_iterator&lt;int&gt;(0),
        thrust::counting_iterator&lt;int&gt;(M),
        estimate_pi(),
        0.0f,
        thrust::plus&lt;float&gt;());
  estimate /= M;

  std::cout &lt;&lt; std::setprecision(3);
  std::cout &lt;&lt; "pi is approximately ";
  std::cout &lt;&lt; estimate &lt;&lt; std::endl;
  return 0;
}

</pre></div>
                  </div>
                  <div class="topic concept nested1" xml:lang="en-us" id="poisson-api-example"><a name="poisson-api-example" shape="rect">
                        <!-- --></a><h3 class="topictitle3">3.8.&nbsp;Poisson API Example</h3>
                     <div class="body conbody">
                        <p class="p">This example shows the differences between the 3 API types for the Poisson distribution. It is a simulation of queues in a
                           store. The host API is the most robust for generating large vectors of Poisson-distributed random numbers. (i.e. it has the
                           best statistical properties across the full range of lambda values) The discrete Device API is almost as robust as the HOST
                           API and allows Poisson-distributed random numbers to be generated inside a kernel. The simple Device API is the least robust
                           but is more efficient when generating Poisson-distributed random numbers for many different lambdas.
                        </p><pre xml:space="preserve">/*
 * This program uses CURAND library for Poisson distribution
 * to simulate queues in store for 16 hours. It shows the
 * difference of using 3 different APIs:
 * - HOST API -arrival of customers is described by Poisson(4)
 * - SIMPLE DEVICE API -arrival of customers is described by
 *     Poisson(4*(sin(x/100)+1)), where x is number of minutes
 *     from store opening time.
 * - ROBUST DEVICE API -arrival of customers is described by:
 *     - Poisson(2) for first 3 hours.
 *     - Poisson(1) for second 3 hours.
 *     - Poisson(3) after 6 hours.  
 */

#include &lt;stdio.h&gt;
#include &lt;stdlib.h&gt;
#include &lt;cuda.h&gt;
#include &lt;curand_kernel.h&gt;
#include &lt;curand.h&gt;

#define CUDA_CALL(x) do { if((x) != cudaSuccess) { \
    printf("Error at %s:%d\n",__FILE__,__LINE__); \
    return EXIT_FAILURE;}} while(0)
#define CURAND_CALL(x) do { if((x)!=CURAND_STATUS_SUCCESS) { \
    printf("Error at %s:%d\n",__FILE__,__LINE__);\
    return EXIT_FAILURE;}} while(0)


#define HOURS 16
#define OPENING_HOUR 7
#define CLOSING_HOUR (OPENING_HOUR + HOURS)

#define access_2D(type, ptr, row, column, pitch)\
    *((type*)((char*)ptr + (row) * pitch) + column)

enum API_TYPE {
    HOST_API = 0,
    SIMPLE_DEVICE_API = 1,
    ROBUST_DEVICE_API = 2,
};

/* global variables */
API_TYPE api;
int report_break;
int cashiers_load_h[HOURS];
__constant__ int cashiers_load[HOURS];

__global__ void setup_kernel(curandState *state)
{
    int id = threadIdx.x + blockIdx.x * blockDim.x;
    /* Each thread gets same seed, a different sequence 
       number, no offset */
    curand_init(1234, id, 0, &amp;state[id]);
}

__inline__ __device__
void update_queue(int id, int min, unsigned int new_customers,
                  unsigned int &amp;queue_length,
                  unsigned int *queue_lengths, size_t pitch)
{
    int balance;
    balance = new_customers - 2 * cashiers_load[(min-1)/60];
    if (balance + (int)queue_length &lt;= 0){
        queue_length = 0;
    }else{
        queue_length += balance;
    }
    /* Store results */
    access_2D(unsigned int, queue_lengths, min-1, id, pitch)
        = queue_length;
}


__global__ void simple_device_API_kernel(curandState *state, 
                    unsigned int *queue_lengths, size_t pitch)
{
    int id = threadIdx.x + blockIdx.x * blockDim.x;
    unsigned int new_customers;
    unsigned int queue_length = 0;
    /* Copy state to local memory for efficiency */
    curandState localState = state[id];
    /* Simulate queue in time */
    for(int min = 1; min &lt;= 60 * HOURS; min++) {
        /* Draw number of new customers depending on API */
        new_customers = curand_poisson(&amp;localState,
                                4*(sin((float)min/100.0)+1));
        /* Update queue */
        update_queue(id, min, new_customers, queue_length,
                     queue_lengths, pitch);       
    }
    /* Copy state back to global memory */
    state[id] = localState;
}


__global__ void host_API_kernel(unsigned int *poisson_numbers,
                    unsigned int *queue_lengths, size_t pitch)
{
    int id = threadIdx.x + blockIdx.x * blockDim.x;
    unsigned int new_customers;
    unsigned int queue_length = 0;
    /* Simulate queue in time */
    for(int min = 1; min &lt;= 60 * HOURS; min++) {
        /* Get random number from global memory */
        new_customers = poisson_numbers
                    [blockDim.x * gridDim.x * (min -1) + id];
        /* Update queue */
        update_queue(id, min, new_customers, queue_length,
                     queue_lengths, pitch);
    }
}

__global__ void robust_device_API_kernel(curandState *state,
                   curandDiscreteDistribution_t poisson_1,
                   curandDiscreteDistribution_t poisson_2,
                   curandDiscreteDistribution_t poisson_3,
                   unsigned int *queue_lengths, size_t pitch)
{
    int id = threadIdx.x + blockIdx.x * 64;
    unsigned int new_customers;
    unsigned int queue_length = 0;
    /* Copy state to local memory for efficiency */
    curandState localState = state[id];
    /* Simulate queue in time */
    /* first 3 hours */
    for(int min = 1; min &lt;= 60 * 3; min++) {
        /* draw number of new customers depending on API */
        new_customers =
                    curand_discrete(&amp;localState, poisson_2);
        /* Update queue */
        update_queue(id, min, new_customers, queue_length,
                                    queue_lengths, pitch);
    }
    /* second 3 hours */
    for(int min = 60 * 3 + 1; min &lt;= 60 * 6; min++) {
        /* draw number of new customers depending on API */
        new_customers =
                    curand_discrete(&amp;localState, poisson_1);
        /* Update queue */
        update_queue(id, min, new_customers, queue_length,
                                    queue_lengths, pitch);       
    }
    /* after 6 hours */
    for(int min = 60 * 6 + 1; min &lt;= 60 * HOURS; min++) {
        /* draw number of new customers depending on API */
        new_customers =
                    curand_discrete(&amp;localState, poisson_3);
        /* Update queue */
        update_queue(id, min, new_customers, queue_length,
                                    queue_lengths, pitch);       
    }
    /* Copy state back to global memory */
    state[id] = localState;
}

/* Set time intervals between reports */
void report_settings()
{
    do{
        printf("Set time intervals between queue reports");
        printf("(in minutes &gt; 0)\n");
        if (scanf("%d", &amp;report_break) == 0) continue;
    }while(report_break &lt;= 0);
}


/* Set number of cashiers each hour */
void add_cachiers(int *cashiers_load)
{
    int i, min, max, begin, end;
    printf("Cashier serves 2 customers per minute...\n");
    for (i = 0; i &lt; HOURS; i++){
        cashiers_load_h[i] = 0;
    }
    while (true){
        printf("Adding cashier...\n");
        min = OPENING_HOUR;
        max = CLOSING_HOUR-1;
        do{
            printf("Set hour that cahier comes (%d-%d)",
                                                min, max);
            printf(" [type 0 to finish adding cashiers]\n");
            if (scanf("%d", &amp;begin) == 0) continue;
        }while (begin &gt; max || (begin &lt; min &amp;&amp; begin != 0));
        if (begin == 0) break;
        min = begin+1;
        max = CLOSING_HOUR;
        do{
            printf("Set hour that cahier leaves (%d-%d)",
                                                min, max);
            printf(" [type 0 to finish adding cashiers]\n");
            if (scanf("%d", &amp;end) == 0) continue;
        }while (end &gt; max || (end &lt; min &amp;&amp; end != 0));
        if (end == 0) break;
        for (i = begin - OPENING_HOUR;
             i &lt; end   - OPENING_HOUR; i++){
            cashiers_load_h[i]++;
        }
    }
    for (i = OPENING_HOUR; i &lt; CLOSING_HOUR; i++){
        printf("\n%2d:00 - %2d:00     %d cashier",
                i, i+1, cashiers_load_h[i-OPENING_HOUR]);
        if (cashiers_load[i-OPENING_HOUR] != 1) printf("s");
    }
    printf("\n");
}

/* Set API type */
API_TYPE set_API_type()
{
    printf("Choose API type:\n");
    int choose;
    do{
        printf("type 1 for HOST API\n");
        printf("type 2 for SIMPLE DEVICE API\n");
        printf("type 3 for ROBUST DEVICE API\n");
        if (scanf("%d", &amp;choose) == 0) continue;
    }while( choose &lt; 1 || choose &gt; 3);
    switch(choose){
        case 1: return HOST_API;
        case 2: return SIMPLE_DEVICE_API;
        case 3: return ROBUST_DEVICE_API;
        default:
            fprintf(stderr, "wrong API\n");
            return HOST_API;
    }
}

void settings()
{
    add_cachiers(cashiers_load);
    cudaMemcpyToSymbol("cashiers_load", cashiers_load_h,
            HOURS * sizeof(int), 0, cudaMemcpyHostToDevice);
    report_settings();
    api = set_API_type();
}

void print_statistics(unsigned int *hostResults, size_t pitch)
{
    int min, i, hour, minute;
    unsigned int sum;
    for(min = report_break; min &lt;= 60 * HOURS;
                            min += report_break) {
        sum = 0;
        for(i = 0; i &lt; 64 * 64; i++) {
            sum += access_2D(unsigned int, hostResults,
                                        min-1, i, pitch);
        }
        hour = OPENING_HOUR + min/60;
        minute = min%60;
        printf("%2d:%02d   # of waiting customers = %10.4g |",
                    hour, minute, (float)sum/(64.0 * 64.0));
        printf("  # of cashiers = %d  |  ",
                    cashiers_load_h[(min-1)/60]);
        printf("# of new customers/min ~= ");
        switch (api){
            case HOST_API:
                printf("%2.2f\n", 4.0);
                break;
            case SIMPLE_DEVICE_API:
                printf("%2.2f\n",
                            4*(sin((float)min/100.0)+1));
                break;
            case ROBUST_DEVICE_API:
                if (min &lt;= 3 * 60){
                    printf("%2.2f\n", 2.0);
                }else{
                    if (min &lt;= 6 * 60){
                        printf("%2.2f\n", 1.0);
                    }else{
                        printf("%2.2f\n", 3.0);
                    }
                }
                break;
            default:
                fprintf(stderr, "Wrong API\n");
        }
    }
}


int main(int argc, char *argv[])
{
    int n;
    size_t pitch;
    curandState *devStates;
    unsigned int *devResults, *hostResults;
    unsigned int *poisson_numbers_d;
    curandDiscreteDistribution_t poisson_1, poisson_2;
    curandDiscreteDistribution_t poisson_3;
    curandGenerator_t gen;

    /* Setting cashiers, report and API */
    settings();

    /* Allocate space for results on device */
    CUDA_CALL(cudaMallocPitch((void **)&amp;devResults, &amp;pitch,
                64 * 64 * sizeof(unsigned int), 60 * HOURS));

    /* Allocate space for results on host */
    hostResults = (unsigned int *)calloc(pitch * 60 * HOURS,
                sizeof(unsigned int));

    /* Allocate space for prng states on device */
    CUDA_CALL(cudaMalloc((void **)&amp;devStates, 64 * 64 * 
              sizeof(curandState)));

    /* Setup prng states */
    if (api != HOST_API){
        setup_kernel&lt;&lt;&lt;64, 64&gt;&gt;&gt;(devStates);
    }
    /* Simulate queue  */
    switch (api){
        case HOST_API:
            /* Create pseudo-random number generator */
            CURAND_CALL(curandCreateGenerator(&amp;gen,
                                CURAND_RNG_PSEUDO_DEFAULT));
            /* Set seed */
            CURAND_CALL(curandSetPseudoRandomGeneratorSeed(
                                            gen, 1234ULL));
            /* compute n */
            n = 64 * 64 * HOURS * 60;
            /* Allocate n unsigned ints on device */
            CUDA_CALL(cudaMalloc((void **)&amp;poisson_numbers_d,
                                n * sizeof(unsigned int)));
            /* Generate n unsigned ints on device */
            CURAND_CALL(curandGeneratePoisson(gen,
                                poisson_numbers_d, n, 4.0));
            host_API_kernel&lt;&lt;&lt;64, 64&gt;&gt;&gt;(poisson_numbers_d,
                                        devResults, pitch);
            /* Cleanup */
            CURAND_CALL(curandDestroyGenerator(gen));
            break;
        case SIMPLE_DEVICE_API:
            simple_device_API_kernel&lt;&lt;&lt;64, 64&gt;&gt;&gt;(devStates,
                                        devResults, pitch);
            break;
        case ROBUST_DEVICE_API:
            /* Create histograms for Poisson(1) */
            CURAND_CALL(curandCreatePoissonDistribution(1.0,
                                                &amp;poisson_1));
            /* Create histograms for Poisson(2) */
            CURAND_CALL(curandCreatePoissonDistribution(2.0,
                                                &amp;poisson_2));
            /* Create histograms for Poisson(3) */
            CURAND_CALL(curandCreatePoissonDistribution(3.0,
                                                &amp;poisson_3));
            robust_device_API_kernel&lt;&lt;&lt;64, 64&gt;&gt;&gt;(devStates,
                            poisson_1, poisson_2, poisson_3,
                            devResults, pitch);
            /* Cleanup */
            CURAND_CALL(curandDestroyDistribution(poisson_1));
            CURAND_CALL(curandDestroyDistribution(poisson_2));
            CURAND_CALL(curandDestroyDistribution(poisson_3));
            break;
        default:
            fprintf(stderr, "Wrong API\n");
    }
    /* Copy device memory to host */
    CUDA_CALL(cudaMemcpy2D(hostResults, pitch, devResults,
                pitch, 64 * 64 * sizeof(unsigned int),
                60 * HOURS, cudaMemcpyDeviceToHost));
    /* Show result */
    print_statistics(hostResults, pitch);
    /* Cleanup */
    CUDA_CALL(cudaFree(devStates));
    CUDA_CALL(cudaFree(devResults));
    free(hostResults);
    return EXIT_SUCCESS;
}
</pre></div>
                  </div>
               </div>
               
               <hr id="contents-end"></hr>
               
            </article>
         </div>
      </div>
      <script language="JavaScript" type="text/javascript" charset="utf-8" src="../common/formatting/common.min.js"></script>
      <script language="JavaScript" type="text/javascript" charset="utf-8" src="../common/scripts/google-analytics/google-analytics-write.js"></script>
      <script language="JavaScript" type="text/javascript" charset="utf-8" src="../common/scripts/google-analytics/google-analytics-tracker.js"></script>
      <script type="text/javascript">var switchTo5x=true;</script><script type="text/javascript" src="http://w.sharethis.com/button/buttons.js"></script><script type="text/javascript">stLight.options({publisher: "998dc202-a267-4d8e-bce9-14debadb8d92", doNotHash: false, doNotCopy: false, hashAddressBar: false});</script></body>
</html>