CUDA Handbook Notes - yszheda/wiki GitHub Wiki

Chap. 5 Memory

5.2.11 Atomic Operations

class cudaSpinlock {
  public:
    cudaSpinlock(int *p);
    void acquire();
    void release();
  private:
    int *m_p;
};

inline __device__
cudaSpinlock::cudaSpinlock(int *p)
{
  m_p = p;
}

inline __device__ void
cudaSpinlock::acquire()
{
  while (atomicCAS(m_p, 0, 1));
}

inline __device__ void
cudaSpinlock::release()
{
  atomicExch(m_p, 0);
}

Chap. 8 Streaming Multiprocessors

8.1 Memory

8.1.3 Global Memory

Because atomic operations are implemented using hardware in the GPU’s integrated memory controller, they do not work across the PCI Express bus and thus do not work correctly on device memory pointers that correspond to host memory or peer memory.

8.1.5 Shared Memory

Atomics in Shared Memory

LDSLK (load shared with lock) instruction

Chap. 12 Reduction

Overview

Two-Pass Reduction

When writing warp synchronous code, the volatile keyword must be used for the pointers into shared memory. Otherwise, the compiler may introduce optimizations that change the order of memory operations and the code will not work correctly.

Single-Pass Reduction

The __threadfence() causes all threads in the block to wait until any pending memory transactions have been posted to device memory. When __threadfence() is executed, writes to global memory are visible to all threads, not just the calling thread or threads in the block.

Reduction with Atomics

Arbitrary Block Sizes

Reduction Using Arbitrary Data Types

template<class T>
struct SharedMemory
{
    __device__ inline operator       T*()
    {
        extern __shared__ int __smem[];
        return (T*) (void *) __smem;
    }
    __device__ inline operator const T*() const
    {
        extern __shared__ int __smem[];
        return (T*) (void *) __smem;
    }
};

a specialization of the SharedMemory template must be declared; otherwise, the compiler will generate the following error:

Error: Unaligned memory accesses not supported

我个人喜欢的方式:

extern __shared__ int __smemInt[];
extern __shared__ T __smemT[];

Predicate Reduction

WARP-LEVEL PRIMITIVES

int __ballot( int p );

__ballot() evaluates a condition for all threads in the warp and returns a 32-bit word, where each bit gives the condition for the corresponding thread in the warp. Since __ballot() broadcasts its result to every thread in the warp, it is effectively a reduction across the warp.

See Warp Vote Functions

int __popc( int i );

__popc returns the number of set bits in the input word.

int __syncthreads_count( int p );

__syncthreads_count waits until all warps in the threadblock have arrived, then broad- casts to all threads in the block the number of threads for which the input condition was true. (SM 2.0)

Warp Reduction with Shuffle

Kepler added shuffle instructions that enable data interchange between threads within a warp without staging the data through shared memory. Although these instructions execute with the same latency as shared memory, they have the benefit of doing the exchange without performing both a read and a write, and they can reduce shared memory usage.

Chap. 13 Scan

Scan / prefix scan / prefix sum / parallel prefix sum is used as a building block for:

  • • Radix sort
  • • Quick sort
  • • Stream compaction and stream splitting
  • • Sparse matrix-vector multiplication
  • • Minimum spanning tree construction
  • • Computation of summed area tables

Overview

prefix dependency

Blelloch's two-pass algorithm

upsweep(a, N)
for d from 0 to (lg N) - 1
    in parallel for i from 0 to N - 1 by 2^(d+1)
        a[i + 2^(d+1) - 1] += a[i + 2^(d-1)]
downsweep(a, N)
    a[N - 1] = 0
    for d from (lg N) - 1 downto 0
        in parallel for i from 0 to N - 1 by 2^(d+1)
            t := a[i + 2^d - 1]
            a[i + 2^d - 1] = a[i + 2^(d+1) - 1]
            a[i + 2^(d+1) - 1] += t

Scan and Circuit Design

  • Brent-Kung circuit
  • Sklansky (minimum-depth) circuit
  • Kogge-Stone circuit

CUDA Implementations

Scan-then-fan (recursive)

4N global memory operations: The initial scan performs one read and one write, and then the fan performs another read and write.

Reduce-then-scan (recursive)

3N global memory operations: The initial reduction pass performs one read per element, and then the scan performs another read and a write.

Two-level reduce-then-scan

// TODO

Warp Scans

warp scan modeled on the Kogge-Stone circuit (work-inefficient)

template<class T>
inline __device__ T
scanWarp( volatile T *sPartials )
{
    const int tid = threadIdx.x;
    const int lane = tid & 31;
    if ( lane >=  1 ) sPartials[0] += sPartials[- 1];
    if ( lane >=  2 ) sPartials[0] += sPartials[- 2];
    if ( lane >=  4 ) sPartials[0] += sPartials[- 4];
    if ( lane >=  8 ) sPartials[0] += sPartials[- 8];
    if ( lane >= 16 ) sPartials[0] += sPartials[-16];
    return sPartials[0];
}

ZERO PADDING

去掉判断语句,但会增加shared memory大小

TEMPLATED FORMULATIONS

template<bool bZeroPad>
inline __device__ int
scanSharedIndex( int tid )
{
    if ( bZeroPad ) {
        const int warp = tid >> 5;
        const int lane = tid & 31;
        return 49 * warp + 16 + lane;
    }
    else {
        return tid;
    }
}

template<typename T, bool bZeroPad>
inline __device__ __host__ int
scanSharedMemory( int numThreads )
{
    if ( bZeroPad ) {
        const int warpcount = numThreads>>5;
        return (49 * warpcount + 16)*sizeof(T);
    }
    else {
        return numThreads*sizeof(T);
    }
}

template<class T, bool bZeroPadded>
inline __device__ T
scanWarp( volatile T *sPartials )
{
    T t = sPartials[0];
    if ( bZeroPadded ) {
        t += sPartials[- 1]; sPartials[0] = t;
        t += sPartials[- 2]; sPartials[0] = t;
        t += sPartials[- 4]; sPartials[0] = t;
        t += sPartials[- 8]; sPartials[0] = t;
        t += sPartials[-16]; sPartials[0] = t;
    }
    else {
        const int tid = threadIdx.x;
        const int lane = tid & 31;
        if ( lane >=  1 ) { t += sPartials[- 1]; sPartials[0] = t; }
        if ( lane >=  2 ) { t += sPartials[- 2]; sPartials[0] = t; }
        if ( lane >=  4 ) { t += sPartials[- 4]; sPartials[0] = t; }
        if ( lane >=  8 ) { t += sPartials[- 8]; sPartials[0] = t; }
        if ( lane >= 16 ) { t += sPartials[-16]; sPartials[0] = t; }
    }
    return t;
}

WARP SHUFFLE

// implemented in inline PTX, because the compiler does not emit efficient code to deal with the predicate returned by the shuffle instruction.
__device__ __forceinline__
int
scanWarpShuffle_step(int partial, int offset)
{
    int result;
    asm(
            "{.reg .u32 r0;"
            ".reg .pred p;"
            "shfl.up.b32 r0|p, %1, %2, 0;"
            "@p add.u32 r0, r0, %3;"
            "mov.u32 %0, r0;}"
            : "=r"(result) : "r"(partial), "r"(offset), "r"(partial));
    return result;
}

// The template parameter is an integer, and typically the value 5 is passed because 5 is the base 2 logarithm of the warp size of 32.
template <int levels>
__device__ __forceinline__
int
scanWarpShuffle(int mysum)
{
    for(int i = 0; i < levels; ++i)
        mysum = scanWarpShuffle_step(mysum, 1 << i);
    return mysum;
}
template <int logBlockSize>
__device__ int
scanBlockShuffle(int val, const unsigned int idx)
{
    const unsigned int lane   = idx & 31;
    const unsigned int warpid = idx >> 5;
    __shared__ int sPartials[32];

    // Intra-warp scan in each warp
    val = scanWarpShuffle<5>(val);

    // Collect per-warp results
    if (lane == 31) sPartials[warpid] = val;
    __syncthreads();

    // Use first warp to scan per-warp results
    if (warpid == 0) {
        int t = sPartials[lane];
        t = scanWarpShuffle<logBlockSize-5>( t );
        sPartials[lane] = t;
    }
    __syncthreads();

    // Add scanned base sum for final result
    if (warpid > 0) {
        val += sPartials[warpid - 1];
    }
    return val;
}

INSTRUCTION COUNTS

  • Use cuobjdump to disassemble.
  • the SSY/.S instruction pairs push and pop the divergence stack
  • __syncthreads() compiles to BAR.SYNC instructions in SASS.

Stream Compaction

// TODO

Chap. 14 N-Body

⚠️ **GitHub.com Fallback** ⚠️