cuda-examples
nvcc options
- nvidia/cuda-compiler-driver-nvcc
CUDA toolkit archive
LAMMPS GPU Build options
- Supported CUDA Compute Capability versions for CUDA SDK version and Microarchitecture
CUDA shared-library-creation
- nvcc --ptxas-options=-v --compiler-options '-fPIC' -o mylib.so --shared mykernel.cu
- extern "C" cudamain() { ... }
Explicit Warp-Level Programing

Something like SIMD intrinstic (vector instruction on CPU)
- While the high performance obtained by warp execution happens behind the scene, many CUDA programs can achieve even higher performance by using explicit warp-level programming.
Using CUDA Warp-Level Primitives
n-Body

mini-nbody / cuda
- Stack::Ovrflow: n-body-cuda-optimization
OpenACC

GPU Gems 3

N-Body (Again)

Perhaps, as compared to what already published in the literature (Fast N-Body Simulation with CUDA) and what already available as codes (see the above answers and Mark Harris' GitHub N-body page, the last kernel is the only new thing. But I have played a bit with N-body, and found it useful to post this answer, potentially useful to next users.
From: Stack::Ovrflow: n-body-cuda-optimization
#include <stdio.h>

#define GRAVITATIONAL_CONST 6.67*1e−11
#define SOFTENING 1e-9f

/********************/
/* CUDA ERROR CHECK */
/********************/
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
    if (code != cudaSuccess) 
    {
        fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
        if (abort) exit(code);
    }
}

/**********/
/* iDivUp */
/**********/
int iDivUp(int a, int b) { return ((a % b) != 0) ? (a / b + 1) : (a / b); }

/*******************/
/* KERNEL FUNCTION */
/*******************/
template<int BLOCKSIZE>
__global__
void KernelcomputeForces(float* m_d, float* x_d, float* y_d, float* z_d, float* fx_d, float* fy_d, float* fz_d, unsigned int N)
{
    int tid = blockDim.x * blockIdx.x + threadIdx.x;
    if (tid < N) {
        float invDist, invDist3;
        // --- Initialize register accumulators for forces
        float fx_temp = 0.0f, fy_temp = 0.0f, fz_temp = 0.0f;
        // --- Move target particle data to registers
        float x_reg = x_d[tid], y_reg = y_d[tid], z_reg = z_d[tid], m_reg = m_d[tid];
        // --- Interact all with all    
        for(unsigned int ib=0; ib<N; ib++) {
            // --- Compute relative distances
            float dx = (x_d[ib] - x_reg);
            float dy = (y_d[ib] - y_reg);
            float dz = (z_d[ib] - z_reg);
            float distanceSquared = dx*dx + dy*dy + dz*dz;
            // --- Prevent slingshots and division by zero
            distanceSquared += SOFTENING;
            float invDist = rsqrtf(distanceSquared);
            float invDist3 = invDist * invDist * invDist;
            // --- Calculate gravitational magnitude between the bodies
            float magnitude = GRAVITATIONAL_CONST * (m_reg * m_d[ib]) * invDist3;
            // --- Calculate forces for the bodies: magnitude times direction
            fx_temp += magnitude*dx;
            fy_temp += magnitude*dy;
            fz_temp += magnitude*dz;
        }
        // --- Stores local memory to global memory
        fx_d[tid] = fx_temp;
        fy_d[tid] = fy_temp;
        fz_d[tid] = fz_temp;
    }
}

/**********************************/
/* KERNEL FUNCTION: SHARED MEMORY */
/**********************************/
template<int BLOCKSIZE>
__global__
void KernelcomputeForces_Shared(float* m_d, float* x_d, float* y_d, float* z_d, float* fx_d, float* fy_d, float* fz_d, unsigned int N)
{
    int tid = blockDim.x * blockIdx.x + threadIdx.x;
    if (tid < N) {
        float invDist, invDist3;
        __shared__ float x_sh[BLOCKSIZE], y_sh[BLOCKSIZE], z_sh[BLOCKSIZE], m_sh[BLOCKSIZE];
        // --- Initialize register accumulators for forces
        float fx_temp = 0.0f, fy_temp = 0.0f, fz_temp = 0.0f;
        // --- Move target particle data to registers
        float x_reg = x_d[tid], y_reg = y_d[tid], z_reg = z_d[tid], m_reg = m_d[tid];
        // --- Interact all with all    
        for(unsigned int ib=0; ib<N; ib+=BLOCKSIZE) {
            // --- Loading data to shared memory
            x_sh[threadIdx.x] = x_d[ib + threadIdx.x];
            y_sh[threadIdx.x] = y_d[ib + threadIdx.x];
            z_sh[threadIdx.x] = z_d[ib + threadIdx.x];
            m_sh[threadIdx.x] = m_d[ib + threadIdx.x];
            __syncthreads();
#pragma unroll
            for(unsigned int ic=0; ic<BLOCKSIZE; ic++) {
                // --- Compute relative distances
                float dx = (x_sh[ic] - x_reg);
                float dy = (y_sh[ic] - y_reg);
                float dz = (z_sh[ic] - z_reg);
                float distanceSquared = dx*dx + dy*dy + dz*dz;
                // --- Prevent slingshots and division by zero
                distanceSquared += SOFTENING;
                float invDist = rsqrtf(distanceSquared);
                float invDist3 = invDist * invDist * invDist;
                // --- Calculate gravitational magnitude between the bodies
                float magnitude = GRAVITATIONAL_CONST * (m_reg * m_sh[ic]) * invDist3;
                // --- Calculate forces for the bodies: magnitude times direction
                fx_temp += magnitude*dx;
                fy_temp += magnitude*dy;
                fz_temp += magnitude*dz;
            }
            __syncthreads();
        }
        // --- Stores local memory to global memory
        fx_d[tid] = fx_temp;
        fy_d[tid] = fy_temp;
        fz_d[tid] = fz_temp;
    }
}

/***************************/
/* KERNEL FUNCTION: TILING */
/***************************/
template<int BLOCKSIZE>
__global__
void KernelcomputeForces_Tiling(float* m_d, float* x_d, float* y_d, float* z_d, float* fx_d, float* fy_d, float* fz_d, unsigned int N)
{
    float invDist, invDist3;
    for (unsigned int tid = blockIdx.x*blockDim.x + threadIdx.x;
                  tid < N;
                  tid += blockDim.x*gridDim.x) {
        // --- Initialize register accumulators for forces
        float fx_temp = 0.0f, fy_temp = 0.0f, fz_temp = 0.0f;
        // --- Move target particle data to registers
        float x_reg = x_d[tid], y_reg = y_d[tid], z_reg = z_d[tid], m_reg = m_d[tid];
        // --- Interact all with all    
        for(unsigned int ib=0; ib<N; ib++) {
            // --- Compute relative distances
            float dx = (x_d[ib] - x_reg);
            float dy = (y_d[ib] - y_reg);
            float dz = (z_d[ib] - z_reg);
            float distanceSquared = dx*dx + dy*dy + dz*dz;
            // --- Prevent slingshots and division by zero
            distanceSquared += SOFTENING;
            float invDist = rsqrtf(distanceSquared);
            float invDist3 = invDist * invDist * invDist;
            // --- Calculate gravitational magnitude between the bodies
            float magnitude = GRAVITATIONAL_CONST * (m_reg * m_d[ib]) * invDist3;
            // --- Calculate forces for the bodies: magnitude times direction
            fx_temp += magnitude*dx;
            fy_temp += magnitude*dy;
            fz_temp += magnitude*dz;
        }
        __syncthreads();
        // --- Stores local memory to global memory
        fx_d[tid] = fx_temp;
        fy_d[tid] = fy_temp;
        fz_d[tid] = fz_temp;
    }
}

/*******************************************/
/* KERNEL FUNCTION: TILING + SHARED MEMORY */
/*******************************************/
template<int BLOCKSIZE>
__global__
void KernelcomputeForces_Tiling_Shared(float* m_d, float* x_d, float* y_d, float* z_d, float* fx_d, float* fy_d, float* fz_d, unsigned int N)
{
    float invDist, invDist3;
    for (unsigned int tid = blockIdx.x*blockDim.x + threadIdx.x;
                  tid < N;
                  tid += blockDim.x*gridDim.x) {
        __shared__ float x_sh[BLOCKSIZE], y_sh[BLOCKSIZE], z_sh[BLOCKSIZE], m_sh[BLOCKSIZE];
        // --- Initialize register accumulators for forces
        float fx_temp = 0.0f, fy_temp = 0.0f, fz_temp = 0.0f;
        // --- Move target particle data to registers
        float x_reg = x_d[tid], y_reg = y_d[tid], z_reg = z_d[tid], m_reg = m_d[tid];
        // --- Interact all with all    
        for(unsigned int ib=0; ib<N; ib+=BLOCKSIZE) {
            // --- Loading data to shared memory
            x_sh[threadIdx.x] = x_d[ib + threadIdx.x];
            y_sh[threadIdx.x] = y_d[ib + threadIdx.x];
            z_sh[threadIdx.x] = z_d[ib + threadIdx.x];
            m_sh[threadIdx.x] = m_d[ib + threadIdx.x];
            __syncthreads();

#pragma unroll
            for(unsigned int ic=0; ic<BLOCKSIZE; ic++) {
                // --- Compute relative distances
                float dx = (x_sh[ic] - x_reg);
                float dy = (y_sh[ic] - y_reg);
                float dz = (z_sh[ic] - z_reg);
                float distanceSquared = dx*dx + dy*dy + dz*dz;
                // --- Prevent slingshots and division by zero
                distanceSquared += SOFTENING;
                float invDist = rsqrtf(distanceSquared);
                float invDist3 = invDist * invDist * invDist;
                // --- Calculate gravitational magnitude between the bodies
                float magnitude = GRAVITATIONAL_CONST * (m_reg * m_sh[ic]) * invDist3;
                // --- Calculate forces for the bodies: magnitude times direction
                fx_temp += magnitude*dx;
                fy_temp += magnitude*dy;
                fz_temp += magnitude*dz;
            }
            __syncthreads();
        }
        // --- Stores local memory to global memory
        fx_d[tid] = fx_temp;
        fy_d[tid] = fy_temp;
        fz_d[tid] = fz_temp;
    }
}

/************************************************/
/* KERNEL FUNCTION: TILING + SHUFFLE OPERATIONS */
/************************************************/
__global__
oid KernelcomputeForces_Tiling_Shuffle(float* m_d, float* x_d, float* y_d, float* z_d, float* fx_d, float* fy_d, float* fz_d, unsigned int N)
{
    float invDist, invDist3;
    const int laneid = threadIdx.x & 31;
    for (unsigned int tid = blockIdx.x*blockDim.x + threadIdx.x;
                  tid < N;
                  tid += blockDim.x*gridDim.x) {
        // --- Initialize register accumulators for forces
        float fx_temp = 0.0f, fy_temp = 0.0f, fz_temp = 0.0f;
        // --- Move target particle data to registers
        float x_reg = x_d[tid], y_reg = y_d[tid], z_reg = z_d[tid], m_reg = m_d[tid];
        // --- Interact all with all    
        for(unsigned int ib=0; ib<N; ib+=32) {
            // --- Loading data to shared memory
            float x_src = x_d[ib + laneid];
            float y_src = y_d[ib + laneid];
            float z_src = z_d[ib + laneid];
            float m_src = m_d[ib + laneid];

#pragma unroll 32
            for(unsigned int ic=0; ic<32; ic++) {
                // --- Compute relative distances
                float dx = (__shfl(x_src, ic) - x_reg);
                float dy = (__shfl(y_src, ic) - y_reg);
                float dz = (__shfl(z_src, ic) - z_reg);
                float distanceSquared = dx*dx + dy*dy + dz*dz;
                // --- Prevent slingshots and division by zero
                distanceSquared += SOFTENING;
                float invDist = rsqrtf(distanceSquared);
                float invDist3 = invDist * invDist * invDist;
                // --- Calculate gravitational magnitude between the bodies
                float magnitude = GRAVITATIONAL_CONST * (m_reg * __shfl(m_src, ic)) * invDist3;
                // --- Calculate forces for the bodies: magnitude times direction
                fx_temp += magnitude*dx;
                fy_temp += magnitude*dy;
                fz_temp += magnitude*dz;
            }
            __syncthreads();
        }
        // --- Stores local memory to global memory
        fx_d[tid] = fx_temp;
        fy_d[tid] = fy_temp;
        fz_d[tid] = fz_temp;
    }
}

/*****************************************/
/* WRAPPER FUNCTION FOR GPU CALCULATIONS */
/*****************************************/
template<int BLOCKSIZE>
float GPUcomputeForces(float* m_d, float* x_d, float* y_d, float* z_d, float* fx_d, float* fy_d, float* fz_d, unsigned int N) {  
    float time;
    dim3 grid(iDivUp(N,BLOCKSIZE), 1, 1);       // --- Specifys how many blocks in three possible dimensions
    dim3 block(BLOCKSIZE, 1, 1);                // --- Threads per block
    cudaEvent_t start, stop;
    gpuErrchk(cudaEventCreate(&start));
    gpuErrchk(cudaEventCreate(&stop));
    gpuErrchk(cudaEventRecord(start, 0));
    KernelcomputeForces_Shared<BLOCKSIZE><<<grid, block>>>(m_d, x_d, y_d, z_d, fx_d, fy_d, fz_d, N);
    //KernelcomputeForces_Tiling<BLOCKSIZE><<<grid, block>>>(m_d, x_d, y_d, z_d, fx_d, fy_d, fz_d, N);
    //KernelcomputeForces<BLOCKSIZE><<<grid, block>>>(m_d, x_d, y_d, z_d, fx_d, fy_d, fz_d, N);
    gpuErrchk(cudaPeekAtLastError());
    gpuErrchk(cudaDeviceSynchronize());
    gpuErrchk(cudaEventRecord(stop, 0));
    gpuErrchk(cudaEventSynchronize(stop));
    gpuErrchk(cudaEventElapsedTime(&time, start, stop));
    return time;
}

/*****************************************/
/* WRAPPER FUNCTION FOR GPU CALCULATIONS */
/*****************************************/
template<int GRIDSIZE, int BLOCKSIZE>
float GPUcomputeForces_Tiling(float* m_d, float* x_d, float* y_d, float* z_d, float* fx_d, float* fy_d, float* fz_d, unsigned int N) {  
    float time;
    dim3 grid(GRIDSIZE, 1, 1);      // --- Specifys how many blocks in three possible dimensions
    dim3 block(BLOCKSIZE, 1, 1);    // --- Threads per block
    cudaEvent_t start, stop;
    gpuErrchk(cudaEventCreate(&start));
    gpuErrchk(cudaEventCreate(&stop));
    gpuErrchk(cudaEventRecord(start, 0));
    //KernelcomputeForces_Tiling<BLOCKSIZE><<<grid, block>>>(m_d, x_d, y_d, z_d, fx_d, fy_d, fz_d, N);
    KernelcomputeForces_Tiling_Shuffle<<<grid, block>>>(m_d, x_d, y_d, z_d, fx_d, fy_d, fz_d, N);
    gpuErrchk(cudaPeekAtLastError());
    gpuErrchk(cudaDeviceSynchronize());
    gpuErrchk(cudaEventRecord(stop, 0));
    gpuErrchk(cudaEventSynchronize(stop));
    gpuErrchk(cudaEventElapsedTime(&time, start, stop));
    return time;
}

/* CPU CALCULATIONS */
void CPUcomputeForces(float* m_h, float* x_h, float* y_h, float* z_h, float* fx_h, float* fy_h, float* fz_h, unsigned int N) {  
    for (unsigned int i=0; i<N; i++) {
        float invDist, invDist3;
        float fx_temp = 0.0f, fy_temp = 0.0f, fz_temp = 0.0f;
        // --- Interact all with all    
        for(unsigned int ib=0; ib<N; ib++) {
            // --- Compute relative distances
            float dx = (x_h[ib] - x_h[i]);
            float dy = (y_h[ib] - y_h[i]);
            float dz = (z_h[ib] - z_h[i]);
            float distanceSquared = dx*dx + dy*dy + dz*dz;
            // --- Prevent slingshots and division by zero
            distanceSquared += SOFTENING;
            float invDist = 1.f / sqrtf(distanceSquared);
            float invDist3 = invDist * invDist * invDist;
            // --- Calculate gravitational magnitude between the bodies
            float magnitude = GRAVITATIONAL_CONST * (m_h[i] * m_h[ib]) * invDist3;
            // --- Calculate forces for the bodies: magnitude times direction
            fx_temp += magnitude*dx;
            fy_temp += magnitude*dy;
            fz_temp += magnitude*dz;
        }
        // --- Stores local memory to global memory
        fx_h[i] = fx_temp;
        fy_h[i] = fy_temp;
        fz_h[i] = fz_temp;
    }
}

/********/
/* MAIN */
/********/
int main(void){
    const int N = 16384;
    size_t gsize = sizeof(float) * size_t(N);
    float * g[10], * _g[7];
    for(int i=0; i<7; i++) gpuErrchk( cudaMalloc((void **)&_g[i], gsize)); 
    for(int i=0; i<10; i++) g[i] = (float *)malloc(gsize);
    for(int i=0; i<N; i++)
        for(int j=0; j<4; j++)
            *(g[j]+i) = (float)rand();

    for(int i=0; i<4; i++) gpuErrchk(cudaMemcpy(_g[i], g[i], gsize, cudaMemcpyHostToDevice)); 

    // --- Warm up to take context establishment time out.
    GPUcomputeForces<512>(_g[0],_g[1],_g[2],_g[3],_g[4],_g[5],_g[6],N);
    //GPUcomputeForces_Tiling<32,512>(_g[0],_g[1],_g[2],_g[3],_g[4],_g[5],_g[6],N);

    // --- Bench runs
    printf("1024: %f\n",    GPUcomputeForces<512>   (_g[0],_g[1],_g[2],_g[3],_g[4],_g[5],_g[6],N));
    printf("512:  %f\n",    GPUcomputeForces<512>   (_g[0],_g[1],_g[2],_g[3],_g[4],_g[5],_g[6],N));
    printf("256:  %f\n",    GPUcomputeForces<256>   (_g[0],_g[1],_g[2],_g[3],_g[4],_g[5],_g[6],N));
    printf("128:  %f\n",    GPUcomputeForces<128>   (_g[0],_g[1],_g[2],_g[3],_g[4],_g[5],_g[6],N));
    printf("64:   %f\n",    GPUcomputeForces<64>    (_g[0],_g[1],_g[2],_g[3],_g[4],_g[5],_g[6],N));
    printf("32:   %f\n",    GPUcomputeForces<32>    (_g[0],_g[1],_g[2],_g[3],_g[4],_g[5],_g[6],N));
    printf("16, 1024: %f\n",    GPUcomputeForces_Tiling<16,512> (_g[0],_g[1],_g[2],_g[3],_g[4],_g[5],_g[6],N));
    printf("8,  1024: %f\n",    GPUcomputeForces_Tiling<8,512>  (_g[0],_g[1],_g[2],_g[3],_g[4],_g[5],_g[6],N));
    printf("4,  1024: %f\n",    GPUcomputeForces_Tiling<4,512>  (_g[0],_g[1],_g[2],_g[3],_g[4],_g[5],_g[6],N));
    printf("32, 512: %f\n",     GPUcomputeForces_Tiling<32,512> (_g[0],_g[1],_g[2],_g[3],_g[4],_g[5],_g[6],N));
    printf("16, 512: %f\n",     GPUcomputeForces_Tiling<16,512> (_g[0],_g[1],_g[2],_g[3],_g[4],_g[5],_g[6],N));
    printf("8,  512: %f\n",     GPUcomputeForces_Tiling<8,512>  (_g[0],_g[1],_g[2],_g[3],_g[4],_g[5],_g[6],N));
    printf("64, 256:  %f\n",    GPUcomputeForces_Tiling<64,256> (_g[0],_g[1],_g[2],_g[3],_g[4],_g[5],_g[6],N));
    printf("32, 256:  %f\n",    GPUcomputeForces_Tiling<32,256> (_g[0],_g[1],_g[2],_g[3],_g[4],_g[5],_g[6],N));
    printf("16, 256:  %f\n",    GPUcomputeForces_Tiling<16,256> (_g[0],_g[1],_g[2],_g[3],_g[4],_g[5],_g[6],N));
    printf("128,128:  %f\n",    GPUcomputeForces_Tiling<128,128>(_g[0],_g[1],_g[2],_g[3],_g[4],_g[5],_g[6],N));
    printf("64, 128:  %f\n",    GPUcomputeForces_Tiling<64,128> (_g[0],_g[1],_g[2],_g[3],_g[4],_g[5],_g[6],N));
    printf("32, 128:  %f\n",    GPUcomputeForces_Tiling<32,128> (_g[0],_g[1],_g[2],_g[3],_g[4],_g[5],_g[6],N));
    printf("256,64:   %f\n",    GPUcomputeForces_Tiling<256,64> (_g[0],_g[1],_g[2],_g[3],_g[4],_g[5],_g[6],N));
printf("128,64:   %f\n",    GPUcomputeForces_Tiling<128,64> (_g[0],_g[1],_g[2],_g[3],_g[4],_g[5],_g[6],N));
    printf("64, 64:   %f\n",    GPUcomputeForces_Tiling<64,64>  (_g[0],_g[1],_g[2],_g[3],_g[4],_g[5],_g[6],N));
    printf("512,32:   %f\n",    GPUcomputeForces_Tiling<512,32> (_g[0],_g[1],_g[2],_g[3],_g[4],_g[5],_g[6],N));
    printf("256,32:   %f\n",    GPUcomputeForces_Tiling<512,32> (_g[0],_g[1],_g[2],_g[3],_g[4],_g[5],_g[6],N));
    printf("128,32:   %f\n",    GPUcomputeForces_Tiling<512,32> (_g[0],_g[1],_g[2],_g[3],_g[4],_g[5],_g[6],N));
    for(int i=8; i<10; i++) gpuErrchk(cudaMemcpy(g[i], _g[i-3], gsize, cudaMemcpyDeviceToHost)); 
    CPUcomputeForces(g[0],g[1],g[2],g[3],g[4],g[5],g[6],N);
    for(int i=0; i<N; i++)
        for(int j=8; j<10; j++) {
            if (abs(*(g[j]+i) - *(g[j-3]+i)) > 0.001f) {
                printf("Error at %i, %i; GPU = %f; CPU = %f\n",i, (j-8), *(g[j-3]+i), *(g[j]+i));
                return;
            }
        }
    printf("Test passed!\n");
    cudaDeviceReset();
    return 0;
}
CUDA - ProkopHapala/FireCore GitHub Wiki

Explicit Warp-Level Programing

n-Body

OpenACC

GPU Gems 3

N-Body (Again)

⚠️ GitHub.com Fallback ⚠️

CUDA - ProkopHapala/FireCore GitHub Wiki

Explicit Warp-Level Programing

n-Body

OpenACC

GPU Gems 3

N-Body (Again)

⚠️ **GitHub.com Fallback** ⚠️

⚠️ GitHub.com Fallback ⚠️