CUDA - BKJackson/BKJackson_Wiki GitHub Wiki
Cuda Indexing

GPUkernelfunction<<<blocknum, threadnum>>>()
nvcc -arch=sm_70 -o out some-CUDA.cu -run
Docs on the -arch flag
GPU feature list for the -arch flag
#include <stdio.h>
__global__ void loop()
{
printf("This is iteration number %d %d\n", blockIdx.x, threadIdx.x);
}
int main()
{
/* Loop over 2 blocks with 10 threads each */
loop<<<2, 10>>>();
cudaDeviceSynchronize();
}
// CPU-only
int N = 2<<20;
size_t size = N * sizeof(int);
int *a;
a = (int *)malloc(size);
// Use `a` in CPU-only program.
free(a);
// Accelerated
int N = 2<<20;
size_t size = N * sizeof(int);
int *a;
// Note the address of `a` is passed as first argument.
cudaMallocManaged(&a, size);
// Use `a` on the CPU and/or on any GPU in the accelerated system.
cudaFree(a);
__global__ void kernel(int *a, int N)
{
int indexWithinTheGrid = threadIdx.x + blockIdx.x * blockDim.x;
int gridStride = gridDim.x * blockDim.x;
for (int i = indexWithinTheGrid; i < N; i += gridStride)
{
// do work on a[i];
}
}
CUDA C++ Best Practices Guide
CUDA GPU Device Properties docs - for programmatically querying GPU device properties