Smart Cuda v0.2.0 (New Features and Updates) - markamo/Smart-Cuda GitHub Wiki
The main tutorial of the initial release of Smart Cuda is at! This section is only about the new features introduced in the version 0.2.0 of Smart Cuda.
Having trouble with Smart CUDA? Check out the documentation at or or contact [email protected] and we’ll help you sort it out.
- Bug fixes
- Include now only specifies one header
#include "smartCuda_lib\smartCuda.h"
- Removed required dependency of passing a cudaError_t object to functions for error checking
cudaError_t cudaStatus;
int arr_size = 10;
int* k1 = smartArray<int,smartDevice>(arr_size , cudaStatus);
////cudaStatus no longer required but still available for backwards compatibility
int* k1 = smartArray<int,smartDevice>(arr_size ); /*, cudaStatus);*/
- Introduction of Inline Array type for dynamic allocation of memory in kernels. Works on both Host and Device platforms, by using the
__global__ void testKernel(int arr_size)
int* arr = smartArray<int,smartInlineArray>(arr_size); //create an inline array of size arr_size;
- Introduction of several inline functions and aliases for Kernel indexing
//// alias for __global__
#define __KERNEL__ __global__
////indexing along one dimension x,y,z = 0,1,2. default index is along the x dimension (0)
// alias for threadIdx.x;
__device__ inline int __local_index(int dim = 0);
// alias for blockIdx.x * blockDim.x + threadIdx.x;
__device__ inline int __global_index(int dim = 0);
// alias for blockIdx.x;
__device__ inline int __block_index(int dim = 0);
__device__ inline int __group_index(int dim = 0);
// alias for gridDim.x * blockDim.x;
__device__ inline int __stride(int dim = 0);
////alias functions for size of threads and blocks along the x, y, z and total size in all dimensions (i.e. 0,1,2,-1). The default is the total size along all dimensions (-1).
// alias for gridDim.x;
__device__ inline int __num_blocks(int dim = -1 );
__device__ inline int __num_groups(int dim = -1);
// alias for gridDim.x;
__device__ inline int __block_size(int dim = -1);
__device__ inline int __group_size(int dim = -1);
__device__ inline int __local_size(int dim = -1);
// alias for gridDim.x * blockDim.x;
__device__ inline int __launch_size(int dim = -1);
__device__ inline int __global_size(int dim = -1);
- New Kernel function
, perform parallel element wise operations on allocated device arrays
template <typename T, class Op> __global__
void appy_func_core(T* dev_Array, const int size, Op fn);
template <typename T, class Op> __global__
void appy_func_core(T* dest, T* src, const int size, Op fn);
template <typename T, class Op> __global__
void appy_func_core(T* dest, T* src1, T* src2, const int size, Op fn);
- New Kernel function
, perform parallel element wise transformations on allocated device arrays. Supports up to 10 allocated device arrays
template <typename T, class Op> __global__
void transfrom_core(T* arr, const int size, Op fn );
template <typename T, class Op> __global__
void transfrom_core(T* arr, T* arr1, T* arr2, const int size, Op fn );
template <typename T, class Op> __global__
void transfrom_core(T* arr, T* arr1, T* arr2, T* arr3, T* arr4, T* arr5, T* arr6, T* arr7, T* arr8, T* arr9, const int size, Op fn );
- New Kernel function
, perform parallel element wise transformations on allocated device arrays of different types. Supports up to 10 allocated device arrays and types
template <typename T, class Op> __global__
void transfrom_core_t(T* arr, const int size, Op fn );
template <typename T, typename T1, typename T2, class Op> __global__
void transfrom_core_t(T* arr, T1* arr1, T2* arr2, const int size, Op fn );
template <typename T, typename T1, typename T2, typename T3, typename T4, typename T5, typename T6, typename T7, typename T8, typename T9, class Op> __global__
void transfrom_core_t(T* arr, T1* arr1, T2* arr2, T3* arr3, T4* arr4, T5* arr5, T6* arr6, T7* arr7, T8* arr8, T9* arr9, const int size, Op fn );
- Simple Reduction function and function operators for reductions on gpu. Customer implementation of smartOperator can be used with for the reduction kernel.
////pre-defined operators
template <typename T> class smartOperator;
template <typename T> class smartPlus: public smartOperator<T>;
template <typename T> class smartMultiply: public smartOperator<T>;
template <typename T> class smartMax: public smartOperator<T>;
template <typename T> class smartMin: public smartOperator<T>;
template <typename T> class smartOR: public smartOperator<T>;
template <typename T> class smartAND: public smartOperator<T>;
////kernel launcher
template<typename T, class Op>
void smartReduce_kl(T *answer, T *partial, const T *in, size_t N, int numBlocks, int numThreads, Op fn );
- Smart Random library for random number generation in on device kernels. Use of a default random number that can be called from any part of the code. Smart Random library provides a lightweight wrapper on cuRand library.
__device__ curandState *defaultStates;
__host__ cudaError_t initDefaultRand(int size = 64 * 64, int seed = time(NULL));
__host__ cudaError_t releaseDefaultRand();
////use defaultStates in cuRand calls
releaseDefaultRand(); //// called when done using defaultStates to release memory allocated;
- Other Smart Random library functions and kernels
__global__ void setup_rand_kernel(curandState *state, unsigned int size, unsigned int seed);
template <typename T>
__global__ void generate_uniform_kernel(T* result, int size, curandState *state );
template <typename T>
__global__ void generate_uniform_range_kernel(T* result, T lower, T upper, int size, curandState *state );
template <typename T>
__host__ cudaError_t smartRandu(T *dev_Array, const int size, curandState *state = defaultStates);
template <typename T>
__host__ cudaError_t smartRandu(T *dev_Array, const int sizeX, const int sizeY, curandState *state = defaultStates);
template <typename T>
__host__ cudaError_t smartRandu(T *dev_Array, const int sizeX, const int sizeY, const int sizeZ, curandState *state = defaultStates);
template <typename T>
__host__ cudaError_t smartRandu(T *dev_Array, const int sizeX, const int sizeY, const int sizeZ, const int sizeW, curandState *state = defaultStates);
template <typename T>
__host__ cudaError_t smartRandr(T *dev_Array, T min, T max, const int size, curandState *state = defaultStates);
template <typename T>
__host__ cudaError_t smartRandr(T *dev_Array, T min, T max, const int sizeX, const int sizeY, curandState *state = defaultStates);
template <typename T>
__host__ cudaError_t smartRandr(T *dev_Array, T min, T max, const int sizeX, const int sizeY, const int sizeZ, curandState *state = defaultStates);
template <typename T>
__host__ cudaError_t smartRandr(T *dev_Array, T min, T max, const int sizeX, const int sizeY, const int sizeZ, const int sizeW, curandState *state = defaultStates);