Smart Cuda v0.2.0 (New Features and Updates) - markamo/Smart-Cuda GitHub Wiki
The main tutorial of the initial release of Smart Cuda is at https://github.com/markamo/Smart-Cuda/wiki/Welcome-to-the-Smart-Cuda-v0.1.2-wiki! This section is only about the new features introduced in the version 0.2.0 of Smart Cuda.
Having trouble with Smart CUDA? Check out the documentation at http://markamo.github.io/Smart-Cuda/ or https://github.com/markamo/Smart-Cuda or contact [email protected] and we’ll help you sort it out.
- Bug fixes
- Include now only specifies one header
#include "smartCuda_lib\smartCuda.h"
- Removed required dependency of passing a cudaError_t object to functions for error checking
cudaError_t cudaStatus;
int arr_size = 10;
....
int* k1 = smartArray<int,smartDevice>(arr_size , cudaStatus);
....
////cudaStatus no longer required but still available for backwards compatibility
int* k1 = smartArray<int,smartDevice>(arr_size ); /*, cudaStatus);*/
- Introduction of Inline Array type for dynamic allocation of memory in kernels. Works on both Host and Device platforms, by using the
smartInlineArray
andsmartInlineArrayFree
Example:
__global__ void testKernel(int arr_size)
{
int* arr = smartArray<int,smartInlineArray>(arr_size); //create an inline array of size arr_size;
...
smartInlineArrayFree(arr);
}
- Introduction of several inline functions and aliases for Kernel indexing
//// alias for __global__
#define __KERNEL__ __global__
////indexing along one dimension x,y,z = 0,1,2. default index is along the x dimension (0)
// alias for threadIdx.x;
__device__ inline int __local_index(int dim = 0);
// alias for blockIdx.x * blockDim.x + threadIdx.x;
__device__ inline int __global_index(int dim = 0);
// alias for blockIdx.x;
__device__ inline int __block_index(int dim = 0);
__device__ inline int __group_index(int dim = 0);
// alias for gridDim.x * blockDim.x;
__device__ inline int __stride(int dim = 0);
////alias functions for size of threads and blocks along the x, y, z and total size in all dimensions (i.e. 0,1,2,-1). The default is the total size along all dimensions (-1).
// alias for gridDim.x;
__device__ inline int __num_blocks(int dim = -1 );
__device__ inline int __num_groups(int dim = -1);
// alias for gridDim.x;
__device__ inline int __block_size(int dim = -1);
__device__ inline int __group_size(int dim = -1);
__device__ inline int __local_size(int dim = -1);
// alias for gridDim.x * blockDim.x;
__device__ inline int __launch_size(int dim = -1);
__device__ inline int __global_size(int dim = -1);
- New Kernel function
appy_func_core
, perform parallel element wise operations on allocated device arrays
template <typename T, class Op> __global__
void appy_func_core(T* dev_Array, const int size, Op fn);
template <typename T, class Op> __global__
void appy_func_core(T* dest, T* src, const int size, Op fn);
template <typename T, class Op> __global__
void appy_func_core(T* dest, T* src1, T* src2, const int size, Op fn);
- New Kernel function
transform_core
, perform parallel element wise transformations on allocated device arrays. Supports up to 10 allocated device arrays
template <typename T, class Op> __global__
void transfrom_core(T* arr, const int size, Op fn );
template <typename T, class Op> __global__
void transfrom_core(T* arr, T* arr1, T* arr2, const int size, Op fn );
...
template <typename T, class Op> __global__
void transfrom_core(T* arr, T* arr1, T* arr2, T* arr3, T* arr4, T* arr5, T* arr6, T* arr7, T* arr8, T* arr9, const int size, Op fn );
...
- New Kernel function
transform_core_t
, perform parallel element wise transformations on allocated device arrays of different types. Supports up to 10 allocated device arrays and types
template <typename T, class Op> __global__
void transfrom_core_t(T* arr, const int size, Op fn );
template <typename T, typename T1, typename T2, class Op> __global__
void transfrom_core_t(T* arr, T1* arr1, T2* arr2, const int size, Op fn );
...
template <typename T, typename T1, typename T2, typename T3, typename T4, typename T5, typename T6, typename T7, typename T8, typename T9, class Op> __global__
void transfrom_core_t(T* arr, T1* arr1, T2* arr2, T3* arr3, T4* arr4, T5* arr5, T6* arr6, T7* arr7, T8* arr8, T9* arr9, const int size, Op fn );
...
- Simple Reduction function and function operators for reductions on gpu. Customer implementation of smartOperator can be used with for the reduction kernel.
////pre-defined operators
template <typename T> class smartOperator;
template <typename T> class smartPlus: public smartOperator<T>;
template <typename T> class smartMultiply: public smartOperator<T>;
template <typename T> class smartMax: public smartOperator<T>;
template <typename T> class smartMin: public smartOperator<T>;
template <typename T> class smartOR: public smartOperator<T>;
template <typename T> class smartAND: public smartOperator<T>;
////kernel launcher
template<typename T, class Op>
void smartReduce_kl(T *answer, T *partial, const T *in, size_t N, int numBlocks, int numThreads, Op fn );
- Smart Random library for random number generation in on device kernels. Use of a default random number that can be called from any part of the code. Smart Random library provides a lightweight wrapper on cuRand library.
Functions:
__device__ curandState *defaultStates;
...
__host__ cudaError_t initDefaultRand(int size = 64 * 64, int seed = time(NULL));
__host__ cudaError_t releaseDefaultRand();
Usage:
initDefaultRand(256*256);
....
////use defaultStates in cuRand calls
....
releaseDefaultRand(); //// called when done using defaultStates to release memory allocated;
- Other Smart Random library functions and kernels
__global__ void setup_rand_kernel(curandState *state, unsigned int size, unsigned int seed);
template <typename T>
__global__ void generate_uniform_kernel(T* result, int size, curandState *state );
template <typename T>
__global__ void generate_uniform_range_kernel(T* result, T lower, T upper, int size, curandState *state );
template <typename T>
__host__ cudaError_t smartRandu(T *dev_Array, const int size, curandState *state = defaultStates);
template <typename T>
__host__ cudaError_t smartRandu(T *dev_Array, const int sizeX, const int sizeY, curandState *state = defaultStates);
template <typename T>
__host__ cudaError_t smartRandu(T *dev_Array, const int sizeX, const int sizeY, const int sizeZ, curandState *state = defaultStates);
template <typename T>
__host__ cudaError_t smartRandu(T *dev_Array, const int sizeX, const int sizeY, const int sizeZ, const int sizeW, curandState *state = defaultStates);
template <typename T>
__host__ cudaError_t smartRandr(T *dev_Array, T min, T max, const int size, curandState *state = defaultStates);
template <typename T>
__host__ cudaError_t smartRandr(T *dev_Array, T min, T max, const int sizeX, const int sizeY, curandState *state = defaultStates);
template <typename T>
__host__ cudaError_t smartRandr(T *dev_Array, T min, T max, const int sizeX, const int sizeY, const int sizeZ, curandState *state = defaultStates);
template <typename T>
__host__ cudaError_t smartRandr(T *dev_Array, T min, T max, const int sizeX, const int sizeY, const int sizeZ, const int sizeW, curandState *state = defaultStates);