Smart Cuda v0.2.0 (New Features and Updates) - markamo/Smart-Cuda GitHub Wiki

Smart CUDA

Welcome to Smart CUDA Library Wiki


Main Tutorial

The main tutorial of the initial release of Smart Cuda is at https://github.com/markamo/Smart-Cuda/wiki/Welcome-to-the-Smart-Cuda-v0.1.2-wiki! This section is only about the new features introduced in the version 0.2.0 of Smart Cuda.

Support or Contact

Having trouble with Smart CUDA? Check out the documentation at http://markamo.github.io/Smart-Cuda/ or https://github.com/markamo/Smart-Cuda or contact [email protected] and we’ll help you sort it out.


Changelog (from version 0.1.2.1 to 0.2.0)

  • Bug fixes
  • Include now only specifies one header
#include "smartCuda_lib\smartCuda.h"
  • Removed required dependency of passing a cudaError_t object to functions for error checking
cudaError_t cudaStatus;
int arr_size = 10;
....
int* k1 = smartArray<int,smartDevice>(arr_size , cudaStatus); 
....
////cudaStatus no longer required but still available for backwards compatibility
int* k1 = smartArray<int,smartDevice>(arr_size ); /*, cudaStatus);*/

  • Introduction of Inline Array type for dynamic allocation of memory in kernels. Works on both Host and Device platforms, by using the smartInlineArray and smartInlineArrayFree Example:
__global__ void testKernel(int arr_size)
{
    int* arr = smartArray<int,smartInlineArray>(arr_size); //create an inline array of size arr_size;
    ...
   smartInlineArrayFree(arr);

}
  • Introduction of several inline functions and aliases for Kernel indexing
//// alias for __global__ 
#define __KERNEL__ __global__ 

////indexing along one dimension x,y,z = 0,1,2. default index is along the x dimension (0)
 // alias for threadIdx.x;
__device__ inline int __local_index(int dim = 0);

 // alias for blockIdx.x * blockDim.x + threadIdx.x;
__device__ inline int __global_index(int dim = 0);

 // alias for blockIdx.x;
__device__ inline int __block_index(int dim = 0);
__device__ inline int __group_index(int dim = 0);

 // alias for gridDim.x * blockDim.x;
__device__ inline int __stride(int dim = 0);

////alias functions for size of threads and blocks along the x, y, z and total size in all dimensions (i.e. 0,1,2,-1). The default is the total size along all dimensions (-1).

 // alias for gridDim.x;
__device__ inline int __num_blocks(int dim = -1 );
__device__ inline int __num_groups(int dim = -1);

 // alias for gridDim.x;
__device__ inline int __block_size(int dim = -1);
__device__ inline int __group_size(int dim = -1);
__device__ inline int __local_size(int dim = -1);

 // alias for gridDim.x * blockDim.x;
__device__ inline int __launch_size(int dim = -1);
__device__ inline int __global_size(int dim = -1);

  • New Kernel function appy_func_core, perform parallel element wise operations on allocated device arrays
template <typename T, class Op> __global__  
void appy_func_core(T* dev_Array, const int size, Op fn);

template <typename T, class Op> __global__
 void appy_func_core(T* dest, T* src, const int size, Op fn);

template <typename T, class Op> __global__
 void appy_func_core(T* dest, T* src1, T* src2, const int size, Op fn);


  • New Kernel function transform_core, perform parallel element wise transformations on allocated device arrays. Supports up to 10 allocated device arrays
template <typename T, class Op> __global__
 void transfrom_core(T* arr, const int size, Op fn );

template <typename T, class Op> __global__
 void transfrom_core(T* arr, T* arr1, T* arr2, const int size, Op fn );

...

template <typename T, class Op> __global__
 void transfrom_core(T* arr, T* arr1, T* arr2, T* arr3, T* arr4,  T* arr5,  T* arr6, T* arr7, T* arr8, T* arr9, const int size, Op fn );

...

  • New Kernel function transform_core_t, perform parallel element wise transformations on allocated device arrays of different types. Supports up to 10 allocated device arrays and types
template <typename T, class Op> __global__
 void transfrom_core_t(T* arr, const int size, Op fn );


template <typename T, typename T1, typename T2, class Op> __global__
 void transfrom_core_t(T* arr, T1* arr1, T2* arr2, const int size, Op fn );

...

template <typename T, typename T1, typename T2,  typename T3, typename T4, typename T5, typename T6, typename T7, typename T8, typename T9, class Op> __global__
 void transfrom_core_t(T* arr, T1* arr1, T2* arr2, T3* arr3, T4* arr4,  T5* arr5,  T6* arr6, T7* arr7, T8* arr8, T9* arr9, const int size, Op fn );

...

  • Simple Reduction function and function operators for reductions on gpu. Customer implementation of smartOperator can be used with for the reduction kernel.
////pre-defined operators
template <typename T> class smartOperator;
template <typename T> class smartPlus: public smartOperator<T>;
template <typename T> class smartMultiply: public smartOperator<T>;
template <typename T> class smartMax: public smartOperator<T>;
template <typename T> class smartMin: public smartOperator<T>;
template <typename T> class smartOR: public smartOperator<T>;
template <typename T> class smartAND: public smartOperator<T>;

////kernel launcher
template<typename T, class Op>
void smartReduce_kl(T *answer, T *partial, const T *in, size_t N, int numBlocks, int numThreads, Op fn );


  • Smart Random library for random number generation in on device kernels. Use of a default random number that can be called from any part of the code. Smart Random library provides a lightweight wrapper on cuRand library.

Functions:

 __device__ curandState *defaultStates;
...
__host__ cudaError_t initDefaultRand(int size = 64 * 64, int seed = time(NULL));
__host__ cudaError_t releaseDefaultRand();

Usage:

initDefaultRand(256*256);
....
////use defaultStates in cuRand calls
....

releaseDefaultRand(); //// called when done using defaultStates to release memory allocated;

  • Other Smart Random library functions and kernels
__global__ void setup_rand_kernel(curandState *state, unsigned int size, unsigned int seed);

template <typename T>
__global__ void generate_uniform_kernel(T* result, int size, curandState *state );

template <typename T>
__global__ void generate_uniform_range_kernel(T* result, T lower, T upper, int size, curandState *state );

template <typename T> 
__host__ cudaError_t smartRandu(T *dev_Array, const int size, curandState *state = defaultStates);

template <typename T> 
__host__ cudaError_t smartRandu(T *dev_Array, const int sizeX, const int sizeY, curandState *state = defaultStates);

template <typename T> 
__host__ cudaError_t smartRandu(T *dev_Array, const int sizeX, const int sizeY, const int sizeZ, curandState *state = defaultStates);

template <typename T> 
__host__ cudaError_t smartRandu(T *dev_Array, const int sizeX, const int sizeY, const int sizeZ, const int sizeW, curandState *state = defaultStates);

template <typename T> 
__host__ cudaError_t smartRandr(T *dev_Array, T min, T max, const int size, curandState *state = defaultStates);

template <typename T> 
__host__ cudaError_t smartRandr(T *dev_Array, T min, T max, const int sizeX, const int sizeY, curandState *state = defaultStates);

template <typename T> 
__host__ cudaError_t smartRandr(T *dev_Array, T min, T max, const int sizeX, const int sizeY, const int sizeZ, curandState *state = defaultStates);

template <typename T> 
__host__ cudaError_t smartRandr(T *dev_Array, T min, T max, const int sizeX, const int sizeY, const int sizeZ, const int sizeW, curandState *state = defaultStates);



⚠️ **GitHub.com Fallback** ⚠️