OpenCL - FrankBau/meta-marsboard-bsp GitHub Wiki
According to the i.MX6 Graphics User's Guide iMX6 Vivante GPGPU cores are compatible with OpenCL Embedded Profile version 1.1.
For OpenCL acceleration in some software like OpenCV on MarS Board, full profile is required, and therefore, OpenCL acceleration cannot be enabled for those.
See also https://www.toradex.com/de/blog/experimenting-with-opencl-on-apalis-imx6q-system-on-module.
hello_opencl.cpp:
// see https://community.nxp.com/docs/DOC-93984
// http://www.cmsoft.com.br/opencl-tutorial/case-study-high-performance-convolution-using-opencl-__local-memory/
// http://www.tankonyvtar.hu/hu/tartalom/tamop412A/2011-0063_08_opencl_en/ch08s05.html
#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include <string.h>
#include <stdint.h>
#include <time.h>
#include <CL/cl.h>
// time diff in nanoseconds
uint64_t diff_time( struct timespec *end_tv, struct timespec *start_tv )
{
uint64_t diff;
if ( (end_tv->tv_nsec - start_tv->tv_nsec) < 0 ) {
diff = 1000000000 * (end_tv->tv_sec - start_tv->tv_sec - 1);
diff += 1000000000 + end_tv->tv_nsec - start_tv->tv_nsec;
} else {
diff = 1000000000 * (end_tv->tv_sec - start_tv->tv_sec);
diff += end_tv->tv_nsec - start_tv->tv_nsec;
}
printf( "diff=%12.6f ms\n", diff/1000000.0 );
return diff;
}
struct timespec tick;
struct timespec tock;
const char *sourceCode =
" \
__kernel void Filter ( \
__read_only image2d_t input, \
__write_only image2d_t output) \
{ \
const sampler_t smp = CLK_NORMALIZED_COORDS_FALSE | \
CLK_ADDRESS_CLAMP_TO_EDGE | \
CLK_FILTER_NEAREST; \
const int2 pos = {get_global_id(0), get_global_id(1)}; \
const uint4 pix = read_imageui( input, smp, pos ); \
write_imageui( output, pos, pix ); \
} \
";
const char *getErrorString(cl_int error)
{
switch(error){
// run-time and JIT compiler errors
case 0: return "CL_SUCCESS";
case -1: return "CL_DEVICE_NOT_FOUND";
case -2: return "CL_DEVICE_NOT_AVAILABLE";
case -3: return "CL_COMPILER_NOT_AVAILABLE";
case -4: return "CL_MEM_OBJECT_ALLOCATION_FAILURE";
case -5: return "CL_OUT_OF_RESOURCES";
case -6: return "CL_OUT_OF_HOST_MEMORY";
case -7: return "CL_PROFILING_INFO_NOT_AVAILABLE";
case -8: return "CL_MEM_COPY_OVERLAP";
case -9: return "CL_IMAGE_FORMAT_MISMATCH";
case -10: return "CL_IMAGE_FORMAT_NOT_SUPPORTED";
case -11: return "CL_BUILD_PROGRAM_FAILURE";
case -12: return "CL_MAP_FAILURE";
case -13: return "CL_MISALIGNED_SUB_BUFFER_OFFSET";
case -14: return "CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST";
case -15: return "CL_COMPILE_PROGRAM_FAILURE";
case -16: return "CL_LINKER_NOT_AVAILABLE";
case -17: return "CL_LINK_PROGRAM_FAILURE";
case -18: return "CL_DEVICE_PARTITION_FAILED";
case -19: return "CL_KERNEL_ARG_INFO_NOT_AVAILABLE";
// compile-time errors
case -30: return "CL_INVALID_VALUE";
case -31: return "CL_INVALID_DEVICE_TYPE";
case -32: return "CL_INVALID_PLATFORM";
case -33: return "CL_INVALID_DEVICE";
case -34: return "CL_INVALID_CONTEXT";
case -35: return "CL_INVALID_QUEUE_PROPERTIES";
case -36: return "CL_INVALID_COMMAND_QUEUE";
case -37: return "CL_INVALID_HOST_PTR";
case -38: return "CL_INVALID_MEM_OBJECT";
case -39: return "CL_INVALID_IMAGE_FORMAT_DESCRIPTOR";
case -40: return "CL_INVALID_IMAGE_SIZE";
case -41: return "CL_INVALID_SAMPLER";
case -42: return "CL_INVALID_BINARY";
case -43: return "CL_INVALID_BUILD_OPTIONS";
case -44: return "CL_INVALID_PROGRAM";
case -45: return "CL_INVALID_PROGRAM_EXECUTABLE";
case -46: return "CL_INVALID_KERNEL_NAME";
case -47: return "CL_INVALID_KERNEL_DEFINITION";
case -48: return "CL_INVALID_KERNEL";
case -49: return "CL_INVALID_ARG_INDEX";
case -50: return "CL_INVALID_ARG_VALUE";
case -51: return "CL_INVALID_ARG_SIZE";
case -52: return "CL_INVALID_KERNEL_ARGS";
case -53: return "CL_INVALID_WORK_DIMENSION";
case -54: return "CL_INVALID_WORK_GROUP_SIZE";
case -55: return "CL_INVALID_WORK_ITEM_SIZE";
case -56: return "CL_INVALID_GLOBAL_OFFSET";
case -57: return "CL_INVALID_EVENT_WAIT_LIST";
case -58: return "CL_INVALID_EVENT";
case -59: return "CL_INVALID_OPERATION";
case -60: return "CL_INVALID_GL_OBJECT";
case -61: return "CL_INVALID_BUFFER_SIZE";
case -62: return "CL_INVALID_MIP_LEVEL";
case -63: return "CL_INVALID_GLOBAL_WORK_SIZE";
case -64: return "CL_INVALID_PROPERTY";
case -65: return "CL_INVALID_IMAGE_DESCRIPTOR";
case -66: return "CL_INVALID_COMPILER_OPTIONS";
case -67: return "CL_INVALID_LINKER_OPTIONS";
case -68: return "CL_INVALID_DEVICE_PARTITION_COUNT";
// extension errors
case -1000: return "CL_INVALID_GL_SHAREGROUP_REFERENCE_KHR";
case -1001: return "CL_PLATFORM_NOT_FOUND_KHR";
case -1002: return "CL_INVALID_D3D10_DEVICE_KHR";
case -1003: return "CL_INVALID_D3D10_RESOURCE_KHR";
case -1004: return "CL_D3D10_RESOURCE_ALREADY_ACQUIRED_KHR";
case -1005: return "CL_D3D10_RESOURCE_NOT_ACQUIRED_KHR";
default: return "Unknown OpenCL error";
}
}
int main()
{
cl_int status;
//Get an OpenCL platform
cl_platform_id cpPlatform;
clGetPlatformIDs(1, &cpPlatform, NULL);
// Get a GPU device
cl_device_id cdDevice;
clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 1, &cdDevice, NULL);
char cBuffer[1024];
clGetDeviceInfo(cdDevice, CL_DEVICE_NAME, sizeof(cBuffer), &cBuffer, NULL);
printf("CL_DEVICE_NAME: %s\n", cBuffer);
clGetDeviceInfo(cdDevice, CL_DRIVER_VERSION, sizeof(cBuffer), &cBuffer, NULL);
printf("CL_DRIVER_VERSION: %s\n", cBuffer);
cl_uint nComputeUnits;
clGetDeviceInfo(cdDevice, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(nComputeUnits), &nComputeUnits, NULL);
printf("CL_DEVICE_MAX_COMPUTE_UNITS: %u\n", nComputeUnits);
cl_ulong nLocalMemSize;
clGetDeviceInfo(cdDevice, CL_DEVICE_LOCAL_MEM_SIZE, sizeof(nLocalMemSize), &nLocalMemSize, NULL);
printf("CL_DEVICE_LOCAL_MEM_SIZE: %lu\n", nLocalMemSize );
cl_ulong nGlobalMemSize;
clGetDeviceInfo(cdDevice, CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(nGlobalMemSize), &nGlobalMemSize, NULL);
printf("CL_DEVICE_GLOBAL_MEM_SIZE: %lu\n", nGlobalMemSize );
size_t nMaxWorkGroupSize;
clGetDeviceInfo(cdDevice, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(nMaxWorkGroupSize), &nMaxWorkGroupSize, NULL);
printf("CL_DEVICE_MAX_WORK_GROUP_SIZE: %lu\n", nMaxWorkGroupSize );
cl_uint nMaxWorkItemDimesions;
clGetDeviceInfo(cdDevice, CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, sizeof(nMaxWorkItemDimesions), &nMaxWorkItemDimesions, NULL);
printf("CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS: %lu\n", nMaxWorkItemDimesions );
size_t nMaxWorkItemSizes[3];
clGetDeviceInfo(cdDevice, CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(nMaxWorkItemSizes), &nMaxWorkItemSizes, NULL);
printf("CL_DEVICE_MAX_WORK_ITEM_SIZES: %lu %lu %lu\n", nMaxWorkItemSizes[0], nMaxWorkItemSizes[1], nMaxWorkItemSizes[2] );
cl_ulong nGlobalMemCacheSize;
clGetDeviceInfo(cdDevice, CL_DEVICE_GLOBAL_MEM_CACHE_SIZE, sizeof(nGlobalMemCacheSize), &nGlobalMemCacheSize, NULL);
printf("CL_DEVICE_GLOBAL_MEM_CACHE_SIZE: %lu\n", nGlobalMemCacheSize );
cl_uint nGlobalMemCacheLineSize;
clGetDeviceInfo(cdDevice, CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE, sizeof(nGlobalMemCacheLineSize), &nGlobalMemCacheLineSize, NULL);
printf("CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE: %lu\n", nGlobalMemCacheLineSize );
// Create a context to run OpenCL enabled GPU
cl_context GPUContext = clCreateContextFromType(0, CL_DEVICE_TYPE_GPU, NULL, NULL, NULL);
// Create a command-queue on the GPU device
cl_command_queue cqCommandQueue = clCreateCommandQueue(GPUContext, cdDevice, CL_QUEUE_PROFILING_ENABLE , &status );
if(status != CL_SUCCESS) {
printf("clCreateCommandQueue: %s\n", getErrorString(status) );
}
printf("clCreateProgramWithSource:\n");
cl_program program = clCreateProgramWithSource(
GPUContext,
1,
(const char**)&sourceCode,
NULL,
&status
);
if(status != CL_SUCCESS) {
printf("clCreateProgramWithSource: %s\n", getErrorString(status) );
}
status = clBuildProgram(
program,
1, //number of devices
&cdDevice, //list of devices
NULL, //options
NULL, //callback function
NULL //callback function arguments
);
if (status == CL_BUILD_PROGRAM_FAILURE) {
// Determine the size of the log
size_t log_size;
clGetProgramBuildInfo(program, cdDevice, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);
// Allocate memory for the log
char *log = (char *) malloc(log_size);
// Get the log
clGetProgramBuildInfo(program, cdDevice, CL_PROGRAM_BUILD_LOG, log_size, log, NULL);
// Print the log
printf("%s\n", log);
free(log);
}
else if(status != CL_SUCCESS) {
printf("clBuildProgram: %s\n", getErrorString(status) );
}
cl_kernel kernel;
kernel = clCreateKernel(
program,
"Filter", //kernel function name
&status
);
if(status != CL_SUCCESS) {
printf("clCreateKernel: %s\n", getErrorString(status) );
}
cl_image_format imgFormat =
{
.image_channel_order = CL_RGBA,
.image_channel_data_type = CL_UNSIGNED_INT8
};
printf("malloc image\n");
uint32_t* i_pixels = (uint32_t*)malloc( 1280*720*4 );
uint32_t* o_pixels = (uint32_t*)malloc( 1280*720*4 );
memset( i_pixels, 0xFB, sizeof(i_pixels) );
printf("clCreateImage2D\n");
cl_mem i_image = clCreateImage2D(
GPUContext,
CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, //memory flags
&imgFormat,
1280, 720, 0, //image width, height and pitch
i_pixels,
&status
);
if(status != CL_SUCCESS) {
printf("clCreateImage2D i: %s\n", getErrorString(status) );
}
cl_mem o_image = clCreateImage2D(
GPUContext,
CL_MEM_WRITE_ONLY | CL_MEM_USE_HOST_PTR, //memory flags
&imgFormat,
1280, 720, 0, //image width, height and pitch
o_pixels,
&status
);
if(status != CL_SUCCESS) {
printf("clCreateImage2D o: %s\n", getErrorString(status) );
}
// last component for 3D images only
size_t origin[] = { 0, 0, 0 }; //write from first pixel (0,0)
size_t region[] = { 1280, 720, 1 }; //write whole image
printf("clEnqueueWriteImage\n");
status = clEnqueueWriteImage(
cqCommandQueue,
i_image,
CL_TRUE,
origin,
region,
0, // row pitch
0, // slice pitch, must be 0 for 2D
i_pixels, //no mem object->use of host ptr
0,
NULL,
NULL
);
if(status != CL_SUCCESS) {
printf("clEnqueueWriteImage: %s\n", getErrorString(status) );
}
status = clSetKernelArg( kernel, 0, sizeof(cl_mem), &i_image );
if(status != CL_SUCCESS) {
printf("clSetKernelArg 0: %s\n", getErrorString(status) );
}
status = clSetKernelArg( kernel, 1, sizeof(cl_mem), &o_image );
if(status != CL_SUCCESS) {
printf("clSetKernelArg 1: %s\n", getErrorString(status) );
}
size_t global[2] = { 1280, 720 };
size_t local[2] = { 16, 8 }; // this works, but {16,16} did not!?
// all measured timings are identical, even for {1,1}
clock_gettime( CLOCK_REALTIME, &tick );
cl_event event;
status = clEnqueueNDRangeKernel(
cqCommandQueue,
kernel,
2,
NULL,
global,
local,
0,
NULL,
&event
);
clWaitForEvents(1, &event);
cl_ulong time_start, time_end;
double total_time;
clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_START, sizeof(time_start), &time_start, NULL);
clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_END, sizeof(time_end), &time_end, NULL);
total_time = time_end-time_start;
printf("OpenCl Execution time is: %0.3f ms\n",total_time/1000000.0);
clock_gettime( CLOCK_REALTIME, &tock );
if(status != CL_SUCCESS) {
printf("clEnqueueNDRangeKernel: %s\n", getErrorString(status) );
}
diff_time(&tock,&tick);
status = clEnqueueReadImage(
cqCommandQueue,
o_image,
CL_TRUE,
origin,
region,
0, // row pitch
0, // slice pitch, must be 0 for 2D
o_pixels, //no mem object->use of host ptr
0,
NULL,
&event
);
if(status != CL_SUCCESS) {
printf("clEnqueueReadImage: %s\n", getErrorString(status) );
}
printf("done 0x%02x\n", o_pixels[0] );
return 0;
}