OpenCL - FrankBau/meta-marsboard-bsp GitHub Wiki

According to the i.MX6 Graphics User's Guide iMX6 Vivante GPGPU cores are compatible with OpenCL Embedded Profile version 1.1.

For OpenCL acceleration in some software like OpenCV on MarS Board, full profile is required, and therefore, OpenCL acceleration cannot be enabled for those.

See also https://www.toradex.com/de/blog/experimenting-with-opencl-on-apalis-imx6q-system-on-module.

hello_opencl.cpp:

// see https://community.nxp.com/docs/DOC-93984
// http://www.cmsoft.com.br/opencl-tutorial/case-study-high-performance-convolution-using-opencl-__local-memory/
// http://www.tankonyvtar.hu/hu/tartalom/tamop412A/2011-0063_08_opencl_en/ch08s05.html

#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include <string.h>
#include <stdint.h>
#include <time.h>

#include <CL/cl.h>

// time diff in nanoseconds                                                                                                                                    
uint64_t diff_time( struct timespec *end_tv, struct timespec *start_tv )                                                                                       
{                                                                                                                                                              
  uint64_t diff;                                                                                                                                               
  if ( (end_tv->tv_nsec - start_tv->tv_nsec) < 0 ) {                                                                                                           
    diff  = 1000000000 * (end_tv->tv_sec - start_tv->tv_sec - 1);                                                                                              
    diff += 1000000000 + end_tv->tv_nsec - start_tv->tv_nsec;                                                                                                  
  } else {                                                                                                                                                     
    diff  = 1000000000 * (end_tv->tv_sec - start_tv->tv_sec);                                                                                                  
    diff += end_tv->tv_nsec - start_tv->tv_nsec;                                                                                                               
  }                                                                                                                                                            
  printf( "diff=%12.6f ms\n", diff/1000000.0 );                                                                                                                
  return diff;                                                                                                                                                 
}                                                                                                                                                              
                                                                                                                                                               
struct timespec tick;                                                                                                                                          
struct timespec tock;   

const char *sourceCode = 
" \
__kernel void Filter ( \
    __read_only image2d_t input, \
    __write_only image2d_t output) \
{ \
  const sampler_t smp = CLK_NORMALIZED_COORDS_FALSE |  \
                          CLK_ADDRESS_CLAMP_TO_EDGE |  \
                          CLK_FILTER_NEAREST;          \
  const int2 pos = {get_global_id(0), get_global_id(1)}; \
  const uint4 pix = read_imageui( input, smp, pos ); \
  write_imageui( output, pos, pix ); \
} \
";

const char *getErrorString(cl_int error)
{
switch(error){
    // run-time and JIT compiler errors
    case 0: return "CL_SUCCESS";
    case -1: return "CL_DEVICE_NOT_FOUND";
    case -2: return "CL_DEVICE_NOT_AVAILABLE";
    case -3: return "CL_COMPILER_NOT_AVAILABLE";
    case -4: return "CL_MEM_OBJECT_ALLOCATION_FAILURE";
    case -5: return "CL_OUT_OF_RESOURCES";
    case -6: return "CL_OUT_OF_HOST_MEMORY";
    case -7: return "CL_PROFILING_INFO_NOT_AVAILABLE";
    case -8: return "CL_MEM_COPY_OVERLAP";
    case -9: return "CL_IMAGE_FORMAT_MISMATCH";
    case -10: return "CL_IMAGE_FORMAT_NOT_SUPPORTED";
    case -11: return "CL_BUILD_PROGRAM_FAILURE";
    case -12: return "CL_MAP_FAILURE";
    case -13: return "CL_MISALIGNED_SUB_BUFFER_OFFSET";
    case -14: return "CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST";
    case -15: return "CL_COMPILE_PROGRAM_FAILURE";
    case -16: return "CL_LINKER_NOT_AVAILABLE";
    case -17: return "CL_LINK_PROGRAM_FAILURE";
    case -18: return "CL_DEVICE_PARTITION_FAILED";
    case -19: return "CL_KERNEL_ARG_INFO_NOT_AVAILABLE";

    // compile-time errors
    case -30: return "CL_INVALID_VALUE";
    case -31: return "CL_INVALID_DEVICE_TYPE";
    case -32: return "CL_INVALID_PLATFORM";
    case -33: return "CL_INVALID_DEVICE";
    case -34: return "CL_INVALID_CONTEXT";
    case -35: return "CL_INVALID_QUEUE_PROPERTIES";
    case -36: return "CL_INVALID_COMMAND_QUEUE";
    case -37: return "CL_INVALID_HOST_PTR";
    case -38: return "CL_INVALID_MEM_OBJECT";
    case -39: return "CL_INVALID_IMAGE_FORMAT_DESCRIPTOR";
    case -40: return "CL_INVALID_IMAGE_SIZE";
    case -41: return "CL_INVALID_SAMPLER";
    case -42: return "CL_INVALID_BINARY";
    case -43: return "CL_INVALID_BUILD_OPTIONS";
    case -44: return "CL_INVALID_PROGRAM";
    case -45: return "CL_INVALID_PROGRAM_EXECUTABLE";
    case -46: return "CL_INVALID_KERNEL_NAME";
    case -47: return "CL_INVALID_KERNEL_DEFINITION";
    case -48: return "CL_INVALID_KERNEL";
    case -49: return "CL_INVALID_ARG_INDEX";
    case -50: return "CL_INVALID_ARG_VALUE";
    case -51: return "CL_INVALID_ARG_SIZE";
    case -52: return "CL_INVALID_KERNEL_ARGS";
    case -53: return "CL_INVALID_WORK_DIMENSION";
    case -54: return "CL_INVALID_WORK_GROUP_SIZE";
    case -55: return "CL_INVALID_WORK_ITEM_SIZE";
    case -56: return "CL_INVALID_GLOBAL_OFFSET";
    case -57: return "CL_INVALID_EVENT_WAIT_LIST";
    case -58: return "CL_INVALID_EVENT";
    case -59: return "CL_INVALID_OPERATION";
    case -60: return "CL_INVALID_GL_OBJECT";
    case -61: return "CL_INVALID_BUFFER_SIZE";
    case -62: return "CL_INVALID_MIP_LEVEL";
    case -63: return "CL_INVALID_GLOBAL_WORK_SIZE";
    case -64: return "CL_INVALID_PROPERTY";
    case -65: return "CL_INVALID_IMAGE_DESCRIPTOR";
    case -66: return "CL_INVALID_COMPILER_OPTIONS";
    case -67: return "CL_INVALID_LINKER_OPTIONS";
    case -68: return "CL_INVALID_DEVICE_PARTITION_COUNT";

    // extension errors
    case -1000: return "CL_INVALID_GL_SHAREGROUP_REFERENCE_KHR";
    case -1001: return "CL_PLATFORM_NOT_FOUND_KHR";
    case -1002: return "CL_INVALID_D3D10_DEVICE_KHR";
    case -1003: return "CL_INVALID_D3D10_RESOURCE_KHR";
    case -1004: return "CL_D3D10_RESOURCE_ALREADY_ACQUIRED_KHR";
    case -1005: return "CL_D3D10_RESOURCE_NOT_ACQUIRED_KHR";
    default: return "Unknown OpenCL error";
    }
}

int main()
{
     cl_int status;

     //Get an OpenCL platform
     cl_platform_id cpPlatform;
     clGetPlatformIDs(1, &cpPlatform, NULL);

     // Get a GPU device
     cl_device_id cdDevice;
     clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 1, &cdDevice, NULL);

     char cBuffer[1024];
     clGetDeviceInfo(cdDevice, CL_DEVICE_NAME, sizeof(cBuffer), &cBuffer, NULL);
     printf("CL_DEVICE_NAME: %s\n", cBuffer);

     clGetDeviceInfo(cdDevice, CL_DRIVER_VERSION, sizeof(cBuffer), &cBuffer, NULL);
     printf("CL_DRIVER_VERSION: %s\n", cBuffer);

     cl_uint nComputeUnits;
     clGetDeviceInfo(cdDevice, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(nComputeUnits), &nComputeUnits, NULL);
     printf("CL_DEVICE_MAX_COMPUTE_UNITS: %u\n", nComputeUnits);

     cl_ulong nLocalMemSize;
     clGetDeviceInfo(cdDevice, CL_DEVICE_LOCAL_MEM_SIZE, sizeof(nLocalMemSize), &nLocalMemSize, NULL);
     printf("CL_DEVICE_LOCAL_MEM_SIZE: %lu\n", nLocalMemSize );

     cl_ulong nGlobalMemSize;
     clGetDeviceInfo(cdDevice, CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(nGlobalMemSize), &nGlobalMemSize, NULL);
     printf("CL_DEVICE_GLOBAL_MEM_SIZE: %lu\n", nGlobalMemSize );

     size_t nMaxWorkGroupSize;
     clGetDeviceInfo(cdDevice, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(nMaxWorkGroupSize), &nMaxWorkGroupSize, NULL);
     printf("CL_DEVICE_MAX_WORK_GROUP_SIZE: %lu\n", nMaxWorkGroupSize );

     cl_uint nMaxWorkItemDimesions;
     clGetDeviceInfo(cdDevice, CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, sizeof(nMaxWorkItemDimesions), &nMaxWorkItemDimesions, NULL);
     printf("CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS: %lu\n", nMaxWorkItemDimesions );

     size_t nMaxWorkItemSizes[3];
     clGetDeviceInfo(cdDevice, CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(nMaxWorkItemSizes), &nMaxWorkItemSizes, NULL);
     printf("CL_DEVICE_MAX_WORK_ITEM_SIZES: %lu %lu %lu\n", nMaxWorkItemSizes[0], nMaxWorkItemSizes[1], nMaxWorkItemSizes[2] );

     cl_ulong nGlobalMemCacheSize;
     clGetDeviceInfo(cdDevice, CL_DEVICE_GLOBAL_MEM_CACHE_SIZE, sizeof(nGlobalMemCacheSize), &nGlobalMemCacheSize, NULL);
     printf("CL_DEVICE_GLOBAL_MEM_CACHE_SIZE: %lu\n", nGlobalMemCacheSize );

     cl_uint nGlobalMemCacheLineSize;
     clGetDeviceInfo(cdDevice, CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE, sizeof(nGlobalMemCacheLineSize), &nGlobalMemCacheLineSize, NULL);
     printf("CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE: %lu\n", nGlobalMemCacheLineSize );

     // Create a context to run OpenCL enabled GPU
     cl_context GPUContext = clCreateContextFromType(0, CL_DEVICE_TYPE_GPU, NULL, NULL, NULL);

     // Create a command-queue on the GPU device
     cl_command_queue cqCommandQueue = clCreateCommandQueue(GPUContext, cdDevice, CL_QUEUE_PROFILING_ENABLE , &status );
     if(status != CL_SUCCESS) {
       printf("clCreateCommandQueue: %s\n", getErrorString(status) );
     }

printf("clCreateProgramWithSource:\n");

     cl_program program = clCreateProgramWithSource(
       GPUContext,
       1,
       (const char**)&sourceCode,
       NULL,
       &status 
     );
     if(status != CL_SUCCESS) {
       printf("clCreateProgramWithSource: %s\n", getErrorString(status) );
     }

     status = clBuildProgram(
       program,
       1, //number of devices
       &cdDevice, //list of devices
       NULL, //options
       NULL, //callback function
       NULL //callback function arguments
     );
     if (status == CL_BUILD_PROGRAM_FAILURE) {
        // Determine the size of the log
        size_t log_size;
        clGetProgramBuildInfo(program, cdDevice, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);

       // Allocate memory for the log
        char *log = (char *) malloc(log_size);

        // Get the log
        clGetProgramBuildInfo(program, cdDevice, CL_PROGRAM_BUILD_LOG, log_size, log, NULL);

        // Print the log
        printf("%s\n", log);

        free(log);
     }
     else if(status != CL_SUCCESS) {
       printf("clBuildProgram: %s\n", getErrorString(status) );
     }

     cl_kernel kernel;

     kernel = clCreateKernel(
       program,
       "Filter", //kernel function name
       &status
     );
     if(status != CL_SUCCESS) {
       printf("clCreateKernel: %s\n", getErrorString(status) );
     }

     cl_image_format imgFormat =
     {
         .image_channel_order = CL_RGBA,
         .image_channel_data_type = CL_UNSIGNED_INT8
     };

printf("malloc image\n");

     uint32_t* i_pixels = (uint32_t*)malloc( 1280*720*4 );
     uint32_t* o_pixels = (uint32_t*)malloc( 1280*720*4 );

     memset( i_pixels, 0xFB, sizeof(i_pixels) );

printf("clCreateImage2D\n");

     cl_mem i_image = clCreateImage2D(
         GPUContext,
         CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, //memory flags
         &imgFormat,
         1280, 720, 0, //image width, height and pitch
         i_pixels,
         &status
     );
     if(status != CL_SUCCESS) {
       printf("clCreateImage2D i: %s\n", getErrorString(status) );
     }

     cl_mem o_image = clCreateImage2D(
         GPUContext,
         CL_MEM_WRITE_ONLY | CL_MEM_USE_HOST_PTR, //memory flags
         &imgFormat,
         1280, 720, 0, //image width, height and pitch
         o_pixels,
         &status
     );
     if(status != CL_SUCCESS) {
       printf("clCreateImage2D o: %s\n", getErrorString(status) );
     }

     // last component for 3D images only
     size_t origin[] = { 0, 0, 0 }; //write from first pixel (0,0)
     size_t region[] = { 1280, 720, 1 }; //write whole image

printf("clEnqueueWriteImage\n");

     status = clEnqueueWriteImage(
       cqCommandQueue,
       i_image,
       CL_TRUE,
       origin,
       region,
       0, // row pitch
       0, // slice pitch, must be 0 for 2D
       i_pixels, //no mem object->use of host ptr
       0,
       NULL,
       NULL
     );
     if(status != CL_SUCCESS) {
       printf("clEnqueueWriteImage: %s\n", getErrorString(status) );
     }

     status = clSetKernelArg( kernel, 0, sizeof(cl_mem), &i_image );
     if(status != CL_SUCCESS) {
       printf("clSetKernelArg 0: %s\n", getErrorString(status) );
     }
     status = clSetKernelArg( kernel, 1, sizeof(cl_mem), &o_image );
     if(status != CL_SUCCESS) {
       printf("clSetKernelArg 1: %s\n", getErrorString(status) );
     }

     size_t global[2] = { 1280, 720 };
     size_t local[2]  = { 16, 8 }; // this works, but {16,16} did not!? 
     // all measured timings are identical, even for {1,1}
     clock_gettime( CLOCK_REALTIME, &tick );

     cl_event event;
     status = clEnqueueNDRangeKernel( 
       cqCommandQueue, 
       kernel, 
       2, 
       NULL, 
       global, 
       local, 
       0, 
       NULL, 
       &event
     );
     clWaitForEvents(1, &event);
cl_ulong time_start, time_end;
 double total_time;
 clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_START, sizeof(time_start), &time_start, NULL);
 clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_END, sizeof(time_end), &time_end, NULL);
 total_time = time_end-time_start;
 printf("OpenCl Execution time is: %0.3f ms\n",total_time/1000000.0);

     clock_gettime( CLOCK_REALTIME, &tock );  
     if(status != CL_SUCCESS) {
       printf("clEnqueueNDRangeKernel: %s\n", getErrorString(status) );
     }
     diff_time(&tock,&tick);

     status = clEnqueueReadImage(
       cqCommandQueue,
       o_image,
       CL_TRUE,
       origin,
       region,
       0, // row pitch
       0, // slice pitch, must be 0 for 2D
       o_pixels, //no mem object->use of host ptr
       0,
       NULL,
       &event
     );
     if(status != CL_SUCCESS) {
       printf("clEnqueueReadImage: %s\n", getErrorString(status) );
     }

printf("done 0x%02x\n", o_pixels[0] );

     return 0;
}
⚠️ **GitHub.com Fallback** ⚠️