OpenCL - shawfdong/hyades GitHub Wiki

OpenCL (Open Computing Language) is a framework for writing programs that execute across heterogeneous platforms consisting of central processing units (CPUs), graphics processing units (GPUs), digital signal processors (DSPs), field-programmable gate arrays (FPGAs) and other processors. OpenCL includes a language (based on C99) for programming these devices, and application programming interfaces (APIs) to control the platform and execute programs on the compute devices. OpenCL provides parallel computing using task-based and data-based parallelism. On Hyades, OpenCL is supported on all the Type II compute nodes: the Type IIa GPU nodes as well as the Type IIb MIC node Aesyle.

Table of Contents

OpenCL on GPU nodes

Each of the 8 GPU nodes in Hyades contains an Nvidia Tesla K20 GPU Accelerator. As of December 2014, the Nvidia driver only supports OpenCL 1.1 on the GPUs[1][2].

Nvidia places the OpenCL library (but not headers) in system location:

$ ls -l /usr/lib64/libOpenCL*
lrwxrwxrwx 1 root root    14 Dec 22 14:39 /usr/lib64/libOpenCL.so -> libOpenCL.so.1
lrwxrwxrwx 1 root root    16 Dec 22 14:39 /usr/lib64/libOpenCL.so.1 -> libOpenCL.so.1.0
lrwxrwxrwx 1 root root    18 Dec 22 14:39 /usr/lib64/libOpenCL.so.1.0 -> libOpenCL.so.1.0.0
-rwxr-xr-x 1 root root 21712 Dec 22 14:39 /usr/lib64/libOpenCL.so.1.0.0

OpenCL on Aesyle

The OpenCL drivers and runtimes from Intel provide OpenCL support for Xeon processors and Xeon Phi coprocessors, among others[3].

Install Intel OpenCL runtime 14.2 on Aesyle:

# wget http://registrationcenter.intel.com/irc_nas/4181/opencl_runtime_14.2_x64_4.5.0.8.tgz
# tar xvfz opencl_runtime_14.2_x64_4.5.0.8.tgz
# cd pset_opencl_runtime_14.1_x64_4.5.0.8
# ./install.sh
Note the newer OpenCL Runtime 15.1 is CPU only. 14.2 supports both Xeon processors and Xeon Phi coprocessors. Intel use alternatives to set the default for OpenCL library[4]:
# ls -l /usr/lib64/libOpenCL*
lrwxrwxrwx 1 root root 37 Feb 26 10:06 /usr/lib64/libOpenCL.so -> /etc/alternatives/opencl-libOpenCL.so
lrwxrwxrwx 1 root root 39 Feb 26 10:06 /usr/lib64/libOpenCL.so.1 -> /etc/alternatives/opencl-libOpenCL.so.1
lrwxrwxrwx 1 root root 41 Feb 26 10:06 /usr/lib64/libOpenCL.so.1.2 -> /etc/alternatives/opencl-libOpenCL.so.1.2

Install Intel Code Builder for OpenCL API 2014 R3 on Aesyle[5]:

# wget http://registrationcenter.intel.com/irc_nas/5193/intel_code_builder_for_opencl_2014_4.6.0.178_x64.tgz
# tar xvfz intel_code_builder_for_opencl_2014_4.6.0.178_x64.tgz
# ./install.sh
Intel use alternatives to set the default for OpenCL headers too:
# ls -l /usr/include/CL
lrwxrwxrwx 1 root root 32 Feb 26 10:18 /usr/include/CL -> /etc/alternatives/opencl-headers

OpenCL device query

Here is a simple C program to query OpenCL device capacities:

#include <stdio.h>
#ifdef __APPLE__
#include <OpenCL/opencl.h>
#else
#include <CL/cl.h>
#endif

int main()
{
  cl_platform_id platforms[100];
  cl_uint platforms_n = 0;
  clGetPlatformIDs(100, platforms, &platforms_n);
  printf("== %d OpenCL platform(s) found: ==\n", platforms_n);
  
  for (int i=0; i<platforms_n; i++)
  {
    char buffer[1024];
    printf("  -- platform %d --\n", i);
    clGetPlatformInfo(platforms[i], CL_PLATFORM_PROFILE, sizeof(buffer), buffer, NULL);
    printf("  PROFILE = %s\n", buffer);
    clGetPlatformInfo(platforms[i], CL_PLATFORM_VERSION, sizeof(buffer), buffer, NULL);
    printf("  VERSION = %s\n", buffer);
    clGetPlatformInfo(platforms[i], CL_PLATFORM_NAME, sizeof(buffer), buffer, NULL);
    printf("  NAME = %s\n", buffer);
    clGetPlatformInfo(platforms[i], CL_PLATFORM_VENDOR, sizeof(buffer), buffer, NULL);
    printf("  VENDOR = %s\n", buffer);
    clGetPlatformInfo(platforms[i], CL_PLATFORM_EXTENSIONS, sizeof(buffer), buffer, NULL);
    printf("  EXTENSIONS = %s\n", buffer);

    cl_device_id devices[100];
    cl_uint devices_n = 0;
    clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_ALL, 100, devices, &devices_n);
    printf("\n  === %d OpenCL device(s) found on platform %d: ===\n", devices_n, i);

    for (int j=0; j<devices_n; j++)
    {
      cl_uint buf_uint;
      cl_ulong buf_ulong;
      printf("    -- device %d on platform %d --\n", j, i);
      clGetDeviceInfo(devices[j], CL_DEVICE_NAME, sizeof(buffer), buffer, NULL);
      printf("    DEVICE_NAME = %s\n", buffer);
      clGetDeviceInfo(devices[j], CL_DEVICE_VENDOR, sizeof(buffer), buffer, NULL);
      printf("    DEVICE_VENDOR = %s\n", buffer);
      clGetDeviceInfo(devices[j], CL_DEVICE_VERSION, sizeof(buffer), buffer, NULL);
      printf("    DEVICE_VERSION = %s\n", buffer);
      clGetDeviceInfo(devices[j], CL_DRIVER_VERSION, sizeof(buffer), buffer, NULL);
      printf("    DRIVER_VERSION = %s\n", buffer);
      clGetDeviceInfo(devices[j], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(buf_uint), &buf_uint, NULL);
      printf("    DEVICE_MAX_COMPUTE_UNITS = %u\n", (unsigned int)buf_uint);
      clGetDeviceInfo(devices[j], CL_DEVICE_MAX_CLOCK_FREQUENCY, sizeof(buf_uint), &buf_uint, NULL);
      printf("    DEVICE_MAX_CLOCK_FREQUENCY = %u\n", (unsigned int)buf_uint);
      clGetDeviceInfo(devices[j], CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(buf_ulong), &buf_ulong, NULL);
      printf("    DEVICE_GLOBAL_MEM_SIZE = %lu\n\n", (unsigned long)buf_ulong);
    }

  }
  return 0;
}

iMac

To compile the code on my Mid 2010 27-inch iMac:

$ clang -framework OpenCL cl_DeviceQuery.c -o cl_DeviceQuery.x

OpenCL 1.2 is supported on my iMac:

$ ./cl_DeviceQuery.x 

== 1 OpenCL platform(s) found: ==

  -- platform 0 --
  PROFILE = FULL_PROFILE
  VERSION = OpenCL 1.2 (Jul 29 2014 21:24:39)
  NAME = Apple
  VENDOR = Apple
  EXTENSIONS = cl_APPLE_SetMemObjectDestructor cl_APPLE_ContextLoggingFunctions cl_APPLE_clut cl_APPLE_query_kernel_names cl_APPLE_gl_sharing cl_khr_gl_event

  === 2 OpenCL device(s) found on platform 0: ===
    -- device 0 on platform 0 --
    DEVICE_NAME = Intel(R) Core(TM) i7 CPU         870  @ 2.93GHz
    DEVICE_VENDOR = Intel
    DEVICE_VERSION = OpenCL 1.2 
    DRIVER_VERSION = 1.1
    DEVICE_MAX_COMPUTE_UNITS = 8
    DEVICE_MAX_CLOCK_FREQUENCY = 2930
    DEVICE_GLOBAL_MEM_SIZE = 8589934592

    -- device 1 on platform 0 --
    DEVICE_NAME = ATI Radeon HD 5750
    DEVICE_VENDOR = AMD
    DEVICE_VERSION = OpenCL 1.2 
    DRIVER_VERSION = 1.2 (Aug 17 2014 20:28:00)
    DEVICE_MAX_COMPUTE_UNITS = 10
    DEVICE_MAX_CLOCK_FREQUENCY = 628
    DEVICE_GLOBAL_MEM_SIZE = 1073741824

GPU nodes

To compile the code on the GPU nodes:

$ gcc -std=c99 cl_DeviceQuery.c -o cl_DeviceQuery.gpu -I/pfs/sw/cuda/6.5/include -L/pfs/sw/cuda/6.5/lib64 -lOpenCL

Only OpenCL 1.1 is supported on the GPUs:

$ ./cl_DeviceQuery.gpu 

== 1 OpenCL platform(s) found: ==

  -- platform 0 --
  PROFILE = FULL_PROFILE
  VERSION = OpenCL 1.1 CUDA 6.5.20
  NAME = NVIDIA CUDA
  VENDOR = NVIDIA Corporation
  EXTENSIONS = cl_khr_byte_addressable_store cl_khr_icd cl_khr_gl_sharing cl_nv_compiler_options cl_nv_device_attribute_query cl_nv_pragma_unroll 

  === 1 OpenCL device(s) found on platform 0: ===
    -- device 0 on platform 0 --
    DEVICE_NAME = Tesla K20m
    DEVICE_VENDOR = NVIDIA Corporation
    DEVICE_VERSION = OpenCL 1.1 CUDA
    DRIVER_VERSION = 340.58
    DEVICE_MAX_COMPUTE_UNITS = 13
    DEVICE_MAX_CLOCK_FREQUENCY = 705
    DEVICE_GLOBAL_MEM_SIZE = 5032706048

Aesyle

To compile the code on the MIC node Aesyle:

$ gcc -std=c99 cl_DeviceQuery.c -o cl_DeviceQuery.x -lOpenCL

OpenCL 1.2 is supported on both the Xeon processors and the Xeon Phi coprocessors[6][7]:

$ ./cl_DeviceQuery.x

== 1 OpenCL platform(s) found: ==

  -- platform 0 --
  PROFILE = FULL_PROFILE
  VERSION = OpenCL 1.2 LINUX
  NAME = Intel(R) OpenCL
  VENDOR = Intel(R) Corporation
  EXTENSIONS = cl_khr_icd cl_khr_global_int32_base_atomics cl_khr_global_int32_extended_atomics cl_khr_local_int32_base_atomics cl_khr_local_int32_extended_atomics cl_khr_byte_addressable_store cl_khr_spir cl_khr_fp64 

  === 3 OpenCL device(s) found on platform 0: ===
    -- device 0 on platform 0 --
    DEVICE_NAME =       Intel(R) Xeon(R) CPU E5-2630L 0 @ 2.00GHz
    DEVICE_VENDOR = Intel(R) Corporation
    DEVICE_VERSION = OpenCL 1.2 (Build 8)
    DRIVER_VERSION = 1.2.0.8
    DEVICE_MAX_COMPUTE_UNITS = 12
    DEVICE_MAX_CLOCK_FREQUENCY = 2000
    DEVICE_GLOBAL_MEM_SIZE = 67646722048

    -- device 1 on platform 0 --
    DEVICE_NAME = Intel(R) Many Integrated Core Acceleration Card
    DEVICE_VENDOR = Intel(R) Corporation
    DEVICE_VERSION = OpenCL 1.2 (Build 8)
    DRIVER_VERSION = 1.2
    DEVICE_MAX_COMPUTE_UNITS = 236
    DEVICE_MAX_CLOCK_FREQUENCY = 1052
    DEVICE_GLOBAL_MEM_SIZE = 6053646336

    -- device 2 on platform 0 --
    DEVICE_NAME = Intel(R) Many Integrated Core Acceleration Card
    DEVICE_VENDOR = Intel(R) Corporation
    DEVICE_VERSION = OpenCL 1.2 (Build 8)
    DRIVER_VERSION = 1.2
    DEVICE_MAX_COMPUTE_UNITS = 236
    DEVICE_MAX_CLOCK_FREQUENCY = 1052
    DEVICE_GLOBAL_MEM_SIZE = 6053646336

References

  1. ^ OpenCL 1.1 Specification
  2. ^ OpenCL API 1.1 Quick Reference Card
  3. ^ OpenCL Drivers and Runtimes for Intel Architecture
  4. ^ alternatives
  5. ^ Intel Code Builder for OpenCL API
  6. ^ OpenCL 1.2 Specification
  7. ^ OpenCL API 1.2 Quick Reference Card
⚠️ **GitHub.com Fallback** ⚠️