OpenCL - shawfdong/hyades GitHub Wiki
OpenCL (Open Computing Language) is a framework for writing programs that execute across heterogeneous platforms consisting of central processing units (CPUs), graphics processing units (GPUs), digital signal processors (DSPs), field-programmable gate arrays (FPGAs) and other processors. OpenCL includes a language (based on C99) for programming these devices, and application programming interfaces (APIs) to control the platform and execute programs on the compute devices. OpenCL provides parallel computing using task-based and data-based parallelism. On Hyades, OpenCL is supported on all the Type II compute nodes: the Type IIa GPU nodes as well as the Type IIb MIC node Aesyle.
Each of the 8 GPU nodes in Hyades contains an Nvidia Tesla K20 GPU Accelerator. As of December 2014, the Nvidia driver only supports OpenCL 1.1 on the GPUs[1][2].
Nvidia places the OpenCL library (but not headers) in system location:
$ ls -l /usr/lib64/libOpenCL* lrwxrwxrwx 1 root root 14 Dec 22 14:39 /usr/lib64/libOpenCL.so -> libOpenCL.so.1 lrwxrwxrwx 1 root root 16 Dec 22 14:39 /usr/lib64/libOpenCL.so.1 -> libOpenCL.so.1.0 lrwxrwxrwx 1 root root 18 Dec 22 14:39 /usr/lib64/libOpenCL.so.1.0 -> libOpenCL.so.1.0.0 -rwxr-xr-x 1 root root 21712 Dec 22 14:39 /usr/lib64/libOpenCL.so.1.0.0
The OpenCL drivers and runtimes from Intel provide OpenCL support for Xeon processors and Xeon Phi coprocessors, among others[3].
Install Intel OpenCL runtime 14.2 on Aesyle:
# wget http://registrationcenter.intel.com/irc_nas/4181/opencl_runtime_14.2_x64_4.5.0.8.tgz # tar xvfz opencl_runtime_14.2_x64_4.5.0.8.tgz # cd pset_opencl_runtime_14.1_x64_4.5.0.8 # ./install.shNote the newer OpenCL Runtime 15.1 is CPU only. 14.2 supports both Xeon processors and Xeon Phi coprocessors. Intel use alternatives to set the default for OpenCL library[4]:
# ls -l /usr/lib64/libOpenCL* lrwxrwxrwx 1 root root 37 Feb 26 10:06 /usr/lib64/libOpenCL.so -> /etc/alternatives/opencl-libOpenCL.so lrwxrwxrwx 1 root root 39 Feb 26 10:06 /usr/lib64/libOpenCL.so.1 -> /etc/alternatives/opencl-libOpenCL.so.1 lrwxrwxrwx 1 root root 41 Feb 26 10:06 /usr/lib64/libOpenCL.so.1.2 -> /etc/alternatives/opencl-libOpenCL.so.1.2
Install Intel Code Builder for OpenCL API 2014 R3 on Aesyle[5]:
# wget http://registrationcenter.intel.com/irc_nas/5193/intel_code_builder_for_opencl_2014_4.6.0.178_x64.tgz # tar xvfz intel_code_builder_for_opencl_2014_4.6.0.178_x64.tgz # ./install.shIntel use alternatives to set the default for OpenCL headers too:
# ls -l /usr/include/CL lrwxrwxrwx 1 root root 32 Feb 26 10:18 /usr/include/CL -> /etc/alternatives/opencl-headers
Here is a simple C program to query OpenCL device capacities:
#include <stdio.h> #ifdef __APPLE__ #include <OpenCL/opencl.h> #else #include <CL/cl.h> #endif int main() { cl_platform_id platforms[100]; cl_uint platforms_n = 0; clGetPlatformIDs(100, platforms, &platforms_n); printf("== %d OpenCL platform(s) found: ==\n", platforms_n); for (int i=0; i<platforms_n; i++) { char buffer[1024]; printf(" -- platform %d --\n", i); clGetPlatformInfo(platforms[i], CL_PLATFORM_PROFILE, sizeof(buffer), buffer, NULL); printf(" PROFILE = %s\n", buffer); clGetPlatformInfo(platforms[i], CL_PLATFORM_VERSION, sizeof(buffer), buffer, NULL); printf(" VERSION = %s\n", buffer); clGetPlatformInfo(platforms[i], CL_PLATFORM_NAME, sizeof(buffer), buffer, NULL); printf(" NAME = %s\n", buffer); clGetPlatformInfo(platforms[i], CL_PLATFORM_VENDOR, sizeof(buffer), buffer, NULL); printf(" VENDOR = %s\n", buffer); clGetPlatformInfo(platforms[i], CL_PLATFORM_EXTENSIONS, sizeof(buffer), buffer, NULL); printf(" EXTENSIONS = %s\n", buffer); cl_device_id devices[100]; cl_uint devices_n = 0; clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_ALL, 100, devices, &devices_n); printf("\n === %d OpenCL device(s) found on platform %d: ===\n", devices_n, i); for (int j=0; j<devices_n; j++) { cl_uint buf_uint; cl_ulong buf_ulong; printf(" -- device %d on platform %d --\n", j, i); clGetDeviceInfo(devices[j], CL_DEVICE_NAME, sizeof(buffer), buffer, NULL); printf(" DEVICE_NAME = %s\n", buffer); clGetDeviceInfo(devices[j], CL_DEVICE_VENDOR, sizeof(buffer), buffer, NULL); printf(" DEVICE_VENDOR = %s\n", buffer); clGetDeviceInfo(devices[j], CL_DEVICE_VERSION, sizeof(buffer), buffer, NULL); printf(" DEVICE_VERSION = %s\n", buffer); clGetDeviceInfo(devices[j], CL_DRIVER_VERSION, sizeof(buffer), buffer, NULL); printf(" DRIVER_VERSION = %s\n", buffer); clGetDeviceInfo(devices[j], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(buf_uint), &buf_uint, NULL); printf(" DEVICE_MAX_COMPUTE_UNITS = %u\n", (unsigned int)buf_uint); clGetDeviceInfo(devices[j], CL_DEVICE_MAX_CLOCK_FREQUENCY, sizeof(buf_uint), &buf_uint, NULL); printf(" DEVICE_MAX_CLOCK_FREQUENCY = %u\n", (unsigned int)buf_uint); clGetDeviceInfo(devices[j], CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(buf_ulong), &buf_ulong, NULL); printf(" DEVICE_GLOBAL_MEM_SIZE = %lu\n\n", (unsigned long)buf_ulong); } } return 0; }
To compile the code on my Mid 2010 27-inch iMac:
$ clang -framework OpenCL cl_DeviceQuery.c -o cl_DeviceQuery.x
OpenCL 1.2 is supported on my iMac:
$ ./cl_DeviceQuery.x == 1 OpenCL platform(s) found: == -- platform 0 -- PROFILE = FULL_PROFILE VERSION = OpenCL 1.2 (Jul 29 2014 21:24:39) NAME = Apple VENDOR = Apple EXTENSIONS = cl_APPLE_SetMemObjectDestructor cl_APPLE_ContextLoggingFunctions cl_APPLE_clut cl_APPLE_query_kernel_names cl_APPLE_gl_sharing cl_khr_gl_event === 2 OpenCL device(s) found on platform 0: === -- device 0 on platform 0 -- DEVICE_NAME = Intel(R) Core(TM) i7 CPU 870 @ 2.93GHz DEVICE_VENDOR = Intel DEVICE_VERSION = OpenCL 1.2 DRIVER_VERSION = 1.1 DEVICE_MAX_COMPUTE_UNITS = 8 DEVICE_MAX_CLOCK_FREQUENCY = 2930 DEVICE_GLOBAL_MEM_SIZE = 8589934592 -- device 1 on platform 0 -- DEVICE_NAME = ATI Radeon HD 5750 DEVICE_VENDOR = AMD DEVICE_VERSION = OpenCL 1.2 DRIVER_VERSION = 1.2 (Aug 17 2014 20:28:00) DEVICE_MAX_COMPUTE_UNITS = 10 DEVICE_MAX_CLOCK_FREQUENCY = 628 DEVICE_GLOBAL_MEM_SIZE = 1073741824
To compile the code on the GPU nodes:
$ gcc -std=c99 cl_DeviceQuery.c -o cl_DeviceQuery.gpu -I/pfs/sw/cuda/6.5/include -L/pfs/sw/cuda/6.5/lib64 -lOpenCL
Only OpenCL 1.1 is supported on the GPUs:
$ ./cl_DeviceQuery.gpu == 1 OpenCL platform(s) found: == -- platform 0 -- PROFILE = FULL_PROFILE VERSION = OpenCL 1.1 CUDA 6.5.20 NAME = NVIDIA CUDA VENDOR = NVIDIA Corporation EXTENSIONS = cl_khr_byte_addressable_store cl_khr_icd cl_khr_gl_sharing cl_nv_compiler_options cl_nv_device_attribute_query cl_nv_pragma_unroll === 1 OpenCL device(s) found on platform 0: === -- device 0 on platform 0 -- DEVICE_NAME = Tesla K20m DEVICE_VENDOR = NVIDIA Corporation DEVICE_VERSION = OpenCL 1.1 CUDA DRIVER_VERSION = 340.58 DEVICE_MAX_COMPUTE_UNITS = 13 DEVICE_MAX_CLOCK_FREQUENCY = 705 DEVICE_GLOBAL_MEM_SIZE = 5032706048
To compile the code on the MIC node Aesyle:
$ gcc -std=c99 cl_DeviceQuery.c -o cl_DeviceQuery.x -lOpenCL
OpenCL 1.2 is supported on both the Xeon processors and the Xeon Phi coprocessors[6][7]:
$ ./cl_DeviceQuery.x == 1 OpenCL platform(s) found: == -- platform 0 -- PROFILE = FULL_PROFILE VERSION = OpenCL 1.2 LINUX NAME = Intel(R) OpenCL VENDOR = Intel(R) Corporation EXTENSIONS = cl_khr_icd cl_khr_global_int32_base_atomics cl_khr_global_int32_extended_atomics cl_khr_local_int32_base_atomics cl_khr_local_int32_extended_atomics cl_khr_byte_addressable_store cl_khr_spir cl_khr_fp64 === 3 OpenCL device(s) found on platform 0: === -- device 0 on platform 0 -- DEVICE_NAME = Intel(R) Xeon(R) CPU E5-2630L 0 @ 2.00GHz DEVICE_VENDOR = Intel(R) Corporation DEVICE_VERSION = OpenCL 1.2 (Build 8) DRIVER_VERSION = 1.2.0.8 DEVICE_MAX_COMPUTE_UNITS = 12 DEVICE_MAX_CLOCK_FREQUENCY = 2000 DEVICE_GLOBAL_MEM_SIZE = 67646722048 -- device 1 on platform 0 -- DEVICE_NAME = Intel(R) Many Integrated Core Acceleration Card DEVICE_VENDOR = Intel(R) Corporation DEVICE_VERSION = OpenCL 1.2 (Build 8) DRIVER_VERSION = 1.2 DEVICE_MAX_COMPUTE_UNITS = 236 DEVICE_MAX_CLOCK_FREQUENCY = 1052 DEVICE_GLOBAL_MEM_SIZE = 6053646336 -- device 2 on platform 0 -- DEVICE_NAME = Intel(R) Many Integrated Core Acceleration Card DEVICE_VENDOR = Intel(R) Corporation DEVICE_VERSION = OpenCL 1.2 (Build 8) DRIVER_VERSION = 1.2 DEVICE_MAX_COMPUTE_UNITS = 236 DEVICE_MAX_CLOCK_FREQUENCY = 1052 DEVICE_GLOBAL_MEM_SIZE = 6053646336