This tutorial shows how you can use your own OpenCL contexts with ViennaCL.
We begin with including the necessary headers:
#include <iostream>
#include <string>
#ifndef VIENNACL_WITH_OPENCL
#define VIENNACL_WITH_OPENCL
#endif
Defining a Compute Kernel
In the following we define a custom compute kernel which computes an elementwise product of two vectors.
Input: v1 ... vector
v2 ... vector
Output: result ... vector
Algorithm: set result[i] <- v1[i] * v2[i]
(in MATLAB notation this is 'result = v1 .* v2');
static const char * my_compute_program =
"__kernel void elementwise_prod(\n"
" __global const float * vec1,\n"
" __global const float * vec2, \n"
" __global float * result,\n"
" unsigned int size) \n"
"{ \n"
" for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0))\n"
" result[i] = vec1[i] * vec2[i];\n"
"};\n";
With this let us go right to main():
Part 1: Set up a custom context
The following is rather lengthy because OpenCL is a fairly low-level framework. For comparison, the subsequent code explicitly performs the OpenCL setup that is done in the background within the 'custom_kernels'-tutorial
std::vector<cl_device_id> device_id_array;
std::cout <<
"Platform info: " << pf.
info() << std::endl;
std::vector<viennacl::ocl::device> devices = pf.
devices(CL_DEVICE_TYPE_DEFAULT);
std::cout << devices[0].name() << std::endl;
std::cout << "Number of devices for custom context: " << devices.size() << std::endl;
for (std::size_t i=0; i<devices.size(); ++i)
{
device_id_array.push_back(devices[i].id());
}
std::cout << "Creating context..." << std::endl;
cl_int err;
cl_context my_context = clCreateContext(0, cl_uint(device_id_array.size()), &(device_id_array[0]), NULL, NULL, &err);
unsigned int vector_size = 10;
std::vector<ScalarType> vec1(vector_size);
std::vector<ScalarType> vec2(vector_size);
std::vector<ScalarType> result(vector_size);
for (unsigned int i=0; i<vector_size; ++i)
{
vec1[i] = static_cast<ScalarType>(i);
vec2[i] = static_cast<ScalarType>(vector_size-i);
}
cl_mem mem_vec1 = clCreateBuffer(my_context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, vector_size * sizeof(ScalarType), &(vec1[0]), &err);
cl_mem mem_vec2 = clCreateBuffer(my_context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, vector_size * sizeof(ScalarType), &(vec2[0]), &err);
cl_mem mem_result = clCreateBuffer(my_context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, vector_size * sizeof(ScalarType), &(result[0]), &err);
std::vector<cl_command_queue> queues(devices.size());
for (std::size_t i=0; i<devices.size(); ++i)
{
queues[i] = clCreateCommandQueue(my_context, devices[i].id(), 0, &err);
}
std::size_t source_len = std::string(my_compute_program).length();
cl_program my_prog = clCreateProgramWithSource(my_context, 1, &my_compute_program, &source_len, &err);
err = clBuildProgram(my_prog, 0, NULL, NULL, NULL, NULL);
const char * kernel_name = "elementwise_prod";
cl_kernel my_kernel = clCreateKernel(my_prog, kernel_name, &err);
err = clSetKernelArg(my_kernel, 0, sizeof(cl_mem), (void*)&mem_vec1);
err = clSetKernelArg(my_kernel, 1, sizeof(cl_mem), (void*)&mem_vec2);
err = clSetKernelArg(my_kernel, 2, sizeof(cl_mem), (void*)&mem_result);
err = clSetKernelArg(my_kernel, 3, sizeof(unsigned int), (void*)&vector_size);
std::size_t global_size = vector_size;
std::size_t local_size = vector_size;
err = clEnqueueNDRangeKernel(queues[0], my_kernel, 1, NULL, &global_size, &local_size, 0, NULL, NULL);
err = clEnqueueReadBuffer(queues[0], mem_vec1, CL_TRUE, 0, sizeof(ScalarType)*vector_size, &(vec1[0]), 0, NULL, NULL);
err = clEnqueueReadBuffer(queues[0], mem_result, CL_TRUE, 0, sizeof(ScalarType)*vector_size, &(result[0]), 0, NULL, NULL);
std::cout << "vec1 : ";
for (std::size_t i=0; i<vec1.size(); ++i)
std::cout << vec1[i] << " ";
std::cout << std::endl;
std::cout << "vec2 : ";
for (std::size_t i=0; i<vec2.size(); ++i)
std::cout << vec2[i] << " ";
std::cout << std::endl;
std::cout << "result: ";
for (std::size_t i=0; i<result.size(); ++i)
std::cout << result[i] << " ";
std::cout << std::endl;
Part 2: Reuse Custom OpenCL Context with ViennaCL
To let ViennaCL reuse the previously created context, we need to make it known to ViennaCL before any ViennaCL objects are created. We inject the custom context as the context with default id '0' when using viennacl::ocl::switch_context().
Check that ViennaCL really uses the new context:
std::cout << "Existing context: " << my_context << std::endl;
Wrap existing OpenCL objects into ViennaCL:
std::cout << "Standard vector operations within ViennaCL:" << std::endl;
vcl_result = vcl_s * vcl_vec1 + vcl_vec2;
std::cout << "vec1 : ";
std::cout << vcl_vec1 << std::endl;
std::cout << "vec2 : ";
std::cout << vcl_vec2 << std::endl;
std::cout << "result: ";
std::cout << vcl_result << std::endl;
We can also reuse the existing elementwise_prod kernel. Therefore, we first have to make the existing program known to ViennaCL For more details on the three lines, see tutorial 'custom-kernels'
std::cout << "Using existing kernel within the OpenCL backend of ViennaCL:" << std::endl;
std::cout << "vec1 : ";
std::cout << vcl_vec1 << std::endl;
std::cout << "vec2 : ";
std::cout << vcl_vec2 << std::endl;
std::cout << "result: ";
std::cout << vcl_result << std::endl;
Since a linear piece of memory can be interpreted in several ways, we will now create a 3x3 row-major matrix out of the linear memory in mem_vec1/ The first three entries in vcl_vec2 and vcl_result are used to carry out matrix-vector products:
vcl_vec2.resize(3);
vcl_result.resize(3);
std::cout << "result of matrix-vector product: ";
std::cout << vcl_result << std::endl;
Any further operations can be carried out in the same way. Just keep in mind that any resizing of vectors or matrices leads to a reallocation of the underlying memory buffer, through which the 'wrapper' is lost.
std::cout << "!!!! TUTORIAL COMPLETED SUCCESSFULLY !!!!" << std::endl;
return EXIT_SUCCESS;
}
Full Example Code
#include <iostream>
#include <string>
#ifndef VIENNACL_WITH_OPENCL
#define VIENNACL_WITH_OPENCL
#endif
static const char * my_compute_program =
"__kernel void elementwise_prod(\n"
" __global const float * vec1,\n"
" __global const float * vec2, \n"
" __global float * result,\n"
" unsigned int size) \n"
"{ \n"
" for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0))\n"
" result[i] = vec1[i] * vec2[i];\n"
"};\n";
{
std::vector<cl_device_id> device_id_array;
std::cout <<
"Platform info: " << pf.
info() << std::endl;
std::vector<viennacl::ocl::device> devices = pf.
devices(CL_DEVICE_TYPE_DEFAULT);
std::cout << devices[0].name() << std::endl;
std::cout << "Number of devices for custom context: " << devices.size() << std::endl;
for (std::size_t i=0; i<devices.size(); ++i)
{
device_id_array.push_back(devices[i].id());
}
std::cout << "Creating context..." << std::endl;
cl_int err;
cl_context my_context = clCreateContext(0, cl_uint(device_id_array.size()), &(device_id_array[0]), NULL, NULL, &err);
unsigned int vector_size = 10;
std::vector<ScalarType> vec1(vector_size);
std::vector<ScalarType> vec2(vector_size);
std::vector<ScalarType> result(vector_size);
for (unsigned int i=0; i<vector_size; ++i)
{
vec1[i] = static_cast<ScalarType>(i);
vec2[i] = static_cast<ScalarType>(vector_size-i);
}
cl_mem mem_vec1 = clCreateBuffer(my_context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, vector_size * sizeof(ScalarType), &(vec1[0]), &err);
cl_mem mem_vec2 = clCreateBuffer(my_context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, vector_size * sizeof(ScalarType), &(vec2[0]), &err);
cl_mem mem_result = clCreateBuffer(my_context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, vector_size * sizeof(ScalarType), &(result[0]), &err);
std::vector<cl_command_queue> queues(devices.size());
for (std::size_t i=0; i<devices.size(); ++i)
{
queues[i] = clCreateCommandQueue(my_context, devices[i].id(), 0, &err);
}
std::size_t source_len = std::string(my_compute_program).length();
cl_program my_prog = clCreateProgramWithSource(my_context, 1, &my_compute_program, &source_len, &err);
err = clBuildProgram(my_prog, 0, NULL, NULL, NULL, NULL);
const char * kernel_name = "elementwise_prod";
cl_kernel my_kernel = clCreateKernel(my_prog, kernel_name, &err);
err = clSetKernelArg(my_kernel, 0, sizeof(cl_mem), (void*)&mem_vec1);
err = clSetKernelArg(my_kernel, 1, sizeof(cl_mem), (void*)&mem_vec2);
err = clSetKernelArg(my_kernel, 2, sizeof(cl_mem), (void*)&mem_result);
err = clSetKernelArg(my_kernel, 3, sizeof(unsigned int), (void*)&vector_size);
std::size_t global_size = vector_size;
std::size_t local_size = vector_size;
err = clEnqueueNDRangeKernel(queues[0], my_kernel, 1, NULL, &global_size, &local_size, 0, NULL, NULL);
err = clEnqueueReadBuffer(queues[0], mem_vec1, CL_TRUE, 0, sizeof(ScalarType)*vector_size, &(vec1[0]), 0, NULL, NULL);
err = clEnqueueReadBuffer(queues[0], mem_result, CL_TRUE, 0, sizeof(ScalarType)*vector_size, &(result[0]), 0, NULL, NULL);
std::cout << "vec1 : ";
for (std::size_t i=0; i<vec1.size(); ++i)
std::cout << vec1[i] << " ";
std::cout << std::endl;
std::cout << "vec2 : ";
for (std::size_t i=0; i<vec2.size(); ++i)
std::cout << vec2[i] << " ";
std::cout << std::endl;
std::cout << "result: ";
for (std::size_t i=0; i<result.size(); ++i)
std::cout << result[i] << " ";
std::cout << std::endl;
std::cout << "Existing context: " << my_context << std::endl;
std::cout << "Standard vector operations within ViennaCL:" << std::endl;
vcl_result = vcl_s * vcl_vec1 + vcl_vec2;
std::cout << "vec1 : ";
std::cout << vcl_vec1 << std::endl;
std::cout << "vec2 : ";
std::cout << vcl_vec2 << std::endl;
std::cout << "result: ";
std::cout << vcl_result << std::endl;
std::cout << "Using existing kernel within the OpenCL backend of ViennaCL:" << std::endl;
std::cout << "vec1 : ";
std::cout << vcl_vec1 << std::endl;
std::cout << "vec2 : ";
std::cout << vcl_vec2 << std::endl;
std::cout << "result: ";
std::cout << vcl_result << std::endl;
vcl_vec2.resize(3);
std::cout << "result of matrix-vector product: ";
std::cout << vcl_result << std::endl;
std::cout << "!!!! TUTORIAL COMPLETED SUCCESSFULLY !!!!" << std::endl;
return EXIT_SUCCESS;
}