Running OpenCL using Visual Studio 2015 - c

I'm a newbie in OpenCL, so far I referred Dr. Dobbs tutorials for OpenCL and few others and ran it on Ubuntu which worked very well but those same codes won't/refuse to work on windows using Visual Studio with all required Environment Variables set correctly.
I'm using 980M with CUDA SDK 8 on vs 2015. I have two files, one in C and another is a kernel(cl) file. Whenever I add both .c & .cl files, the program refuses to run by throwing errors like Can't find program files and things like that. However, if I write the kernel file within C file, it works sometimes say 1 out of 3. The same program works fine on my PC running Ubuntu 16 and another PC with AMD card running on Ubuntu 16.
The program I'm trying to run is vector addition written in C.
I've attached the link to the code.
OpenCL vector addition
add_numbers.c
#define PROGRAM_FILE "add_numbers.cl"
#define KERNEL_FUNC "vecAdd"
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
#ifdef __linux
#include <unistd.h>
#include <fcntl.h>
#endif // __linux
#ifdef MAC
#include <OpenCL/cl.h>
#else
#include <CL/cl.h>
#endif
/* Find a GPU or CPU associated with the first available platform */
cl_device_id create_device() {
cl_platform_id platform;
cl_device_id dev;
int err;
/* Identify a platform */
err = clGetPlatformIDs(1, &platform, NULL);
if(err < 0) {
perror("Couldn't identify a platform");
exit(1);
}
/* Access a device */
err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_CPU, 1, &dev, NULL);
if(err == CL_DEVICE_NOT_FOUND) {
err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &dev, NULL);
}
if(err < 0) {
perror("Couldn't access any devices");
exit(1);
}
return dev;
}
/* Create program from a file and compile it */
cl_program build_program(cl_context ctx, cl_device_id dev, const char* filename) {
cl_program program;
FILE *program_handle;
char *program_buffer, *program_log;
size_t program_size, log_size;
int err;
/* Read program file and place content into buffer */
program_handle = fopen(filename, "r");
if(program_handle == NULL) {
perror("Couldn't find the program file");
exit(1);
}
fseek(program_handle, 0, SEEK_END);
program_size = ftell(program_handle);
rewind(program_handle);
program_buffer = (char*)malloc(program_size + 1);
program_buffer[program_size] = '\0';
fread(program_buffer, sizeof(char), program_size, program_handle);
fclose(program_handle);
/* Create program from file */
program = clCreateProgramWithSource(ctx, 1,
(const char**)&program_buffer, &program_size, &err);
if(err < 0) {
perror("Couldn't create the program");
exit(1);
}
free(program_buffer);
/* Build program */
err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
if(err < 0) {
/* Find size of log and print to std output */
clGetProgramBuildInfo(program, dev, CL_PROGRAM_BUILD_LOG,
0, NULL, &log_size);
program_log = (char*) malloc(log_size + 1);
program_log[log_size] = '\0';
clGetProgramBuildInfo(program, dev, CL_PROGRAM_BUILD_LOG,
log_size + 1, program_log, NULL);
printf("%s\n", program_log);
free(program_log);
exit(1);
}
return program;
}
int main() {
/* OpenCL structures */
cl_device_id device;
cl_context context;
cl_program program;
cl_kernel kernel;
cl_command_queue queue;
cl_device_type dev_type; //new
// cl_int i, j, err;
size_t local_size, global_size;
// vector add pgm
// Length of vectors
unsigned int n = 1000000;
// Host input vectors
double *h_a;
double *h_b;
// Host output vector
double *h_c;
// Device input buffers
cl_mem d_a;
cl_mem d_b;
// Device output buffer
cl_mem d_c;
// Size, in bytes, of each vector
size_t bytes = n*sizeof(double);
// Allocate memory for each vector on host
h_a = (double *)malloc(bytes);
h_b = (double *)malloc(bytes);
h_c = (double *)malloc(bytes);
// Initialize vectors on host
int i;
for( i = 0; i < n; i++ )
{
/*h_a[i] = sinf(i)*sinf(i);
h_b[i] = cosf(i)*cosf(i);*/
h_a[i] = i+1;
h_b[i] = i+2;
}
// size_t globalSize, localSize;
cl_int err;
// Number of work items in each local work group
local_size = 64;
// Number of total work items - localSize must be devisor
global_size = (n/local_size)*local_size;
/* Create device and context */
device = create_device();
context = clCreateContext(NULL, 1, &device, NULL, NULL, &err);
if(err < 0) {
perror("Couldn't create a context");
exit(1);
}
/* Build program */
program = build_program(context, device, PROGRAM_FILE);
/* Create data buffer */
input_buffer = clCreateBuffer(context, CL_MEM_READ_ONLY |
CL_MEM_COPY_HOST_PTR, ARRAY_SIZE * sizeof(float), data, &err);
sum_buffer = clCreateBuffer(context, CL_MEM_READ_WRITE |
CL_MEM_COPY_HOST_PTR, num_groups * sizeof(float), sum, &err);
if(err < 0) {
perror("Couldn't create a buffer");
exit(1);
};*/
//input buffers
d_a = clCreateBuffer(context, CL_MEM_READ_ONLY, bytes, NULL, NULL);
d_b = clCreateBuffer(context, CL_MEM_READ_ONLY, bytes, NULL, NULL);
d_c = clCreateBuffer(context, CL_MEM_WRITE_ONLY, bytes, NULL, NULL);
/* Create a command queue */
queue = clCreateCommandQueue(context, device, 0, &err);
if(err < 0) {
perror("Couldn't create a command queue");
exit(1);
};
/* Create a kernel */
kernel = clCreateKernel(program, KERNEL_FUNC, &err);
if(err < 0) {
perror("Couldn't create a kernel");
exit(1);
};
// Newer args under test
err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &d_a);
err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &d_b);
err |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &d_c);
err |= clSetKernelArg(kernel, 3, sizeof(unsigned int), &n);
if(err < 0) {
perror("Couldn't create a kernel argument");
exit(1);
}
/* Enqueue kernel */
err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global_size,
&local_size, 0, NULL, NULL);
if(err < 0) {
perror("Couldn't enqueue the kernel");
exit(1);
}
/* Read the kernel's output */
err = clEnqueueReadBuffer(queue, d_c, CL_TRUE, 0,
bytes, h_c, 0, NULL, NULL );
if(err < 0) {
perror("Couldn't read the buffer");
exit(1);
}
//cl_device_type dev_type;
clGetDeviceInfo(device, CL_DEVICE_TYPE, sizeof(dev_type), &dev_type, NULL);
if (dev_type == CL_DEVICE_TYPE_GPU) {
puts("I'm 100% sure this device is a GPU");
}
else
puts("Device is CPU\n");
// Sum calc.
double sum = 0;
for(i=0; i<n; i++)
//sum += h_c[i];
sum = h_a[i] + h_b[i];
printf("final result: %lf\n",(sum/n));
/* Deallocate resources */
clReleaseMemObject(d_a);
clReleaseMemObject(d_b);
clReleaseMemObject(d_c);
clReleaseKernel(kernel);
clReleaseCommandQueue(queue);
clReleaseProgram(program);
clReleaseContext(context);
return 0;
}
add_number.cl kernel file
__kernel void vecAdd( __global double *a,
__global double *b,
__global double *c,
const unsigned int n)
{
#pragma OPENCL EXTENSION cl_khr_fp64 :enable
//Get our global thread ID
int id = get_global_id(0);
//Make sure we do not go out of bounds
if (id < n)
c[id] = a[id] + b[id];
}

Related

In OpenCl, multiple gpu is slower than single gpu. How can I make faster?

I made vector addition kernel and run it in the single gpu and multiple gpu.
However in Multi gpu case is much slower than single gpu in the same length of vector addition.
The structure of my code is one context one kernel and multi queues which has same number of devices..
How can I modify for faster in multi gpu case?
The code is below
#include <stdio.h>
#include <stdlib.h>
#include <sys/time.h>
#include <unistd.h>
#include <CL/cl.h>
#include <math.h>
//#define VECTOR_SIZE 640000
//#define LOCAL_SIZE 64
#define CHECK_ERROR(err) \
if (err != CL_SUCCESS) { \
printf("[%s:%d] OpenCL error %d\n", __FILE__, __LINE__, err); \
exit(EXIT_FAILURE); \
}
double get_time() {
struct timeval tv;
gettimeofday(&tv, NULL);
return (double)tv.tv_sec + (double)1e-6 * tv.tv_usec;
}
char *get_source_code(const char *file_name, size_t *len) {
char *source_code;
size_t length;
FILE *file = fopen(file_name, "r");
if (file == NULL) {
printf("[%s:%d] Failed to open %s\n", __FILE__, __LINE__, file_name);
exit(EXIT_FAILURE);
}
fseek(file, 0, SEEK_END);
length = (size_t)ftell(file);
rewind(file);
source_code = (char *)malloc(length + 1);
fread(source_code, length, 1, file);
source_code[length] = '\0';
fclose(file);
*len = length;
return source_code;
}
int main() {
// OpenCl variables
cl_platform_id platform;
//cl_device_id device;
cl_device_id *devices;
cl_device_id device_temp;
cl_context context;
//cl_command_queue queue;
cl_command_queue *queues;
cl_mem bufferA, bufferB, bufferC;
cl_program program;
char *kernel_source;
size_t kernel_source_size;
cl_kernel kernel;
//cl_kernel *kernels;
cl_int err;
//
size_t VECTOR_SIZE = 64000000 ;
int num_devices = 4;
size_t LOCAL_SIZE = 64;
// Time variables
double start;
double end;
// Get platform
err = clGetPlatformIDs(1, &platform, NULL);
CHECK_ERROR(err);
// Get GPU device
devices = (cl_device_id *) malloc(sizeof(cl_device_id)*num_devices);
err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, num_devices, devices, NULL);
//err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL);
CHECK_ERROR(err);
// Create context
context = clCreateContext(NULL,num_devices, devices , NULL, NULL , &err);
//context = clCreateContext(NULL, 1, &device, NULL, NULL, &err);
CHECK_ERROR(err);
// Get kernel code
kernel_source = get_source_code("kernel.cl", &kernel_source_size);
// Create program
program = clCreateProgramWithSource(context, 1, (const char**)&kernel_source,
&kernel_source_size, &err);
CHECK_ERROR(err);
// Build program
err = clBuildProgram(program, num_devices, devices, "", NULL, NULL);
if(err == CL_BUILD_PROGRAM_FAILURE) {
size_t log_size;
char *log;
// Get program build
//err = clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG,
// 0, NULL, &log_size);
err = clGetProgramBuildInfo(program,devices[0],CL_PROGRAM_BUILD_LOG,0,NULL,&log_size);
CHECK_ERROR(err);
// Get build log
log = (char*)malloc(log_size + 1);
//err = clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG,
// log_size, log, NULL);
err = clGetProgramBuildInfo(program,devices[0],CL_PROGRAM_BUILD_LOG,log_size,log,NULL);
CHECK_ERROR(err);
log[log_size] = '\0';
printf("Compiler error : \n%s\n", log);
free(log);
exit(0);
}
CHECK_ERROR(err);
// Create Vector A, B, C
float *A = (float*)malloc(sizeof(float) * VECTOR_SIZE);
float *B = (float*)malloc(sizeof(float) * VECTOR_SIZE);
float *C = (float*)malloc(sizeof(float) * VECTOR_SIZE);
// Initial Vector A, B
//cl_ushort idx;
/*for(idx = 0; idx < VECTOR_SIZE; idx++) {
A[idx] = rand() % 100;
B[idx] = rand() % 100;
}*/
printf("start\n");
start = get_time();
for(int i = 0; i <VECTOR_SIZE; i++){
A[i] = sinf(i)*sinf(i);
B[i] = cosf(i)*cosf(i);
}
end = get_time();
printf("Initialization time : %f seconds elapsed\n", end-start);
// Create kernel
/*kernels = (cl_kernel *) malloc(sizeof(cl_kernel)*num_devices);
for(int i=0; i<num_devices; i++){
kernels[i] = clCreateKernel(program,"vec_add", &err);
CHECK_ERROR(err);
}*/
kernel = clCreateKernel(program, "vec_add", &err);
CHECK_ERROR(err);
// Create Buffer
bufferA = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(float) * VECTOR_SIZE, NULL, &err);
CHECK_ERROR(err);
bufferB = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(float) * VECTOR_SIZE, NULL, &err);
CHECK_ERROR(err);
bufferC = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(float) * VECTOR_SIZE, NULL, &err);
CHECK_ERROR(err);
printf("error hi\n");
// Create command-queue
queues = (cl_command_queue *) malloc(sizeof(cl_command_queue)*num_devices);
for(int i=0; i<num_devices; i++){
if (i==0){
queues[i] = clCreateCommandQueue(context,devices[i],CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE,&err);
CHECK_ERROR(err);
}
else{
queues[i] = clCreateCommandQueue(context,devices[i], CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, &err);
CHECK_ERROR(err);
}
}
printf("error bye\n");
//queue = clCreateCommandQueue(context, device, 0, &err);
//CHECK_ERROR(err);
// Write Buffer
for (int i = 0; i<num_devices; i++){
err = clEnqueueWriteBuffer(queues[i],bufferA,CL_FALSE,0,sizeof(float)*VECTOR_SIZE,A,0,NULL,NULL);
CHECK_ERROR(err);
err = clEnqueueWriteBuffer(queues[i],bufferB,CL_FALSE,0,sizeof(float)*VECTOR_SIZE,B,0,NULL,NULL);
CHECK_ERROR(err);
}
//err = clEnqueueWriteBuffer(queue, bufferA, CL_FALSE, 0, sizeof(float) * VECTOR_SIZE, A, 0, NULL, NULL);
//CHECK_ERROR(err);
//err = clEnqueueWriteBuffer(queue, bufferB, CL_FALSE, 0, sizeof(float) * VECTOR_SIZE, B, 0, NULL, NULL);
//CHECK_ERROR(err);
for(int i=0; i<num_devices; i++){
err=clFinish(queues[i]);
CHECK_ERROR(err);
}
// Set Kernel arguments
start = get_time();
/*for(int i=0; i<num_devices; i++){
err=clSetKernelArg(kernels[i], 0, sizeof(cl_mem), &bufferA);
CHECK_ERROR(err);
err=clSetKernelArg(kernels[i], 1, sizeof(cl_mem), &bufferB);
CHECK_ERROR(err);
err=clSetKernelArg(kernels[i], 2, sizeof(cl_mem), &bufferC);
CHECK_ERROR(err);
err=clSetKernelArg(kernels[i], 3, sizeof(unsigned int), &VECTOR_SIZE);
CHECK_ERROR(err);
}*/
err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &bufferA);
CHECK_ERROR(err);
err = clSetKernelArg(kernel, 1, sizeof(cl_mem), &bufferB);
CHECK_ERROR(err);
err = clSetKernelArg(kernel, 2, sizeof(cl_mem), &bufferC);
CHECK_ERROR(err);
err = clSetKernelArg(kernel, 3, sizeof(unsigned int), &VECTOR_SIZE);
CHECK_ERROR(err);
end = get_time();
printf("Send Vector A, B to GPU : %f seconds elapsed\n", end - start);
for(int i=0; i<num_devices; i++){
err=clFinish(queues[i]);
CHECK_ERROR(err);
}
cl_event ooo_events[num_devices];
start = get_time();
// Execute Kernel
size_t global_size = VECTOR_SIZE;
size_t local_size = LOCAL_SIZE;
for(int i=0; i<num_devices; i++){
//start=get_time();
err= clEnqueueNDRangeKernel(queues[i],kernel,1,NULL,&global_size,&local_size,0,NULL,NULL);
CHECK_ERROR(err);
//err = clEnqueueNDRangeKernel(queues[i],kernels[i],1,NULL,&global_size, &local_size,0,NULL,NULL);
//CHECK_ERROR(err);
//end=get_time();
//printf("Calculate C : %f seconds elapsed\n", end-start);
}
//err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL,&global_size, &local_size, 0, NULL, NULL);
//CHECK_ERROR(err);
for(int i=0; i<num_devices; i++){
err=clFinish(queues[i]);
CHECK_ERROR(err);
}
end = get_time();
printf("Calculate C : %f seconds elapsed\n", end - start);
// Read Buffer
start = get_time();
for(int i=0; i<num_devices; i++){
err = clEnqueueReadBuffer(queues[i],bufferC,CL_TRUE,0,sizeof(float)*VECTOR_SIZE,C,0,NULL,NULL);
CHECK_ERROR(err);
}
//err = clEnqueueReadBuffer(queue, bufferC, CL_TRUE, 0, sizeof(float) * VECTOR_SIZE, C, 0, NULL, NULL);
//CHECK_ERROR(err);
end = get_time();
printf("Receive C from GPU : %f seconds elapsed\n", end - start);
// Evaluate Vector C
start = get_time();
double sum = 0;
for(int i = 0; i < VECTOR_SIZE; i++) {
sum += C[i];
}
end = get_time();
printf("Verification time : %f seconds elapsed\n", end-start);
printf("%lf, %ld \n", sum,VECTOR_SIZE);
if (abs(VECTOR_SIZE - sum) < 1) {
printf("Verification success!\n");
}
printf("Sum : %f\n", sum);
// Release OpenCL object
clReleaseMemObject(bufferA);
clReleaseMemObject(bufferB);
clReleaseMemObject(bufferC);
free(A);
free(B);
free(C);
clReleaseKernel(kernel);
//clReleaseKernel(kernels[0]);
//clReleaseKernel(kernels[1]);
clReleaseProgram(program);
clReleaseCommandQueue(queues[0]);
clReleaseCommandQueue(queues[1]);
//clReleaseCommandQueue(queue);
clReleaseContext(context);
return 0;
}
Using multiple GPUs is only beneficial in terms of performance if the amount of computational work that each GPU performs takes more time then the communication, scheduling and synchronization overhead. This is true for a single GPU as well.
In your case, each GPU performs a simple vector addition. but that rarely takes more time then transferring the data to the GPU, waiting for the kernel to actually get scheduled for execution, etc.
Your code is not measuring the total kernel execution time but also the scheduling overhead.
I would advise you to use proper GPU profiling tools (depending on your GPU vendor) instead of manual CPU timings to properly examine what is going on. You can also try measuring kernel execution time via events.

OpenCL clCreateBuffer() crashes the program

I have taken up OpenCL programming as part of a university project and I have a bit of a problem when I try to input data to a buffer object during the clCreateBuffer() routine.
The program is a simple two-dimensional matrix addition. The code is as follows:
#define _CRT_SECURE_NO_WARNINGS
#define PROGRAM_FILE "add_kernel.cl"
#define ADD_FUNC "add_matrix"
#define MATRIX_DIM 256
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
#ifdef MAC
#include <OpenCL/cl.h>
#else
#include <CL/cl.h>
#endif
/* Find a GPU associated with the first available platform */
cl_device_id create_device() {
cl_platform_id platform;
cl_device_id dev;
int err;
/* Identify a platform */
err = clGetPlatformIDs(1, &platform, NULL);
if(err < 0) {
perror("Couldn't identify a platform");
exit(1);
}
/* Access a GPU */
err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &dev, NULL);
if(err < 0) {
perror("Couldn't access any GPU type");
exit(1);
}
return dev;
}
cl_program build_program(cl_context ctx, cl_device_id dev, const char* filename) {
cl_program program;
FILE *program_handle;
char *program_buffer, *program_log;
size_t program_size, log_size;
int err;
/* Read program file and place content into buffer */
program_handle = fopen(filename, "r");
if(program_handle == NULL) {
perror("Couldn't find the program file");
exit(1);
}
fseek(program_handle, 0, SEEK_END);
program_size = ftell(program_handle);
rewind(program_handle);
program_buffer = (char*)malloc(program_size + 1);
program_buffer[program_size] = '\0';
fread(program_buffer, sizeof(char), program_size, program_handle);
fclose(program_handle);
/* Create program from file */
program = clCreateProgramWithSource(ctx, 1,
(const char**)&program_buffer, &program_size, &err);
if(err < 0) {
perror("Couldn't create the program");
exit(1);
}
free(program_buffer);
/* Build program */
err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
if(err < 0) {
/* Find size of log and print to std output */
clGetProgramBuildInfo(program, dev, CL_PROGRAM_BUILD_LOG,
0, NULL, &log_size);
program_log = (char*) malloc(log_size + 1);
program_log[log_size] = '\0';
clGetProgramBuildInfo(program, dev, CL_PROGRAM_BUILD_LOG,
log_size + 1, program_log, NULL);
printf("%s\n", program_log);
free(program_log);
exit(1);
}
return program;
}
int main(){
/* Host/device data structures */
cl_device_id device;
cl_context context;
cl_command_queue queue;
cl_program program;
cl_kernel add_kernel;
size_t global_size;
cl_ulong mem_size;
cl_int i, j, err, check;
/* Data and buffers */
cl_uint matrix_dim;
float a_mat[MATRIX_DIM][MATRIX_DIM], b_mat[MATRIX_DIM][MATRIX_DIM],
c_mat[MATRIX_DIM][MATRIX_DIM], check_mat[MATRIX_DIM][MATRIX_DIM];
cl_mem a_buffer, b_buffer, c_buffer;
/* Initialize A, B, and check matrices */
srand((unsigned int)time(0));
for(i=0; i<MATRIX_DIM; i++) {
for(j=0; j<MATRIX_DIM; j++) {
a_mat[i][j] = (float)rand()/RAND_MAX;
}
}
srand((unsigned int)time(0));
for(i=0; i<MATRIX_DIM; i++) {
for(j=0; j<MATRIX_DIM; j++) {
b_mat[i][j] = (float)rand()/RAND_MAX;
check_mat[i][j] = 0.0f;
}
}
for(i=0; i<MATRIX_DIM; i++) {
for(j=0; j<MATRIX_DIM; j++) {
check_mat[i][j] += a_mat[i][j] + b_mat[i][j];
}
}
/* Create a device and context */
device = create_device();
context = clCreateContext(NULL, 1, &device, NULL, NULL, &err);
if(err < 0) {
perror("Couldn't create a context");
exit(1);
}
/* Build the program */
program = build_program(context, device, PROGRAM_FILE);
add_kernel = clCreateKernel(program, ADD_FUNC, &err);
if(err < 0) {
perror("Couldn't create a kernel");
exit(1);
};
/* Create buffers */
a_buffer = clCreateBuffer(context,
CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
sizeof(a_mat), a_mat, &err);
if(err < 0) {
perror("Couldn't create buffer A");
exit(1);
};
b_buffer = clCreateBuffer(context,
CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
sizeof(b_mat), b_mat, &err);
if(err < 0) {
perror("Couldn't create buffer B");
exit(1);
};
c_buffer = clCreateBuffer(context, CL_MEM_WRITE_ONLY,
sizeof(c_mat), NULL, &err);
if(err < 0) {
perror("Couldn't create buffer C");
exit(1);
};
/* Create a command queue */
queue = clCreateCommandQueue(context, device, 0, &err);
if(err < 0) {
perror("Couldn't create a command queue");
exit(1);
};
/* Create arguments for multiplication kernel */
err = clSetKernelArg(add_kernel, 0, sizeof(a_buffer), &a_buffer);
err |= clSetKernelArg(add_kernel, 1, sizeof(b_buffer), &b_buffer);
err |= clSetKernelArg(add_kernel, 2, sizeof(c_buffer), &c_buffer);
global_size = MATRIX_DIM * MATRIX_DIM;
//printf("%lu\n", global_size);
err = clEnqueueNDRangeKernel(queue, add_kernel, 1, NULL, &global_size,
NULL, 0, NULL, NULL);
if(err < 0) {
perror("Couldn't enqueue the addition kernel");
exit(1);
}
/* Read output buffer */
err = clEnqueueReadBuffer(queue, c_buffer, CL_TRUE, 0,
sizeof(c_mat), c_mat, 0, NULL, NULL);
if(err < 0) {
perror("Couldn't read the buffer");
exit(1);
}
/* Check result */
check = 1;
for(i=0; i<MATRIX_DIM; i++) {
for(j=0; j<MATRIX_DIM; j++) {
if(c_mat[i][j] != check_mat[i][j]){
check = 0;
break;
}
}
}
if(check)
printf("Addition check succeeded.\n");
else
printf("Addition check failed.\n");
/* Deallocate resources */
clReleaseMemObject(a_buffer);
clReleaseMemObject(b_buffer);
clReleaseMemObject(c_buffer);
clReleaseKernel(add_kernel);
clReleaseCommandQueue(queue);
clReleaseProgram(program);
clReleaseContext(context);
return 0;
}
The kernel code is the following:
__kernel void add_matrix(__global float* matrix_a,
__global float* matrix_b,
__global float* result) {
int i = get_global_id(0);
result[i] = matrix_a[i] + matrix_b[i];
}
Now, it works great for dimensions up to 358x358, but as soon as I put 359 in the MATRIX_DIM it crashes. It shows the usual "foo.exe has stopped working". I know it has to do something with the clCreateBuffer() command because if I remove the code from the first clCreateBuffer() and below, it runs and terminates fine, but as soon as I add even one it crashes.
The CL_DEVICE_MAX_MEM_ALLOC_SIZE option shows a number of 512MB of available memory and the data I am trying to pass is much less than that.
Is there anything I can do to increase the ammount of data I can process?
My GPU is a Radeon Sapphire HD5770
EDIT: After a suggestion in the comments I ran the debugger which yielded the following message:
Program received signal SIGSEGV, Segmentation fault.
In amdocl!_aclHsaLoader () (C:\WINDOWS\SysWOW64\amdocl.dll)
#15 0x00401355 in create_device () at C:\test\testcl.c:26
C:\test\testcl.c:26:503:beg:0x401355
I am really not sure what this means though. Any ideas?
The main problem is, that you allocate to much memory on the stack at these code lines, so that, you got a stack overflow:
float a_mat[MATRIX_DIM][MATRIX_DIM], b_mat[MATRIX_DIM][MATRIX_DIM],
c_mat[MATRIX_DIM][MATRIX_DIM], check_mat[MATRIX_DIM][MATRIX_DIM];
In my test here, the execution didn't even entered the main method. You have to allocate these matrices on the heap with:
float *a_mat = calloc(MATRIX_DIM*MATRIX_DIM, sizeof(*a_mat));
float *b_mat = calloc(MATRIX_DIM*MATRIX_DIM, sizeof(*b_mat));
float *c_mat = calloc(MATRIX_DIM*MATRIX_DIM, sizeof(*c_mat));
float *check_mat = calloc(MATRIX_DIM*MATRIX_DIM, sizeof(*check_mat));
But now, you have only a 1-dimensional (1D) data-buffer for each matrix, so that, you have to change every 2D index [i][j] into the corresponding 1D index [i*MATRIX_DIM][j], e.g.:
a_mat[i*MATRIX_DIM+j] = (float)rand()/RAND_MAX;
EDIT: You have to also update the calls to clCreateBuffer und clEnqueueReadBuffer. The matrix size cannot be determined with sizeof(matrix_name) anymore (where matrix_name is one of a_mat, b_mat, ...). You have to replace every such sizeof (there are 4 of some) with MATRIX_DIM*MATRIX_DIM*sizeof(*matrix_name). Don't forget the derefence before the matrix_name, e.g.:
a_buffer = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
MATRIX_DIM*MATRIX_DIM*sizeof(*a_mat), a_mat, &err);
(End of Edit).
Don't forget to release the data-buffers at the end:
free(a_mat);
free(b_mat);
free(c_mat);
free(check_mat);
To get even the kernel to run, I had even to fix the reading of the kernel program. The return value of ftell was always a little bit too large. The actual number of bytes is instead returned by fread. Thus, change these lines
program_buffer[program_size] = '\0';
fread(program_buffer, sizeof(char), program_size, program_handle);
to
program_size = fread(program_buffer, sizeof(char), program_size, program_handle); // changed
program_buffer[program_size] = '\0'; // moved here

Opencl Reduction is not as expected

I'm pretty a novice about opencl. I have tried about "get the summation of all cubes of every element in an array". Here's my kernel code:
kernel void cubeSum(global float *input,
local float *prods,
global float *output )
{
int gid = get_global_id( 0 );
int tnum = get_local_id( 0 ); // thread number
int wgNum = get_group_id( 0 ); // work-group number
int numItems = get_local_size( 0 );
prods[ tnum ] = input[ gid ] * input[ gid ] * input[gid]; // cube
for (int offset = 1; offset < numItems; offset *= 2) {
int mask = 2 * offset - 1;
barrier(CLK_LOCAL_MEM_FENCE);
if ( (tnum & mask) == 0 ) {
prods[tnum] += prods[tnum + offset];
}
}
barrier(CLK_LOCAL_MEM_FENCE);
if ( tnum == 0 )
output[wgNum] = prods[0];
}
I can't figure out why my result is not the same with sequential result. When the array is from 0 to 511, my result is sequential result minus 2048; when the array is from 0 to 1023, my result is sequential result plus 16384.
I will try to figure it out myself while I'm waiting for you answers.
Another question is I found it is hard to debug kernel code since the dataset is quite big and it runs concurrently. Any advice for debugging?
All the advices are appreciated =).
By the way, here's my host code:
#include <stdio.h>
#include <stdio.h>
#include <math.h>
#include <string.h>
#include <stdlib.h>
#include <OpenCL/opencl.h>
#define NUM_ELEMENTS (512)
#define LOCAL_SIZE (512)
#define MAX_SOURCE_SIZE (0x100000)
int main(int argc, const char * argv[])
{
float data[NUM_ELEMENTS]; //hA
float sum;
float sumTest;
size_t global;
size_t local;
size_t numWorkGroups;
size_t dataSize;
size_t resultsSize;
cl_device_id device;
cl_context context;
cl_command_queue cmdQueue;
cl_program program;
cl_kernel kernel;
cl_mem input;
cl_mem output;
FILE *fp;
//failed to use relative path here. permission problem?
char fileName[] = "/Users/sure/USC/590/cubeSum/cubeSum/cubeSum.cl";
char *source_str;
size_t source_size;
/* カーネルを含むソースコードをロード */
fp = fopen(fileName, "r");
if (!fp) {
fprintf(stderr, "Failed to load kernel.\n");
exit(1);
}
source_str = (char*)malloc(MAX_SOURCE_SIZE);
source_size = fread( source_str, 1, MAX_SOURCE_SIZE, fp);
fclose( fp );
//allocate the host memory buffers:
int i = 0;
unsigned int count = NUM_ELEMENTS;
for (i = 0; i < count; i++) {
data[i] = i;
}
//array size in bytes (will need this later):
dataSize = NUM_ELEMENTS * sizeof(float);
//opencl function status
cl_int status;
// Connect to a compute device
//
int gpu = 1;
status = clGetDeviceIDs(NULL, gpu ? CL_DEVICE_TYPE_GPU : CL_DEVICE_TYPE_CPU, 1, &device, NULL);
if (status != CL_SUCCESS)
{
printf("Error: Failed to create a device group!\n");
return EXIT_FAILURE;
}
//create an Opencl context
context = clCreateContext(NULL, 1, &device, NULL, NULL, &status);
//create a command queue
cmdQueue = clCreateCommandQueue( context, device, 0, &status );
//allocate memory buffers on the device
input = clCreateBuffer( context, CL_MEM_READ_ONLY, dataSize, NULL, &status ); //dA
//TODO: at this line, I don't have the value of local which is calculated by clGetKernelWorkGroupInfo
//need to figure out a way to avoid hardcode it.
output = clCreateBuffer( context, CL_MEM_WRITE_ONLY, sizeof(float) * NUM_ELEMENTS / LOCAL_SIZE, NULL, &status ); //dC
// enqueue the 2 commands to write data into the device buffers:
status = clEnqueueWriteBuffer( cmdQueue, input, CL_FALSE, 0, dataSize, data, 0, NULL, NULL );
// create the kernel program on the device:
program = clCreateProgramWithSource(context, 1, (const char **) & source_str, (const size_t *)&source_size, &status);
if (!program)
{
printf("Error: Failed to create compute program!\n");
return EXIT_FAILURE;
}
// Build the program executable
//
status = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
if (status != CL_SUCCESS)
{
size_t len;
char buffer[2048];
printf("Error: Failed to build program executable!\n");
clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len);
printf("%s\n", buffer);
exit(1);
}
//create compute kernel
kernel = clCreateKernel( program, "cubeSum", &status );
// Get the maximum work group size for executing the kernel on the device
//
status = clGetKernelWorkGroupInfo(kernel, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), &local, NULL);
if (status != CL_SUCCESS)
{
printf("Error: Failed to retrieve kernel work group info! %d\n", status);
exit(1);
}
global = count;
numWorkGroups = global / local;
float results[numWorkGroups]; //hC
resultsSize = numWorkGroups * sizeof(float);
//set kernel parameter
status = clSetKernelArg( kernel, 0, sizeof(cl_mem), &input );
status = clSetKernelArg( kernel, 1, sizeof(float), NULL );
status = clSetKernelArg( kernel, 2, sizeof(cl_mem), &output );
// Execute the kernel over the entire range of our 1d input data set
// using the maximum number of work group items for this device
//
status = clEnqueueNDRangeKernel(cmdQueue, kernel, 1, NULL, &global, &local, 0, NULL, NULL);
if (status)
{
printf("Error: Failed to execute kernel!\n");
return EXIT_FAILURE;
}
clFinish(cmdQueue);
status = clEnqueueReadBuffer( cmdQueue, output, CL_TRUE, 0, resultsSize, results, 0, NULL, NULL );
// Validate our results
//
sum = 0;
for (int i=0; i<numWorkGroups; i++) {
sum += results[i];
}
sumTest = 0;
for(i = 0; i < count; i++)
{
sumTest += data[i] * data[i] * data[i];
}
// Print a brief summary detailing the results
//
printf("Computed '%f/%f'!\n", sum, sumTest);
// Shutdown and cleanup
//
clReleaseMemObject(input);
clReleaseMemObject(output);
clReleaseProgram(program);
clReleaseKernel(kernel);
clReleaseCommandQueue(cmdQueue);
clReleaseContext(context);
return 0;
}
EDIT: Just found another thing. My code is correct if I just sum all element without cube/square. Thus, I'm gonna figure out how cube affect to my program.
You appear to only be allocating 4-bytes of local memory:
status = clSetKernelArg( kernel, 1, sizeof(float), NULL );
This should be the total amount of local memory required for that argument by the entire work-group. In the case of your kernel, this is (work-group-size * sizeof(float)).
So, you should instead have something like this:
status = clSetKernelArg( kernel, 1, local*sizeof(float), NULL );
The discrepancies you are seeing are likely coming from the limitations of floating point, since you are summing some very large numbers. If you initialise your inputs with smaller numbers (e.g. data[i] = i*0.01;), you should get results equal to your sequential implementation (I've verified this on my own system). This is why you don't see the errors when you remove the cube.

Implementing matrix-vector multiplication in AMD openCL/C produces system freezes when I try to increase matrix size

For a project at university, i'm implementing matrix-vector multiplication using AMD OpenCL.
The machine i'm using is a brand new desktop running Ubuntu 12.04, with a Radeon HD 7970 and an AMD FX-4100 quad-core processor. I AMD APP 1.2 and the latest ATI Catalyst drivers for the Radeon.
Here is the kernel I am trying to use.
__kernel void mvKernel(__global float* a, const __global float* x, __global float* y, int m, int n)
{
float sum = 0.0f;
__global float* A;
int i;
int j = 0;
int indx = get_global_id(0);
__local float xs[2048];
for(i = get_local_id(0); i < n; i+= get_local_size(0)) {
xs[i] = x[i];
}
mem_fence(CLK_LOCAL_MEM_FENCE|CLK_GLOBAL_MEM_FENCE);
A = &a[indx];
for(i = 0; i < n; i++) {
sum += xs[i] * A[j];
j += m;
}
y[indx] = sum;
}
When this is run on the GPU for matrix sizes 256 x 256, the results produced are correct and no problems occur. However when I try to increase the matrix size, given as command line arguments, the system will hang, requiring a reboot.
However when I run the code using AMD's CodeXL debugger/profiler, the code will run most of the time, with no errors.
Here's the host code I run
#include <stdio.h>
#include <stdlib.h>
#include <CL/cl.h>
#include <math.h>
#include <string.h>
char* readSource(const char* sourceFilename);
void randomInit(float* data, int size)
{
int i =0;
for(i; i < size; i++)
data[i] = (rand()/(float)RAND_MAX) * 10;
}
void cpuMV (float* y, float* A, float* X, int M, int N)
{
for(int i = 0; i< M; i++) {
double sum = 0;
y[i] = 0;
for(int k = 0; k < N; k++) {
double a = A[i + k* M];
double x = X[k];
sum += a * x;
}
y[i] = (float) sum;
}
}
int main( int argc, char ** argv) {
int M = atoi(argv[1]);//1024;
int N = atoi(argv[2]);//1024;
float *A, *x;
float *y;
A = (float *)malloc(sizeof(float) * M * N);
x = (float *)malloc(sizeof(float) * N);
y = (float *)malloc(sizeof(float) * M);
randomInit(A, M * N);
randomInit(x, N);
int wrong;
wrong = 0;
cl_int err;
cl_uint numPlatforms;
cl_platform_id *platforms;
err = clGetPlatformIDs(0, NULL, &numPlatforms);
if (err != CL_SUCCESS) {
printf("clGetPlatformIDs failed\n");
exit(-1);
}
if(numPlatforms == 0) {
printf("No platforms detected.\n");
exit(-1);
}
platforms = (cl_platform_id*)malloc(numPlatforms*sizeof(cl_platform_id));
clGetPlatformIDs(numPlatforms, platforms, NULL);
printf("%u platforms found\n", numPlatforms);
for(int i =0; i < numPlatforms; i++) {
char buff[100];
printf("Platform %u:\n", i);
err = clGetPlatformInfo(platforms[i], CL_PLATFORM_VENDOR, sizeof(buff), buff, NULL);
printf("\tVendor: %s\n", buff);
err = clGetPlatformInfo(platforms[i], CL_PLATFORM_NAME, sizeof(buff), buff, NULL);
printf("\tName: %s\n", buff);
if (err != CL_SUCCESS) {
printf("clGetPlatformInfo failed\n");
exit(-1);
}
}
printf("\n");
cl_uint numDevices = 0;
cl_device_id *devices;
err = clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_GPU, 0, NULL, &numDevices);
if(err != CL_SUCCESS) {
printf("clGetDeviceIDs failed\n");
exit(-1);
}
if (numDevices == 0){
printf("No devices found\n");
exit(-1);
}
devices = (cl_device_id*)malloc(numDevices*sizeof(cl_device_id));
err = clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_GPU, numDevices, devices, NULL);
printf("%u devices found\n", numDevices);
for(int i =0; i < numDevices; i++) {
char buff[100];
printf("Device %u:\n", i);
err = clGetDeviceInfo(devices[i], CL_DEVICE_VENDOR, sizeof(buff), buff, NULL);
printf("\tVendor: %s\n", buff);
err = clGetDeviceInfo(devices[i], CL_DEVICE_NAME, sizeof(buff), buff, NULL);
printf("\tName: %s\n", buff);
if (err != CL_SUCCESS) {
printf("clGetDeviceInfo failed\n");
exit(-1);
}
}
cl_context context;
context = clCreateContext(NULL, numDevices,devices, NULL, NULL, &err);
if(err != CL_SUCCESS){
printf("clCreateContext failed\n");
exit(-1);
}
cl_command_queue cmdQueue;
cmdQueue = clCreateCommandQueue(context, devices[0], 0, &err);
if(err != CL_SUCCESS) {
printf("clCreateCommandQueue failed\n");
exit(-1);
}
cl_mem d_A, d_x;
cl_mem d_y;
d_A = clCreateBuffer(context, CL_MEM_READ_ONLY|CL_MEM_COPY_HOST_PTR, M * N * sizeof(float), A, &err);
if (err != CL_SUCCESS) {
printf("clCreateBuffer for A failed\n");
exit(-1);
}
d_x = clCreateBuffer(context, CL_MEM_READ_ONLY|CL_MEM_COPY_HOST_PTR, N * sizeof(float), x, &err);
if (err != CL_SUCCESS) {
printf("clCreateBuffer for x failed\n");
exit(-1);
}
d_y = clCreateBuffer(context, CL_MEM_READ_WRITE, M * sizeof(float), NULL, &err);
if (err != CL_SUCCESS) {
printf("clCreateBuffer for y failed\n");
exit(-1);
}
cl_program program;
char* source;
const char *sourceFile = "MVM_Kernel2.cl";
source = readSource(sourceFile);
program = clCreateProgramWithSource(context, 1, (const char**) &source, NULL, &err);
if (err != CL_SUCCESS) {
printf("clCreateProgramFailed");
exit(-1);
}
cl_int buildErr;
buildErr = clBuildProgram(program, numDevices, devices, NULL, NULL, NULL);
if (buildErr != CL_SUCCESS) {
printf("Program failed to build,\n");
cl_build_status buildStatus;
for(int i = 0; i < numDevices; i++) {
clGetProgramBuildInfo(program, devices[i], CL_PROGRAM_BUILD_STATUS, sizeof(cl_build_status), &buildStatus, NULL);
if(buildStatus == CL_SUCCESS) {
continue;
}
char *buildLog;
size_t buildLogSize;
clGetProgramBuildInfo(program, devices[i], CL_PROGRAM_BUILD_LOG, 0, NULL, &buildLogSize);
buildLog = (char *)malloc(buildLogSize);
clGetProgramBuildInfo(program, devices[i], CL_PROGRAM_BUILD_LOG,buildLogSize, buildLog, NULL);
buildLog[buildLogSize -1] = '\0';
printf("Device %u Build Log:\n%s\n", i, buildLog);
free(buildLog);
}
exit(0);
}
else {
printf("No build errors\n");
}
cl_kernel kernel;
kernel = clCreateKernel(program, "mvKernel", &err);
if(err != CL_SUCCESS) {
printf("clCreateKernel failed\n");
exit(-1);
}
err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &d_A);
err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &d_x);
err |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &d_y);
err |= clSetKernelArg(kernel, 3, sizeof(int), &M);
err |= clSetKernelArg(kernel, 4, sizeof(int), &N);
size_t globalWorkSize[1];
globalWorkSize[0] = M * N;
size_t localWorkSize[1];
localWorkSize[0] = 256;
err = clEnqueueNDRangeKernel(cmdQueue, kernel, 1, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL);
clEnqueueReadBuffer(cmdQueue, d_y, CL_TRUE, 0, M * sizeof(float), y, 0, NULL, NULL);
clFlush(cmdQueue);
err = clFinish(cmdQueue);
if(err != CL_SUCCESS) {
printf("ERROR!!");
exit(-1);
}
clReleaseKernel(kernel);
clReleaseProgram(program);
clReleaseCommandQueue(cmdQueue);
clReleaseMemObject(d_A);
clReleaseMemObject(d_x);
clReleaseMemObject(d_y);
clReleaseContext(context);
for(int i=0; i < (M <10 ? M : 10); i++)
printf("vector y = %f\n", y[i]);
float* refY;
refY = (float*)malloc(M*sizeof(float));
cpuMV(refY, A, x, M, N);
for (int i = 0; i < M; ++i) {
float diff = refY[i] - y[i];
if (fabsf(diff)/ refY[i] > 1e-4)
wrong++;
}
printf("There were %d errors!!\n", wrong);
free(A);
free(y);
free(x);
free(source);
free(platforms);
free(devices);
}
char* readSource(const char *sourceFilename) {
FILE *fp;
int errs;
int size;
char *source;
fp = fopen(sourceFilename, "rb");
errs = fseek(fp, 0, SEEK_END);
if(errs != 0) {
printf("Error seeking to end of file");
exit(-1);
}
size = ftell(fp);
if(size<0) {
printf("Errror getting file position");
exit(-1);
}
errs = fseek(fp, 0, SEEK_SET);
if(errs != 0){
printf("Error seeking to start of file\n");
exit(-1);
}
source = (char*)malloc(size +1);
errs = fread(source, 1, size, fp);
if(errs != size) {
printf("only read %d bytes\n", errs);
exit(0);
}
source[size]= '\0';
return source;
}
Eventually this needs to work on matrices of order ~10000
EDIT
I've also tried the same code on my laptop which has an Nvidia GT525m, and the program runs fine for matrices upto 352 * 352, any bigger and the answer will just be zero, but it doesn't crash.
The problem was with globalWorkSize being far too big (M * N) when it should have been just M. This must have been overloading the GPU and causing the system freeze. I now have the code running reliably on both Nvidia and AMD GPUs as well as the AMD CPU

Simple OpenCL program compiles and runs but output is incorrect

I wrote a simply OpenCL program based off the SDK and it compiles and runs, however the output is wrong. Is there something I'm doing wrong?
Any suggestions for learning to debug C and OpenCL is much appreciated. I'm quite new to the platform.
Code is below.
The output in array c is all zeros.
Thanks.
test_opencl.h
#ifndef _TEST_OPENCL_H_
#define _TEST_OPENCL_H_
int main( int argc, const char** argv);
int runTest( int argc, const char** argv);
#endif
test_opencl.cl
// simple test of adding a[i] to b[i] to get c[i]
__kernel void add_array(__global float *a, __global float *b, __global float *c)
{
int xid = get_global_id(0);
c[xid] = a[xid] + b[xid];
}
test_opencl.cpp
// standard utility and system includes
#include <oclUtils.h>
#include "test_opencl.h"
// OpenCL error catcher
cl_int err = 0;
// Main Program
// *********************************************************************
int main( int argc, const char** argv)
{
// set logfile name and start logs
shrSetLogFileName ("test_opencl.txt");
shrLog(LOGBOTH, 0, "%s Starting...\n\n", argv[0]);
// run the main test
int result = runTest(argc, argv);
shrCheckError(result, 0);
// finish
shrEXIT(argc, argv);
}
//! Run a simple test for OPENCL
// *********************************************************************
int runTest( int argc, const char** argv)
{
cl_context gpu_context;
cl_command_queue cmd_queue;
cl_program program;
cl_kernel test_kernel;
const size_t szGlobalWorkSize = 10;
const size_t szLocalWorkSize = 10;
// size of memory required to store the array
const unsigned int mem_size = sizeof(int) * 10;
// create the OpenCL context on a GPU device
gpu_context = clCreateContextFromType(0, CL_DEVICE_TYPE_GPU, NULL, NULL, &err);
shrCheckError(err, CL_SUCCESS);
// get devices
cl_device_id device;
if( shrCheckCmdLineFlag(argc, argv, "device") ) {
int device_nr = 0;
shrGetCmdLineArgumenti(argc, argv, "device", &device_nr);
device = oclGetDev(gpu_context, device_nr);
} else {
device = oclGetMaxFlopsDev(gpu_context);
}
// create a command-queue
cmd_queue = clCreateCommandQueue(gpu_context, device, 0, &err);
shrCheckError(err, CL_SUCCESS);
// allocate and initalize host memory
int a[10], b[10], c[10];
for (int i = 0; i < 10; i++) {
a[i] = i;
b[i] = i * i;
}
// create buffers on device
cl_mem vol_a = clCreateBuffer(gpu_context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, mem_size, a, &err);
shrCheckError(err, CL_SUCCESS);
cl_mem vol_b = clCreateBuffer(gpu_context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, mem_size, b, &err);
shrCheckError(err, CL_SUCCESS);
cl_mem vol_c = clCreateBuffer(gpu_context, CL_MEM_WRITE_ONLY | CL_MEM_COPY_HOST_PTR, mem_size, c, &err);
shrCheckError(err, CL_SUCCESS);
// copy data from host to device
err = clEnqueueWriteBuffer(cmd_queue, vol_a, CL_TRUE, 0, mem_size, a, 0, NULL, NULL);
err |= clEnqueueWriteBuffer(cmd_queue, vol_b, CL_TRUE, 0, mem_size, b, 0, NULL, NULL);
shrCheckError(err, CL_SUCCESS);
// Program Setup
size_t program_length;
char* source_path = shrFindFilePath("test_opencl.cl", argv[0]);
shrCheckError(source_path != NULL, shrTRUE);
char *source = oclLoadProgSource(source_path, "", &program_length);
shrCheckError(source != NULL, shrTRUE);
// create the program
program = clCreateProgramWithSource(gpu_context, 1, (const char **)&source, &program_length, &err);
shrCheckError(err, CL_SUCCESS);
// build the program
err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
if (err != CL_SUCCESS)
{
// write out standard error, Build Log and PTX, then return error
shrLog(LOGBOTH | ERRORMSG, err, STDERROR);
return(EXIT_FAILURE);
}
clFinish(cmd_queue);
shrLog(LOGBOTH, 0, "%s Starting kernel operation...\n\n", argv[0]);
// create the test kernel
test_kernel = clCreateKernel(program, "add_array", &err);
shrCheckError(err, CL_SUCCESS);
// set the args values for the kernel
err = clSetKernelArg(test_kernel, 0, sizeof(cl_mem), (void *) &vol_a);
err |= clSetKernelArg(test_kernel, 1, sizeof(cl_mem), (void *) &vol_b);
err |= clSetKernelArg(test_kernel, 2, sizeof(cl_mem), (void *) &vol_c);
shrCheckError(err, CL_SUCCESS);
err = clEnqueueNDRangeKernel(cmd_queue, test_kernel, 1, NULL, &szGlobalWorkSize, NULL, 0, NULL, NULL);
shrCheckError(err, CL_SUCCESS);
clFinish(cmd_queue);
// copy result from device to host
err = clEnqueueReadBuffer(cmd_queue, vol_c, CL_TRUE, 0, mem_size, c, 0, NULL, NULL);
shrCheckError(err, CL_SUCCESS);
int d[10];
err = clEnqueueReadBuffer(cmd_queue, vol_a, CL_TRUE, 0, mem_size, d, 0, NULL, NULL);
shrCheckError(err, CL_SUCCESS);
clFinish(cmd_queue);
shrLog(LOGBOTH, 0, "%s Finished kernel operation...\n\n", argv[0]);
bool passed = true;
for (int i = 0; i < 10; i++) {
if (c[i] != i + i * i)
passed = false;
shrLog(LOGBOTH, 0, "c = %d d = %d\n", c[i], d[i]);
}
if (passed)
shrLog(LOGBOTH, 0, "%s Test Passed\n\n", argv[0]);
else
shrLog(LOGBOTH, 0, "%s Test Failed\n\n", argv[0]);
// cleanup OpenCL
clReleaseMemObject(vol_a);
clReleaseMemObject(vol_b);
clReleaseMemObject(vol_c);
clReleaseKernel(test_kernel);
clReleaseProgram(program);
clReleaseCommandQueue(cmd_queue);
clReleaseContext(gpu_context);
return 0;
}
The problems in the code and the solution can be found here.

Resources