I have to program the Floyd algorithm using OpenCL, it works fine but only with n<268. when n>=268 i have an "Access violation reading location" when calling clEnqueueWriteBuffer (the buffer_distances one, in the loop).
Here is my code:
graphe is an adjacency matrix, and distances is the distances matrix
int n;
printf("enter n value: ");
scanf("%d", &n);
printf("\n");
int n2 = n * n;
int matSize = n2 * sizeof(int*);
int* graphe = malloc(sizeof(int) * n2);
int* distances = malloc(sizeof(int) * n2);
//mat[i,j] => mat[i*n + j]
if (graphe == NULL)
printf("malloc failed\n");
init_graphe(graphe, n);
copy(graphe, distances, n);
initialization of opencl variables:
char* programSource = load_kernel("kernel.cl");
cl_int status;
// STEP 1: Discover and initialize the platforms
cl_uint numPlatforms = 0;
cl_platform_id* platforms = NULL;
status = clGetPlatformIDs(0, NULL, &numPlatforms);
printf("Number of platforms = %d\n", numPlatforms);
platforms = (cl_platform_id*)malloc(numPlatforms * sizeof(cl_platform_id));
status = clGetPlatformIDs(numPlatforms, platforms, NULL);
char Name[1000];
clGetPlatformInfo(platforms[0], CL_PLATFORM_NAME, sizeof(Name), Name, NULL);
printf("Name of platform : %s\n", Name);
fflush(stdout);
// STEP 2: Discover and initialize the devices
cl_uint numDevices = 0;
cl_device_id* devices = NULL;
status = clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_ALL, 0, NULL, &numDevices);
printf("Number of devices = %d\n", (int)numDevices);
devices = (cl_device_id*)malloc(numDevices * sizeof(cl_device_id));
status = clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_ALL, numDevices, devices, NULL);
for (int i = 0; i < numDevices; i++) {
clGetDeviceInfo(devices[i], CL_DEVICE_NAME, sizeof(Name), Name, NULL);
printf("Name of device %d: %s\n\n", i, Name);
}
// STEP 3: Create a context
fflush(stdout);
cl_context context = NULL;
context = clCreateContext(NULL, numDevices, devices, NULL, NULL, &status);
// STEP 4: Create a command queue
fflush(stdout);
cl_command_queue cmdQueue;
cmdQueue = clCreateCommandQueue(context, devices[0], 0, &status);
// STEP 5: Create device buffers
fflush(stdout);
cl_mem buffer_graphe;
cl_mem buffer_n;
cl_mem buffer_distances;
cl_mem buffer_k;
buffer_graphe = clCreateBuffer(context, CL_MEM_READ_WRITE, matSize, NULL, &status);
buffer_n = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(int), NULL, &status);
buffer_distances = clCreateBuffer(context, CL_MEM_READ_WRITE, matSize, NULL, &status);
buffer_k = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(int), NULL, &status);
fflush(stdout);
// STEP 6: Create and compile the program
cl_program program = clCreateProgramWithSource(context, 1, (const char**)&programSource, NULL, &status);
printf("Compilation\n");
fflush(stdout);
status = clBuildProgram(program, numDevices, devices, NULL, NULL, NULL);
// STEP 8: Create the kernel
cl_kernel kernel = NULL;
fflush(stdout);
kernel = clCreateKernel(program, "floyd", &status);
size_t globalWorkSize[2] = { n, n };
size_t localWorkSize[3] = { 20,20 };
Execution of the kernel:
clock_t start = clock();
int k;
for (k = 0; k < n; k++) {
status = clEnqueueWriteBuffer(cmdQueue, buffer_graphe, CL_TRUE, 0, matSize, graphe, 0, NULL, NULL);
status = clEnqueueWriteBuffer(cmdQueue, buffer_n, CL_TRUE, 0, sizeof(int), &n, 0, NULL, NULL);
status = clEnqueueWriteBuffer(cmdQueue, buffer_distances, CL_TRUE, 0, matSize, distances, 0, NULL, NULL);
status = clEnqueueWriteBuffer(cmdQueue, buffer_k, CL_TRUE, 0, sizeof(int), &k, 0, NULL, NULL);
status = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&buffer_graphe);
status = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&buffer_n);
status = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*)&buffer_distances);
status = clSetKernelArg(kernel, 3, sizeof(cl_mem), (void*)&buffer_k);
status = clEnqueueNDRangeKernel(cmdQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL);
clFinish(cmdQueue);
status = clEnqueueReadBuffer(cmdQueue, buffer_distances, CL_TRUE, 0, matSize, distances, 0, NULL, NULL);
clFinish(cmdQueue);
}
and the kernel:
void kernel floyd(global int* graphe, global int* n, global int* distances, global int* k)
{
int i = get_global_id(0);
int j = get_global_id(1);
int ij = i * (*n) + j;
int ik = i * (*n) + (*k);
int kj = (*k) * (*n) + j;
if (distances[ik] + distances[kj] < distances[ij]) {
distances[ij] = distances[ik] + distances[kj];
}
}
You have:
int matSize = n2 * sizeof(int*);
…
int* distances = malloc(sizeof(int) * n2);
…
status = clEnqueueWriteBuffer(cmdQueue, buffer_distances, CL_TRUE, 0, matSize, distances, 0, NULL, NULL);
Say n2 is 100.
matSize will be 800 on a 64-bit system. (sizeof(int*) = 8)
You allocate 400 bytes of memory for your distances array. (sizeof(int) = 4, typically)
You then copy 800 bytes (matSize) from distances into your OpenCL buffer. This overflows the end of the array. Whoops.
The bug is of course the use of sizeof(int*): you've got an array of ints, not an array of pointers, so this should be sizeof(int), which is what you're correctly doing in the malloc call. (I can't quite fathom why you're not using matSize there.) Although what you should probably be using is CLint, or one of the explicitly-sized types (int32_t in this case), because types in OpenCL kernels have very specific definitions which may or may not match those in host C code.
Additional Notes:
I'm not 100% convinced your data dependencies are safe here. No work-item should be reading an array entry that another is writing in the same kernel enqueueing batch. It seems to me that ij (written) for one of the work-items will be equal to ik (read) for the others in the row? Similar deal with ij and kj.
There's no need to read and re-write the distances buffer between iterations, if you're not modifying it on the host. Neither does graphe need re-writing every time if it's not changing.
You can pass scalar arguments such as k and n without a buffer. status = clSetKernelArg(kernel, 1, sizeof(n), &n); works fine if you change your kernel signature's argument to int n. (no dereference needed in the kernel then.)
A local work size of 20x20 is likely not optimal. If you're not using local memory or barriers, don't bother setting a local size at al.
You should be able to remove the clFinish calls, and you can change the buffer writes to be non-blocking once you've moved them outside the loop. This might give you an additional small speedup.
I think i found the solution, i replaced malloc by calloc, and now it works.
Related
I made vector addition kernel and run it in the single gpu and multiple gpu.
However in Multi gpu case is much slower than single gpu in the same length of vector addition.
The structure of my code is one context one kernel and multi queues which has same number of devices..
How can I modify for faster in multi gpu case?
The code is below
#include <stdio.h>
#include <stdlib.h>
#include <sys/time.h>
#include <unistd.h>
#include <CL/cl.h>
#include <math.h>
//#define VECTOR_SIZE 640000
//#define LOCAL_SIZE 64
#define CHECK_ERROR(err) \
if (err != CL_SUCCESS) { \
printf("[%s:%d] OpenCL error %d\n", __FILE__, __LINE__, err); \
exit(EXIT_FAILURE); \
}
double get_time() {
struct timeval tv;
gettimeofday(&tv, NULL);
return (double)tv.tv_sec + (double)1e-6 * tv.tv_usec;
}
char *get_source_code(const char *file_name, size_t *len) {
char *source_code;
size_t length;
FILE *file = fopen(file_name, "r");
if (file == NULL) {
printf("[%s:%d] Failed to open %s\n", __FILE__, __LINE__, file_name);
exit(EXIT_FAILURE);
}
fseek(file, 0, SEEK_END);
length = (size_t)ftell(file);
rewind(file);
source_code = (char *)malloc(length + 1);
fread(source_code, length, 1, file);
source_code[length] = '\0';
fclose(file);
*len = length;
return source_code;
}
int main() {
// OpenCl variables
cl_platform_id platform;
//cl_device_id device;
cl_device_id *devices;
cl_device_id device_temp;
cl_context context;
//cl_command_queue queue;
cl_command_queue *queues;
cl_mem bufferA, bufferB, bufferC;
cl_program program;
char *kernel_source;
size_t kernel_source_size;
cl_kernel kernel;
//cl_kernel *kernels;
cl_int err;
//
size_t VECTOR_SIZE = 64000000 ;
int num_devices = 4;
size_t LOCAL_SIZE = 64;
// Time variables
double start;
double end;
// Get platform
err = clGetPlatformIDs(1, &platform, NULL);
CHECK_ERROR(err);
// Get GPU device
devices = (cl_device_id *) malloc(sizeof(cl_device_id)*num_devices);
err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, num_devices, devices, NULL);
//err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL);
CHECK_ERROR(err);
// Create context
context = clCreateContext(NULL,num_devices, devices , NULL, NULL , &err);
//context = clCreateContext(NULL, 1, &device, NULL, NULL, &err);
CHECK_ERROR(err);
// Get kernel code
kernel_source = get_source_code("kernel.cl", &kernel_source_size);
// Create program
program = clCreateProgramWithSource(context, 1, (const char**)&kernel_source,
&kernel_source_size, &err);
CHECK_ERROR(err);
// Build program
err = clBuildProgram(program, num_devices, devices, "", NULL, NULL);
if(err == CL_BUILD_PROGRAM_FAILURE) {
size_t log_size;
char *log;
// Get program build
//err = clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG,
// 0, NULL, &log_size);
err = clGetProgramBuildInfo(program,devices[0],CL_PROGRAM_BUILD_LOG,0,NULL,&log_size);
CHECK_ERROR(err);
// Get build log
log = (char*)malloc(log_size + 1);
//err = clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG,
// log_size, log, NULL);
err = clGetProgramBuildInfo(program,devices[0],CL_PROGRAM_BUILD_LOG,log_size,log,NULL);
CHECK_ERROR(err);
log[log_size] = '\0';
printf("Compiler error : \n%s\n", log);
free(log);
exit(0);
}
CHECK_ERROR(err);
// Create Vector A, B, C
float *A = (float*)malloc(sizeof(float) * VECTOR_SIZE);
float *B = (float*)malloc(sizeof(float) * VECTOR_SIZE);
float *C = (float*)malloc(sizeof(float) * VECTOR_SIZE);
// Initial Vector A, B
//cl_ushort idx;
/*for(idx = 0; idx < VECTOR_SIZE; idx++) {
A[idx] = rand() % 100;
B[idx] = rand() % 100;
}*/
printf("start\n");
start = get_time();
for(int i = 0; i <VECTOR_SIZE; i++){
A[i] = sinf(i)*sinf(i);
B[i] = cosf(i)*cosf(i);
}
end = get_time();
printf("Initialization time : %f seconds elapsed\n", end-start);
// Create kernel
/*kernels = (cl_kernel *) malloc(sizeof(cl_kernel)*num_devices);
for(int i=0; i<num_devices; i++){
kernels[i] = clCreateKernel(program,"vec_add", &err);
CHECK_ERROR(err);
}*/
kernel = clCreateKernel(program, "vec_add", &err);
CHECK_ERROR(err);
// Create Buffer
bufferA = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(float) * VECTOR_SIZE, NULL, &err);
CHECK_ERROR(err);
bufferB = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(float) * VECTOR_SIZE, NULL, &err);
CHECK_ERROR(err);
bufferC = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(float) * VECTOR_SIZE, NULL, &err);
CHECK_ERROR(err);
printf("error hi\n");
// Create command-queue
queues = (cl_command_queue *) malloc(sizeof(cl_command_queue)*num_devices);
for(int i=0; i<num_devices; i++){
if (i==0){
queues[i] = clCreateCommandQueue(context,devices[i],CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE,&err);
CHECK_ERROR(err);
}
else{
queues[i] = clCreateCommandQueue(context,devices[i], CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, &err);
CHECK_ERROR(err);
}
}
printf("error bye\n");
//queue = clCreateCommandQueue(context, device, 0, &err);
//CHECK_ERROR(err);
// Write Buffer
for (int i = 0; i<num_devices; i++){
err = clEnqueueWriteBuffer(queues[i],bufferA,CL_FALSE,0,sizeof(float)*VECTOR_SIZE,A,0,NULL,NULL);
CHECK_ERROR(err);
err = clEnqueueWriteBuffer(queues[i],bufferB,CL_FALSE,0,sizeof(float)*VECTOR_SIZE,B,0,NULL,NULL);
CHECK_ERROR(err);
}
//err = clEnqueueWriteBuffer(queue, bufferA, CL_FALSE, 0, sizeof(float) * VECTOR_SIZE, A, 0, NULL, NULL);
//CHECK_ERROR(err);
//err = clEnqueueWriteBuffer(queue, bufferB, CL_FALSE, 0, sizeof(float) * VECTOR_SIZE, B, 0, NULL, NULL);
//CHECK_ERROR(err);
for(int i=0; i<num_devices; i++){
err=clFinish(queues[i]);
CHECK_ERROR(err);
}
// Set Kernel arguments
start = get_time();
/*for(int i=0; i<num_devices; i++){
err=clSetKernelArg(kernels[i], 0, sizeof(cl_mem), &bufferA);
CHECK_ERROR(err);
err=clSetKernelArg(kernels[i], 1, sizeof(cl_mem), &bufferB);
CHECK_ERROR(err);
err=clSetKernelArg(kernels[i], 2, sizeof(cl_mem), &bufferC);
CHECK_ERROR(err);
err=clSetKernelArg(kernels[i], 3, sizeof(unsigned int), &VECTOR_SIZE);
CHECK_ERROR(err);
}*/
err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &bufferA);
CHECK_ERROR(err);
err = clSetKernelArg(kernel, 1, sizeof(cl_mem), &bufferB);
CHECK_ERROR(err);
err = clSetKernelArg(kernel, 2, sizeof(cl_mem), &bufferC);
CHECK_ERROR(err);
err = clSetKernelArg(kernel, 3, sizeof(unsigned int), &VECTOR_SIZE);
CHECK_ERROR(err);
end = get_time();
printf("Send Vector A, B to GPU : %f seconds elapsed\n", end - start);
for(int i=0; i<num_devices; i++){
err=clFinish(queues[i]);
CHECK_ERROR(err);
}
cl_event ooo_events[num_devices];
start = get_time();
// Execute Kernel
size_t global_size = VECTOR_SIZE;
size_t local_size = LOCAL_SIZE;
for(int i=0; i<num_devices; i++){
//start=get_time();
err= clEnqueueNDRangeKernel(queues[i],kernel,1,NULL,&global_size,&local_size,0,NULL,NULL);
CHECK_ERROR(err);
//err = clEnqueueNDRangeKernel(queues[i],kernels[i],1,NULL,&global_size, &local_size,0,NULL,NULL);
//CHECK_ERROR(err);
//end=get_time();
//printf("Calculate C : %f seconds elapsed\n", end-start);
}
//err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL,&global_size, &local_size, 0, NULL, NULL);
//CHECK_ERROR(err);
for(int i=0; i<num_devices; i++){
err=clFinish(queues[i]);
CHECK_ERROR(err);
}
end = get_time();
printf("Calculate C : %f seconds elapsed\n", end - start);
// Read Buffer
start = get_time();
for(int i=0; i<num_devices; i++){
err = clEnqueueReadBuffer(queues[i],bufferC,CL_TRUE,0,sizeof(float)*VECTOR_SIZE,C,0,NULL,NULL);
CHECK_ERROR(err);
}
//err = clEnqueueReadBuffer(queue, bufferC, CL_TRUE, 0, sizeof(float) * VECTOR_SIZE, C, 0, NULL, NULL);
//CHECK_ERROR(err);
end = get_time();
printf("Receive C from GPU : %f seconds elapsed\n", end - start);
// Evaluate Vector C
start = get_time();
double sum = 0;
for(int i = 0; i < VECTOR_SIZE; i++) {
sum += C[i];
}
end = get_time();
printf("Verification time : %f seconds elapsed\n", end-start);
printf("%lf, %ld \n", sum,VECTOR_SIZE);
if (abs(VECTOR_SIZE - sum) < 1) {
printf("Verification success!\n");
}
printf("Sum : %f\n", sum);
// Release OpenCL object
clReleaseMemObject(bufferA);
clReleaseMemObject(bufferB);
clReleaseMemObject(bufferC);
free(A);
free(B);
free(C);
clReleaseKernel(kernel);
//clReleaseKernel(kernels[0]);
//clReleaseKernel(kernels[1]);
clReleaseProgram(program);
clReleaseCommandQueue(queues[0]);
clReleaseCommandQueue(queues[1]);
//clReleaseCommandQueue(queue);
clReleaseContext(context);
return 0;
}
Using multiple GPUs is only beneficial in terms of performance if the amount of computational work that each GPU performs takes more time then the communication, scheduling and synchronization overhead. This is true for a single GPU as well.
In your case, each GPU performs a simple vector addition. but that rarely takes more time then transferring the data to the GPU, waiting for the kernel to actually get scheduled for execution, etc.
Your code is not measuring the total kernel execution time but also the scheduling overhead.
I would advise you to use proper GPU profiling tools (depending on your GPU vendor) instead of manual CPU timings to properly examine what is going on. You can also try measuring kernel execution time via events.
So i have written a code and it works, now what i want to do is call clEnqueueNDRangeKernel() multiple times and after each execution i want to update a buffer (buffer Y) with that output. i have written the following code and i want to know if its correct for that function.
i did not write a seperate setkernelArg() command for it.
for (int a = 0; a < 100; a++)
{
ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, globalws, NULL, 0, NULL, NULL);
if (ret != CL_SUCCESS) {
printf("Failed to enqueueNDRangeKernel.\n");
exit(1);
}
clEnqueueReadBuffer(command_queue, bufferC, CL_TRUE, 0, M*N * sizeof(float), (void *)C, 0, NULL, NULL);
clEnqueueWriteBuffer(command_queue, bufferY, CL_TRUE, 0, 1 * N * sizeof(float), (void *)C, 0, NULL, NULL);
for (int i = 0; i < N; i++) {
printf("%f, ", C[i]);
}
}
You should wait for every OpenCL Api call. Create Events for every call. So you are sure that every execution has finished before you start the next one. For example its possible that the kernel do some computation on the GPU, but at the same time you start the clEnqueueReadBuffer and you read the Output-Buffer before the Kernel has finished to write to the Output-Buffer. It's also possible that you write to the GPU before clEnqueueReadBuffer has finished.
An OpenCL Api call starts the execution on the GPU, but the Host Programm also goes on.
With the Events, your programm could look like this:
cl_event evKernel, evReadBuf, evWriteBuf;
for(int a = 0; a < 100; a++)
{
ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, globalws, NULL, 0, NULL, &evKernel);
if (ret != CL_SUCCESS) {
printf("Failed to enqueueNDRangeKernel.\n");
exit(1);
}
clWaitForEvents(1, &evKernel);
clEnqueueReadBuffer(command_queue, bufferC, CL_TRUE, 0, M*N * sizeof(float), (void *)C, 0, NULL, &evReadBuf);
clWaitForEvents(1, &evReadBuf);
clEnqueueWriteBuffer(command_queue, bufferY, CL_TRUE, 0, 1 * N * sizeof(float), (void *)C, 0, NULL, &evWriteBuf);
clWaitForEvents(1, &evWriteBuf);
for (int i = 0; i < N; i++) {
printf("%f, ", C[i]);
}
}
With the events the execution time for the loop will grow.
In the clEnqueueWriteBuffer call you write from Host-Memory ( M*N*sizeof(float) ) that is bigger than the Device-Buffer (1*N*sizeof(float) ).(Maybe you mean(M*N*sizeof(float))?) A programm on the Host-Side would crash (invalid memory access), but OpenCL does not complain about it and copy data. I'm not sure, but this could cause problems in the future.
I don't know what your kernel does, but it would be better if the kernel only write data to the Output-Buffer that you needed on the Host-Side. You copy bufferC to C, but only a part of C to bufferY, which seems to be the input for the next kernel. Maybe you can make changes in the kernel.
Copy data from Host to Device or Device to Host is a expensive part. So for Performance reasons you should not copy data that you don't need for further computations.
I'm pretty a novice about opencl. I have tried about "get the summation of all cubes of every element in an array". Here's my kernel code:
kernel void cubeSum(global float *input,
local float *prods,
global float *output )
{
int gid = get_global_id( 0 );
int tnum = get_local_id( 0 ); // thread number
int wgNum = get_group_id( 0 ); // work-group number
int numItems = get_local_size( 0 );
prods[ tnum ] = input[ gid ] * input[ gid ] * input[gid]; // cube
for (int offset = 1; offset < numItems; offset *= 2) {
int mask = 2 * offset - 1;
barrier(CLK_LOCAL_MEM_FENCE);
if ( (tnum & mask) == 0 ) {
prods[tnum] += prods[tnum + offset];
}
}
barrier(CLK_LOCAL_MEM_FENCE);
if ( tnum == 0 )
output[wgNum] = prods[0];
}
I can't figure out why my result is not the same with sequential result. When the array is from 0 to 511, my result is sequential result minus 2048; when the array is from 0 to 1023, my result is sequential result plus 16384.
I will try to figure it out myself while I'm waiting for you answers.
Another question is I found it is hard to debug kernel code since the dataset is quite big and it runs concurrently. Any advice for debugging?
All the advices are appreciated =).
By the way, here's my host code:
#include <stdio.h>
#include <stdio.h>
#include <math.h>
#include <string.h>
#include <stdlib.h>
#include <OpenCL/opencl.h>
#define NUM_ELEMENTS (512)
#define LOCAL_SIZE (512)
#define MAX_SOURCE_SIZE (0x100000)
int main(int argc, const char * argv[])
{
float data[NUM_ELEMENTS]; //hA
float sum;
float sumTest;
size_t global;
size_t local;
size_t numWorkGroups;
size_t dataSize;
size_t resultsSize;
cl_device_id device;
cl_context context;
cl_command_queue cmdQueue;
cl_program program;
cl_kernel kernel;
cl_mem input;
cl_mem output;
FILE *fp;
//failed to use relative path here. permission problem?
char fileName[] = "/Users/sure/USC/590/cubeSum/cubeSum/cubeSum.cl";
char *source_str;
size_t source_size;
/* カーネルを含むソースコードをロード */
fp = fopen(fileName, "r");
if (!fp) {
fprintf(stderr, "Failed to load kernel.\n");
exit(1);
}
source_str = (char*)malloc(MAX_SOURCE_SIZE);
source_size = fread( source_str, 1, MAX_SOURCE_SIZE, fp);
fclose( fp );
//allocate the host memory buffers:
int i = 0;
unsigned int count = NUM_ELEMENTS;
for (i = 0; i < count; i++) {
data[i] = i;
}
//array size in bytes (will need this later):
dataSize = NUM_ELEMENTS * sizeof(float);
//opencl function status
cl_int status;
// Connect to a compute device
//
int gpu = 1;
status = clGetDeviceIDs(NULL, gpu ? CL_DEVICE_TYPE_GPU : CL_DEVICE_TYPE_CPU, 1, &device, NULL);
if (status != CL_SUCCESS)
{
printf("Error: Failed to create a device group!\n");
return EXIT_FAILURE;
}
//create an Opencl context
context = clCreateContext(NULL, 1, &device, NULL, NULL, &status);
//create a command queue
cmdQueue = clCreateCommandQueue( context, device, 0, &status );
//allocate memory buffers on the device
input = clCreateBuffer( context, CL_MEM_READ_ONLY, dataSize, NULL, &status ); //dA
//TODO: at this line, I don't have the value of local which is calculated by clGetKernelWorkGroupInfo
//need to figure out a way to avoid hardcode it.
output = clCreateBuffer( context, CL_MEM_WRITE_ONLY, sizeof(float) * NUM_ELEMENTS / LOCAL_SIZE, NULL, &status ); //dC
// enqueue the 2 commands to write data into the device buffers:
status = clEnqueueWriteBuffer( cmdQueue, input, CL_FALSE, 0, dataSize, data, 0, NULL, NULL );
// create the kernel program on the device:
program = clCreateProgramWithSource(context, 1, (const char **) & source_str, (const size_t *)&source_size, &status);
if (!program)
{
printf("Error: Failed to create compute program!\n");
return EXIT_FAILURE;
}
// Build the program executable
//
status = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
if (status != CL_SUCCESS)
{
size_t len;
char buffer[2048];
printf("Error: Failed to build program executable!\n");
clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len);
printf("%s\n", buffer);
exit(1);
}
//create compute kernel
kernel = clCreateKernel( program, "cubeSum", &status );
// Get the maximum work group size for executing the kernel on the device
//
status = clGetKernelWorkGroupInfo(kernel, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), &local, NULL);
if (status != CL_SUCCESS)
{
printf("Error: Failed to retrieve kernel work group info! %d\n", status);
exit(1);
}
global = count;
numWorkGroups = global / local;
float results[numWorkGroups]; //hC
resultsSize = numWorkGroups * sizeof(float);
//set kernel parameter
status = clSetKernelArg( kernel, 0, sizeof(cl_mem), &input );
status = clSetKernelArg( kernel, 1, sizeof(float), NULL );
status = clSetKernelArg( kernel, 2, sizeof(cl_mem), &output );
// Execute the kernel over the entire range of our 1d input data set
// using the maximum number of work group items for this device
//
status = clEnqueueNDRangeKernel(cmdQueue, kernel, 1, NULL, &global, &local, 0, NULL, NULL);
if (status)
{
printf("Error: Failed to execute kernel!\n");
return EXIT_FAILURE;
}
clFinish(cmdQueue);
status = clEnqueueReadBuffer( cmdQueue, output, CL_TRUE, 0, resultsSize, results, 0, NULL, NULL );
// Validate our results
//
sum = 0;
for (int i=0; i<numWorkGroups; i++) {
sum += results[i];
}
sumTest = 0;
for(i = 0; i < count; i++)
{
sumTest += data[i] * data[i] * data[i];
}
// Print a brief summary detailing the results
//
printf("Computed '%f/%f'!\n", sum, sumTest);
// Shutdown and cleanup
//
clReleaseMemObject(input);
clReleaseMemObject(output);
clReleaseProgram(program);
clReleaseKernel(kernel);
clReleaseCommandQueue(cmdQueue);
clReleaseContext(context);
return 0;
}
EDIT: Just found another thing. My code is correct if I just sum all element without cube/square. Thus, I'm gonna figure out how cube affect to my program.
You appear to only be allocating 4-bytes of local memory:
status = clSetKernelArg( kernel, 1, sizeof(float), NULL );
This should be the total amount of local memory required for that argument by the entire work-group. In the case of your kernel, this is (work-group-size * sizeof(float)).
So, you should instead have something like this:
status = clSetKernelArg( kernel, 1, local*sizeof(float), NULL );
The discrepancies you are seeing are likely coming from the limitations of floating point, since you are summing some very large numbers. If you initialise your inputs with smaller numbers (e.g. data[i] = i*0.01;), you should get results equal to your sequential implementation (I've verified this on my own system). This is why you don't see the errors when you remove the cube.
I have a problem with the kernel execution which don't write informations at correct locations when I'm using large arrays (1000 x 10000). But for small arrays, there is no problem, I retrieve the correct resuls. For the kernel execution, I use GPU from ATI Mobility RADEON HD 4300 Series.
C code sample is :
#include <stdio.h>
#include <stdlib.h>
#ifdef __APPLE__
#include <OpenCL/opencl.h>
#else
#include <CL/cl.h>
#endif
#define MAX_SOURCE_SIZE (0x100000)
#define MAX_SIZE 108
#define NCOLS 1000
#define NROWS 10000
int main(void) {
char* source_name = "mykernel.cl";
char* source_code;
size_t source_size;
cl_platform_id platformId = NULL;
cl_uint nbplatforms;
cl_device_id deviceId = NULL;
cl_uint nbdevices;
cl_context context = NULL;
cl_int errcode;
cl_command_queue commandQueue = NULL;
cl_program program;
size_t global_work_size[2];
size_t local_work_size[2];
FILE* fh;
//Retrieving platform information
errcode = clGetPlatformIDs(1, &platformId, &nbplatforms);
//Retrieving device (GPU) information
errcode = clGetDeviceIDs(platformId, CL_DEVICE_TYPE_GPU, 1, &deviceId, &nbdevices);
//Creation of a working context
context = clCreateContext(NULL, 1, &deviceId, NULL, NULL, &errcode);
commandQueue = clCreateCommandQueue(context, deviceId, 0, &errcode);
//Opening and reading the kernel source file
if((fh = fopen(source_name, "r")) == NULL){
fprintf(stderr, "Failed to open the file containing the kernel source !\n");
exit(EXIT_FAILURE);
}
source_code = (char*) malloc (MAX_SOURCE_SIZE * sizeof(char));
source_size = fread(source_code, sizeof(char), MAX_SOURCE_SIZE, fh);
fclose(fh);
program = clCreateProgramWithSource(context, 1, (const char**) &source_code, (const size_t*) &source_size, &errcode);
//Building kernel
errcode = clBuildProgram(program, 1, &deviceId, NULL, NULL, NULL);
//Creation of the kernel program
cl_kernel kernel = clCreateKernel(program, "mykernel", &errcode);
unsigned int *op1 = (unsigned int*) malloc (NCOLS * NROWS * sizeof(unsigned int));
cl_mem op1buff = clCreateBuffer(context, CL_MEM_WRITE_ONLY, NCOLS * NROWS * sizeof(unsigned int), NULL, &errcode);
clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*) &op1buff);
global_work_size[0] = NCOLS;
global_work_size[1] = NROWS;
local_work_size[0] = NCOLS;
local_work_size[1] = 1;
clEnqueueNDRangeKernel(commandQueue, kernel, 2, NULL, global_work_size, local_work_size, 0, NULL, NULL);
errcode = clEnqueueReadBuffer(commandQueue, op1buff, CL_TRUE, 0, NCOLS * NROWS * sizeof(unsigned int), (void*)op1, 0, NULL, NULL);
for(int i = 0; i < NROWS; i++){
for(int j = 0; j < NCOLS; j++)
printf("[index:%d - %u] ", i*NCOLS+j, op1[i*NCOLS+j]);
printf("\n");
}
return EXIT_SUCCESS;
}
The kernel source code is placed on a file named mykernel.cl and is presented as follows:
__kernel void mykernel(__global unsigned int* op1buf){
unsigned int index = get_group_id(1) * get_global_size(0) + get_local_id(0);
op1buf[index] = index;
}
The execution of this program returns unexpected values read from the arrays when I'm using large arrays. For example :
[index:0 - 16777215] [index:1 - 16777215] [index:2 - 16777215] [index:3 - 16777215] ...
[index:1000 - 3438339071] [index:1001 - 3941660159] [index:1002 - 1650092117] [index:1003 - 2529976771] ...
[index:1000 - 3438339071] [index:1001 - 3941660159] [index:1002 - 1650092117] [index:1003 - 2529976771] ...
[index:3000 - 16777215] [index:3001 - 16777215] [index:3002 - 16777215] [index:3003 - 16777215] ...
[index:4000 - 3438339071] [index:4001 - 3941660159] [index:4002 - 1650092117] [index:4003 - 2529976771] ...
....
What can be the matter on my code or is there something on the use of GPU which I don't take in consideration ?
Thanks in advance.
1000 is evidently too large for your device. Use clGetDeviceInfo with CL_DEVICE_MAX_WORK_GROUP_SIZE to determine the largest value you can use.
I wrote a simply OpenCL program based off the SDK and it compiles and runs, however the output is wrong. Is there something I'm doing wrong?
Any suggestions for learning to debug C and OpenCL is much appreciated. I'm quite new to the platform.
Code is below.
The output in array c is all zeros.
Thanks.
test_opencl.h
#ifndef _TEST_OPENCL_H_
#define _TEST_OPENCL_H_
int main( int argc, const char** argv);
int runTest( int argc, const char** argv);
#endif
test_opencl.cl
// simple test of adding a[i] to b[i] to get c[i]
__kernel void add_array(__global float *a, __global float *b, __global float *c)
{
int xid = get_global_id(0);
c[xid] = a[xid] + b[xid];
}
test_opencl.cpp
// standard utility and system includes
#include <oclUtils.h>
#include "test_opencl.h"
// OpenCL error catcher
cl_int err = 0;
// Main Program
// *********************************************************************
int main( int argc, const char** argv)
{
// set logfile name and start logs
shrSetLogFileName ("test_opencl.txt");
shrLog(LOGBOTH, 0, "%s Starting...\n\n", argv[0]);
// run the main test
int result = runTest(argc, argv);
shrCheckError(result, 0);
// finish
shrEXIT(argc, argv);
}
//! Run a simple test for OPENCL
// *********************************************************************
int runTest( int argc, const char** argv)
{
cl_context gpu_context;
cl_command_queue cmd_queue;
cl_program program;
cl_kernel test_kernel;
const size_t szGlobalWorkSize = 10;
const size_t szLocalWorkSize = 10;
// size of memory required to store the array
const unsigned int mem_size = sizeof(int) * 10;
// create the OpenCL context on a GPU device
gpu_context = clCreateContextFromType(0, CL_DEVICE_TYPE_GPU, NULL, NULL, &err);
shrCheckError(err, CL_SUCCESS);
// get devices
cl_device_id device;
if( shrCheckCmdLineFlag(argc, argv, "device") ) {
int device_nr = 0;
shrGetCmdLineArgumenti(argc, argv, "device", &device_nr);
device = oclGetDev(gpu_context, device_nr);
} else {
device = oclGetMaxFlopsDev(gpu_context);
}
// create a command-queue
cmd_queue = clCreateCommandQueue(gpu_context, device, 0, &err);
shrCheckError(err, CL_SUCCESS);
// allocate and initalize host memory
int a[10], b[10], c[10];
for (int i = 0; i < 10; i++) {
a[i] = i;
b[i] = i * i;
}
// create buffers on device
cl_mem vol_a = clCreateBuffer(gpu_context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, mem_size, a, &err);
shrCheckError(err, CL_SUCCESS);
cl_mem vol_b = clCreateBuffer(gpu_context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, mem_size, b, &err);
shrCheckError(err, CL_SUCCESS);
cl_mem vol_c = clCreateBuffer(gpu_context, CL_MEM_WRITE_ONLY | CL_MEM_COPY_HOST_PTR, mem_size, c, &err);
shrCheckError(err, CL_SUCCESS);
// copy data from host to device
err = clEnqueueWriteBuffer(cmd_queue, vol_a, CL_TRUE, 0, mem_size, a, 0, NULL, NULL);
err |= clEnqueueWriteBuffer(cmd_queue, vol_b, CL_TRUE, 0, mem_size, b, 0, NULL, NULL);
shrCheckError(err, CL_SUCCESS);
// Program Setup
size_t program_length;
char* source_path = shrFindFilePath("test_opencl.cl", argv[0]);
shrCheckError(source_path != NULL, shrTRUE);
char *source = oclLoadProgSource(source_path, "", &program_length);
shrCheckError(source != NULL, shrTRUE);
// create the program
program = clCreateProgramWithSource(gpu_context, 1, (const char **)&source, &program_length, &err);
shrCheckError(err, CL_SUCCESS);
// build the program
err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
if (err != CL_SUCCESS)
{
// write out standard error, Build Log and PTX, then return error
shrLog(LOGBOTH | ERRORMSG, err, STDERROR);
return(EXIT_FAILURE);
}
clFinish(cmd_queue);
shrLog(LOGBOTH, 0, "%s Starting kernel operation...\n\n", argv[0]);
// create the test kernel
test_kernel = clCreateKernel(program, "add_array", &err);
shrCheckError(err, CL_SUCCESS);
// set the args values for the kernel
err = clSetKernelArg(test_kernel, 0, sizeof(cl_mem), (void *) &vol_a);
err |= clSetKernelArg(test_kernel, 1, sizeof(cl_mem), (void *) &vol_b);
err |= clSetKernelArg(test_kernel, 2, sizeof(cl_mem), (void *) &vol_c);
shrCheckError(err, CL_SUCCESS);
err = clEnqueueNDRangeKernel(cmd_queue, test_kernel, 1, NULL, &szGlobalWorkSize, NULL, 0, NULL, NULL);
shrCheckError(err, CL_SUCCESS);
clFinish(cmd_queue);
// copy result from device to host
err = clEnqueueReadBuffer(cmd_queue, vol_c, CL_TRUE, 0, mem_size, c, 0, NULL, NULL);
shrCheckError(err, CL_SUCCESS);
int d[10];
err = clEnqueueReadBuffer(cmd_queue, vol_a, CL_TRUE, 0, mem_size, d, 0, NULL, NULL);
shrCheckError(err, CL_SUCCESS);
clFinish(cmd_queue);
shrLog(LOGBOTH, 0, "%s Finished kernel operation...\n\n", argv[0]);
bool passed = true;
for (int i = 0; i < 10; i++) {
if (c[i] != i + i * i)
passed = false;
shrLog(LOGBOTH, 0, "c = %d d = %d\n", c[i], d[i]);
}
if (passed)
shrLog(LOGBOTH, 0, "%s Test Passed\n\n", argv[0]);
else
shrLog(LOGBOTH, 0, "%s Test Failed\n\n", argv[0]);
// cleanup OpenCL
clReleaseMemObject(vol_a);
clReleaseMemObject(vol_b);
clReleaseMemObject(vol_c);
clReleaseKernel(test_kernel);
clReleaseProgram(program);
clReleaseCommandQueue(cmd_queue);
clReleaseContext(gpu_context);
return 0;
}
The problems in the code and the solution can be found here.