Recursive polynomial - c
I'm working on calculation of Legendre Polynomial on GPU.
Briefly, Recursive Legendre Polynomial is computing the n-th order by (n-1)th and (n-2)th order. We divide the x into k (let's say k=23) parts to compute polynomial and do a summation, which would be more precise.
So my kernel goes below.
First, we create a k * width array.
int row = blockIdx.y * blockDim.y + threadIdx.y;
int col = blockIdx.x * blockDim.x + threadIdx.x;
float delta = 2. / width;
if ((row < d_k) && (col < width))
kXList[row * width + col] = -1.f + (col * d_k + row + 1.f) * delta / (float)d_k;
And 1st order and 2nd order, kXList_2 is the first, kXList_1 is the second.
kXList_1[row * width + col] = kXList[row * width + col];
kXList_2[row * width + col] = 1.f;
Do summation over columns and saving it into d_xLegendreP.
if (row == 0) {
float row_0 = 0.f;
float row_1 = 0.f;
for (int h = 0; h < d_k; ++h) {
row_0 += kXList_2[h * width + col];
row_1 += kXList_1[h * width + col];
}
d_xLegendreP[0 * width + col] = row_0;
d_xLegendreP[1 * width + col] = row_1;
}
recusive calculation of rest order.
float kX_2 = kXList_2[row * width + col];
float kX_1 = kXList_1[row * width + col];
float kX = kXList[row * width + col];
float row_n;
for (int n = 2; n <= order; n++) {
kXList_temp[row * width + col] = ((2.f * n - 1.f) * kX * kX_1) / (float)n - (((n - 1.f) * kX_2) / (float)n);
if ((row == 0)) {
row_n = 0.f;
for (int h = 0; h < d_k; h++) {
row_n += kXList_temp[h * width + col];
}
d_xLegendreP[n * width + col] = row_n;
}
kX_2 = kX_1;
kX_1 = kXList_temp[row * width + col];
}
As has been pointed out, CUDA makes no statements about the order of thread execution. However you have a number of points in your calculation sequence where you expect a previous line of code has been completed in its entirety, across the entire grid, in order for the next section of your code to be correct.
Generally the nature of CUDA parallel thread execution means that such dependencies lead to incorrect/broken code.
I haven't tried to fully realize your algorithm in an optimal way, but to demonstrate the proof of this, I have broken up your kernel code in such a way that such dependencies are made "correct" through the use of the kernel-call boundary, which is effectively a global sync. This is probably one way to sort out your problem, as indicated in the comments.
Here's an example. I'm not going to try to detail each change, but by breaking it up this way I believe I have satisfied the dependencies expected using your approach. I have not fully verified anything, but a quick check suggests the output seems to match your matlab output:
$ cat t1820.cu
#include <stdio.h>
#include <math.h>
#include<iostream>
#include <stdlib.h>
#define BLOCKDIM_32 32
#define k 23
#define Mmax 40
#define IMG_SIZE 1024
static const long DEVICE = 0;
#define CUDA_CHECK_RETURN(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char* file, int line, bool abort = true)
{
if (code != cudaSuccess)
{
fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
__global__ void LegendreMoment1(float* kXList, float* kXList_1, float* kXList_2, float* kXList_temp,
float* d_xLegendreP, int width, int d_k, int order) {
int row = blockIdx.y * blockDim.y + threadIdx.y;
int col = blockIdx.x * blockDim.x + threadIdx.x;
float delta = 2. / width;
if ((row < d_k) && (col < width)) {
kXList[row * width + col] = -1.f + (col * d_k + row + 1.f) * delta / (float)d_k;
kXList_1[row * width + col] = kXList[row * width + col];
kXList_2[row * width + col] = 1.f;
}
}
__global__ void LegendreMoment2(float* kXList, float* kXList_1, float* kXList_2, float* kXList_temp,
float* d_xLegendreP, int width, int d_k, int order) {
int row = blockIdx.y * blockDim.y + threadIdx.y;
int col = blockIdx.x * blockDim.x + threadIdx.x;
if ((row < d_k) && (col < width)) {
if (row == 0) {
float row_0 = 0.f;
float row_1 = 0.f;
for (int h = 0; h < d_k; ++h) {
row_0 += kXList_2[h * width + col];
row_1 += kXList_1[h * width + col];
}
d_xLegendreP[0 * width + col] = row_0;
d_xLegendreP[1 * width + col] = row_1;
}
}
}
__global__ void LegendreMoment3(float* kXList, float* kXList_1, float* kXList_2, float* kXList_temp,
float* d_xLegendreP, int width, int d_k, int order, int n, float *kXList_prev) {
int row = blockIdx.y * blockDim.y + threadIdx.y;
int col = blockIdx.x * blockDim.x + threadIdx.x;
if ((row < d_k) && (col < width)) {
float kX_2, kX_1, kX = kXList[row * width + col];
if (n == 2){
kX_2 = kXList_2[row * width + col];
kX_1 = kXList_1[row * width + col];}
if (n == 3){
kX_2 = kXList_1[row * width + col];
kX_1 = kXList_temp[row*width+col];}
if (n > 3){
kX_2 = kXList_prev[row * width + col];
kX_1 = kXList_temp[row*width+col];}
kXList_prev[row*width+col] = kX_1;
kXList_temp[row * width + col] = ((2.f * n - 1.f) * kX * kX_1) / (float)n - (((n - 1.f) * kX_2) / (float)n);
}
}
__global__ void LegendreMoment4(float* kXList, float* kXList_1, float* kXList_2, float* kXList_temp,
float* d_xLegendreP, int width, int d_k, int order, int n) {
int row = blockIdx.y * blockDim.y + threadIdx.y;
int col = blockIdx.x * blockDim.x + threadIdx.x;
float row_n;
if ((row < d_k) && (col < width)) {
if ((row == 0)) {
row_n = 0.f;
for (int h = 0; h < d_k; h++) {
row_n += kXList_temp[h * width + col];
}
d_xLegendreP[n * width + col] = row_n;
}
}
}
float matlab_result[][4] = {
{23., 23., 23., 23.},
{-22.9766, -22.9316, -22.8867, -22.8418},
{22.9297, 22.7952, 22.661, 22.527},
{-22.8596, -22.5914, -22.3245, -22.059},
{22.7663, 22.3211, 21.8799, 21.4425},
{-22.6501, -21.9856, -21.3303, -20.6839},
{22.5111, 21.5864, 20.6798, 19.7912},
{-22.3496, -21.1254, -19.9335, -18.7734},
{22.166, 20.6046, 19.0967, 17.6411},
{-21.9606, -20.0265, -18.1756, -16.4058},
{21.7339, 19.3937, 17.1772, 15.0802},
{-21.4862, -18.7091, -16.1086, -13.6777},
{21.2181, 17.9757, 14.9778, 12.2124},
{-20.9301, -17.1971, -13.7931, -10.6992},
{20.6228, 16.3766, 12.563, 9.15308},
{-20.2967, -15.5179, -11.2963, -7.5893},
{19.9525, 14.625, 10.0023, 6.02321},
{-19.5909, -13.7018, -8.69016, -4.46998},
{19.2126, 12.7524, 7.36912, 2.94447},
{-18.8183, -11.781, -6.04847, -1.46107},
{18.4087, 10.792, 4.73739, 0.0335239},
{-17.9847, -9.78953, -3.44488, 1.32519},
{17.5472, 8.77808, 2.17971, -2.60304},
{-17.0968, -7.76199, -0.950332, 3.78904},
{16.6345, 6.74559, -0.235176, -4.87336},
{-16.1611, -5.7332, 1.36917, 5.84745},
{15.6776, 4.72908, -2.44452, -6.70411},
{-15.1848, -3.73739, 3.45463, 7.43756},
{14.6836, 2.7622, -4.39351, -8.04346},
{-14.1751, -1.80747, 5.25583, 8.51902},
{13.66, 0.877003, -6.03692, -8.86292},
{-13.1395, 0.0255473, 6.73284, 9.07537},
{12.6143, -0.896704, -7.34039, -9.15805},
{-12.0855, 1.73318, 7.85712, 9.11411},
{11.554, -2.53191, -8.28135, -8.94808},
{-11.0207, 3.29003, 8.61218, 8.6658},
{10.4866, -4.00492, -8.84949, -8.27433},
{-9.95254, 4.67419, 8.99391, 7.78188},
{9.41953, -5.29574, -9.04682, -7.19767},
{-8.88843, 5.8677, 9.01035, 6.53179},
{8.36015, -6.38847, -8.88731, -5.79509}
};
#define TOL 0.0001f
int main()
{
float* kXList;
float* kXList_1;
float* kXList_2;
float* kXList_temp;
float* kXList_prev;
float* d_xLegendreP;
float* xLegendreP;
int width = IMG_SIZE;
cudaEvent_t d_total_begin, d_total_end;
xLegendreP = new float[(Mmax + 1) * width];
CUDA_CHECK_RETURN(cudaSetDevice(DEVICE));
CUDA_CHECK_RETURN(cudaEventCreate(&d_total_begin));
CUDA_CHECK_RETURN(cudaEventCreate(&d_total_end));
printf("Time kernel launch...\n");
CUDA_CHECK_RETURN(cudaEventRecord(d_total_begin, 0));
printf("Allocating space on device...\n");
CUDA_CHECK_RETURN(
cudaMalloc((void**)&kXList, width * k * sizeof(float)));
CUDA_CHECK_RETURN(
cudaMalloc((void**)&kXList_temp, width * k * sizeof(float)));
CUDA_CHECK_RETURN(
cudaMalloc((void**)&kXList_prev, width * k * sizeof(float)));
CUDA_CHECK_RETURN(
cudaMalloc((void**)&kXList_1, width * k * sizeof(float)));
CUDA_CHECK_RETURN(
cudaMalloc((void**)&kXList_2, width * k * sizeof(float)));
CUDA_CHECK_RETURN(
cudaMalloc((void**)&d_xLegendreP, width * (Mmax + 1) * sizeof(float)));
printf("Copying data from host to device...\n");
dim3 grid(ceil(Mmax / 32), ceil(width / 32), 1);
dim3 block(BLOCKDIM_32, BLOCKDIM_32, 1);
printf("Launching kernel...\n");
LegendreMoment1 << <grid, block >> > (kXList, kXList_1, kXList_2, kXList_temp,
d_xLegendreP, IMG_SIZE, k, Mmax);
LegendreMoment2 << <grid, block >> > (kXList, kXList_1, kXList_2, kXList_temp,
d_xLegendreP, IMG_SIZE, k, Mmax);
for (int n = 2; n <= Mmax; n++) {
LegendreMoment3 << <grid, block >> > (kXList, kXList_1, kXList_2, kXList_temp,
d_xLegendreP, IMG_SIZE, k, Mmax, n, kXList_prev);
LegendreMoment4 << <grid, block >> > (kXList, kXList_1, kXList_2, kXList_temp,
d_xLegendreP, IMG_SIZE, k, Mmax, n);
}
CUDA_CHECK_RETURN(
cudaMemcpy(xLegendreP, d_xLegendreP, width * (Mmax + 1) * sizeof(float), cudaMemcpyDeviceToHost));
CUDA_CHECK_RETURN(cudaEventRecord(d_total_end, 0));
printf("\n");
for (int n = 0; n <= Mmax; n++)
printf("row %2d:%8.4f %8.4f %8.4f %8.4f\n", n, xLegendreP[n * width + 0],xLegendreP[n * width + 1],xLegendreP[n * width + 2],xLegendreP[n * width + 3]);
for (int i = 0; i < Mmax; i++)
for (int j = 0; j < 4; j++)
if (fabsf(xLegendreP[i*width+j] - matlab_result[i][j]) > TOL) {printf("mismatch at %d, %d\n", i, j); return 0;}
CUDA_CHECK_RETURN(cudaEventSynchronize(d_total_end));
float gpuTime = 0.0;
CUDA_CHECK_RETURN(cudaEventElapsedTime(&gpuTime, d_total_begin, d_total_end));
printf(">>>Elapsed GPU Time is : %f ms\n", gpuTime);
printf("Freeing memory on device...\n");
CUDA_CHECK_RETURN(cudaEventDestroy(d_total_begin));
CUDA_CHECK_RETURN(cudaEventDestroy(d_total_end));
CUDA_CHECK_RETURN(cudaFree(kXList));
CUDA_CHECK_RETURN(cudaFree(kXList_temp));
CUDA_CHECK_RETURN(cudaFree(kXList_1));
CUDA_CHECK_RETURN(cudaFree(kXList_2));
CUDA_CHECK_RETURN(cudaFree(d_xLegendreP));
printf("Exiting program...\n");
return 0;
}
$ nvcc -o t1820 t1820.cu
$ ./t1820
Time kernel launch...
Allocating space on device...
Copying data from host to device...
Launching kernel...
row 0: 23.0000 23.0000 23.0000 23.0000
row 1:-22.9766 -22.9316 -22.8867 -22.8418
row 2: 22.9297 22.7952 22.6610 22.5270
row 3:-22.8596 -22.5914 -22.3245 -22.0590
row 4: 22.7663 22.3211 21.8799 21.4425
row 5:-22.6501 -21.9856 -21.3303 -20.6839
row 6: 22.5111 21.5864 20.6798 19.7912
row 7:-22.3496 -21.1254 -19.9335 -18.7734
row 8: 22.1660 20.6046 19.0967 17.6411
row 9:-21.9606 -20.0265 -18.1756 -16.4058
row 10: 21.7339 19.3937 17.1772 15.0802
row 11:-21.4862 -18.7090 -16.1086 -13.6777
row 12: 21.2181 17.9757 14.9778 12.2124
row 13:-20.9301 -17.1971 -13.7931 -10.6992
row 14: 20.6228 16.3766 12.5630 9.1531
row 15:-20.2967 -15.5179 -11.2963 -7.5893
row 16: 19.9525 14.6250 10.0023 6.0232
row 17:-19.5909 -13.7018 -8.6902 -4.4700
row 18: 19.2126 12.7524 7.3691 2.9445
row 19:-18.8183 -11.7810 -6.0485 -1.4611
row 20: 18.4087 10.7920 4.7374 0.0335
row 21:-17.9848 -9.7895 -3.4449 1.3252
row 22: 17.5472 8.7781 2.1797 -2.6030
row 23:-17.0968 -7.7620 -0.9503 3.7890
row 24: 16.6345 6.7456 -0.2352 -4.8734
row 25:-16.1611 -5.7332 1.3692 5.8475
row 26: 15.6776 4.7291 -2.4445 -6.7041
row 27:-15.1848 -3.7374 3.4546 7.4376
row 28: 14.6836 2.7622 -4.3935 -8.0435
row 29:-14.1751 -1.8075 5.2558 8.5190
row 30: 13.6600 0.8770 -6.0369 -8.8629
row 31:-13.1395 0.0255 6.7328 9.0754
row 32: 12.6143 -0.8967 -7.3404 -9.1581
row 33:-12.0855 1.7332 7.8571 9.1141
row 34: 11.5540 -2.5319 -8.2813 -8.9481
row 35:-11.0207 3.2900 8.6122 8.6658
row 36: 10.4866 -4.0049 -8.8495 -8.2743
row 37: -9.9525 4.6742 8.9939 7.7819
row 38: 9.4195 -5.2957 -9.0468 -7.1977
row 39: -8.8884 5.8677 9.0103 6.5318
row 40: 8.3601 -6.3885 -8.8873 -5.7951
>>>Elapsed GPU Time is : 1.223776 ms
Freeing memory on device...
Exiting program...
$
I'm not suggesting the above code is defect-free or suitable for any particular purpose. It is mostly your code. I've made some changes to demonstrate the need for global sync that is inherent in your approach.
Related
Edge detection in 24-bit bmp using C
how are you? I have a question about an algorithm that I've been learning. It is the relatively simple algorithm, which helps to detect the edges of the image. In summary, the algorithm works like this: it takes a 24-bit .bmp image of arbitrary dimensions and applies the Sobel operator to detect the edges in the image. I almost managed to get satisfactory results with the code below. #include <stdio.h> #include <string.h> #include <stdlib.h> #include <stdint.h> #include <math.h> [![#pragma pack(push, 1) typedef struct { char bitmapSignatureBytes\[2\]; uint32_t sizeOfBitmapImageBytes; uint16_t reserved1; uint16_t reserved2; uint32_t pixelOffset;][1]][1] }bmpFileHeader; #pragma pack(pop) #pragma pack(push, 1) typedef struct { uint32_t dib_header_size; // DIB Header size in bytes (40 bytes) int32_t width; // Width of the image int32_t height; // Height of image uint16_t num_planes; // Number of color planes uint16_t bits_per_pixel; // Bits per pixel uint32_t compression; // Compression type uint32_t image_size_bytes; // Image size in bytes int32_t x_resolution_ppm; // Pixels per meter int32_t y_resolution_ppm; // Pixels per meter uint32_t num_colors; // Number of colors uint32_t important_colors; // Important colors }bmpInfoHeader; #pragma pack(pop) #pragma pack(push,1) typedef struct { uint8_t blue; uint8_t green; uint8_t red; }pixel; #pragma pack(pop) int randNum(void); int main(void){ bmpFileHeader myBmpFileHeader; bmpInfoHeader myBmpInfoHeader; FILE *bmpImage = fopen("work.bmp", "rb"); FILE *newBmpImage = fopen("border_work.bmp", "wb"); if (bmpImage == NULL) { printf("Error occured when opening file\n"); } fread(&myBmpFileHeader, sizeof(myBmpFileHeader), 1, bmpImage); fread(&myBmpInfoHeader, sizeof(myBmpInfoHeader), 1, bmpImage); if (myBmpFileHeader.bitmapSignatureBytes[0]==0x42 && myBmpFileHeader.bitmapSignatureBytes[1]==0x4D && myBmpInfoHeader.dib_header_size == 40 && myBmpInfoHeader.bits_per_pixel == 24 && myBmpInfoHeader.compression ==0 ) { printf(" File is probably BMP\n"); }else{ printf("Error\n"); } int width = myBmpInfoHeader.width; //printf("Width %i\n", width ); int height = abs(myBmpInfoHeader.height); //printf("Height: %i\n", height ); pixel(*image)[width] = calloc(height, width * sizeof(pixel)); pixel(*image_blur)[width] = calloc(height, width * sizeof(pixel)); int padding = (4 - (width * sizeof(pixel)) % 4) % 4; for (int i = 0; i < height; ++i) { fread(image[i], sizeof(pixel), width, bmpImage); fseek(bmpImage, padding, SEEK_CUR); } int gx[3][3]; int gy[3][3]; gx[0][0] = -1; gx[0][1] = 0; gx[0][2] = 1; gx[1][0] = -2; gx[1][1] = 0; gx[1][2] = 2; gx[2][0] = -1; gx[2][1] = 0; gx[2][2] = 1; gy[0][0] = -1; gy[0][1] = -2; gy[0][2] = -1; gy[1][0] = 0; gy[1][1] = 0; gy[1][2] = 0; gy[2][0] = 1; gy[2][1] = 2; gy[2][2] = 1; int gxValBlue; int gyValBlue; int gxValGreen; int gyValGreen; int gxValRed; int gyValRed; int squaredBlue; int squaredGreen; int squaredRed; for (int lin = 0; lin < height; ++lin) { for (int col = 0; col < width; ++col) { if (lin !=0 && lin != height && col != 0 && col != width)// tem todos os vizinhos { gxValBlue = (image[lin-1][col-1].blue * gx[0][0] + image[lin-1][col].blue * gx[0][1] + image[lin-1][col+1].blue * gx[0][2] + image[lin][col-1].blue * gx[1][0] + image[lin][col].blue * gx[1][1] + image[lin][col+1].blue * gx[1][2] + image[lin-1][col-1].blue * gx[2][0] + image[lin+1][col].blue * gx[2][1] + image[lin+1][col+1].blue * gx[2][2]); gyValBlue = (image[lin-1][col-1].blue * gy[0][0] + image[lin-1][col].blue * gy[0][1] + image[lin-1][col+1].blue * gy[0][2] + image[lin][col-1].blue * gy[1][0] + image[lin][col].blue * gy[1][1] + image[lin][col+1].blue * gy[1][2] + image[lin-1][col-1].blue * gy[2][0] + image[lin+1][col].blue * gy[2][1] + image[lin+1][col+1].blue * gy[2][2]); squaredBlue = (int)sqrt(gxValBlue*gxValBlue + gyValBlue*gyValBlue); gxValGreen = (image[lin-1][col-1].green * gx[0][0] + image[lin-1][col].green * gx[0][1] + image[lin-1][col+1].green * gx[0][2] + image[lin][col-1].green * gx[1][0] + image[lin][col].green * gx[1][1] + image[lin][col+1].green * gx[1][2] + image[lin-1][col-1].green * gx[2][0] + image[lin+1][col].green * gx[2][1] + image[lin+1][col+1].green * gx[2][2]); gyValGreen = (image[lin-1][col-1].green * gy[0][0] + image[lin-1][col].green * gy[0][1] + image[lin-1][col+1].green * gy[0][2] + image[lin][col-1].green * gy[1][0] + image[lin][col].green * gy[1][1] + image[lin][col+1].green * gy[1][2] + image[lin-1][col-1].green * gy[2][0] + image[lin+1][col].green * gy[2][1] + image[lin+1][col+1].green * gy[2][2]); squaredGreen = (int)sqrt(gxValGreen*gxValGreen + gyValGreen*gyValGreen); gxValRed = (image[lin-1][col-1].red * gx[0][0] + image[lin-1][col].red * gx[0][1] + image[lin-1][col+1].red * gx[0][2] + image[lin][col-1].red * gx[1][0] + image[lin][col].red * gx[1][1] + image[lin][col+1].red * gx[1][2] + image[lin-1][col-1].red * gx[2][0] + image[lin+1][col].red * gx[2][1] + image[lin+1][col+1].red * gx[2][2]); gyValRed = (image[lin-1][col-1].red * gy[0][0] + image[lin-1][col].red * gy[0][1] + image[lin-1][col+1].red * gy[0][2] + image[lin][col-1].red * gy[1][0] + image[lin][col].red * gy[1][1] + image[lin][col+1].red * gy[1][2] + image[lin-1][col-1].red * gy[2][0] + image[lin+1][col].red * gy[2][1] + image[lin+1][col+1].red * gy[2][2]); squaredRed = (int)sqrt(gxValRed*gxValRed + gyValRed*gyValRed); if (squaredBlue > 255) { image_blur[lin][col].blue = 255; }else{ image_blur[lin][col].blue = squaredBlue; } if (squaredGreen > 255) { image_blur[lin][col].green = 255; }else{ image_blur[lin][col].green = squaredGreen; } if (squaredRed > 255) { image_blur[lin][col].red = 255; }else{ image_blur[lin][col].red = squaredRed; } }else { // bottom image_blur[lin][col].blue = 0; image_blur[lin][col].green = 0; image_blur[lin][col].red = 0; } } } fwrite(&myBmpFileHeader, sizeof(myBmpFileHeader),1, newBmpImage); fwrite(&myBmpInfoHeader, sizeof(myBmpInfoHeader), 1, newBmpImage); for (int i = 0; i < width; ++i) { for (int k = 0; k < padding; ++k) { fputc(0x00, newBmpImage); } fwrite(image_blur[i], sizeof(pixel), width, newBmpImage); } fclose(newBmpImage); fclose(bmpImage); free(image); free(image_blur); return 0; } I am also sending an example of the original image and modified image. As you can see, the modified image is cropped. The two images have the same dimensions, but the modified image appears cropped. My assumptions that it may be happening: misuse of calloc () that is not providing enough memory to store the modified image problem with padding I have had this problem for a long time and I would like to ask the community for help to resolve this issue and raise my level of proficiency in C.
The reason for the cropped output is what user3386109 said: when writing the output BMP, the outer loop for (int i = 0; i < width; ++i) should iterate up to height, not width. BMPs are stored starting from the bottom row, this is why a portion of the top of the image went missing. A minor comment about the filtering: there is a check that appears to intend to exclude a one-pixel margin for boundary handling, if (lin !=0 && lin != height && col != 0 && col != width) Beware there is an off-by-one error on the right and bottom edges. Since lin iterates for lin < height, the bottom row is lin == height - 1, not height. Similarly col == width - 1 is the rightmost column.
OpenCL Kernel implementing im2col with batch
I am trying to adapt a secuential function writen for CPU to an OpenCL kernel for GPU. The function is the well known im2col used in many deep learning applications. I have found some code on the OpenCV repository implementing this im2col function written in OpenCL but the one that I have to adapt uses a batch that confuses me and seems to be a bit different. What should I change on the OpenCL kernel to make it work the same on GPU as it does on the CPU function? CPU code int fn_im2col_cpu(int I, int WI, int HI, int B, int KW, int KH, int WO, int HO, int PW, int PH, int SW, int SH, type *in_ptr, type *out_ptr) { PROFILING_HEADER_EXTERN(im2col); PROFILING_DEVICE(im2col, DEV_CPU); int i; // scrolls input channels int w; // scrolls channel columns (width) int h; // scrolls channel rows (height) int kw; // scrolls filter columns (width) int kh; // scrolls filter rows (height) // we sweep all output pixels, and for each pixel we compute the associated input pixel #pragma omp parallel for private (kh, kw, h, w) for (i = 0; i < I; i++) { size_t out_addr = ((size_t)B * (size_t)WO * (size_t)HO * (size_t)KW * (size_t)KH * (size_t)i); size_t in_addr1 = (size_t)i * (size_t)B * (size_t)WI * (size_t)HI; for (kh = 0; kh < KH; kh++) { for (kw = 0; kw < KW; kw++) { for (h = 0; h < HO; h++) { int hi = h * SH - PH + kh; size_t in_addr2 = in_addr1 + ((size_t)hi * (size_t)B * (size_t)WI); for (w = 0; w < WO; w++) { int wi = w * SW - PW + kw; int force_padding = (wi < 0) || (wi >= WI) || (hi < 0) || (hi >= HI); if (force_padding) { bzero(&out_ptr[out_addr], B*sizeof(type)); } else { int in_addr = in_addr2 + (wi * B); memcpy(&out_ptr[out_addr], &in_ptr[in_addr], B*sizeof(type)); } out_addr+=B; } } } } } return 1; } OpenCL kernel from https://github.com/opencv/opencv/blob/master/modules/dnn/src/opencl/im2col.cl __kernel void im2col(__global const float *im_src, int im_src_offset, int channels, int height_inp, int width_inp, int kernel_h, int kernel_w, int pad_h, int pad_w, int stride_h, int stride_w, int height_out, int width_out, __global float *im_col, int im_col_offset ) { int index = get_global_id(0); if (index >= height_out * width_out * channels) return; int j_out = index % width_out; int i_out = (index / width_out) % height_out; int c_inp = (index / width_out) / height_out; int c_out = c_inp * kernel_h * kernel_w; int i_inp = i_out * stride_h - pad_h; int j_inp = j_out * stride_w - pad_w; im_src += (c_inp * height_inp + i_inp) * width_inp + j_inp + im_src_offset; im_col += (c_out * height_out + i_out) * width_out + j_out + im_col_offset; for (int ki = 0; ki < kernel_h; ++ki) for (int kj = 0; kj < kernel_w; ++kj) { int i = i_inp + ki; int j = j_inp + kj; *im_col = (i >= 0 && j >= 0 && i < height_inp && j < width_inp) ? im_src[ki * width_inp + kj] : 0; im_col += height_out * width_out; } }
Your C version folds the batch into the lowest dimension. The opencl version isn't even using batch. You need to pass in the batch size "B", and change this copy to a block copy (or just do a loop over) by the batch size: for (int b=0; b<B; b++) *(im_col*B+b) = (i >= 0 && j >= 0 && i < height_inp && j < width_inp) ? im_src[(ki * width_inp + kj)*B + b] : 0; to emulate the memcpy(..., B*sizeof(type)). And then just stride B times more: im_col += height_out * width_out * B;
Cuda program for matrix batch multiplication
I am a novice in the field of CUDA program and I am trying to repeat the function of cublasSgemmBatched, which means that I want to perform the matrix-matrix multiplication of a batch of matrices. I try to implement my idea as the following code. #include <stdio.h> __global__ void BatchMulCUDA(float* array1, float* array2, int narray1, int dim, float* result) { int tx = blockIdx.x * blockDim.x + threadIdx.x; if (tx < narray1 * dim) { float temp = 0; int index = tx / dim; #pragma for (int i = 0; i < dim; i++) { temp += array1[tx * dim + i] * array2[index * dim + i]; } result[tx] = temp; } } void BatchMulGPU(float* array1, float* array2, int narray1, int dim, float* result) { dim3 threads(1024, 1); dim3 grid(narray1 / 1024 + 1, 1); int threadsPerBlock = threads.x * threads.y; int blocksPerGrid = grid.x * grid.y; printf("CUDA kernel launch with %d blocks of %d threads\n", blocksPerGrid, threadsPerBlock); BatchMulCUDA<<<grid, threads>>>(array1, array2, narray1, dim, result); } However, strangely, I found that I can get the right output before the index 19730. After the element of 19730, the output of GPU is always 0. I do not know what the problem is. The CPU version of my code and test function are as the following. Is there any hardware limitation that I do not realize? #include "kernel.h" #include <cuda_runtime.h> #include <stdio.h> #include <stdlib.h> #include <iostream> #include <sys/time.h> #include <math.h> double cpuSecond() { struct timeval tp; gettimeofday(&tp, NULL); return ((double) tp.tv_sec + (double)tp.tv_usec*1e-6); } void BatchMulCPU(float* array1, float* array2, int narray1, int dim, float* result) { for (int i = 0; i < narray1 * dim; i++) { float temp = 0; int index = i / dim; for (int j = 0; j < dim; j++) { temp += array1[i * dim + j] * array2[index * dim + j]; } result[i] = temp; } } int main(int argc, char** argv) { int narray1 = 6980; int dim = 4; float* array1 = new float[narray1 * dim * dim]; float* array2 = new float[narray1 * dim]; float* resultGPU = new float[narray1 * dim]; float* resultCPU = new float[narray1 * dim]; float* d_array1; float* d_array2; float* d_result; for (int i = 0; i < narray1 * dim * dim; i++) { array1[i] = static_cast<float> (rand() / (static_cast<float> (RAND_MAX / 10))); } for (int i = 0; i < narray1 * dim; i++) { array2[i] = static_cast<float> (rand() / (static_cast<float> (RAND_MAX / 10))); } cudaError_t err; double iStart = cpuSecond(); err = cudaMalloc((void**)&d_array1, narray1 * dim * dim * sizeof(float)); err = cudaMalloc((void**)&d_array2, narray1 * dim * sizeof(float)); err = cudaMalloc((void**)&d_result, narray1 * dim * sizeof(float)); err = cudaMemcpy(d_array1, array1, narray1 * dim * dim * sizeof(float), cudaMemcpyHostToDevice); err = cudaMemcpy(d_array2, array2, narray1 * dim * sizeof(float), cudaMemcpyHostToDevice); BatchMulGPU(d_array1, d_array2, narray1, dim, d_result); err = cudaMemcpy(resultGPU, d_result, narray1 * dim * sizeof(float), cudaMemcpyDeviceToHost); double iElaps = cpuSecond() - iStart; printf("Total GPU computation time is %lf \n" , iElaps); iStart = cpuSecond(); BatchMulCPU(array1, array2, narray1, dim, resultCPU); iElaps = cpuSecond() - iStart; printf("Total CPU computation time is %lf \n" , iElaps); float error = 0; float temp = 0; for (long i = 0; i < narray1 * dim; i++) { // temp = abs(resultCPU[i] - resultGPU[i]); // if (temp > 0.5) // { // std::cout << i << std::endl; // } error += abs(resultCPU[i] - resultGPU[i]); } printf("Error is %f \n", error); // for (int i = 19730; i < 19750; i++) // { // std::cout << "GPU " << resultGPU[i] << std::endl; // std::cout << "CPU " << resultCPU[i] << std::endl; // } cudaFree(d_array1); cudaFree(d_array2); cudaFree(d_result); return 0; }
Apart from the possibility of a WDDM TDR timeout as discussed in the comments, the code has an error. Its evident that the kernel design expects that a total grid size (total number of threads) will be launched that is equal to or greater than the number of arrays times the side dimension: int tx = blockIdx.x * blockDim.x + threadIdx.x; if (tx < narray1 * dim) i.e. narray1*dim are the needed number of threads However the number being launched is only narray1: dim3 threads(1024, 1); dim3 grid(narray1 / 1024 + 1, 1); If we change the last line above to: dim3 grid((narray1*dim) / 1024 + 1, 1); this code design error will be addressed. The reason the code works correctly for small number of matrices (anything up to 256) is because of the rounding-up effect in the grid sizing to a minimum of 1024 threads, which is 256*4 (narray1 * dim). As an aside, this code is not functionally similar to cublasSgemmBatched from what I can see. I don't recognize this code as being any matrix multiplication (matrix dot product) that I am familiar with.
Don't understand how this code is scaling a bmp image
The following code was given to me by my instructor. I just don't understand how this is scaling a bmp image. I know the basics about bmp images (the info on wikipedia). I know that this method is supposed to multiply the rows and cols of the new image by whatever scale is. I tried to run the code by hand but it confused me even more. Any help will be much appreciated. Thanks! int enlarge(PIXEL* original, int rows, int cols, int scale, PIXEL** new, int* newrows, int* newcols) { //scaling the new rows & cols *newcols = cols * scale; *newrows = rows * scale; //memory allocated for enlaged bmp *new = (PIXEL*)malloc(*newrows * *newcols * sizeof(PIXEL)); int row, col, sx, sy; //transverse through every row for (row = 0; row < rows; row++ ) //transvere through every col for (col = 0; col < cols; col++ ){ //im unsure what this is for PIXEL* o = original + (row * cols) + col; for(sy = 0; sy < scale; sy++ ) for(sx = 0; sx < scale; sx++ ) { //im unsure what this is for PIXEL* n = *new + (scale * row) * *newcols + (scale * col) + (sy * *newcols) + sx; *n = *o; } } return 0; } Here is the struct for PIXEL. typedef struct { unsigned char r; unsigned char g; unsigned char b; } PIXEL; There is additional code but I do not think that is needed for this question.
PIXEL* o = original + (row * cols) + col; Here he is retrieving a pointer to the source pixel in the original image; it's just trivial pointer arithmetic, based on the fact that the rows in the bitmap are consecutive in memory. In general, in a C-style matrix width-wide the address of the element (x, y) is beginning + (y * width) + x. Then, he loops over a square scale x scale wide in the target image. for(sy = 0; sy < scale; sy++ ) for(sx = 0; sx < scale; sx++ ) { //im unsure what this is for PIXEL* n = *new + (scale * row) * *newcols + (scale * col) + (sy * *newcols) + sx; The n pointer points to the target pixel in the destination image; if you match the formula above from the source image and rearrange a bit the terms, you'll see he is accessing the new image, at position (scale * col + sx, scale * row + sy) (remember that the new image is *newcols wide). *n = *o; Here he's just copying the source pixel to the target pixel. In practice, he's "expanding" each source pixel into a scale x scale square in the target image.
How to Optimize CUDA Sieve of Eratosthenes [closed]
This question is unlikely to help any future visitors; it is only relevant to a small geographic area, a specific moment in time, or an extraordinarily narrow situation that is not generally applicable to the worldwide audience of the internet. For help making this question more broadly applicable, visit the help center. Closed 9 years ago. I'm new to CUDA. To get my hands dirty, I tried writing a Sieve of Eratosthenes (for finding all the primes up to some number n). There are a number of things I had to do to get it to work that it seems shouldn't have been necessary. I'm curious whether anyone knows of a more natural (and still CUDA-optimized) approach. To take the entries marked as prime in the isPrime array, I had to do two separate kernel calls. The first counts the number of primes in each threadblock and assigns to each entry i the number of primes in that block less than i. Then I have to make a second call to add in the number of primes in all the previous blocks in order to get the final index. But it's even worse than that, because to avoid heaps of concurrent reads, I had to store the number of primes in the block in a separate array at each of THREADS_PER_BLOCK indices effectively doubling the required memory for the algorithm. It seems like there should be a way to have all the threads read the same value for each block rather than have to copy it so many times. Despite all this, there's still the problem of concurrent reads in the clearMultiples method. Especially for small primes like 2 and 3, every thread has to read the value in. Isn't there any way to deal with this? Could anyone look at my code and tell me if there's anything obvious I could do that would be simpler or more efficient? Is there anything I'm doing that's particularly inefficient (besides printing out all the primes at the end of course)? Is it necessary to call synchronize after every kernel call? Do I need to synchronize after memcpy's as well? Finally, how come when I set THREADS_PER_BLOCK to 512 it doesn't work? Thank you #include <stdio.h> #include <cuda.h> #include <assert.h> #include <math.h> #define MAX_BLOCKS 256 #define THREADS_PER_BLOCK 256 //Must be a power of 2 #define BLOCK_SPACE 2 * THREADS_PER_BLOCK __global__ void initialize(int* isPrime, int n) { int idx = blockIdx.x * THREADS_PER_BLOCK + threadIdx.x; int step = gridDim.x * THREADS_PER_BLOCK; int i; for (i = idx; i <= 1; i += step) { isPrime[i] = 0; } for (; i < n; i += step) { isPrime[i] = 1; } } __global__ void clearMultiples(int* isPrime, int* primeList, int startInd, int endInd, int n) { int yidx = blockIdx.y * blockDim.y + threadIdx.y; int xidx = blockIdx.x * blockDim.x + threadIdx.x; int ystep = gridDim.y * blockDim.y; int xstep = gridDim.x * blockDim.x; for (int pnum = startInd + yidx; pnum < endInd; pnum += ystep) { int p = primeList[pnum]; int pstart = p * (p + xidx); int pstep = p * xstep; for (int i = pstart; i < n; i += pstep) { isPrime[i] = 0; } } } __device__ void makeCounts(int* isPrime, int* addend, int start, int stop) { __shared__ int tmpCounts[BLOCK_SPACE]; __shared__ int dumbCounts[BLOCK_SPACE]; int idx = threadIdx.x; tmpCounts[idx] = ((start + idx) < stop) ? isPrime[start + idx] : 0; __syncthreads(); int numEntries = THREADS_PER_BLOCK; int cstart = 0; while (numEntries > 1) { int prevStart = cstart; cstart += numEntries; numEntries /= 2; if (idx < numEntries) { int i1 = idx * 2 + prevStart; tmpCounts[idx + cstart] = tmpCounts[i1] + tmpCounts[i1 + 1]; } __syncthreads(); } if (idx == 0) { dumbCounts[cstart] = tmpCounts[cstart]; tmpCounts[cstart] = 0; } while (cstart > 0) { int prevStart = cstart; cstart -= numEntries * 2; if (idx < numEntries) { int v1 = tmpCounts[idx + prevStart]; int i1 = idx * 2 + cstart; tmpCounts[i1 + 1] = tmpCounts[i1] + v1; tmpCounts[i1] = v1; dumbCounts[i1] = dumbCounts[i1 + 1] = dumbCounts[idx + prevStart]; } numEntries *= 2; __syncthreads(); } if (start + idx < stop) { isPrime[start + idx] = tmpCounts[idx]; addend[start + idx] = dumbCounts[idx]; } } __global__ void createCounts(int* isPrime, int* addend, int lb, int ub) { int step = gridDim.x * THREADS_PER_BLOCK; for (int i = lb + blockIdx.x * THREADS_PER_BLOCK; i < ub; i += step) { int start = i; int stop = min(i + step, ub); makeCounts(isPrime, addend, start, stop); } } __global__ void sumCounts(int* isPrime, int* addend, int lb, int ub, int* totalsum) { int idx = blockIdx.x; int s = 0; for (int i = lb + idx; i < ub; i += THREADS_PER_BLOCK) { isPrime[i] += s; s += addend[i]; } if (idx == 0) { *totalsum = s; } } __global__ void condensePrimes(int* isPrime, int* primeList, int lb, int ub, int primeStartInd, int primeCount) { int idx = blockIdx.x * THREADS_PER_BLOCK + threadIdx.x; int step = gridDim.x * THREADS_PER_BLOCK; for (int i = lb + idx; i < ub; i += step) { int term = isPrime[i]; int nextTerm = i + 1 == ub ? primeCount : isPrime[i + 1]; if (term < nextTerm) { primeList[primeStartInd + term] = i; } } } int main(void) { printf("Enter upper bound:\n"); int n; scanf("%d", &n); int *isPrime, *addend, *numPrimes, *primeList; cudaError_t t = cudaMalloc((void**) &isPrime, n * sizeof(int)); assert(t == cudaSuccess); t = cudaMalloc(&addend, n * sizeof(int)); assert(t == cudaSuccess); t = cudaMalloc(&numPrimes, sizeof(int)); assert(t == cudaSuccess); int primeBound = 2 * n / log(n); t = cudaMalloc(&primeList, primeBound * sizeof(int)); assert(t == cudaSuccess); int numBlocks = min(MAX_BLOCKS, (n + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK); initialize<<<numBlocks, THREADS_PER_BLOCK>>>(isPrime, n); t = cudaDeviceSynchronize(); assert(t == cudaSuccess); int bound = (int) ceil(sqrt(n)); int lb; int ub = 2; int primeStartInd = 0; int primeEndInd = 0; while (ub < n) { if (primeEndInd > primeStartInd) { int lowprime; t = cudaMemcpy(&lowprime, primeList + primeStartInd, sizeof(int), cudaMemcpyDeviceToHost); assert(t == cudaSuccess); int numcols = n / lowprime; int numrows = primeEndInd - primeStartInd; int threadx = min(numcols, THREADS_PER_BLOCK); int thready = min(numrows, THREADS_PER_BLOCK / threadx); int blockx = min(numcols / threadx, MAX_BLOCKS); int blocky = min(numrows / thready, MAX_BLOCKS / blockx); dim3 gridsize(blockx, blocky); dim3 blocksize(threadx, thready); clearMultiples<<<gridsize, blocksize>>>(isPrime, primeList, primeStartInd, primeEndInd, n); t = cudaDeviceSynchronize(); assert(t == cudaSuccess); } lb = ub; ub *= 2; if (lb >= bound) { ub = n; } numBlocks = min(MAX_BLOCKS, (ub - lb + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK); createCounts<<<numBlocks, THREADS_PER_BLOCK>>>(isPrime, addend, lb, ub); t = cudaDeviceSynchronize(); assert(t == cudaSuccess); sumCounts<<<THREADS_PER_BLOCK, 1>>>(isPrime, addend, lb, ub, numPrimes); t = cudaDeviceSynchronize(); assert(t == cudaSuccess); int primeCount; t = cudaMemcpy(&primeCount, numPrimes, sizeof(int), cudaMemcpyDeviceToHost); assert(t == cudaSuccess); assert(primeCount > 0); primeStartInd = primeEndInd; primeEndInd += primeCount; condensePrimes<<<numBlocks, THREADS_PER_BLOCK>>>(isPrime, primeList, lb, ub, primeStartInd, primeCount); t = cudaDeviceSynchronize(); assert(t == cudaSuccess); } int finalprimes[primeEndInd]; t = cudaMemcpy(finalprimes, primeList, primeEndInd * sizeof(int), cudaMemcpyDeviceToHost); assert(t == cudaSuccess); t = cudaFree(isPrime); assert(t == cudaSuccess); t = cudaFree(addend); assert(t == cudaSuccess); t = cudaFree(numPrimes); assert(t == cudaSuccess); t = cudaFree(primeList); assert(t == cudaSuccess); for (int i = 0; i < primeEndInd; i++) { if (i % 16 == 0) printf("\n"); else printf(" "); printf("%4d", finalprimes[i]); } printf("\n"); return 0; }
Answering some of your questions. Fix your error checking as defined in the comments. define what you mean by "concurrent reads". You're concerned about this but I'm not sure what you mean by it. Is it necessary to call synchronize after every kernel call? No, it isn't. If your code is not working correctly, synchronizing after every kernel call then doing proper error checking will tell you if any kernels are not launching correctly. Synchronization is generally not needed for relatively simple single-stream programs like this one. The cuda calls that need to synchronize like cudaMemcpy will do this automatically for you. Do I need to synchronize after memcpy's as well? No, cudaMemcpy is synchronous in nature (it will force all cuda calls in the same stream to complete before it begins, and it will not return control to the host thread until the copy is complete.) If you don't want the blocking characteristic (not returning control to the host thread until complete) then you can use the cudaMemcpyAsync version of the call. You would use streams to get around the behavior of forcing all previous cuda calls to complete. Finally, how come when I set THREADS_PER_BLOCK to 512 it doesn't work? Please define what you mean by "it doesn't work". I compiled your code with THREADS_PER_BLOCK of 512 and 256, and for an upper bound of 1000 it gave the same output in each case.