Related
I need to turn the following implementation of this function into a threaded one.
Yet, I have absolutely no clue on how to do so nor how to approach the problem.
Any tips or orientation is much appreciated.
void compute_target_pixel(int x, int y) {
int i, j, sum = 0;
int delta = (KLEN - 1) / 2;
for (i = -delta; i <= delta; ++i)
for (j = -delta; j <= delta; ++j)
if (0 <= x + i && x + i < WIDTH && 0 <= y + j && y + j < HEIGHT)
sum += filter.values[(i + delta) * KLEN + (j + delta)] * pixels[(x + i) * HEIGHT + (y + j)];
if(filter.sum > 0) target[x * HEIGHT + y] = sum / filter.sum;
else target[x * HEIGHT + y] = sum;
}
I ve done a program that take an number n as input and then return a square matrix n*n with the property that all row, columns, and diagonals have the same sum.
The project works without problem, and I tried to optimize it as much as i can, from the algorithm to uses the specific data type for this(in my case unsigned short, cause i didn t need a bigger storage).
After all i tried to see the performance and i wanted to try it with a bigger number like 100,200, so on;
But when i tried to change the storage of matrix the program didn t work properly and returned a matrix with 0 and the sum was strange.
I don t understand from where is this bug.
#include <stdio.h>
#include <stdlib.h>
unsigned short a[100][100], i = 0, j = 0, n, suma[100];
void next_b(unsigned short *i, unsigned short *j); // find the properly i and j
void completare(unsigned short i, unsigned short j); // completes the matrix after i find the i and j
void tipar(); // print the matrix
int suma_linie(unsigned short x); //sum of a row
int suma_coloana(unsigned short y); //sum of a column
int suma_diagonala_principala(); //first diagonal
int suma_diagonala_secundara(); //second one
int main()
{
scanf("%hu", &n);
system("cls");
j = n / 2 - 1;
a[0][j] = 4;
a[0][j + 1] = 1;
a[1][j] = 2;
a[1][j + 1] = 3;
suma[0] = 5;
suma[1] = 5;
suma[n + j] = 6;
suma[n + j + 1] = 4;
for (int x = 2; x <= (n / 2) * (n / 2); x++)
{
next_b(&i, &j);
a[i][j] = x;
completare(i, j);
}
tipar();
//for(int x=0;x<n;x++){
//
// printf("suma de pe linia %d este: %d\n",x,suma_linie(x));
// printf("suma de pe coloana %d este: %d\n\n",x,suma_coloana(x));
//}
//printf("suma de pe daig principala este: %d\n\n",suma_diagonala_principala());
// printf("suma de pe daig secundara este: %d\n\n",suma_diagonala_secundara());
for (int x = 0; x < 2 * n + 2; x++)
{
if (x < n)
{
printf("suma de pe linia %d este %hu\n", x, suma[x]);
}
else if (x < 2 * n)
{
printf("suma de pe coloana %d este %hu\n", x % n, suma[x]);
}
else if (x == 2 * n)
{
printf("suma de pe diag principala este %hu\n", suma[x]);
}
else
{
printf("suma de pe diag secundara este %hu\n", suma[x]);
}
}
return 0;
}
void tipar()
{
for (int k = 0; k < n; k++)
{
for (int l = 0; l < n; l++)
{
if (a[k][l] < 10)
{
printf(" %d |", a[k][l]);
}
else if (a[k][l] <= 99)
{
printf(" %d |", a[k][l]);
}
else if (a[k][l] < 1000)
{
printf(" %d |", a[k][l]);
}
else if (a[k][l] < 10000)
{
printf("%d ", a[k][l]);
}
}
printf("\n");
for (int z = 0; z <= 6 * n - 1; z++)
{
printf("-");
}
printf("\n");
}
printf("\n");
}
void next_b(unsigned short *i, unsigned short *j)
{
if (*i - 2 < 0)
{
if (a[n - 2][*j + 2] == 0 && *j + 2 <= n - 2)
{
// printf("cazul 2\n");
*i = n - 2;
*j += 2;
return;
}
else if (a[*i - 2][*j] == 0)
{
// printf("cazul 7\n");
*i += 2;
return;
}
}
else
{
if (*j == n - 2)
{ //printf("cazul 3\n");
*i -= 2;
*j = 0;
return;
}
else if (a[*i - 2][*j + 2] != 0)
{
//printf("cazul 4\n");
*i += 2;
}
else if (a[*i - 2][*j + 2] == 0)
{
// printf("cazul 5\n");
*i -= 2;
*j += 2;
}
}
}
void completare(unsigned short i, unsigned short j)
{
if (i <= n / 2)
{ //////////// l
if (i == n / 2 - 1 && j == n / 2 - 1)
{
a[i][j + 1] = 4 * a[i][j];
a[i + 1][j] = 4 * a[i][j] - 2;
a[i + 1][j + 1] = 4 * a[i][j] - 1;
a[i][j] = 4 * a[i][j] - 3;
}
else
{
a[i][j] = 4 * a[i][j];
a[i][j + 1] = a[i][j] - 3;
a[i + 1][j] = a[i][j] - 2;
a[i + 1][j + 1] = a[i][j] - 1;
}
}
else if (i == n / 2 + 1)
{ ///////////// u
if (j == n / 2 - 1)
{
a[i][j] = 4 * a[i][j];
a[i][j + 1] = a[i][j] - 3;
a[i + 1][j] = a[i][j] - 2;
a[i + 1][j + 1] = a[i][j] - 1;
}
else
{
a[i][j + 1] = 4 * a[i][j];
a[i + 1][j] = 4 * a[i][j] - 2;
a[i + 1][j + 1] = 4 * a[i][j] - 1;
a[i][j] = 4 * a[i][j] - 3;
}
}
else
{ ///////x
a[i][j + 1] = 4 * a[i][j];
a[i + 1][j + 1] = 4 * a[i][j] - 2;
a[i + 1][j] = 4 * a[i][j] - 1;
a[i][j] = 4 * a[i][j] - 3;
}
suma[i] += a[i][j] + a[i][j + 1];
suma[i + 1] += a[i + 1][j] + a[i + 1][j + 1];
suma[n + j] += a[i][j] + a[i + 1][j];
suma[n + j + 1] += a[i][j + 1] + a[i + 1][j + 1];
if (i == j)
{
suma[2 * n] += a[i][j] + a[i + 1][j + 1];
}
if (i + j + 1 == n - 1)
{
suma[2 * n + 1] += a[i + 1][j] + a[i][j + 1];
}
}
int suma_linie(unsigned short x)
{
int s = 0;
for (int y = 0; y < n; y++)
{
s += a[x][y];
}
return s;
}
int suma_coloana(unsigned short y)
{
int s = 0;
for (int x = 0; x < n; x++)
{
s += a[x][y];
}
return s;
}
int suma_diagonala_principala()
{
int s = 0;
for (int x = 0; x < n; x++)
{
s += a[x][x];
}
return s;
}
int suma_diagonala_secundara()
{
int s = 0;
for (int x = 0; x < n; x++)
{
s += a[x][n - x - 1];
}
return s;
}
The code is solve with an algorithm that i found on Wikipedia-Magic Square.
In the above cod if i try to change suma size to 200 for example or any other value, the program works strange and returns stupid things.
It s valid even i set a size higher.
I used two ways to see the sum, one with a predefined functions and the other adding the matrix element to suma, the functions used int and the other method utilize unsigned short if its matters.
I checked your code and found some problems. I tried with n = 90.
for(int x=2; x<=(n/2)*(n/2); x++)
{
next_b(&i,&j);
a[i][j]=x;
completare(i,j);
}
If you check this code then you can see when the value of i, j become 0, 2 accordingly then it goes to next_b(i, j) method. And there are some lines which are trying to access negative indexes. So it throws exception.
Like -
else if(a[*i-2][*j]==0) // here
{
// printf("cazul 7\n");
*i+=2;
return;
}
.............................................
if(*j==n-2)
{
//printf("cazul 3\n");
*i-=2;
*j=0;
return;
}
else if (a[*i-2][*j+2]!=0) // here
{
//printf("cazul 4\n");
*i+=2;
}
else if(a[*i-2][*j+2]==0) // here
{
//printf("cazul 5\n");
*i-=2;
*j+=2;
}
Try to fix these negative indexing and then it should work (Assuming your approach is correct).
Same issue as post (Cuda - Multiple sums in each vector element). How do you perform 2D block striding in both x- and y-direction with varying summation limits. The 2D algorithm can be seen in the CPU and monolithic kernel. I included openmp for the CPU so as to get a more fair speedup result. If there is a way to increase the speed of the CPU function as well I would be happy to find out.
This version of the code takes a 2D array and flattens it to a 1D array. I still use the 2D thread dim3 indexing so I can index the double summations more intuitively.
(p.s. all credit to user Robert Crovella for the 1D striding code.)
The code so far is,
#include <stdio.h>
#include <iostream>
#include <cuda.h>
#include <sys/time.h>
typedef double df;
#define USECPSEC 1000000ULL
#define BSX 1<<5
#define BSY 1<<5
#define N 100
#define M 100
const bool sync = true;
const bool nosync = false;
unsigned long long dtime_usec(unsigned long long start, bool use_sync = nosync){
if (use_sync == sync) cudaDeviceSynchronize();
timeval tv;
gettimeofday(&tv, 0);
return ((tv.tv_sec*USECPSEC)+tv.tv_usec)-start;
}
int divUp(int a, int b) {return (a + b - 1) / b;}
float cpu_sum(int n, int m, df *a, df *b, df *c) {
df q, r;
#pragma omp parallel for collapse(2)
for (int x = 0; x < n; x++) {
for (int y = 0; y < m; y++) {
q = 0.0f;
for (int i = 0; i <= x; i++) {
r = 0.0f;
for (int j = 0; j <= y; j++) {
r += a[i * n + j] * b[(x - i) * n + y - j];
}
for (int j = 1; j < m - y; j++) {
r += a[i * n + j] * b[(x - i) * n + y + j]
+ a[i * n + y + j] * b[(x - i) * n + j];
}
q += r;
}
for (int i = 1; i < n-x; i++) {
r = 0.0f;
for (int j = 0; j <= y; j++) {
r += a[i * n + j] * b[(x + i) * n + y - j]
+ a[(x + i) * n + j] * b[ i * n + y - j];
}
for (int j = 1; j < m - y; j++) {
r += a[i * n + j] * b[(x + i) * n + y + j]
+ a[(x + i) * n + y + j] * b[(x + i) * n + j]
+a[(x + i) * n + j] * b[i * n + y + j]
+ a[(x + i) * n + y + j] * b[i * n + j];
}
q += r;
}
c[x * N + y] = 0.25f*q;
}
}
return 0;
}
const int P2 = 5;
const int TPB = 1<<P2;
const unsigned row_mask = ~((0xFFFFFFFFU>>P2)<<P2);
__global__ void chebyprod_imp(int n, int m, df *a, df *b, df *c){
__shared__ df sdata[TPB*TPB];
int x = blockIdx.x;
int y = blockIdx.y;
int row_width_x = (((x)>(n-x))?(x):(n-x))+1;
int row_width_y = (((y)>(m-y))?(y):(m-y))+1;
int strides_x = (row_width_x>>P2) + ((row_width_x&row_mask)?1:0);
int strides_y = (row_width_y>>P2) + ((row_width_y&row_mask)?1:0);
int i = threadIdx.x;
df tmp_a;
df sum = 0.0f;
for (int s=0; s < strides_x; s++) { // block-stride x loop
int j = threadIdx.y;
for (int u=0; u < strides_y; u++) { // block-stride y loop
if (i < n && j < m) {tmp_a = a[i * n + j];}
if (i <= x) {
if (j <= y) {sum += tmp_a * b[(x - i) * n + y - j];}
if ((j > 0) && (j < (m-y))) {sum += tmp_a * b[(x - i) * n + y + j]
+ a[i * n + y + j] * b[(x - i) * n + j];}
}
if ((i > 0) && (i < (n-x))) {
if (j <= y) {sum += tmp_a * b[(x + i) * n + y - j]
+ a[(x + i) * n + j] * b[ i * n + y - j];}
if ((j > 0) && (j < (m-y))) {sum += tmp_a * b[(x + i) * n + y + j]
+ a[(x + i) * n + y + j] * b[(x + i) * n + j]
+ a[(x + i) * n + j] * b[i * n + y + j]
+ a[(x + i) * n + y + j] * b[i * n + j];}
}
j += TPB;
}
i += TPB;
}
sdata[threadIdx.x * TPB + threadIdx.y] = sum;
for (int s = TPB>>1; s > 0; s>>=1) { // sweep reduction in x
for (int u = TPB>>1; u > 0; u>>=1) { // sweep reduction in x
__syncthreads();
if (threadIdx.x < s && threadIdx.y < u) {
sdata[threadIdx.x * TPB + threadIdx.y] += sdata[(threadIdx.x + s) * TPB + threadIdx.y + u];
}
}
}
if (!threadIdx.x && !threadIdx.y) c[x * n + y] = 0.25f*sdata[0];
}
__global__ void chebyprod(int n, int m, df *a, df *b, df *c){
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;
df q, r;
if (x < n && y < m) {
q = 0.0f;
for (int i = 0; i <= x; i++) {
r = 0.0f;
for (int j = 0; j <= y; j++) {
r += a[i * n + j] * b[(x - i) * n + y - j];
}
for (int j = 1; j < m - y; j++) {
r += a[i * n + j] * b[(x - i) * n + y + j]
+ a[i * n + y + j] * b[(x - i) * n + j];
}
q += r;
}
for (int i = 1; i < n-x; i++) {
r = 0.0f;
for (int j = 0; j <= y; j++) {
r += a[i * n + j] * b[(x + i) * n + y - j]
+ a[(x + i) * n + j] * b[ i * n + y - j];
}
for (int j = 1; j < m - y; j++) {
r += a[i * n + j] * b[(x + i) * n + y + j]
+ a[(x + i) * n + y + j] * b[(x + i) * n + j]
+a[(x + i) * n + j] * b[i * n + y + j]
+ a[(x + i) * n + y + j] * b[i * n + j];
}
q += r;
}
c[x * N + y] = 0.25f*q;
}
}
int main(void){
int size = N*M*sizeof(df);
df *a, *b, *c, *cc, *ci, *d_a, *d_b, *d_c, *d_ci;
a = (df*)malloc(size);
b = (df*)malloc(size);
c = (df*)malloc(size);
cc = (df*)malloc(size);
ci = (df*)malloc(size);
cudaMalloc(&d_a, size);
cudaMalloc(&d_b, size);
cudaMalloc(&d_c, size);
cudaMalloc(&d_ci, size);
#pragma omp parallel for collapse (2)
for (int i = 0; i < N; i++) {
for (int j = 0; j < M; j++) {
a[i * M + j] = 0.1f;
b[i * M + j] = 0.2f;
}
}
unsigned long long dt = dtime_usec(0);
// Perform chebyprod on N elements
cpu_sum(N, M, a, b, cc);
dt = dtime_usec(dt,sync);
printf("Time taken 2D CPU: %fs\n", dt/(float)USECPSEC);
df dtc = dt/(float)USECPSEC;
std::cout << "Vector cc: [ ";
for (int k = 0; k < 10; ++k)
std::cout << cc[k] << " ";
std::cout <<"]\n";
cudaMemcpy(d_a, a, size, cudaMemcpyHostToDevice);
cudaMemcpy(d_b, b, size, cudaMemcpyHostToDevice);
dim3 dimBlock(BSX, BSY);
dim3 dimGrid(divUp(N, BSX), divUp(M, BSY));
//std::cout << "dimBlock: " << dimBlock << "\n dimGrid: " << dimGrid << "\n";
dt = dtime_usec(0);
// Perform chebyprod on N elements
chebyprod<<< dimBlock, dimGrid >>>(N, M, d_a, d_b, d_c);
dt = dtime_usec(dt,sync);
printf("Time taken 2D monolithic kernel: %fs\n", dt/(float)USECPSEC);
printf("Speedup: %fs\n", dtc/(dt/(float)USECPSEC));
cudaMemcpy(c, d_c, size, cudaMemcpyDeviceToHost);
std::cout << "Vector c: [ ";
for (int k = 0; k < 10; ++k)
std::cout << c[k] << " ";
std::cout <<"]\n";
dt = dtime_usec(0);
// Perform chebyprod on N elements
chebyprod_imp<<< dimBlock, dimGrid >>>(N, M, d_a, d_b, d_ci);
dt = dtime_usec(dt,sync);
printf("Time taken 2D stride kernel: %fs\n", dt/(float)USECPSEC);
cudaMemcpy(ci, d_ci, size, cudaMemcpyDeviceToHost);
std::cout << "Vector ci: [ ";
for (int k = 0; k < 10; ++k)
std::cout << ci[k] << " ";
std::cout <<"]\n";
cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_c);
cudaFree(d_ci);
free(a);
free(b);
free(c);
free(cc);
free(ci);
}
For me, anyway, the results for the CPU code don't match between the cases where I compile with OpenMP support and without, if I omit -O3. I seem to get the correct results with OpenMP compilation if I also specify -O3. I'm not sure why that should matter for correctness, although it obviously has an impact on CPU code performance.
You seem to have gotten your grid and block sizing backwards:
chebyprod<<< dimBlock, dimGrid >>>(....
the first kernel config parameter is the grid dimension, not the block dimension. I'm not sure how this came about since you had it done correctly in your previous question.
As in the previous question, we need to pick a thread strategy and implement it correctly. You seemed to be confused about striding, so hopefully the code below will clarify things. The thread strategy I will use here is one warp per output point. A warp is a group of threads with a dimension of 32 (threads) in the x direction, and 1 in the y direction. Therefore the loop striding will be by an increment of 32 in the x direction, but only 1 in the y direction, to cover the entire space. The choice of thread strategy also affects grid sizing.
You seem to have jumbled the relationships that I think should exist for the two dimensions. The x direction, N, and n should all be connected. Likewise the y direction, M and m should all be connected (for example, M is the dimension in the y direction).
When it comes to 2D threadblocks, we want to arrange indexing for coalescing on the GPU such that the index that includes threadIdx.x is not multiplied by anything. (A simplified statement of coalescing is that we want adjacent threads in the warp to access adjacent elements in memory. Since threadIdx.x increases by 1 as we go from thread to thread in the warp, we want to use this characteristic to generate adjacent memory indexing. If we multiply threadIdx.x by anything except 1, we break the pattern.) You have this reversed - where the index including threadIdx.x is typically multiplied by the row dimension (N, or n). This really cannot be correct, and also does not make for good coalesced access. To solve this, we want to transpose our indexing and also transpose the data storage for a and b (and therefore c). In the code below, I have tranposed the indexing for the data setup for a and b, and also the relevant indexing has been transposed in the striding kernel (only). In your non-striding kernel and also your CPU version, I have not transposed the indexing, I leave that as an exercise for you, if needed. For the results, numerically, it does not matter, because your entire a matrix has the same value at every location, and a similar statement can be made about your b matrix. Numerically, then, for this example code, transposing (or not) has no bearing on the result. But it matters for performance (of the striding kernel, at least). Also note that I believe performing the indexing "transpose" on the "monolithic" kernel should also improve its performance. I don't know if it would affect the performance of the CPU version.
I've also added back in the const __restrict__ usage that I included in my previous answer. According to my testing, on "smaller" GPUs this provides noticeable performance benefit. It's not strictly necessary for correctness, however. Here's a worked example with the above changes that gives numerically matching results for all 3 test cases:
$ cat t1498.cu
#include <stdio.h>
#include <iostream>
#include <cuda.h>
#include <time.h>
#include <sys/time.h>
typedef double df;
#define USECPSEC 1000000ULL
#define BSX 1<<5
#define BSY 1<<5
#define N 100
#define M 100
const bool sync = true;
const bool nosync = false;
unsigned long long dtime_usec(unsigned long long start, bool use_sync = nosync){
if (use_sync == sync) cudaDeviceSynchronize();
timeval tv;
gettimeofday(&tv, 0);
return ((tv.tv_sec*USECPSEC)+tv.tv_usec)-start;
}
int divUp(int a, int b) {return (a + b - 1) / b;}
void cpu_sum(int n, int m, df *a, df *b, df *c) {
df q, r;
#pragma omp parallel for collapse(2)
for (int x = 0; x < n; x++) {
for (int y = 0; y < m; y++) {
q = 0.0f;
for (int i = 0; i <= x; i++) {
r = 0.0f;
for (int j = 0; j <= y; j++) {
r += a[i * n + j] * b[(x - i) * n + y - j];
}
for (int j = 1; j < m - y; j++) {
r += a[i * n + j] * b[(x - i) * n + y + j]
+ a[i * n + y + j] * b[(x - i) * n + j];
}
q += r;
}
for (int i = 1; i < n-x; i++) {
r = 0.0f;
for (int j = 0; j <= y; j++) {
r += a[i * n + j] * b[(x + i) * n + y - j]
+ a[(x + i) * n + j] * b[ i * n + y - j];
}
for (int j = 1; j < m - y; j++) {
r += a[i * n + j] * b[(x + i) * n + y + j]
+ a[(x + i) * n + y + j] * b[(x + i) * n + j]
+a[(x + i) * n + j] * b[i * n + y + j]
+ a[(x + i) * n + y + j] * b[i * n + j];
}
q += r;
}
c[x * N + y] = 0.25f*q;
}
}
}
// choose one warp per output point
const int P2 = 5; // assumes warp size is 32
const unsigned row_mask = ~((0xFFFFFFFFU>>P2)<<P2);
__global__ void chebyprod_imp(int n, int m, const df * __restrict__ a, const df * __restrict__ b, df * __restrict__ c){
int x = blockIdx.x;
int y = threadIdx.y+blockDim.y*blockIdx.y;
int width_x = (((x)>(n-x))?(x):(n-x))+1;
int height_y = (((y)>(m-y))?(y):(m-y))+1;
int strides_x = (width_x>>P2) + ((width_x&row_mask)?1:0);
int strides_y = height_y;
int i = threadIdx.x;
df tmp_a;
df sum = 0.0f;
if ((x < n) && (y < m)){
for (int s=0; s < strides_x; s++) { // warp-stride x loop
for (int j=0; j < strides_y; j++) { // y loop
if (i < n && j < m) {tmp_a = a[j * n + i];}
if (i <= x) {
if (j <= y) {sum += tmp_a * b[(y - j) * n + x - i];}
if ((j > 0) && (j < (m-y))) {sum += tmp_a * b[(y+j) * n + x - i] + a[(y+j)* n + i] * b[j*n+(x - i)];}
}
if ((i > 0) && (i < (n-x))) {
if (j <= y) {sum += tmp_a * b[(y-j) * n + x+i] + a[j*n + (x + i)] * b[(y - j)*n + i];}
if ((j > 0) && (j < (m-y)))
{sum += tmp_a * b[(y+j) * n + x+i]
+ a[(y+j) * n + x + i] * b[j*n+(x + i)]
+ a[j*n + (x + i)] * b[(y+j)*n + i]
+ a[(y+j)*n + x + i] * b[j*n+i];}
}
}
i += 32;
}
// warp-shuffle reduction
for (int offset = warpSize>>1; offset > 0; offset >>= 1)
sum += __shfl_down_sync(0xFFFFFFFFU, sum, offset);
if (!threadIdx.x) c[y*m+x] = 0.25f*sum;}
}
__global__ void chebyprod(int n, int m, df *a, df *b, df *c){
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;
df q, r;
if (x < n && y < m) {
q = 0.0f;
for (int i = 0; i <= x; i++) {
r = 0.0f;
for (int j = 0; j <= y; j++) {
r += a[i * n + j] * b[(x - i) * n + y - j];
}
for (int j = 1; j < m - y; j++) {
r += a[i * n + j] * b[(x - i) * n + y + j]
+ a[i * n + y + j] * b[(x - i) * n + j];
}
q += r;
}
for (int i = 1; i < n-x; i++) {
r = 0.0f;
for (int j = 0; j <= y; j++) {
r += a[i * n + j] * b[(x + i) * n + y - j]
+ a[(x + i) * n + j] * b[ i * n + y - j];
}
for (int j = 1; j < m - y; j++) {
r += a[i * n + j] * b[(x + i) * n + y + j]
+ a[(x + i) * n + y + j] * b[(x + i) * n + j]
+a[(x + i) * n + j] * b[i * n + y + j]
+ a[(x + i) * n + y + j] * b[i * n + j];
}
q += r;
}
c[x * N + y] = 0.25f*q;
}
}
int main(void){
int size = N*M*sizeof(df);
df *a, *b, *c, *cc, *ci, *d_a, *d_b, *d_c, *d_ci;
a = (df*)malloc(size);
b = (df*)malloc(size);
c = (df*)malloc(size);
cc = (df*)malloc(size);
ci = (df*)malloc(size);
cudaMalloc(&d_a, size);
cudaMalloc(&d_b, size);
cudaMalloc(&d_c, size);
cudaMalloc(&d_ci, size);
#pragma omp parallel for collapse (2)
for (int j = 0; j < M; j++) {
for (int i = 0; i < N; i++) {
a[j * N + i] = 0.1f;
b[j * N + i] = 0.2f;
}
}
unsigned long long dt = dtime_usec(0);
// Perform chebyprod on N elements
cpu_sum(N, M, a, b, cc);
dt = dtime_usec(dt,sync);
printf("Time taken 2D CPU: %fs\n", dt/(float)USECPSEC);
df dtc = dt/(float)USECPSEC;
std::cout << "Vector cc: [ ";
for (int k = 0; k < 10; ++k)
std::cout << cc[k] << " ";
std::cout <<"]\n";
cudaMemcpy(d_a, a, size, cudaMemcpyHostToDevice);
cudaMemcpy(d_b, b, size, cudaMemcpyHostToDevice);
dim3 dimBlock(BSX, BSY);
dim3 dimGrid(divUp(N, BSX), divUp(M, BSY));
//std::cout << "dimBlock: " << dimBlock << "\n dimGrid: " << dimGrid << "\n";
dt = dtime_usec(0);
// Perform chebyprod on N elements
chebyprod<<< dimGrid, dimBlock >>>(N, M, d_a, d_b, d_c);
dt = dtime_usec(dt,sync);
printf("Time taken 2D monolithic kernel: %fs\n", dt/(float)USECPSEC);
printf("Speedup: %fs\n", dtc/(dt/(float)USECPSEC));
cudaMemcpy(c, d_c, size, cudaMemcpyDeviceToHost);
std::cout << "Vector c: [ ";
for (int k = 0; k < 10; ++k)
std::cout << c[k] << " ";
std::cout <<"]\n";
dt = dtime_usec(0);
// Perform chebyprod on N elements
dim3 dimGrid2(N, (M+dimBlock.y-1)/dimBlock.y);
chebyprod_imp<<< dimGrid2, dimBlock >>>(N, M, d_a, d_b, d_ci);
dt = dtime_usec(dt,sync);
printf("Time taken 2D stride kernel: %fs\n", dt/(float)USECPSEC);
printf("Speedup: %fs\n", dtc/(dt/(float)USECPSEC));
cudaMemcpy(ci, d_ci, size, cudaMemcpyDeviceToHost);
std::cout << "Vector ci: [ ";
for (int k = 0; k < 10; ++k)
std::cout << ci[k] << " ";
std::cout <<"]\n";
df max_error = 0;
for (int k = 0; k < N*M; k++)
max_error = fmax(max_error, fabs(c[k] - ci[k]));
std::cout << "Max diff = " << max_error << std::endl;
cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_c);
cudaFree(d_ci);
free(a);
free(b);
free(c);
free(cc);
free(ci);
}
$ nvcc -O3 -Xcompiler -fopenmp -arch=sm_52 -o t1498 t1498.cu
$ ./t1498
Time taken 2D CPU: 0.034830s
Vector cc: [ 198.005 197.01 196.015 195.02 194.025 193.03 192.035 191.04 190.045 189.05 ]
Time taken 2D monolithic kernel: 0.033687s
Speedup: 1.033930s
Vector c: [ 198.005 197.01 196.015 195.02 194.025 193.03 192.035 191.04 190.045 189.05 ]
Time taken 2D stride kernel: 0.013526s
Speedup: 2.575041s
Vector ci: [ 198.005 197.01 196.015 195.02 194.025 193.03 192.035 191.04 190.045 189.05 ]
Max diff = 8.52651e-13
$
CUDA 10.1.105, Fedora 29, GTX 960
Note that when we run this same test on a Tesla V100, which can take the most advantage of the "extra" threads available in the striding kernel case, the benefit is more obvious:
$ OMP_NUM_THREADS=32 ./t1498
Time taken 2D CPU: 0.031610s
Vector cc: [ 198.005 197.01 196.015 195.02 194.025 193.03 192.035 191.04 190.045 189.05 ]
Time taken 2D monolithic kernel: 0.018228s
Speedup: 1.734145s
Vector c: [ 198.005 197.01 196.015 195.02 194.025 193.03 192.035 191.04 190.045 189.05 ]
Time taken 2D stride kernel: 0.000731s
Speedup: 43.242137s
Vector ci: [ 198.005 197.01 196.015 195.02 194.025 193.03 192.035 191.04 190.045 189.05 ]
Max diff = 8.52651e-13
If you perform the indexing "transpose" on your monolithic kernel similar to what I have done in the striding kernel, I think you'll end up in a performance situation that is roughly similar to where you ended up in the last question. Little or no performance benefit for the striding kernel over your monolithic kernel on a "small" GPU. ~5x improvement on a "large" GPU.
I have a typical algorithm for matrix multiplication. I am trying to apply and understand loop unrolling, but I am having a problem implementing the algorithm when I am trying to unroll k times when k isn't a multiple of the matrices size. (I get very large numbers as a result instead). That means I am not getting how to handle the remaining elements after unrolling. Here is what I have:
void Mult_Matx(unsigned long* a, unsigned long* b, unsigned long*c, long n)
{
long i = 0, j = 0, k = 0;
unsigned long sum, sum1, sum2, sum3, sum4, sum5, sum6, sum7;
for (i = 0; i < n; i++)
{
long in = i * n;
for (j = 0; j < n; j++)
{
sum = sum1 = sum2 = sum3 = sum4 = sum5 = sum6 = sum7 = 0;
for (k = 0; k < n; k += 8)
{
sum = sum + a[in + k] * b[k * n + j];
sum1 = sum1 + a[in + (k + 1)] * b[(k + 1) * n + j];
sum2 = sum2 + a[in + (k + 2)] * b[(k + 2) * n + j];
sum3 = sum3 + a[in + (k + 3)] * b[(k + 3) * n + j];
sum4 = sum4 + a[in + (k + 4)] * b[(k + 4) * n + j];
sum5 = sum5 + a[in + (k + 5)] * b[(k + 5) * n + j];
sum6 = sum6 + a[in + (k + 6)] * b[(k + 6) * n + j];
sum7 = sum7 + a[in + (k + 7)] * b[(k + 7) * n + j];
}
if (n % 8 != 0)
{
for (k = 8 * (n / 8); k < n; k++)
{
sum = sum + a[in + k] * b[k * n + j];
}
}
c[in + j] = sum + sum1 + sum2 + sum3 + sum4 + sum5 + sum6 + sum7;
}
}
}
Let's say size aka n is 12. When I unroll it 4 times, this code works, meaning when it never enters the remainder loop. But I am losing track of what's going on when it does! If anyone can direct me where I am going wrong, I'd really appreciate it. I am new to this, and having a hard time figuring out.
A generic way of unrolling a loop on this shape:
for(int i=0; i<N; i++)
...
is
int i;
for(i=0; i<N-L; i+=L)
...
for(; i<N; i++)
...
or if you want to keep the index variable in the scope of the loops:
for(int i=0; i<N-L; i+=L)
...
for(int i=L*(N/L); i<N; i++)
...
Here, I'm using the fact that integer division is rounded down. L is the number of steps you do in the first loop.
Example:
const int N=22;
const int L=6;
int i;
for(i=0; i<N-L; i+=L)
{
printf("%d\n", i);
printf("%d\n", i+1);
printf("%d\n", i+2);
printf("%d\n", i+3);
printf("%d\n", i+4);
printf("%d\n", i+5);
}
for(; i<N; i++)
printf("%d\n", i);
But I recommend taking a look at Duff's device. However, I do suspect that it's not always a good thing to use. The reason is that modulo is a pretty expensive operation.
The condition if (n % 8 != 0) should not be needed. The for header should take care of that if written properly.
I'm working on a neural network, so I have very large data structures. Thus I'm using a pointer to an array in the heap. I have an assignment statement that is always assigning 0
Nothing is out of bounds, and everything is type double
The code snippet looks like:
for( j = 0 ; j < NumHidden ; j++ ) { /* compute hidden unit activations */
*(SumH + p + j) = *(WeightIH + 0) ;
for( i = 0 ; i <= NumInput ; i++ ) {
temp1 = *(Input + game + 0 + i) * *(WeightIH + i + j) ;
temp2 = *(Input + game + 1 + i) * *(WeightIH + i + j) ;
*(SumH + p + j) += temp1 - temp2 ;
}
*(Hidden + p + j) = 1.0/(1.0 + exp(-*(SumH + p + j))) ;
}
in gdb I can prove that the values are non-zero:
117 temp1 = *(Input + game + 0 + i) * *(WeightIH + i + j) ;
(gdb) p *Input
$1 = 0.75454545500000003
(gdb) p *WeightIH
$2 = 0.5
(gdb) n
118 temp2 = *(Input + game + 1 + i) * *(WeightIH + i + j) ;
(gdb) p temp1
$3 = 0
...but as you can see, temp1 is equal to zero after the assignment. What am I missing?
UPDATE
per request:
Breakpoint 1, main () at nn.c:117
117 temp1 = *(Input + game + 0 + i) * *(WeightIH + i + j) ;
(gdb) p *(WeightIH + i + j)
$1 = 0.5
(gdb) p *(Input + game + 0 + i)
$2 = 0.75454545500000003
Here is the entire code:
Some psuedocode:
read in all the inputs into a 3d array
define all structures (all are correct, none go out of bounds)
loop number of patterns
loop number of games (this is a sports model)
compute activation values
compute output values
compute error
back propagate
update weights
/*******************************************************************************
* nn.c 1.0 � JOHN BULLINARIA 2004 *
*******************************************************************************/
/* To compile use "cc nn.c -O -lm -o nn" and then run using "./nn" */
/* For explanations see: http://www.cs.bham.ac.uk/~jxb/NN/nn.html */
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
#include <math.h>
#include <fcntl.h>
#define NUMPAT 101
#define NUMIN 24
#define NUMHID 100
#define NUMOUT 55
#define rando() ((double)rand()/(RAND_MAX+1))
int main() {
int i, j, k, p, np, op, ranpat[NUMPAT], epoch, game;
int NumPattern = NUMPAT, NumInput = NUMIN, NumHidden = NUMHID, NumOutput = NUMOUT;
double temp1, temp2;
double *Input = (double *)malloc(NumOutput*2*NumInput*sizeof(double));
char line[128];
double num;
FILE *csvFile = fopen("inputs.csv", "r");
int oned_count = 0;
int twod_count = 0;
int threed_count = 0;
int count = 0;
if (csvFile){
char *token;
while (fgets(line, 1024, csvFile)){
token = strtok(&line[0], ",");
while(token){
num = atof(token);
*(Input + oned_count + twod_count + threed_count) = num;
token = strtok(NULL, ",");
threed_count++;
}
count++;
if ((count % 2) == 0){
oned_count++;
}
twod_count++;
if (twod_count == 2){
twod_count = 0;
}
}
fclose(csvFile);
}
double Target[55] = {-5 ,25, -3, 2 ,5 ,17, 10, 10 ,3 ,-8, 11 ,-2, -5, 17 ,4 ,4 ,2 ,12, 5 ,-11 ,-4, -9 ,13, -1, 5 ,7 ,5 ,4, 8 ,12 ,-13 ,-2, 3 ,34, -19 ,6, 7 ,-9 ,14, 4 ,3 ,-17 ,3, 6 ,-5, -2, -1, -7, 11, -1, 15, -7 ,7 ,19, 1};
double *SumH =(double *)malloc(NumPattern*NumHidden*sizeof(double));
double *WeightIH =(double *)malloc(NumInput*NumHidden*sizeof(double));
double *Hidden =(double *)malloc(NumPattern*NumHidden*sizeof(double));
double *SumO =(double *)malloc(NumPattern*NumOutput*sizeof(double));
double *WeightHO =(double *)malloc(NumHidden*NumOutput*sizeof(double));
double *Output =(double *)malloc(NumPattern*NumOutput*sizeof(double));
double *DeltaWeightIH =(double *)malloc(NumInput*NumHidden*sizeof(double));
double *DeltaWeightHO = (double *)malloc(NumHidden*NumOutput*sizeof(double));
double DeltaO[NumOutput];
double SumDOW[NumHidden];
double DeltaH[NumHidden];
double Error, eta = 0.10, alpha = 0.9;
//double temps[24] = {-0.901337 -0.872058 -0.765912 -0.904485 -1.01524 ,-1.00116, -1.02088, -0.849757, -0.777824, -0.967258 ,-1.02125, -0.773202, -0.622447 ,-0.576088 ,-0.76714, -0.741354 ,-0.669561, -0.606497 ,-0.670834 ,-0.85477, -0.980444, -1.00685, -0.0365572, -0.000114586};
for( j = 0 ; j < NumHidden ; j++ ) { /* initialize WeightIH and DeltaWeightIH */
for( i = 0 ; i < NumInput ; i++ ) {
*(DeltaWeightIH + i + j) = 0;
// *(WeightIH + i) = test_weights[i] ;
*(WeightIH + i + j) = .5 ;
}
}
for( k = 0 ; k < NumOutput ; k ++ ) { /* initialize WeightHO and DeltaWeightHO */
for( j = 0 ; j < NumHidden ; j++ ) {
*(DeltaWeightHO + j + k) = 0.0 ;
*(WeightHO + j + k) = 1;
}
}
for( epoch = 0 ; epoch < 500000 ; epoch++) { /* iterate weight updates */
for( p = 0 ; p < NumPattern ; p++ ) { /* randomize order of individuals */
ranpat[p] = p ;
}
for( p = 0 ; p < NumPattern ; p++) {
np = rand() % NUMPAT ;
op = ranpat[p] ;
ranpat[p] = ranpat[np] ;
ranpat[np] = op ;
}
Error = 0.0 ;
for( np = 0 ; np < NumPattern ; np++ ) { /* repeat for all the training patterns */
p = ranpat[np];
for (game = 0; game < 55; game++){
for( j = 0 ; j < NumHidden ; j++ ) { /* compute hidden unit activations */
*(SumH + p + j) = *(WeightIH + 0) ;
for( i = 0 ; i < NumInput ; i++ ) {
temp1 = *(Input + game + 0 + i) * *(WeightIH + i + j) ;
temp2 = *(Input + game + 1 + i) * *(WeightIH + i + j) ;
*(SumH + p + j) += temp1 - temp2 ;
}
*(Hidden + p + j) = 1.0/(1.0 + exp(-*(SumH + p + j))) ;
}
for( k = 0 ; k < NumOutput ; k++ ) { /* compute output unit activations and errors */
*(SumO + p + k) = *(WeightHO + 0 + k) ;
for( j = 0 ; j < NumHidden ; j++ ) {
*(SumO + p + k) += *(Hidden + p + j) * *(WeightHO + j + k) ;
}
*(Output + p + k) = 1.0/(1.0 + exp(-*(SumO + p + k))) ; /* Sigmoidal Outputs */
//*(Output + p + k) = (exp(*(Output + p + k)) - exp(-*(Output + p + k))) / (exp(*(Output + p + k)) + exp(-*(Output + p + k))); //TANH
//*(Output + p + k) = (2.0/(1.0 + exp(-*(SumO + p + k)))) - 1 ; //bipolar sigmoid
//*(Output + p + k) = .5 * (1 + 1.0/(1.0 + exp(-*(SumO + p + k)))) * (1 - 1.0/(1.0 + exp(-*(SumO + p + k)))); //derivative sigmoid
//*(Output + p + k) = .5 * (1 + ((2.0/(1.0 + exp(-*(SumO + p + k)))) - 1)) * (1 - ((2.0/(1.0 + exp(-*(SumO + p + k)))) - 1)); //derivative bioolar sigmoid
/* Output[p][k] = SumO[p][k]; L
ear Outputs */
Error += 0.50 * (Target[game] - *(Output + p + k)) * (Target[game] - *(Output + p + k)) ; /* SSE */
//Error -= ( Target[game] * log( *(Output + p + k) ) + ( 1.0 - Target[k] ) * log( 1.0 - *(Output + p + k) ) ) ;
DeltaO[k] = (Target[game] - *(Output + p + k)) * *(Output + p + k) * (1.0 - *(Output + p + k)) ; /* Sigmoidal Outputs, SSE */
//DeltaO[k] = (exp(Target[game]) - exp(-*(Output + p + k))) / (exp(Target[game]) + exp(-*(Output + p + k)))
//DeltaO[k] = Target[k] - *(Output + p+k);
//DeltaO[k] = Target[game] - *(Output +p + k);
}
for( j = 0 ; j < NumHidden ; j++ ) { /* 'back-propagate' errors to hidden layer */
SumDOW[j] = 0.0 ;
for( k = 0 ; k < NumOutput ; k++ ) {
SumDOW[j] += *(WeightHO + j + k) * DeltaO[k] ;
}
DeltaH[j] = SumDOW[j] * *(Hidden + p + j) * (1.0 - *(Hidden + p + j)) ;
}
for( j = 0 ; j < NumHidden ; j++ ) { /* update weights WeightIH */
*(DeltaWeightIH + 0 + j) = eta * DeltaH[j] + alpha * *(DeltaWeightIH + 0 + j) ;
*(WeightIH + 0 + j) += *(DeltaWeightIH + 0 + j) ;
for( i = 0 ; i < NumInput ; i++ ) {
*(DeltaWeightIH + i + j) = eta * *(Input + game + 0 + i) * DeltaH[j] + alpha * *(DeltaWeightIH + i + j);
*(WeightIH + i + j) += *(DeltaWeightIH + i + j) ;
}
}
for( k = 0 ; k < NumOutput ; k ++ ) { /* update weights WeightHO */
*(DeltaWeightHO + 0 + k) = eta * DeltaO[k] + alpha * *(DeltaWeightHO + 0 + k) ;
*(WeightHO + 0 + k) += *(DeltaWeightHO + 0 + k) ;
for( j = 0 ; j < NumHidden ; j++ ) {
*(DeltaWeightHO + j + k) = eta * *(Hidden + p + j) * DeltaO[k] + alpha * *(DeltaWeightHO + j + k) ;
*(WeightHO + j + k) += *(DeltaWeightHO + j + k) ;
}
}
}
}
//if( epoch%10 == 0 ){
fprintf(stdout, "\nEpoch %-10d : Error = %f\n", epoch, Error) ;
// fprintf(stdout, "\nEpoch %-10d : weight1 example = %f", epoch, *(WeightIH)) ;
printf("Input weights:\n");
printf("-------------------\n");
for (i = 0; i < 24; i++){
printf("%G\n", *(WeightIH + i + 100));
}
printf("Hidden weights:\n");
printf("-------------------\n");
for (i = 0; i < NumHidden; i++){
printf("%G\n", *(WeightHO + i + 55));
}
//}
if( Error < 0.0004 ) break ; /* stop learning when 'near enough' */
}
return 1 ;
}
/*******************************************************************************/
temp1 and temp2 are ints.....
As a rule of thumb, declare variables the innermost scope they are used.
This question baffles me a bit, as I cannot see what is really wrong. I have two ideas to check:
I think the line temp1 = *(Input + game + 0 + i) * *(WeightIH + i + j) ; is correct, but - just for laughs - try to add another set of parentheses:
temp1 = (*(Input + game + 0 + i)) * (*(WeightIH + i + j)) ;
it could be you get memory corruption somewhere before that line (although I don't see where). M.M is right though with all your indexing probably being wrong. Consider how a two-dimensional array is build, and which element you are trying to access - for example *(WeightIH + i + j) is not accessing 'line' i, row 'j' - you need to multiply the row index with the col length (or the other way around, however you like, just consistent): *(WeightIH + i*NumHidden + j). Consider that the row for i = 1 does not start at index 1+0, but after all NumHidden elements of j are done, so it starts at index NumHidden+0, etc. Fix this, and try if the problem goes away.