Determine tile size for matrix multiplication with AVX - c

I wrote the code below for tiled matrix multiplication with AVX.
#include <x86intrin.h>
#include <immintrin.h>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
void fill(double *mat, int n) {
register int i;
register int j;
for (i = 0; i < n; ++i) {
for (j = 0; j < n; ++j) {
mat[i * n + j] = (double) (i * n + j);
}
}
}
void blocked_mmul_avx(double *A, double *B, double *C, int n) {
register int row, block, chunk, sub_chunk;
int block_size = 4;
for (row = 0; row < n; row++)
for (block = 0; block < n; block += block_size) {
register __m256d c0 = _mm256_setzero_pd();
for (chunk = 0; chunk < n; chunk += block_size) {
for (sub_chunk = 0; sub_chunk < block_size; sub_chunk++) {
register __m256d a = _mm256_broadcast_sd(
&A[row * n + chunk + sub_chunk]);
register __m256d b = _mm256_loadu_pd(
&B[chunk * n + sub_chunk * n + block]);
c0 += a * b;
}
}
_mm256_storeu_pd(&C[row * n + block], c0);
}
}
int main() {
int n = 4096; //n = 512, 1024, 2048, 4096
double *A;
double *B;
double *C;
A = (double*) malloc(n * n * sizeof(double));
B = (double*) malloc(n * n * sizeof(double));
C = (double*) malloc(n * n * sizeof(double));
srand(0);
fill(A, n); // it is a function to load A and B with random numbers
fill(B, n);
printf("n = %d\n\n", n);
clock_t t0 = clock();
blocked_mmul_avx(A, B, C, n);
clock_t t1 = clock();
printf("Execution Time = %0.3f s \n\n", (float) (t1 - t0) / CLOCKS_PER_SEC);
free(A);
free(B);
free(C);
return 0;
}
If I set size of the block to 4, the code works correctly. But if I increase it to 16, 1/4 of entries of output matrix is correct. I don't know why this is happening??
I checked and there are 16 YMM registers so I think block size can be increased to 16. But my experiment shows the otherwise.
Any idea why this is happening?

Related

C Programming: doing matrix multiplication of two contiguous, row-major arrays

I'm trying to write a function that does naive matrix multiplication of two contiguous, row-major arrays. But when I attempt to print each value at the end I get garbage. I'm guessing it's because I've mixed up the proper iterations and scaling needed to jump rows/columns. Does anyone have any advice?
Full code necessary is below:
#include <stdio.h>
#include <stdlib.h>
void dmatmul(double *a, double *b, double *c, int astride, int bstride, int cdim_0, int cdim_1) {
int i, j, p;
for (i = 0; i < cdim_0; i++) {
for (j = 0; j < cdim_1; j++) {
c[i * cdim_1 + j] = 0.0;
for (p = 0; p < (astride); p++) {
c[i * cdim_1 + j] += a[i * (astride) + p] * b[p * (bstride) + j];
}
}
}
}
int main(void) {
double *x, *y, *z;
int xdim_0, xdim_1, ydim_0, ydim_1, zdim_0, zdim_1, i, j;
xdim_0 = 2;
xdim_1 = 4;
ydim_0 = 4;
ydim_1 = 2;
zdim_0 = 2;
zdim_1 = 2;
x = (double *) malloc (xdim_0 * xdim_1 * sizeof(double));
y = (double *) malloc (ydim_0 * ydim_1 * sizeof(double));
z = (double *) malloc (zdim_0 * zdim_1 * sizeof(double));
for (i = 0; i < xdim_0 * xdim_1; i++) {
x[i] = i + 1;
y[i] = 2 * (i + 1);
}
dmatmul(x, y, z, xdim_1, ydim_1, zdim_0, zdim_1);
printf("\nMatrix product of X and Y dimensions: (%d, %d)\n", zdim_0, zdim_1);
printf("Matrix product of X and Y values:");
for (i = 0; i < zdim_0; i++) {
printf("\n");
for (j = 0; j < zdim_1; i++) {
printf("\t%f", z[i * zdim_1 + j]);
}
}
return 0;
}
The primary problem is a typo in the inner for loop doing the printing. You have:
for (j = 0; j < zdim_1; i++)
but you ned to increment j, not i:
for (j = 0; j < zdim_1; j++)
Here's my code, which has an independent matrix printing function appropriate for the arrays you're using:
/* SO 7516-7451 */
#include <stdio.h>
#include <stdlib.h>
static void dmatmul(double *a, double *b, double *c, int astride, int bstride, int cdim_0, int cdim_1)
{
int i, j, p;
for (i = 0; i < cdim_0; i++)
{
for (j = 0; j < cdim_1; j++)
{
c[i * cdim_1 + j] = 0.0;
for (p = 0; p < (astride); p++)
{
c[i * cdim_1 + j] += a[i * (astride) + p] * b[p * (bstride) + j];
}
}
}
}
static void mat_print(const char *tag, int rows, int cols, double *matrix)
{
printf("%s (%dx%d):\n", tag, rows, cols);
for (int i = 0; i < rows; i++)
{
for (int j = 0; j < cols; j++)
printf("%4.0f", matrix[i * cols + j]);
putchar('\n');
}
}
int main(void)
{
int xdim_0 = 2;
int xdim_1 = 4;
int ydim_0 = 4;
int ydim_1 = 2;
int zdim_0 = 2;
int zdim_1 = 2;
double *x = (double *)malloc(xdim_0 * xdim_1 * sizeof(double));
double *y = (double *)malloc(ydim_0 * ydim_1 * sizeof(double));
double *z = (double *)malloc(zdim_0 * zdim_1 * sizeof(double));
for (int i = 0; i < xdim_0 * xdim_1; i++)
{
x[i] = i + 1;
y[i] = 2 * (i + 1);
}
mat_print("X", xdim_0, xdim_1, x);
mat_print("Y", ydim_0, ydim_1, y);
dmatmul(x, y, z, xdim_1, ydim_1, zdim_0, zdim_1);
mat_print("Z", zdim_0, zdim_1, z);
printf("\nMatrix product of X and Y dimensions: (%d, %d)\n", zdim_0, zdim_1);
printf("Matrix product of X and Y values:\n");
for (int i = 0; i < zdim_0; i++)
{
for (int j = 0; j < zdim_1; j++)
printf("\t%f", z[i * zdim_1 + j]);
printf("\n");
}
return 0;
}
I've also initialized the variables as I declared them. The code should, but does not, check that the memory was allocated.
When I ran this code without your printing, I got the correct result, so then I took a good look at that and saw the problem.
X (2x4):
1 2 3 4
5 6 7 8
Y (4x2):
2 4
6 8
10 12
14 16
Z (2x2):
100 120
228 280
Matrix product of X and Y dimensions: (2, 2)
Matrix product of X and Y values:
100.000000 120.000000
228.000000 280.000000

C calculation of the gradient of a multidimensional function. (partial derivatives)

I tried to implement an algorithm for calculating the gradient, but I can't even figure out whether it works correctly or not, because I don't know how to calculate it even with my hands.
I have attached the material on which I tried to make an implementation of the algorithm. I would be very grateful if someone could give an example of the input data and the result that should turn out, because I really want to figure it out. And also I would like to know what I have incorrectly prescribed in my program.
Thank you in advance. Because I'm really stuck on this and I can't move from a dead place.
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <locale.h>
double f(double x)
{
return (x * x );
}
int main(int argc, char** argv) {
double del = 0.1;
int n;
scanf_s("%d", &n);
double* x;
x = (double*)malloc(n * sizeof(double));
double* y;
y = (double*)malloc(n * sizeof(double));
double* fx;
fx = (double*)malloc(n * sizeof(double));
double* fy;
fy = (double*)malloc(n * sizeof(double));
double* grad;
grad = (double*)malloc(n * sizeof(double));
int i = 0;
for (i = 0; i < n; i++)
{
printf("x[%d] = ", i);
scanf_s("%lf", &x[i]);
y[i] = x[i];
fx[i] = f(x[i]);
}
double a = 1 / del;
for (i = 0; i < n; i++) {
y[i] = y[i] + del;
fy[i] = f(y[i]);
grad[i] = a * (fy[i] - fx[i]);
y[i] = x[i];
}
for (i = 0; i < n; i++) {
printf("%lf ", grad[i]);
}
}
UPD:
double f(double arr[], int size)
{
int a = sizeof(arr);
for (int i = 0; i < size; i++) {
arr[i] = arr[i] * arr[i];
}
return *arr;
}
int main(int argc, char** argv) {
double del = 0.1;
int n;
scanf_s("%d", &n);
double* x;
x = (double*)malloc(n * sizeof(double));
double* y;
y = (double*)malloc(n * sizeof(double));
double* fx;
fx = (double*)malloc(n * sizeof(double));
double* fy;
fy = (double*)malloc(n * sizeof(double));
double* grad;
grad = (double*)malloc(n * sizeof(double));
int i = 0;
for (i = 0; i < n; i++)
{
printf("x[%d] = ", i);
scanf_s("%lf", &x[i]);
y[i] = x[i];
fx[i] = f(x,n);
}
double a = 1 / del;
for (i = 0; i < n; i++) {
y[i] = y[i] + del;
fy[i] = f(y,n);
grad[i] = a * (fy[i] - fx[i]);
y[i] = x[i];
}
for (i = 0; i < n; i++) {
printf("%lf ", grad[i]);
}
}
I would suggest that you do this by hand for a single function before you worry about a vector of functions.
Assume you have one independent variable x and one function y = f(x):
Pick a value for x[i]
Calculate y(x[i]) = f(x[i])
Calculate x[i+1] = x[i] + delta
Calculate y(x[i+1]) = f(x[i+1])
The forward different approximation of the derivative at x[i] is (y(x[i+1]) - y(x[i])/delta.
There are other ways to approximate the derivative, but this is a start.
Once this works for a single function it's easy to see how to extend it to a vector of functions: turn all those subscripts to vectors.
You should understand that
independent variables x are a vector
dependent functions y are a vector
derivatives of y w.r.t. x are expressed as an m x n matrix, not a
vector:
dy[i]/dx[j] for i = 1 to m, j = 1 to n
There are n independent variables x and m dependent functions y.

Matrix multiplication not working when I store the result of a sum instead of accessing that element on each sum

I am writing a blocked matrix multiplication algorithm for n x n matrices. My matrices are stored as 1D arrays. My first version of the algorithm works fine:
double * blocked_ijk_matmul(double *A, double *B, int n, int b) {
double *C = (double *) malloc(n * n * sizeof(double));
int i_block, j_block, k_block, i, j, k;
for (i_block = 0; i_block < n; i_block += b) {
for (j_block = 0; j_block < n; j_block += b) {
for (k_block = 0; k_block < n; k_block += b) {
for (i = i_block; i < fmin(i_block + b, n); ++i) {
for (j = j_block; j < fmin(j_block + b, n); ++j) {
for (k = k_block; k < fmin(k_block + b, n); ++k) {
C[(i * n) + j] += A[(i * n) + k] * B[(k * n) + j];
}
}
}
}
}
}
return C;
}
However, in this algorithm C[(i *n) * j] is computed quite a large number of times depending on the size of the matrices. If I instead try to store this sum, and then set the value of C[(i *n) * j] to the total sum value when all summations are complete, I get incorrect results:
double * blocked_ijk_matmul(double *A, double *B, int n, int b) {
double *C = (double *) malloc(n * n * sizeof(double));
int i_block, j_block, k_block, i, j, k;
for (i_block = 0; i_block < n; i_block += b) {
for (j_block = 0; j_block < n; j_block += b) {
for (k_block = 0; k_block < n; k_block += b) {
for (i = i_block; i < fmin(i_block + b, n); ++i) {
for (j = j_block; j < fmin(j_block + b, n); ++j) {
double sum = 0;
for (k = k_block; k < fmin(k_block + b, n); ++k) {
sum += A[(i * n) + k] * B[(k * n) + j];
}
C[(i * n) + j] = sum;
}
}
}
}
}
return C;
}
I cannot figure out for quite some time why this is not working. Clearly, double sum = 0; and C[(i * n) + j] = sum; need to be placed somewhere else, but I cannot figure out where.

Using Cuda for non-square matrix multiplication

I started to work with cuda the last days. Writing a program which multiplies two matrices of the size N x N was no problem. In the kernel function I used this code:
for(int i = 0; i < width; i++){
sum += a[row * width + i] * b[i * width + col];
c[row * width + col] = sum;
}
How do I have to design the kernel function to multiply a matrix of the size 1 x N with a matrix of the size N x M
I have found a solution for this problem now:
#include <stdio.h>
#include <iostream>
using namespace std;
__global__
void kernel(float *a, float *b, float *c, int N, int M) {
int tid = threadIdx.x + blockIdx.x * blockDim.x;
float sum = 0;
if (tid < M) {
for (int i = 0; i < N; i++)
sum += a[i] * b[(i * M) + tid];
c[tid] = sum;
}
}
int main(void) {
float *dev_a, *dev_b, *dev_c;
int N = 16;
int M = 12;
float a[N];
float b[N][M];
float c[M];
for (int i = 0; i < N; i++) {
a[i] = 1.0;
}
for (int i = 0; i < N; i++) {
for (int e = 0; e < M; e++) {
b[i][e] = 1.0;
}
}
cudaMalloc((void**) &dev_a, sizeof(float) * N);
cudaMalloc((void**) &dev_b, sizeof(float) * N * M);
cudaMalloc((void**) &dev_c, sizeof(float) * M);
cudaMemcpy(dev_a, a, sizeof(float) * N, cudaMemcpyHostToDevice);
cudaMemcpy(dev_b, b, sizeof(float) * N * M, cudaMemcpyHostToDevice);
kernel<<<M / 256 + 1, 256>>>(dev_a, dev_b, dev_c, N, M);
cudaMemcpy(c, dev_c, sizeof(float) * M, cudaMemcpyDeviceToHost);
cudaFree(dev_a);
cudaFree(dev_b);
cudaFree(dev_c);
for (int i = 0; i < M; i++) {
cout << c[i] << endl;
}
return 0;
}
But I have still one question. Does it make sense to split the for loop operation in the kernel in several kernels for performance reasons?

Initializing data with Openmp [shallow water algorithm]

First of all, my Englishg level sucks, so sorry if something isn't well written...
I'm learning how to parallelize C code using OpenMP, the algorith I'm trying to parallelize is the shallow water equations algorithm, and although with a simple #pragma omp parallel for in the most critical loop I've gained nearly 40% more performance I know that my implementation is very poor and I'm not milking the cores as I should. The structure of the code is simple: a 'main' that allocates memory and initializes some matrixes and arrays and calls a function called solver that does all the work, where I putted the #pragma omp parallel for.
I was thinking that I could boost the performance using a parallel section where the memory is allocated and initialized so every thread has all the data, but when i run the program I don't have any boost, and since I'm a rookie with this I don't know if my thinking was bad or the bad thing was my implementation. I'll apreciate some help or a hint that could boost the performance of the algorithm. This is my homework and I don't want someone to do it for me, just a little help that can make me go forward...
I'll paste the code for better understanding:
MAIN FUNCTION (Allocations and initializations)
int main(int argc, char **argv) {
long int i, j, m, n, M, N;
char *ptr;
long int s;
int flag, verbose;
double *Q;
double *x, *y;
double **ffx, **nFx, **ffy, **nFy;
double dx, dt, epsi, delta, dy, tend, tmp, stime;
/* Default values to use: m volumes in the x-direction and n volumes in the y-direction
M = 1000;
N = 1000;
/* create file and verbose flags */
.......
.......
/* Parse command line options */
.......
.......
epsi = 2.0;
delta = 0.5;
dx = (xend - xstart) / (double) M;
dy = (yend - ystart) / (double) N;
dt = dx / sqrt( 9.81 * 5.0);
tend = 0.1;
/* Add two ghost volumes at each side of the domain */
m = M + 2;
n = N + 2;
/* Allocate memory for the domain */
/*HERE IS WHRE I PUT THE PRAGMA FOR PARALLEL INITIALIZATION AND ALLOCATIONS*/
#pragma omp parallel
{
Q = (double *) malloc(m * n * cell_size * sizeof(double));
x = (double *) malloc(m * sizeof(double));
y = (double *) malloc(n * sizeof(double));
/* Allocate memory for fluxes */
ffx = (double **) malloc(cell_size * sizeof(double *));
ffy = (double **) malloc(cell_size * sizeof(double *));
nFx = (double **) malloc(cell_size * sizeof(double *));
nFy = (double **) malloc(cell_size * sizeof(double *));
ffx[0] = (double *) malloc(cell_size * m * sizeof(double));
nFx[0] = (double *) malloc(cell_size * m * sizeof(double));
ffy[0] = (double *) malloc(cell_size * n * sizeof(double));
nFy[0] = (double *) malloc(cell_size * n * sizeof(double));
for (i = 0; i < cell_size; i++) {
ffx[i] = ffx[0] + i * m;
nFx[i] = nFx[0] + i * m;
ffy[i] = ffy[0] + i * n;
nFy[i] = nFy[0] + i * n;
}
for (i = 0,tmp= -dx/2 + xstart; i < m; i++, tmp += dx)
x[i] = tmp;
for (i = 0,tmp= -dy/2 + ystart; i < n; i++, tmp += dy)
y[i] = tmp;
/* Set initial Gauss hump */
for (i = 0; i < m; i++) {
for (j = 0; j < n; j++) {
Q(0, i, j) = 4.0;
Q(1, i, j) = 0.0;
Q(2, i, j) = 0.0;
}
}
for (i = 1; i < m-1; i++) {
for (j = 1; j < n-1; j++) {
Q(0, i, j) = 4.0 + epsi * exp(-(pow(x[i] - xend / 4.0, 2) + pow(y[j] - yend / 4.0, 2)) /
(pow(delta, 2)));
}
}
}
// Record start time
stime = gettime();
/*THIS IS THE FUNCTION WHERE THE 'WORK' IS DONE*/
solver(Q, ffx, ffy, nFx, nFy, m, n, tend, dx, dy, dt);`
}
SOLVER FUNCTION (Critical Section)
/*
This is the main solver routine.
*/
void solver(double *Q, double **ffx, double **ffy, double **nFx, double **nFy,
int m, int n, double tend, double dx, double dy, double dt) {
double bc_mask[3] = {1.0, -1.0, -1.0};
double time;
int i, j, k, steps;
steps = ceil(tend / dt);
for (i = 0, time = 0.0; i < steps; i++, time += dt) {
/* Apply boundary condition */
#pragma omp parallel for private(j) num_threads (NTHR)
for (k = 0; k < cell_size; k++)
{
for (j = 1; j < n - 1 ; j++)
{
Q(k, 0, j) = bc_mask[k] * Q(k, 1, j);
Q(k, m-1, j) = bc_mask[k] * Q(k, m-2, j);
}
}
#pragma omp parallel for private(j) num_threads (NTHR)
for (k = 0; k < cell_size; k++)
{
for (j = 0; j < m; j++)
{
Q(k, j, 0) = bc_mask[k] * Q(k, j, 1);
Q(k, j, n-1) = bc_mask[k] * Q(k, j, n-2);
}
}
/* Update all volumes with the Lax-Friedrich's scheme */
laxf_scheme_2d(Q, ffx, ffy, nFx, nFy, m, n, dx, dy, dt);
}
}
/*
This is the Lax-Friedrich's scheme for updating volumes
*/
void laxf_scheme_2d(double *Q, double **ffx, double **ffy, double **nFx, double **nFy,
int m, int n, double dx, double dy, double dt) {
int i, j, k;
/* Calculate and update fluxes in the x-direction */
#pragma omp parallel for private(k,j) num_threads (NTHR)
for (i = 1; i < n; i++) {
fx(Q, ffx, m, n, i);
for (k = 0; k < cell_size; k++)
for (j = 1; j < m; j++)
nFx[k][j] = 0.5 * ((ffx[k][j-1] + ffx[k][j]) - dx/dt * (Q(k, j, i) - Q(k, j-1, i)));
for (k = 0; k < cell_size; k++)
for (j = 1; j < m-1; j++)
Q(k, j, i) = Q(k, j, i) - dt/dx * ((nFx[k][j+1] - nFx[k][j]));
}
/* Calculate and update fluxes in the y-direction */
#pragma omp parallel for private(k,j) num_threads (NTHR)
for (i = 1; i < m; i++) {
fy(Q, ffy, m, n, i);
for (k = 0; k < cell_size; k++)
for (j = 1; j < n; j++)
nFy[k][j] = 0.5 * ((ffy[k][j-1] + ffy[k][j]) - dy/dt * (Q(k, i, j) - Q(k, i, j -1)));
for (k = 0; k < cell_size; k++)
for (j = 1; j < n-1; j++)
Q(k,i,j) = Q(k,i,j) - dt/dy * ((nFy[k][j+1] - nFy[k][j]));
}
}
As I understand there is no data dependency in the loops of the solver function and it's sub-functions, and since putting a parallel region in the allocation and data initialization did nothing, I don't know how to continue.
Thanks in advance!
There are multiple problems with your code. First of all, you have data races there, since you write to shared variables, such as Q, x, and y, by all threads. Either do the allocations outside of a parallel region or perform them by a single thread only (#pragma omp master or #pragma omp single).
Then, you don't parallelize the for loops in the initialization section. In fact, all these loops are executed by all threads within whole ranges (again with data races and likely a lot of cache contention). You should add #pragma omp parallel to these loops. For nested loops, the collapse directive might be useful.
Also, be sure that there are no data races in solver() and laxf_scheme_2d() functions. Seemingly, the most time of the calculation is spend within laxf_scheme_2d(), however, this function is not at all run in parallel. Does it use OpenMP internally?
Thank you for the answers. I've seen many problems in my implementation, first of all the heaviest function where all the job is done is laxf_scheme_2d.
About the Q variable i have this #define Q(i, j, k) Q[((k) + n * ((j) + m * (i)))]
This is laxf_scheme_2d
void laxf_scheme_2d(double *Q, double **ffx, double **ffy, double **nFx, double **nFy,
int m, int n, double dx, double dy, double dt) {
int i, j, k;
/* Calculate and update fluxes in the x-direction */
#pragma omp for
for (i = 1; i < n; i++) {
fx(Q, ffx, m, n, i);
for (j = 1; j < m; j++)
for (k = 0; k < cell_size; k++)
nFx[k][j] = 0.5 * ((ffx[k][j-1] + ffx[k][j]) -
dx/dt * (Q(k, j, i) - Q(k, j-1, i)));
for (j = 1; j < m-1; j++)
for (k = 0; k < cell_size; k++)
Q(k, j, i) = Q(k, j, i) - dt/dx * ((nFx[k][j+1] - nFx[k][j]));
}
/* Calculate and update fluxes in the y-direction */
#pragma omp for
for (i = 1; i < m; i++) {
fy(Q, ffy, m, n, i);
for (j = 1; j < n; j++)
for (k = 0; k < cell_size; k++)
nFy[k][j] = 0.5 * ((ffy[k][j-1] + ffy[k][j]) -
dy/dt * (Q(k, i, j) - Q(k, i, j -1)));
for (j = 1; j < n-1; j++)
for (k = 0; k < cell_size; k++)
Q(k,i,j) = Q(k,i,j) - dt/dy * ((nFy[k][j+1] - nFy[k][j]));
}
}
Functions fx and fy are very simple and with no data dependencies. I can't put de #pragma omp parallel for above the first for loop because there are data races but for now I can't see how to change this code to overcome them.
long int i, j, m, n, M, N;
char *ptr;
long int s;
int flag, verbose;
double *Q;
double *x, *y;
double **ffx, **nFx, **ffy, **nFy;
double dx, dt, epsi, delta, dy, tend, tmp, stime;
M = 1000;
N = 1000;
/* Add two ghost volumes at each side of the domain */
m = M + 2;
n = N + 2;
/* Allocate memory for the domain */
Q = (double *) malloc(m * n * cell_size * sizeof(double));
x = (double *) malloc(m * sizeof(double));
y = (double *) malloc(n * sizeof(double));
/* Allocate memory for fluxes */
ffx = (double **) malloc(cell_size * sizeof(double *));
ffy = (double **) malloc(cell_size * sizeof(double *));
nFx = (double **) malloc(cell_size * sizeof(double *));
nFy = (double **) malloc(cell_size * sizeof(double *));
ffx[0] = (double *) malloc(cell_size * m * sizeof(double));
nFx[0] = (double *) malloc(cell_size * m * sizeof(double));
ffy[0] = (double *) malloc(cell_size * n * sizeof(double));
nFy[0] = (double *) malloc(cell_size * n * sizeof(double));

Resources