With optimization disabled, matrix multiplication is slower with AVX. Why? [duplicate] - c

This question already has answers here:
Demonstrator code failing to show 4 times faster SIMD speed with optimization disabled
(3 answers)
SSE intrinsics without compiler optimization
(2 answers)
Closed last month.
First I wrote this code:
void dgemm3(double* A, double* B, double* C, int n){
register int i, j, k, n4 = n * 4;
register double cij0, cij1, cij2, cij3;
register double *a0, *a1, *a2, *a3, *b0, *b1, *b2, *b3;
for (i=0; i<n; ++i){
for (j=0; j<n; ++j){
a0 = &A[i*n];
a1 = a0 + 1;
a2 = a1 + 1;
a3 = a2 + 1;
b0 = &B[j];
b1 = b0 + n;
b2 = b1 + n;
b3 = b2 + n;
cij0 = cij1 = cij2 = cij3 = 0;
for(k = 0; k < n; k+=4, a0+=4, a1+=4, a2+=4, a3+=4, b0+=n4, b1+=n4, b2+=n4, b3+=n4){
cij0 += *a0 * *b0;
cij1 += *a1 * *b1;
cij2 += *a2 * *b2;
cij3 += *a3 * *b3;
}
*C++ = cij0 + cij1 + cij2 + cij3;
}
}
}
and the I wrote this code using avx:
void dgemm_avx (double* A, double* B, double* C, int n) {
for (int i=0; i<n; i++) {
for (int j=0; j<n; j+=4) {
__m256d c0 = _mm256_setzero_pd();
for (int k=0; k<n; k++) {
__m256d m1 = _mm256_broadcast_sd(A+i*n+k);
__m256d m2 = _mm256_loadu_pd(B+k*n+j);
__m256d m3 = _mm256_mul_pd(m1,m2);
c0 = _mm256_add_pd(c0,m3);
}
_mm256_storeu_pd(C+i*n+j, c0);
}
}
}
I expected the second one to be faster but it is not.
But if I use -O1 flag on both of them then the second one is faster.
Now my question is shouldn't the second one be faster even without optimization?

Related

Vectorising a nested loop with AVX2

I am trying to vectorise the inner loop the following nested loop. Firstly, is this good practice, or should one avoid attempting to vectorise nested loops?
The following works, it has already some basic loop unrolling.
int sparsemv(struct mesh *A, const double * const x, double * const y) {
const int nrow = (const int) A->local_nrow;
int j = 0;
double sum = 0.0;
#pragma omp parallel for private(j, sum)
for (int i=0; i< nrow; i++) {
sum = 0.0;
const double * const cur_vals = (const double * const) A->ptr_to_vals_in_row[i];
const int * const cur_inds = (const int * const) A->ptr_to_inds_in_row[i];
const int cur_nnz = (const int) A->nnz_in_row[i];
int unroll = (cur_nnz/4)*4;
for (j=0; j< unroll; j+=4) {
sum += cur_vals[j] * x[cur_inds[j]];
sum += cur_vals[j+1] * x[cur_inds[j+1]];
sum += cur_vals[j+2] * x[cur_inds[j+2]];
sum += cur_vals[j+3] * x[cur_inds[j+3]];
}
for (; j < cur_nnz; j++) {
sum += cur_vals[j] * x[cur_inds[j]];
}
y[i] = sum;
}
return 0;
}
However, when I try to vectorise using 256-bit Vector registers in AVX2, I get either the incorrect answers or seg faults. x and y are aligned but A is not, but for the moment, all loading and storing is done using unaligned operations since that is the only time I don't get seg faults:
int sparsemv(struct mesh *A, const double * const x, double * const y) {
const int nrow = (const int) A->local_nrow;
int j = 0;
double sum = 0.0;
#pragma omp parallel for private(j, sum)
for (int i=0; i< nrow; i++) {
sum = 0.0;
const double * const cur_vals = (const double * const) A->ptr_to_vals_in_row[i];
const int * const cur_inds = (const int * const) A->ptr_to_inds_in_row[i];
const int cur_nnz = (const int) A->nnz_in_row[i];
int unroll = (cur_nnz/4)*4;
__m256d sumVec = _mm256_set1_pd(sum);
for (j=0; j< unroll; j+=4) {
__m256d cur_valsVec = _mm256_loadu_pd(cur_vals + j);
__m256d xVec = _mm256_loadu_pd(x + cur_inds[j]);
sumVec = _mm256_add_pd(sumVec, _mm256_mul_pd(cur_valsVec, xVec));
}
_mm256_storeu_pd(y + i, sumVec); // Is this storing in y + i + 1, 2 and 3 aswell?
for (; j < cur_nnz; j++) {
sum += cur_vals[j] * x[cur_inds[j]];
}
y[i] += sum;
}
return 0;
}

Passing 2D arrays (matrix) to functions in Arduino for Neural Networks

I know there are another post talking about this, but I feel they don't help me, I want to do functions to multiply matrix and sum matrix, and one for print too. I need this to work with neural networks
/*
Function to matrix multiplication:
All the matrix need to be declarated
r1 = row of the first matrix
c1 = column of the first matrix
r2 = row of the first matrix
c2 = column of the first matrix
first = A matrix
second = B matrix
mult = C matrix, or A * B
*/
void dot(int r1, int c1, int r2, int c2, float first[][c1], float second[][c2], float mult[][c2])
{
for (int i = 0; i < r1; ++i)
{
for (int j = 0; j < c2; ++j)
{
for (int k = 0; k < c1; ++k)
{
mult[i][j] += first[i][k] * second[k][j];
}
}
}
}
It should work with declarated matrix, it works in C, but I need this in Arduino
What can i do?
This is for sum:
/*
Function to sum matrixs:
All the matrix need to be declarated and equal in rows and columns
r = row of the matrix
c = column of the matrix
first = A matrix
second = B matrix
sum = C matrix, or A + B
*/
void sum(int r, int c, float first[][c], int second[][c], float sum[][c])
{
for (int i = 0; i < r; ++i)
{
for (int j = 0; j < c; ++j)
{
sum[i][j] = first[i][j] + second[i][j];
}
}
}
This is for print, but isn't important:
/*
Function to print matrixs:
r = row of the matrix
c = column of the matrix
first = A matrix
second = B matrix
sum = C matrix, or A + B
*/
void printMatrix(int r, int c, float matrix[r][c])
{
for (int i = 0; i < r; i++)
{
Serial.print('[ ');
for (int j = 0; j < c; j++)
{
Serial.print(' ');
Serial.print(matrix[i][j]);
Serial.print(' ');
}
Serial.print(']');
Serial.println("\n");
}
}
I really need help

How does a pointer to a 2D array work underneath the hood?

I can't understand why this piece of code which is supposed to perform matrix multiplication goes wrong.
Input: 2x2 matrices with elements 1,2,3,4 in both matrices
Expected output: 7 10 15 22
Output given by this code: 15 22 12 16
int a[10][10], b[10][10], c[10][10], i, j, k, r1, c1, r2, c2;
int (*pa)[10][10] = &a, (*pb)[10][10] = &b, (*pc)[10][10] = &c;
for ( i = 0; i < r1; i++) {
for(j = 0; j < c2; j++) {
*pc[i][j] = 0;
for(k = 0; k < c1; k++) {
*pc[i][j] += *pa[i][k] * *pb[k][j];
}
}
}
I tried debugging using print statements like this and these are the results:
When given 2x2 matrices which have 1,2,3,4 as their elements, these are the errors produced:
at 00 of a is 3
at 00 of b is 1
Elements you're multiplying: 3 1
But expected output is this:
at 00 of a is 1
at 00 of b is 1
(Same seems to happen for rest of the elements)
Rest of the code which isn't pasted here is bug-free. Checked it thoroughly using print statements.
Refer these answers first:
Pointer address in a C multidimensional array
Create a pointer to two-dimensional array
Both together answer you question. Here is a working version of your code(simple version):
#include<stdio.h>
void main()
{
int a[2][2]={{1,2},{3,4}};
int b[2][2]={{1,2},{3,4}};
int c[2][2], i, j, k, r1=2, c1=2, r2=2, c2=2;
int (*pa)[2] = a, (*pb)[2] = b, (*pc)[2] = c;
for ( i = 0; i < r1; i++)
{
for(j = 0; j < c2; j++)
{
pc[i][j] = 0;
for(k = 0; k < c1; k++)
{
pc[i][j] += pa[i][k] * pb[k][j];
}
}
}
for(i=0;i<2;i++)
{
printf("\n");
for(j=0;j<2;j++)
{
printf("%d\t",c[i][j]);
}
}
}

Speed up matrix-matrix multiplication using SSE vector instructions

I have some trouble in vectorize some C code using SSE vector instructions. The code which I have to victorize is
#define N 1000
void matrix_mul(int mat1[N][N], int mat2[N][N], int result[N][N])
{
int i, j, k;
for (i = 0; i < N; ++i)
{
for (j = 0; j < N; ++j)
{
for (k = 0; k < N; ++k)
{
result[i][k] += mat1[i][j] * mat2[j][k];
}
}
}
}
Here is what I got so far:
void matrix_mul_sse(int mat1[N][N], int mat2[N][N], int result[N][N])
{
int i, j, k; int* l;
__m128i v1, v2, v3;
v3 = _mm_setzero_si128();
for (i = 0; i < N; ++i)
{
for (j = 0; j < N; j += 4)
{
for (k = 0; k < N; k += 4)
{
v1 = _mm_set1_epi32(mat1[i][j]);
v2 = _mm_loadu_si128((__m128i*)&mat2[j][k]);
v3 = _mm_add_epi32(v3, _mm_mul_epi32(v1, v2));
_mm_storeu_si128((__m128i*)&result[i][k], v3);
v3 = _mm_setzero_si128();
}
}
}
}
After execution I got wrong result. I know that the reason is the loading from memory to v2. I loop through mat1 in row major order so I need to load mat2[0][0], mat2[1][0], mat2[2][0], mat2[3][0].... but what actually loaded is mat2[0][0], mat2[0][1], mat2[0][2], mat2[0][3]... because mat2 has stored in the memory in row major order. I tried to fix this problem but without any improvement.
Can anyone help me please.
Below fixed your implementation:
void matrix_mul_sse(int mat1[N][N], int mat2[N][N], int result[N][N])
{
int i, j, k;
__m128i v1, v2, v3, v4;
for (i = 0; i < N; ++i)
{
for (j = 0; j < N; ++j) // 'j' must be incremented by 1
{
// read mat1 here because it does not use 'k' index
v1 = _mm_set1_epi32(mat1[i][j]);
for (k = 0; k < N; k += 4)
{
v2 = _mm_loadu_si128((const __m128i*)&mat2[j][k]);
// read what's in the result array first as we will need to add it later to our calculations
v3 = _mm_loadu_si128((const __m128i*)&result[i][k]);
// use _mm_mullo_epi32 here instead _mm_mul_epi32 and add it to the previous result
v4 = _mm_add_epi32(v3, _mm_mullo_epi32(v1, v2));
// store the result
_mm_storeu_si128((__m128i*)&result[i][k], v4);
}
}
}
}
In short _mm_mullo_epi32 (requires SSE4.1) produces 4 x int32 results as opposed to _mm_mul_epi32 which does 2 x int64 results. If you cannot use SSE4.1 then have a look at the answer here for an alternative SSE2 solution.
Full description by Intel Intrinsic Guide:
_mm_mullo_epi32: Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit integers, and store
the low 32 bits of the intermediate integers in dst.
_mm_mul_epi32: Multiply the low 32-bit integers from each packed 64-bit element in a and b, and store the
signed 64-bit results in dst.
I kinda changed around your code to make the addressing explicit [ it helps in this case ].
#define N 100
This is a stub for the vector unit multiple & accumulate operation; you should be able to replace NV with whatever throw your vector unit has, and put the relevant opcodes in here.
#define NV 8
int Vmacc(int *A, int *B) {
int i = 0;
int x = 0;
for (i = 0; i < NV; i++) {
x += *A++ * *B++;
}
return x;
}
This multiply has two notable variations from the norm:
1. It caches the columnar vector into a contiguous one.
2. It attempts to push slices of the multiply accumulate into a vector-like func.
Even without using the vector unit, this takes half the time of naive version just because of better cache/prefetch utilization.
void mm2(int *A, int *B, int n, int *C) {
int c, r;
int stride = 0;
int cache[N];
for (c = 0; c < n; c++) {
/* cache cumn i: */
for (r = 0; r < n; r++) {
cache[r] = B[c + r*n];
}
for (r = 0; r < n; r++) {
int k = 0;
int x = 0;
int *Av = A + r*n;
for (k = 0; k+NV-1 < n; k += NV) {
x += Vmacc(Av+k, cache+k);
}
while (k < n) {
x += Av[k] * cache[k];
k++;
}
C[r*n + c] = x;
}
}
}

parallelizing matrix multiplication through threading and SIMD

I am trying to speed up matrix multiplication on multicore architecture. For this end, I try to use threads and SIMD at the same time. But my results are not good. I test speed up over sequential matrix multiplication:
void sequentialMatMul(void* params)
{
cout << "SequentialMatMul started.";
int i, j, k;
for (i = 0; i < N; i++)
{
for (k = 0; k < N; k++)
{
for (j = 0; j < N; j++)
{
X[i][j] += A[i][k] * B[k][j];
}
}
}
cout << "\nSequentialMatMul finished.";
}
I tried to add threading and SIMD to matrix multiplication as follows:
void threadedSIMDMatMul(void* params)
{
bounds *args = (bounds*)params;
int lowerBound = args->lowerBound;
int upperBound = args->upperBound;
int idx = args->idx;
int i, j, k;
for (i = lowerBound; i <upperBound; i++)
{
for (k = 0; k < N; k++)
{
for (j = 0; j < N; j+=4)
{
mmx1 = _mm_loadu_ps(&X[i][j]);
mmx2 = _mm_load_ps1(&A[i][k]);
mmx3 = _mm_loadu_ps(&B[k][j]);
mmx4 = _mm_mul_ps(mmx2, mmx3);
mmx0 = _mm_add_ps(mmx1, mmx4);
_mm_storeu_ps(&X[i][j], mmx0);
}
}
}
_endthread();
}
And the following section is used for calculating lowerbound and upperbound of each thread:
bounds arg[CORES];
for (int part = 0; part < CORES; part++)
{
arg[part].idx = part;
arg[part].lowerBound = (N / CORES)*part;
arg[part].upperBound = (N / CORES)*(part + 1);
}
And finally threaded SIMD version is called like this:
HANDLE handle[CORES];
for (int part = 0; part < CORES; part++)
{
handle[part] = (HANDLE)_beginthread(threadedSIMDMatMul, 0, (void*)&arg[part]);
}
for (int part = 0; part < CORES; part++)
{
WaitForSingleObject(handle[part], INFINITE);
}
The result is as follows:
Test 1:
// arrays are defined as follow
float A[N][N];
float B[N][N];
float X[N][N];
N=2048
Core=1//just one thread
Sequential time: 11129ms
Threaded SIMD matmul time: 14650ms
Speed up=0.75x
Test 2:
//defined arrays as follow
float **A = (float**)_aligned_malloc(N* sizeof(float), 16);
float **B = (float**)_aligned_malloc(N* sizeof(float), 16);
float **X = (float**)_aligned_malloc(N* sizeof(float), 16);
for (int k = 0; k < N; k++)
{
A[k] = (float*)malloc(cols * sizeof(float));
B[k] = (float*)malloc(cols * sizeof(float));
X[k] = (float*)malloc(cols * sizeof(float));
}
N=2048
Core=1//just one thread
Sequential time: 15907ms
Threaded SIMD matmul time: 18578ms
Speed up=0.85x
Test 3:
//defined arrays as follow
float A[N][N];
float B[N][N];
float X[N][N];
N=2048
Core=2
Sequential time: 10855ms
Threaded SIMD matmul time: 27967ms
Speed up=0.38x
Test 4:
//defined arrays as follow
float **A = (float**)_aligned_malloc(N* sizeof(float), 16);
float **B = (float**)_aligned_malloc(N* sizeof(float), 16);
float **X = (float**)_aligned_malloc(N* sizeof(float), 16);
for (int k = 0; k < N; k++)
{
A[k] = (float*)malloc(cols * sizeof(float));
B[k] = (float*)malloc(cols * sizeof(float));
X[k] = (float*)malloc(cols * sizeof(float));
}
N=2048
Core=2
Sequential time: 16579ms
Threaded SIMD matmul time: 30160ms
Speed up=0.51x
My question: why I don’t get speed up?
Here are the times I get building on your algorithm on my four core i7 IVB processor.
sequential: 3.42 s
4 threads: 0.97 s
4 threads + SSE: 0.86 s
Here are the times on a 2 core P9600 #2.53 GHz which is similar to the OP's E2200 #2.2 GHz
sequential: time 6.52 s
2 threads: time 3.66 s
2 threads + SSE: 3.75 s
I used OpenMP because it makes this easy. Each thread in OpenMP runs over effectively
lowerBound = N*part/CORES;
upperBound = N*(part + 1)/CORES;
(note that that is slightly different than your definition. Your definition can give the wrong result due to rounding for some values of N since you divide by CORES first.)
As to the SIMD version. It's not much faster probably due it being memory bandwidth bound . It's probably not really faster because GCC already vectroizes the loop.
The most optimal solution is much more complicated. You need to use loop tiling and reorder the elements within tiles to get the optimal performance. I don't have time to do that today.
Here is the code I used:
//c99 -O3 -fopenmp -Wall foo.c
#include <stdio.h>
#include <string.h>
#include <x86intrin.h>
#include <omp.h>
void gemm(float * restrict a, float * restrict b, float * restrict c, int n) {
for(int i=0; i<n; i++) {
for(int k=0; k<n; k++) {
for(int j=0; j<n; j++) {
c[i*n+j] += a[i*n+k]*b[k*n+j];
}
}
}
}
void gemm_tlp(float * restrict a, float * restrict b, float * restrict c, int n) {
#pragma omp parallel for
for(int i=0; i<n; i++) {
for(int k=0; k<n; k++) {
for(int j=0; j<n; j++) {
c[i*n+j] += a[i*n+k]*b[k*n+j];
}
}
}
}
void gemm_tlp_simd(float * restrict a, float * restrict b, float * restrict c, int n) {
#pragma omp parallel for
for(int i=0; i<n; i++) {
for(int k=0; k<n; k++) {
__m128 a4 = _mm_set1_ps(a[i*n+k]);
for(int j=0; j<n; j+=4) {
__m128 c4 = _mm_load_ps(&c[i*n+j]);
__m128 b4 = _mm_load_ps(&b[k*n+j]);
c4 = _mm_add_ps(_mm_mul_ps(a4,b4),c4);
_mm_store_ps(&c[i*n+j], c4);
}
}
}
}
int main(void) {
int n = 2048;
float *a = _mm_malloc(n*n * sizeof *a, 64);
float *b = _mm_malloc(n*n * sizeof *b, 64);
float *c1 = _mm_malloc(n*n * sizeof *c1, 64);
float *c2 = _mm_malloc(n*n * sizeof *c2, 64);
float *c3 = _mm_malloc(n*n * sizeof *c2, 64);
for(int i=0; i<n*n; i++) a[i] = 1.0*i;
for(int i=0; i<n*n; i++) b[i] = 1.0*i;
memset(c1, 0, n*n * sizeof *c1);
memset(c2, 0, n*n * sizeof *c2);
memset(c3, 0, n*n * sizeof *c3);
double dtime;
dtime = -omp_get_wtime();
gemm(a,b,c1,n);
dtime += omp_get_wtime();
printf("time %f\n", dtime);
dtime = -omp_get_wtime();
gemm_tlp(a,b,c2,n);
dtime += omp_get_wtime();
printf("time %f\n", dtime);
dtime = -omp_get_wtime();
gemm_tlp_simd(a,b,c3,n);
dtime += omp_get_wtime();
printf("time %f\n", dtime);
printf("error %d\n", memcmp(c1,c2, n*n*sizeof *c1));
printf("error %d\n", memcmp(c1,c3, n*n*sizeof *c1));
}
It looks to me that the threads are sharing __m128 mmx* variables, you probably defined them global/static. You must be getting wrong results in your X array too. Define __m128 mmx* variables inside threadedSIMDMatMul function scope and it will run much faster.
void threadedSIMDMatMul(void* params)
{
__m128 mmx0, mmx1, mmx2, mmx3, mmx4;
// rest of the code here
}

Resources