How can i optimize my AVX implementation of dot product?

How can i optimize my AVX implementation of dot product? - c

I`ve tried to implement dot product of this two arrays using AVX https://stackoverflow.com/a/10459028. But my code is very slow.
A and xb are arrays of doubles, n is even number. Can you help me?
const int mask = 0x31;
int sum =0;
for (int i = 0; i < n; i++)
{
int ind = i;
if (i + 8 > n) // padding
{
sum += A[ind] * xb[i].x;
i++;
ind = n * j + i;
sum += A[ind] * xb[i].x;
continue;
}
__declspec(align(32)) double ar[4] = { xb[i].x, xb[i + 1].x, xb[i + 2].x, xb[i + 3].x };
__m256d x = _mm256_loadu_pd(&A[ind]);
__m256d y = _mm256_load_pd(ar);
i+=4; ind = n * j + i;
__declspec(align(32)) double arr[4] = { xb[i].x, xb[i + 1].x, xb[i + 2].x, xb[i + 3].x };
__m256d z = _mm256_loadu_pd(&A[ind]);
__m256d w = _mm256_load_pd(arr);
__m256d xy = _mm256_mul_pd(x, y);
__m256d zw = _mm256_mul_pd(z, w);
__m256d temp = _mm256_hadd_pd(xy, zw);
__m128d hi128 = _mm256_extractf128_pd(temp, 1);
__m128d low128 = _mm256_extractf128_pd(temp, 0);
//__m128d dotproduct = _mm_add_pd((__m128d)temp, hi128);
__m128d dotproduct = _mm_add_pd(low128, hi128);
sum += dotproduct.m128d_f64[0]+dotproduct.m128d_f64[1];
i += 3;
}

There are two big inefficiencies in your loop that are immediately apparent:
(1) these two chunks of scalar code:
__declspec(align(32)) double ar[4] = { xb[i].x, xb[i + 1].x, xb[i + 2].x, xb[i + 3].x };
...
__m256d y = _mm256_load_pd(ar);
and
__declspec(align(32)) double arr[4] = { xb[i].x, xb[i + 1].x, xb[i + 2].x, xb[i + 3].x };
...
__m256d w = _mm256_load_pd(arr);
should be implemented using SIMD loads and shuffles (or at the very least use _mm256_set_pd and give the compiler a chance to do a half-reasonable job of generating code for a gathered load).
(2) the horizontal summation at the end of the loop:
for (int i = 0; i < n; i++)
{
...
__m256d xy = _mm256_mul_pd(x, y);
__m256d zw = _mm256_mul_pd(z, w);
__m256d temp = _mm256_hadd_pd(xy, zw);
__m128d hi128 = _mm256_extractf128_pd(temp, 1);
__m128d low128 = _mm256_extractf128_pd(temp, 0);
//__m128d dotproduct = _mm_add_pd((__m128d)temp, hi128);
__m128d dotproduct = _mm_add_pd(low128, hi128);
sum += dotproduct.m128d_f64[0]+dotproduct.m128d_f64[1];
i += 3;
}
should be moved out of the loop:
__m256d xy = _mm256_setzero_pd();
__m256d zw = _mm256_setzero_pd();
...
for (int i = 0; i < n; i++)
{
...
xy = _mm256_add_pd(xy, _mm256_mul_pd(x, y));
zw = _mm256_add_pd(zw, _mm256_mul_pd(z, w));
i += 3;
}
__m256d temp = _mm256_hadd_pd(xy, zw);
__m128d hi128 = _mm256_extractf128_pd(temp, 1);
__m128d low128 = _mm256_extractf128_pd(temp, 0);
//__m128d dotproduct = _mm_add_pd((__m128d)temp, hi128);
__m128d dotproduct = _mm_add_pd(low128, hi128);
sum += dotproduct.m128d_f64[0]+dotproduct.m128d_f64[1];

Related

Vectorising a nested loop with AVX2

I am trying to vectorise the inner loop the following nested loop. Firstly, is this good practice, or should one avoid attempting to vectorise nested loops?
The following works, it has already some basic loop unrolling.
int sparsemv(struct mesh *A, const double * const x, double * const y) {
const int nrow = (const int) A->local_nrow;
int j = 0;
double sum = 0.0;
#pragma omp parallel for private(j, sum)
for (int i=0; i< nrow; i++) {
sum = 0.0;
const double * const cur_vals = (const double * const) A->ptr_to_vals_in_row[i];
const int * const cur_inds = (const int * const) A->ptr_to_inds_in_row[i];
const int cur_nnz = (const int) A->nnz_in_row[i];
int unroll = (cur_nnz/4)*4;
for (j=0; j< unroll; j+=4) {
sum += cur_vals[j] * x[cur_inds[j]];
sum += cur_vals[j+1] * x[cur_inds[j+1]];
sum += cur_vals[j+2] * x[cur_inds[j+2]];
sum += cur_vals[j+3] * x[cur_inds[j+3]];
}
for (; j < cur_nnz; j++) {
sum += cur_vals[j] * x[cur_inds[j]];
}
y[i] = sum;
}
return 0;
}
However, when I try to vectorise using 256-bit Vector registers in AVX2, I get either the incorrect answers or seg faults. x and y are aligned but A is not, but for the moment, all loading and storing is done using unaligned operations since that is the only time I don't get seg faults:
int sparsemv(struct mesh *A, const double * const x, double * const y) {
const int nrow = (const int) A->local_nrow;
int j = 0;
double sum = 0.0;
#pragma omp parallel for private(j, sum)
for (int i=0; i< nrow; i++) {
sum = 0.0;
const double * const cur_vals = (const double * const) A->ptr_to_vals_in_row[i];
const int * const cur_inds = (const int * const) A->ptr_to_inds_in_row[i];
const int cur_nnz = (const int) A->nnz_in_row[i];
int unroll = (cur_nnz/4)*4;
__m256d sumVec = _mm256_set1_pd(sum);
for (j=0; j< unroll; j+=4) {
__m256d cur_valsVec = _mm256_loadu_pd(cur_vals + j);
__m256d xVec = _mm256_loadu_pd(x + cur_inds[j]);
sumVec = _mm256_add_pd(sumVec, _mm256_mul_pd(cur_valsVec, xVec));
}
_mm256_storeu_pd(y + i, sumVec); // Is this storing in y + i + 1, 2 and 3 aswell?
for (; j < cur_nnz; j++) {
sum += cur_vals[j] * x[cur_inds[j]];
}
y[i] += sum;
}
return 0;
}

Pseudoinverse code results in C inaccurate compared to MATLAB results

I am trying to figure out why my pseudoinverse C code results differ from MATLAB results.
This is the code for pseudo-inverse: http://www.mymathlib.com/c_source/matrices/linearsystems/singular_value_decomposition.c
#include <string.h> // required for memcpy()
#include <float.h> // required for DBL_EPSILON
#include <math.h> // required for fabs(), sqrt();
#define MAX_ITERATION_COUNT 30 // Maximum number of iterations
// Internally Defined Routines
static void Householders_Reduction_to_Bidiagonal_Form(double* A, int nrows,
int ncols, double* U, double* V, double* diagonal, double* superdiagonal );
static int Givens_Reduction_to_Diagonal_Form( int nrows, int ncols,
double* U, double* V, double* diagonal, double* superdiagonal );
static void Sort_by_Decreasing_Singular_Values(int nrows, int ncols,
double* singular_value, double* U, double* V);
////////////////////////////////////////////////////////////////////////////////
// int Singular_Value_Decomposition(double* A, int nrows, int ncols, //
// double* U, double* singular_values, double* V, double* dummy_array) //
// //
// Description: //
// This routine decomposes an m x n matrix A, with m >= n, into a product //
// of the three matrices U, D, and V', i.e. A = UDV', where U is an m x n //
// matrix whose columns are orthogonal, D is a n x n diagonal matrix, and //
// V is an n x n orthogonal matrix. V' denotes the transpose of V. If //
// m < n, then the procedure may be used for the matrix A'. The singular //
// values of A are the diagonal elements of the diagonal matrix D and //
// correspond to the positive square roots of the eigenvalues of the //
// matrix A'A. //
//
int Singular_Value_Decomposition(double* A, int nrows, int ncols, double* U,
double* singular_values, double* V, double* dummy_array)
{
Householders_Reduction_to_Bidiagonal_Form( A, nrows, ncols, U, V,
singular_values, dummy_array);
if (Givens_Reduction_to_Diagonal_Form( nrows, ncols, U, V,
singular_values, dummy_array ) < 0) return -1;
Sort_by_Decreasing_Singular_Values(nrows, ncols, singular_values, U, V);
return 0;
}
////////////////////////////////////////////////////////////////////////////////
// static void Householders_Reduction_to_Bidiagonal_Form(double* A, int nrows,//
// int ncols, double* U, double* V, double* diagonal, double* superdiagonal )//
// //
// Description: //
// This routine decomposes an m x n matrix A, with m >= n, into a product //
// of the three matrices U, B, and V', i.e. A = UBV', where U is an m x n //
// matrix whose columns are orthogonal, B is a n x n bidiagonal matrix, //
// and V is an n x n orthogonal matrix. V' denotes the transpose of V. //
// If m < n, then the procedure may be used for the matrix A'. The //
// //
/
////////////////////////////////////////////////////////////////////////////////
// //
static void Householders_Reduction_to_Bidiagonal_Form(double* A, int nrows,
int ncols, double* U, double* V, double* diagonal, double* superdiagonal )
{
int i,j,k,ip1;
double s, s2, si, scale;
double dum;
double *pu, *pui, *pv, *pvi;
double half_norm_squared;
// Copy A to U
memcpy(U,A, sizeof(double) * nrows * ncols);
//
diagonal[0] = 0.0;
s = 0.0;
scale = 0.0;
for ( i = 0, pui = U, ip1 = 1; i < ncols; pui += ncols, i++, ip1++ ) {
superdiagonal[i] = scale * s;
//
// Perform Householder transform on columns.
//
// Calculate the normed squared of the i-th column vector starting at
// row i.
//
for (j = i, pu = pui, scale = 0.0; j < nrows; j++, pu += ncols)
scale += fabs( *(pu + i) );
if (scale > 0.0) {
for (j = i, pu = pui, s2 = 0.0; j < nrows; j++, pu += ncols) {
*(pu + i) /= scale;
s2 += *(pu + i) * *(pu + i);
}
//
//
// Chose sign of s which maximizes the norm
//
s = ( *(pui + i) < 0.0 ) ? sqrt(s2) : -sqrt(s2);
//
// Calculate -2/u'u
//
half_norm_squared = *(pui + i) * s - s2;
//
// Transform remaining columns by the Householder transform.
//
*(pui + i) -= s;
for (j = ip1; j < ncols; j++) {
for (k = i, si = 0.0, pu = pui; k < nrows; k++, pu += ncols)
si += *(pu + i) * *(pu + j);
si /= half_norm_squared;
for (k = i, pu = pui; k < nrows; k++, pu += ncols) {
*(pu + j) += si * *(pu + i);
}
}
}
for (j = i, pu = pui; j < nrows; j++, pu += ncols) *(pu + i) *= scale;
diagonal[i] = s * scale;
//
// Perform Householder transform on rows.
//
// Calculate the normed squared of the i-th row vector starting at
// column i.
//
s = 0.0;
scale = 0.0;
if (i >= nrows || i == (ncols - 1) ) continue;
for (j = ip1; j < ncols; j++) scale += fabs ( *(pui + j) );
if ( scale > 0.0 ) {
for (j = ip1, s2 = 0.0; j < ncols; j++) {
*(pui + j) /= scale;
s2 += *(pui + j) * *(pui + j);
}
s = ( *(pui + ip1) < 0.0 ) ? sqrt(s2) : -sqrt(s2);
//
// Calculate -2/u'u
//
half_norm_squared = *(pui + ip1) * s - s2;
//
// Transform the rows by the Householder transform.
//
*(pui + ip1) -= s;
for (k = ip1; k < ncols; k++)
superdiagonal[k] = *(pui + k) / half_norm_squared;
if ( i < (nrows - 1) ) {
for (j = ip1, pu = pui + ncols; j < nrows; j++, pu += ncols) {
for (k = ip1, si = 0.0; k < ncols; k++)
si += *(pui + k) * *(pu + k);
for (k = ip1; k < ncols; k++) {
*(pu + k) += si * superdiagonal[k];
}
}
}
for (k = ip1; k < ncols; k++) *(pui + k) *= scale;
}
}
// Update V
pui = U + ncols * (ncols - 2);
pvi = V + ncols * (ncols - 1);
*(pvi + ncols - 1) = 1.0;
s = superdiagonal[ncols - 1];
pvi -= ncols;
for (i = ncols - 2, ip1 = ncols - 1; i >= 0; i--, pui -= ncols,
pvi -= ncols, ip1-- ) {
if ( s != 0.0 ) {
pv = pvi + ncols;
for (j = ip1; j < ncols; j++, pv += ncols)
*(pv + i) = ( *(pui + j) / *(pui + ip1) ) / s;
for (j = ip1; j < ncols; j++) {
si = 0.0;
for (k = ip1, pv = pvi + ncols; k < ncols; k++, pv += ncols)
si += *(pui + k) * *(pv + j);
for (k = ip1, pv = pvi + ncols; k < ncols; k++, pv += ncols)
*(pv + j) += si * *(pv + i);
}
}
pv = pvi + ncols;
for ( j = ip1; j < ncols; j++, pv += ncols ) {
*(pvi + j) = 0.0;
*(pv + i) = 0.0;
}
*(pvi + i) = 1.0;
s = superdiagonal[i];
}
// Update U
pui = U + ncols * (ncols - 1);
for (i = ncols - 1, ip1 = ncols; i >= 0; ip1 = i, i--, pui -= ncols ) {
s = diagonal[i];
for ( j = ip1; j < ncols; j++) *(pui + j) = 0.0;
if ( s != 0.0 ) {
for (j = ip1; j < ncols; j++) {
si = 0.0;
pu = pui + ncols;
for (k = ip1; k < nrows; k++, pu += ncols)
si += *(pu + i) * *(pu + j);
si = (si / *(pui + i) ) / s;
for (k = i, pu = pui; k < nrows; k++, pu += ncols)
*(pu + j) += si * *(pu + i);
}
for (j = i, pu = pui; j < nrows; j++, pu += ncols){
*(pu + i) /= s;
}
}
else
for (j = i, pu = pui; j < nrows; j++, pu += ncols) *(pu + i) = 0.0;
*(pui + i) += 1.0;
}
}
////////////////////////////////////////////////////////////////////////////////
// static int Givens_Reduction_to_Diagonal_Form( int nrows, int ncols, //
// double* U, double* V, double* diagonal, double* superdiagonal ) //
// //
// Description: //
// This routine decomposes a bidiagonal matrix given by the arrays //
// diagonal and superdiagonal into a product of three matrices U1, D and //
// V1', the matrix U1 premultiplies U and is returned in U, the matrix //
// V1 premultiplies V and is returned in V. The matrix D is a diagonal //
// matrix and replaces the array diagonal. //
// //
// The method used to annihilate the offdiagonal elements is a variant //
// of the QR transformation. The method consists of applying Givens //
// rotations to the right and the left of the current matrix until //
// the new off-diagonal elements are chased out of the matrix. //
// //
// The process is an iterative process which due to roundoff errors may //
// not converge within a predefined number of iterations. (This should //
// be unusual.) //
// //
// Arguments: //
// int nrows //
// The number of rows of the matrix U. //
// int ncols //
// The number of columns of the matrix U. //
// double* U //
// On input, a pointer to a matrix already initialized to a matrix //
// with mutually orthogonal columns. On output, the matrix with //
// mutually orthogonal columns. //
// double* V //
// On input, a pointer to a square matrix with the same number of rows //
// and columns as the columns of the matrix U, i.e. V[ncols][ncols]. //
// The matrix V is assumed to be initialized to an orthogonal matrix. //
// On output, V is an orthogonal matrix. //
// double* diagonal //
// On input, a pointer to an array of dimension ncols which initially //
// contains the diagonal of the bidiagonal matrix. On output, the //
// it contains the diagonal of the diagonal matrix. //
// double* superdiagonal //
// On input, a pointer to an array of dimension ncols which initially //
// the first component is zero and the successive components form the //
// superdiagonal of the bidiagonal matrix. //
// //
// Return Values: //
// 0 Success //
// -1 Failure - The procedure failed to terminate within //
// MAX_ITERATION_COUNT iterations. //
// //
// Example: //
// #define M //
// #define N //
// double U[M][N]; //
// double V[N][N]; //
// double diagonal[N]; //
// double superdiagonal[N]; //
// int err; //
// //
// (your code to initialize the matrices U, V, diagonal, and ) //
// ( superdiagonal. - Note this routine is not accessible from outside) //
// ( i.e. it is declared static.) //
// //
// err = Givens_Reduction_to_Diagonal_Form( M,N,(double*)U,(double*)V, //
// diagonal, superdiagonal ); //
// if ( err < 0 ) printf("Failed to converge\n"); //
// else { ... } //
// ... //
////////////////////////////////////////////////////////////////////////////////
// //
static int Givens_Reduction_to_Diagonal_Form( int nrows, int ncols,
double* U, double* V, double* diagonal, double* superdiagonal )
{
double epsilon;
double c, s;
double f,g,h;
double x,y,z;
double *pu, *pv;
int i,j,k,m;
int rotation_test;
int iteration_count;
for (i = 0, x = 0.0; i < ncols; i++) {
y = fabs(diagonal[i]) + fabs(superdiagonal[i]);
if ( x < y ) x = y;
}
epsilon = x * DBL_EPSILON;
for (k = ncols - 1; k >= 0; k--) {
iteration_count = 0;
while(1) {
rotation_test = 1;
for (m = k; m >= 0; m--) {
if (fabs(superdiagonal[m]) <= epsilon) {rotation_test = 0; break;}
if (fabs(diagonal[m-1]) <= epsilon) break;
}
if (rotation_test) {
c = 0.0;
s = 1.0;
for (i = m; i <= k; i++) {
f = s * superdiagonal[i];
superdiagonal[i] *= c;
if (fabs(f) <= epsilon) break;
g = diagonal[i];
h = sqrt(f*f + g*g);
diagonal[i] = h;
c = g / h;
s = -f / h;
for (j = 0, pu = U; j < nrows; j++, pu += ncols) {
y = *(pu + m - 1);
z = *(pu + i);
*(pu + m - 1 ) = y * c + z * s;
*(pu + i) = -y * s + z * c;
}
}
}
z = diagonal[k];
if (m == k ) {
if ( z < 0.0 ) {
diagonal[k] = -z;
for ( j = 0, pv = V; j < ncols; j++, pv += ncols)
*(pv + k) = - *(pv + k);
}
break;
}
else {
if ( iteration_count >= MAX_ITERATION_COUNT ) return -1;
iteration_count++;
x = diagonal[m];
y = diagonal[k-1];
g = superdiagonal[k-1];
h = superdiagonal[k];
f = ( (y - z) * ( y + z ) + (g - h) * (g + h) )/(2.0 * h * y);
g = sqrt( f * f + 1.0 );
if ( f < 0.0 ) g = -g;
f = ( (x - z) * (x + z) + h * (y / (f + g) - h) ) / x;
// Next QR Transformtion
c = 1.0;
s = 1.0;
for (i = m + 1; i <= k; i++) {
g = superdiagonal[i];
y = diagonal[i];
h = s * g;
g *= c;
z = sqrt( f * f + h * h );
superdiagonal[i-1] = z;
c = f / z;
s = h / z;
f = x * c + g * s;
g = -x * s + g * c;
h = y * s;
y *= c;
for (j = 0, pv = V; j < ncols; j++, pv += ncols) {
x = *(pv + i - 1);
z = *(pv + i);
*(pv + i - 1) = x * c + z * s;
*(pv + i) = -x * s + z * c;
}
z = sqrt( f * f + h * h );
diagonal[i - 1] = z;
if (z != 0.0) {
c = f / z;
s = h / z;
}
f = c * g + s * y;
x = -s * g + c * y;
for (j = 0, pu = U; j < nrows; j++, pu += ncols) {
y = *(pu + i - 1);
z = *(pu + i);
*(pu + i - 1) = c * y + s * z;
*(pu + i) = -s * y + c * z;
}
}
superdiagonal[m] = 0.0;
superdiagonal[k] = f;
diagonal[k] = x;
}
}
}
return 0;
}
////////////////////////////////////////////////////////////////////////////////
// static void Sort_by_Decreasing_Singular_Values(int nrows, int ncols, //
// double* singular_values, double* U, double* V) //
// //
// Description: //
// This routine sorts the singular values from largest to smallest //
// singular value and interchanges the columns of U and the columns of V //
// whenever a swap is made. I.e. if the i-th singular value is swapped //
// with the j-th singular value, then the i-th and j-th columns of U are //
// interchanged and the i-th and j-th columns of V are interchanged. //
// //
// Arguments: //
// int nrows //
// The number of rows of the matrix U. //
// int ncols //
// The number of columns of the matrix U. //
// double* singular_values //
// On input, a pointer to the array of singular values. On output, the//
// sorted array of singular values. //
// double* U //
// On input, a pointer to a matrix already initialized to a matrix //
// with mutually orthogonal columns. On output, the matrix with //
// mutually orthogonal possibly permuted columns. //
// double* V //
// On input, a pointer to a square matrix with the same number of rows //
// and columns as the columns of the matrix U, i.e. V[ncols][ncols]. //
// The matrix V is assumed to be initialized to an orthogonal matrix. //
// On output, V is an orthogonal matrix with possibly permuted columns.//
// //
// Return Values: //
// The function is of type void. //
// //
// Example: //
// #define M //
// #define N //
// double U[M][N]; //
// double V[N][N]; //
// double diagonal[N]; //
// //
// (your code to initialize the matrices U, V, and diagonal. ) //
// ( - Note this routine is not accessible from outside) //
// ( i.e. it is declared static.) //
// //
// Sort_by_Decreasing_Singular_Values(nrows, ncols, singular_values, //
// (double*) U, (double*) V); //
// ... //
////////////////////////////////////////////////////////////////////////////////
// //
static void Sort_by_Decreasing_Singular_Values(int nrows, int ncols,
double* singular_values, double* U, double* V)
{
int i,j,max_index;
double temp;
double *p1, *p2;
for (i = 0; i < ncols - 1; i++) {
max_index = i;
for (j = i + 1; j < ncols; j++)
if (singular_values[j] > singular_values[max_index] )
max_index = j;
if (max_index == i) continue;
temp = singular_values[i];
singular_values[i] = singular_values[max_index];
singular_values[max_index] = temp;
p1 = U + max_index;
p2 = U + i;
for (j = 0; j < nrows; j++, p1 += ncols, p2 += ncols) {
temp = *p1;
*p1 = *p2;
*p2 = temp;
}
p1 = V + max_index;
p2 = V + i;
for (j = 0; j < ncols; j++, p1 += ncols, p2 += ncols) {
temp = *p1;
*p1 = *p2;
*p2 = temp;
}
}
}
////////////////////////////////////////////////////////////////////////////////
// void Singular_Value_Decomposition_Inverse(double* U, double* D, double* V,//
// double tolerance, int nrows, int ncols, double *Astar) //
// //
// Description: //
// This routine calculates the pseudo-inverse of the matrix A = UDV'. //
// where U, D, V constitute the singular value decomposition of A. //
// Let Astar be the pseudo-inverse then Astar = V(1/D)U', where 1/D is //
// the pseudo-inverse of D, i.e. if D[i] > 0 then (1/D)[i] = 1/D[i] and //
// if D[i] = 0, then (1/D)[i] = 0. Because the singular values are //
// subject to round-off error. A tolerance is given so that if //
// D[i] < tolerance, D[i] is treated as if it were 0. //
// The default tolerance is D[0] * DBL_EPSILON * ncols, assuming that the //
// diagonal matrix of singular values is sorted from largest to smallest, //
// if the user specified tolerance is less than the default tolerance, //
// then the default tolerance is used. //
// //
// Arguments: //
// double* U //
// A matrix with mutually orthonormal columns. //
// double* D //
// A diagonal matrix with decreasing non-negative diagonal elements. //
// i.e. D[i] > D[j] if i < j and D[i] >= 0 for all i. //
// double* V //
// An orthogonal matrix. //
// double tolerance //
// An lower bound for non-zero singular values (provided tolerance > //
// ncols * DBL_EPSILON * D[0]). //
// int nrows //
// The number of rows of the matrix U and B. //
// int ncols //
// The number of columns of the matrix U. Also the number of rows and //
// columns of the matrices D and V. //
// double* Astar //
// On input, a pointer to the first element of an ncols x nrows matrix.//
// On output, the pseudo-inverse of UDV'. //
// //
// Return Values: //
// The function is of type void. //
// //
// Example: //
// #define M //
// #define N //
// double U[M][N]; //
// double V[N][N]; //
// double D[N]; //
// double Astar[N][M]; //
// double tolerance; //
// //
// (your code to initialize the matrices U,D,V) //
// //
// Singular_Value_Decomposition_Inverse((double*) U, D, (double*) V, //
// tolerance, M, N, (double*) Astar); //
// //
// printf(" The pseudo-inverse of A = UDV' is \n"); //
// ... //
////////////////////////////////////////////////////////////////////////////////
// //
void Singular_Value_Decomposition_Inverse(double* U, double* D, double* V,
double tolerance, int nrows, int ncols, double *Astar)
{
int i,j,k;
double *pu, *pv, *pa;
double dum;
dum = DBL_EPSILON * D[0] * (double) ncols;
if (tolerance < dum) tolerance = dum;
for ( i = 0, pv = V, pa = Astar; i < ncols; i++, pv += ncols)
for ( j = 0, pu = U; j < nrows; j++, pa++)
for (k = 0, *pa = 0.0; k < ncols; k++, pu++)
if (D[k] > tolerance) *pa += *(pv + k) * *pu / D[k];
}
I have set my tolerance to 1e-16.
Input Matrix:
MatA[4][4] = {
{1e-15,2e-15,3e-15,4e-15},
{5e-15,10e-15,7e-15,8e-15},
{9e-15, 18e-15, 11e-15,12e-15},
{13e-15,26e-15,15e-15,16e-15}
};
C code results:
-7.3177e+13 -3.6957e+13 -7.3773e+11 3.5482e+13
-1.4635e+14 -7.3915e+13 -1.4755e+12 7.0964e+13
1.0264e+14 5.7015e+13 1.1387e+13 -3.4240e+13
1.9055e+14 1.0400e+14 1.7450e+13 -6.9101e+13
Matlab results:
1.0e+14 *
-0.7348 -0.3712 -0.0076 0.3561
-1.4697 -0.7424 -0.0152 0.7121
1.0227 0.5682 0.1136 -0.3409
1.9015 1.0379 0.1742 -0.6894
I am not sure where I am losing accuracy. The only place I can see accuracy to come into effect is DBL_Epsilon and Tolerance. I have also put the value for DBL_Epsilon as 4.94065645841247E-32. Not sure how I would get closer values to the matlab output.

It is likely your choice of tolerance. MATLAB sets the tolerance by default to max(size(A)) * eps(norm(A)) (according to the docs). For your matrix, this is 2.5244e-29.
If I compute A*pinv(A)*A - A I see
1.0e-28 *
-0.0039 -0.0079 0 0
-0.0237 -0.0473 -0.0158 0
-0.0473 -0.0947 -0.0316 0
-0.0789 -0.1578 -0.0316 -0.0316
If I compute the same but using your result instead of pinv(A), I see
1.0e-16 *
-0.0430 -0.0860 0.0582 0.1088
-0.1356 -0.2712 0.1862 0.3472
-0.2282 -0.4565 0.3143 0.5855
-0.3209 -0.6417 0.4423 0.8239
Looking at these magnitudes, this really points to the two choices for tolerance.

Implementation of OpenMP

void collision_f()
{
int x;
long double feq[Q], feq_R[Q], feq_B[Q], feq_force[Q];
long double mR[Q], mB[Q], meq_R[Q], meq_B[Q];
long double col_R[Q], col_B[Q];
forces_f();
#pragma omp parallel
{
#pragma omp for
for (x =0; x <NX; x++)
{
for (int y =0; y <NY; y++)
{
if ( (bnode[x][y] ==0) || (bnode[x][y] ==1) || (bnode[x][y] ==2) )
{
long double uxeq = ux_f[x][y] + Force_x_f[x][y]/rho_f[x][y];
long double uyeq = uy_f[x][y] + Force_y_f[x][y]/rho_f[x][y];
for (int i =0; i <Q; i++)
{
feq[i] = feq_R[i] = feq_B[i] = feq_force[i] = 0.0;
//equilibrium distribution
long double udotc_f = ux_f[x][y]*cx[i] + uy_f[x][y]*cy[i];
long double u2_f = pow(ux_f[x][y], 2) + pow(uy_f[x][y], 2);
feq[i] = wt[i]*rho_f[x][y]*(1.0 + 3.0*udotc_f + 4.5*pow(udotc_f, 2) - 1.5*u2_f);
feq_R[i] = wt[i]*rho_R_f[x][y]*(1.0 + 3.0*udotc_f + 4.5*pow(udotc_f, 2) - 1.5*u2_f);
feq_B[i] = wt[i]*rho_B_f[x][y]*(1.0 + 3.0*udotc_f + 4.5*pow(udotc_f, 2) - 1.5*u2_f);
long double udotc_force = uxeq*cx[i] + uyeq*cy[i];
long double u2_force = pow(uxeq, 2) + pow(uyeq, 2);
feq_force[i] = wt[i]*rho_f[x][y]*(1.0 + 3.0*udotc_force + 4.5*pow(udotc_force, 2) - 1.5*u2_force);
//printf("%d\t%d\t%d\t%Lf\t%Lf\t%Lf\t%Lf\n", x, y, i, feq[i], feq_R[i], feq_B[i], feq_force[i]);
}
//Calculating moments and meq
for (int i =0; i <Q; i++)
{
meq_R[i] = meq_B[i] = mR[i] = mB[i] = 0.0;
for (int j =0; j <Q; j++)
{
mR[i] += M[i][j]*r1[x][y][j];
meq_R[i] += M[i][j]*feq_R[j];
mB[i] += M[i][j]*b1[x][y][j];
meq_B[i] += M[i][j]*feq_B[j];
//printf("%d,%d\t%d,%d\t%Lf\t%Lf\t%Lf\t%Lf\n", x, y, i, j, mR[i], meq_R[i], mB[i], meq_B[i]);
}
//printf("%d\t%d\t%d\t%Lf\t%Lf\t%Lf\t%Lf\n", x, y, i, mR[i], meq_R[i], mB[i], meq_B[i]);
}
//Collision equation
for (int i =0; i <Q; i++)
{
col_R[i] = col_B[i] = 0.0;
for (int j =0; j <Q; j++)
{
col_R[i] += stmiv_f[x][y][i][j]*(mR[j] - meq_R[j]);
col_B[i] += stmiv_f[x][y][i][j]*(mB[j] - meq_B[j]);
}
long double force = feq_force[i] - feq[i];
r2[x][y][i] = r1[x][y][i] - col_R[i];
b2[x][y][i] = b1[x][y][i] - col_B[i];
f1[x][y][i] = r2[x][y][i] + b2[x][y][i] + force;
//Recoloring using d'Ortona's segregation method
r2[x][y][i] = (rho_R_f[x][y]/rho_f[x][y]) * (f1[x][y][i] + BETA_LKR*wt[i]*(rho_f[x][y] - rho_R_f[x][y])*(n_x[x][y]*cx[i] + n_y[x][y]*cy[i]));
b2[x][y][i] = f1[x][y][i] - r2[x][y][i];
}
if (rho_R_f[x][y] <=EVAP_LIM*rho_r_f)
{
for (int i =0; i <Q; i++)
{
b2[x][y][i] = f1[x][y][i];
r2[x][y][i] = 0.0;
}
}
if (rho_B_f[x][y] <=EVAP_LIM*rho_r_f)
{
for (int i =0; i <Q; i++)
{
r2[x][y][i] = f1[x][y][i];
b2[x][y][i] = 0.0;
}
}
}
}
}
}
return;
}
I am trying to implement OpenMP in this function. But I am getting nan for feq_R[Q] and -nan for meq_R[Q], meq_B[Q] only for a few combination of x, y. Also for each run, the values of x, y are different where I get -nan kind of solution. I have also checked that rho_f[x][y] is not zero for those x, y. I have also tried omp for reduction for meq_R[Q], meq_B[Q] only to be unsuccessful. FYI, the serial code runs without any problem. Any help is greatly appreciated.

Cuda - 2D Multiple double sums in each Matrix element

Same issue as post (Cuda - Multiple sums in each vector element). How do you perform 2D block striding in both x- and y-direction with varying summation limits. The 2D algorithm can be seen in the CPU and monolithic kernel. I included openmp for the CPU so as to get a more fair speedup result. If there is a way to increase the speed of the CPU function as well I would be happy to find out.
This version of the code takes a 2D array and flattens it to a 1D array. I still use the 2D thread dim3 indexing so I can index the double summations more intuitively.
(p.s. all credit to user Robert Crovella for the 1D striding code.)
The code so far is,
#include <stdio.h>
#include <iostream>
#include <cuda.h>
#include <sys/time.h>
typedef double df;
#define USECPSEC 1000000ULL
#define BSX 1<<5
#define BSY 1<<5
#define N 100
#define M 100
const bool sync = true;
const bool nosync = false;
unsigned long long dtime_usec(unsigned long long start, bool use_sync = nosync){
if (use_sync == sync) cudaDeviceSynchronize();
timeval tv;
gettimeofday(&tv, 0);
return ((tv.tv_sec*USECPSEC)+tv.tv_usec)-start;
}
int divUp(int a, int b) {return (a + b - 1) / b;}
float cpu_sum(int n, int m, df *a, df *b, df *c) {
df q, r;
#pragma omp parallel for collapse(2)
for (int x = 0; x < n; x++) {
for (int y = 0; y < m; y++) {
q = 0.0f;
for (int i = 0; i <= x; i++) {
r = 0.0f;
for (int j = 0; j <= y; j++) {
r += a[i * n + j] * b[(x - i) * n + y - j];
}
for (int j = 1; j < m - y; j++) {
r += a[i * n + j] * b[(x - i) * n + y + j]
+ a[i * n + y + j] * b[(x - i) * n + j];
}
q += r;
}
for (int i = 1; i < n-x; i++) {
r = 0.0f;
for (int j = 0; j <= y; j++) {
r += a[i * n + j] * b[(x + i) * n + y - j]
+ a[(x + i) * n + j] * b[ i * n + y - j];
}
for (int j = 1; j < m - y; j++) {
r += a[i * n + j] * b[(x + i) * n + y + j]
+ a[(x + i) * n + y + j] * b[(x + i) * n + j]
+a[(x + i) * n + j] * b[i * n + y + j]
+ a[(x + i) * n + y + j] * b[i * n + j];
}
q += r;
}
c[x * N + y] = 0.25f*q;
}
}
return 0;
}
const int P2 = 5;
const int TPB = 1<<P2;
const unsigned row_mask = ~((0xFFFFFFFFU>>P2)<<P2);
__global__ void chebyprod_imp(int n, int m, df *a, df *b, df *c){
__shared__ df sdata[TPB*TPB];
int x = blockIdx.x;
int y = blockIdx.y;
int row_width_x = (((x)>(n-x))?(x):(n-x))+1;
int row_width_y = (((y)>(m-y))?(y):(m-y))+1;
int strides_x = (row_width_x>>P2) + ((row_width_x&row_mask)?1:0);
int strides_y = (row_width_y>>P2) + ((row_width_y&row_mask)?1:0);
int i = threadIdx.x;
df tmp_a;
df sum = 0.0f;
for (int s=0; s < strides_x; s++) { // block-stride x loop
int j = threadIdx.y;
for (int u=0; u < strides_y; u++) { // block-stride y loop
if (i < n && j < m) {tmp_a = a[i * n + j];}
if (i <= x) {
if (j <= y) {sum += tmp_a * b[(x - i) * n + y - j];}
if ((j > 0) && (j < (m-y))) {sum += tmp_a * b[(x - i) * n + y + j]
+ a[i * n + y + j] * b[(x - i) * n + j];}
}
if ((i > 0) && (i < (n-x))) {
if (j <= y) {sum += tmp_a * b[(x + i) * n + y - j]
+ a[(x + i) * n + j] * b[ i * n + y - j];}
if ((j > 0) && (j < (m-y))) {sum += tmp_a * b[(x + i) * n + y + j]
+ a[(x + i) * n + y + j] * b[(x + i) * n + j]
+ a[(x + i) * n + j] * b[i * n + y + j]
+ a[(x + i) * n + y + j] * b[i * n + j];}
}
j += TPB;
}
i += TPB;
}
sdata[threadIdx.x * TPB + threadIdx.y] = sum;
for (int s = TPB>>1; s > 0; s>>=1) { // sweep reduction in x
for (int u = TPB>>1; u > 0; u>>=1) { // sweep reduction in x
__syncthreads();
if (threadIdx.x < s && threadIdx.y < u) {
sdata[threadIdx.x * TPB + threadIdx.y] += sdata[(threadIdx.x + s) * TPB + threadIdx.y + u];
}
}
}
if (!threadIdx.x && !threadIdx.y) c[x * n + y] = 0.25f*sdata[0];
}
__global__ void chebyprod(int n, int m, df *a, df *b, df *c){
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;
df q, r;
if (x < n && y < m) {
q = 0.0f;
for (int i = 0; i <= x; i++) {
r = 0.0f;
for (int j = 0; j <= y; j++) {
r += a[i * n + j] * b[(x - i) * n + y - j];
}
for (int j = 1; j < m - y; j++) {
r += a[i * n + j] * b[(x - i) * n + y + j]
+ a[i * n + y + j] * b[(x - i) * n + j];
}
q += r;
}
for (int i = 1; i < n-x; i++) {
r = 0.0f;
for (int j = 0; j <= y; j++) {
r += a[i * n + j] * b[(x + i) * n + y - j]
+ a[(x + i) * n + j] * b[ i * n + y - j];
}
for (int j = 1; j < m - y; j++) {
r += a[i * n + j] * b[(x + i) * n + y + j]
+ a[(x + i) * n + y + j] * b[(x + i) * n + j]
+a[(x + i) * n + j] * b[i * n + y + j]
+ a[(x + i) * n + y + j] * b[i * n + j];
}
q += r;
}
c[x * N + y] = 0.25f*q;
}
}
int main(void){
int size = N*M*sizeof(df);
df *a, *b, *c, *cc, *ci, *d_a, *d_b, *d_c, *d_ci;
a = (df*)malloc(size);
b = (df*)malloc(size);
c = (df*)malloc(size);
cc = (df*)malloc(size);
ci = (df*)malloc(size);
cudaMalloc(&d_a, size);
cudaMalloc(&d_b, size);
cudaMalloc(&d_c, size);
cudaMalloc(&d_ci, size);
#pragma omp parallel for collapse (2)
for (int i = 0; i < N; i++) {
for (int j = 0; j < M; j++) {
a[i * M + j] = 0.1f;
b[i * M + j] = 0.2f;
}
}
unsigned long long dt = dtime_usec(0);
// Perform chebyprod on N elements
cpu_sum(N, M, a, b, cc);
dt = dtime_usec(dt,sync);
printf("Time taken 2D CPU: %fs\n", dt/(float)USECPSEC);
df dtc = dt/(float)USECPSEC;
std::cout << "Vector cc: [ ";
for (int k = 0; k < 10; ++k)
std::cout << cc[k] << " ";
std::cout <<"]\n";
cudaMemcpy(d_a, a, size, cudaMemcpyHostToDevice);
cudaMemcpy(d_b, b, size, cudaMemcpyHostToDevice);
dim3 dimBlock(BSX, BSY);
dim3 dimGrid(divUp(N, BSX), divUp(M, BSY));
//std::cout << "dimBlock: " << dimBlock << "\n dimGrid: " << dimGrid << "\n";
dt = dtime_usec(0);
// Perform chebyprod on N elements
chebyprod<<< dimBlock, dimGrid >>>(N, M, d_a, d_b, d_c);
dt = dtime_usec(dt,sync);
printf("Time taken 2D monolithic kernel: %fs\n", dt/(float)USECPSEC);
printf("Speedup: %fs\n", dtc/(dt/(float)USECPSEC));
cudaMemcpy(c, d_c, size, cudaMemcpyDeviceToHost);
std::cout << "Vector c: [ ";
for (int k = 0; k < 10; ++k)
std::cout << c[k] << " ";
std::cout <<"]\n";
dt = dtime_usec(0);
// Perform chebyprod on N elements
chebyprod_imp<<< dimBlock, dimGrid >>>(N, M, d_a, d_b, d_ci);
dt = dtime_usec(dt,sync);
printf("Time taken 2D stride kernel: %fs\n", dt/(float)USECPSEC);
cudaMemcpy(ci, d_ci, size, cudaMemcpyDeviceToHost);
std::cout << "Vector ci: [ ";
for (int k = 0; k < 10; ++k)
std::cout << ci[k] << " ";
std::cout <<"]\n";
cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_c);
cudaFree(d_ci);
free(a);
free(b);
free(c);
free(cc);
free(ci);
}

For me, anyway, the results for the CPU code don't match between the cases where I compile with OpenMP support and without, if I omit -O3. I seem to get the correct results with OpenMP compilation if I also specify -O3. I'm not sure why that should matter for correctness, although it obviously has an impact on CPU code performance.
You seem to have gotten your grid and block sizing backwards:
chebyprod<<< dimBlock, dimGrid >>>(....
the first kernel config parameter is the grid dimension, not the block dimension. I'm not sure how this came about since you had it done correctly in your previous question.
As in the previous question, we need to pick a thread strategy and implement it correctly. You seemed to be confused about striding, so hopefully the code below will clarify things. The thread strategy I will use here is one warp per output point. A warp is a group of threads with a dimension of 32 (threads) in the x direction, and 1 in the y direction. Therefore the loop striding will be by an increment of 32 in the x direction, but only 1 in the y direction, to cover the entire space. The choice of thread strategy also affects grid sizing.
You seem to have jumbled the relationships that I think should exist for the two dimensions. The x direction, N, and n should all be connected. Likewise the y direction, M and m should all be connected (for example, M is the dimension in the y direction).
When it comes to 2D threadblocks, we want to arrange indexing for coalescing on the GPU such that the index that includes threadIdx.x is not multiplied by anything. (A simplified statement of coalescing is that we want adjacent threads in the warp to access adjacent elements in memory. Since threadIdx.x increases by 1 as we go from thread to thread in the warp, we want to use this characteristic to generate adjacent memory indexing. If we multiply threadIdx.x by anything except 1, we break the pattern.) You have this reversed - where the index including threadIdx.x is typically multiplied by the row dimension (N, or n). This really cannot be correct, and also does not make for good coalesced access. To solve this, we want to transpose our indexing and also transpose the data storage for a and b (and therefore c). In the code below, I have tranposed the indexing for the data setup for a and b, and also the relevant indexing has been transposed in the striding kernel (only). In your non-striding kernel and also your CPU version, I have not transposed the indexing, I leave that as an exercise for you, if needed. For the results, numerically, it does not matter, because your entire a matrix has the same value at every location, and a similar statement can be made about your b matrix. Numerically, then, for this example code, transposing (or not) has no bearing on the result. But it matters for performance (of the striding kernel, at least). Also note that I believe performing the indexing "transpose" on the "monolithic" kernel should also improve its performance. I don't know if it would affect the performance of the CPU version.
I've also added back in the const __restrict__ usage that I included in my previous answer. According to my testing, on "smaller" GPUs this provides noticeable performance benefit. It's not strictly necessary for correctness, however. Here's a worked example with the above changes that gives numerically matching results for all 3 test cases:
$ cat t1498.cu
#include <stdio.h>
#include <iostream>
#include <cuda.h>
#include <time.h>
#include <sys/time.h>
typedef double df;
#define USECPSEC 1000000ULL
#define BSX 1<<5
#define BSY 1<<5
#define N 100
#define M 100
const bool sync = true;
const bool nosync = false;
unsigned long long dtime_usec(unsigned long long start, bool use_sync = nosync){
if (use_sync == sync) cudaDeviceSynchronize();
timeval tv;
gettimeofday(&tv, 0);
return ((tv.tv_sec*USECPSEC)+tv.tv_usec)-start;
}
int divUp(int a, int b) {return (a + b - 1) / b;}
void cpu_sum(int n, int m, df *a, df *b, df *c) {
df q, r;
#pragma omp parallel for collapse(2)
for (int x = 0; x < n; x++) {
for (int y = 0; y < m; y++) {
q = 0.0f;
for (int i = 0; i <= x; i++) {
r = 0.0f;
for (int j = 0; j <= y; j++) {
r += a[i * n + j] * b[(x - i) * n + y - j];
}
for (int j = 1; j < m - y; j++) {
r += a[i * n + j] * b[(x - i) * n + y + j]
+ a[i * n + y + j] * b[(x - i) * n + j];
}
q += r;
}
for (int i = 1; i < n-x; i++) {
r = 0.0f;
for (int j = 0; j <= y; j++) {
r += a[i * n + j] * b[(x + i) * n + y - j]
+ a[(x + i) * n + j] * b[ i * n + y - j];
}
for (int j = 1; j < m - y; j++) {
r += a[i * n + j] * b[(x + i) * n + y + j]
+ a[(x + i) * n + y + j] * b[(x + i) * n + j]
+a[(x + i) * n + j] * b[i * n + y + j]
+ a[(x + i) * n + y + j] * b[i * n + j];
}
q += r;
}
c[x * N + y] = 0.25f*q;
}
}
}
// choose one warp per output point
const int P2 = 5; // assumes warp size is 32
const unsigned row_mask = ~((0xFFFFFFFFU>>P2)<<P2);
__global__ void chebyprod_imp(int n, int m, const df * __restrict__ a, const df * __restrict__ b, df * __restrict__ c){
int x = blockIdx.x;
int y = threadIdx.y+blockDim.y*blockIdx.y;
int width_x = (((x)>(n-x))?(x):(n-x))+1;
int height_y = (((y)>(m-y))?(y):(m-y))+1;
int strides_x = (width_x>>P2) + ((width_x&row_mask)?1:0);
int strides_y = height_y;
int i = threadIdx.x;
df tmp_a;
df sum = 0.0f;
if ((x < n) && (y < m)){
for (int s=0; s < strides_x; s++) { // warp-stride x loop
for (int j=0; j < strides_y; j++) { // y loop
if (i < n && j < m) {tmp_a = a[j * n + i];}
if (i <= x) {
if (j <= y) {sum += tmp_a * b[(y - j) * n + x - i];}
if ((j > 0) && (j < (m-y))) {sum += tmp_a * b[(y+j) * n + x - i] + a[(y+j)* n + i] * b[j*n+(x - i)];}
}
if ((i > 0) && (i < (n-x))) {
if (j <= y) {sum += tmp_a * b[(y-j) * n + x+i] + a[j*n + (x + i)] * b[(y - j)*n + i];}
if ((j > 0) && (j < (m-y)))
{sum += tmp_a * b[(y+j) * n + x+i]
+ a[(y+j) * n + x + i] * b[j*n+(x + i)]
+ a[j*n + (x + i)] * b[(y+j)*n + i]
+ a[(y+j)*n + x + i] * b[j*n+i];}
}
}
i += 32;
}
// warp-shuffle reduction
for (int offset = warpSize>>1; offset > 0; offset >>= 1)
sum += __shfl_down_sync(0xFFFFFFFFU, sum, offset);
if (!threadIdx.x) c[y*m+x] = 0.25f*sum;}
}
__global__ void chebyprod(int n, int m, df *a, df *b, df *c){
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;
df q, r;
if (x < n && y < m) {
q = 0.0f;
for (int i = 0; i <= x; i++) {
r = 0.0f;
for (int j = 0; j <= y; j++) {
r += a[i * n + j] * b[(x - i) * n + y - j];
}
for (int j = 1; j < m - y; j++) {
r += a[i * n + j] * b[(x - i) * n + y + j]
+ a[i * n + y + j] * b[(x - i) * n + j];
}
q += r;
}
for (int i = 1; i < n-x; i++) {
r = 0.0f;
for (int j = 0; j <= y; j++) {
r += a[i * n + j] * b[(x + i) * n + y - j]
+ a[(x + i) * n + j] * b[ i * n + y - j];
}
for (int j = 1; j < m - y; j++) {
r += a[i * n + j] * b[(x + i) * n + y + j]
+ a[(x + i) * n + y + j] * b[(x + i) * n + j]
+a[(x + i) * n + j] * b[i * n + y + j]
+ a[(x + i) * n + y + j] * b[i * n + j];
}
q += r;
}
c[x * N + y] = 0.25f*q;
}
}
int main(void){
int size = N*M*sizeof(df);
df *a, *b, *c, *cc, *ci, *d_a, *d_b, *d_c, *d_ci;
a = (df*)malloc(size);
b = (df*)malloc(size);
c = (df*)malloc(size);
cc = (df*)malloc(size);
ci = (df*)malloc(size);
cudaMalloc(&d_a, size);
cudaMalloc(&d_b, size);
cudaMalloc(&d_c, size);
cudaMalloc(&d_ci, size);
#pragma omp parallel for collapse (2)
for (int j = 0; j < M; j++) {
for (int i = 0; i < N; i++) {
a[j * N + i] = 0.1f;
b[j * N + i] = 0.2f;
}
}
unsigned long long dt = dtime_usec(0);
// Perform chebyprod on N elements
cpu_sum(N, M, a, b, cc);
dt = dtime_usec(dt,sync);
printf("Time taken 2D CPU: %fs\n", dt/(float)USECPSEC);
df dtc = dt/(float)USECPSEC;
std::cout << "Vector cc: [ ";
for (int k = 0; k < 10; ++k)
std::cout << cc[k] << " ";
std::cout <<"]\n";
cudaMemcpy(d_a, a, size, cudaMemcpyHostToDevice);
cudaMemcpy(d_b, b, size, cudaMemcpyHostToDevice);
dim3 dimBlock(BSX, BSY);
dim3 dimGrid(divUp(N, BSX), divUp(M, BSY));
//std::cout << "dimBlock: " << dimBlock << "\n dimGrid: " << dimGrid << "\n";
dt = dtime_usec(0);
// Perform chebyprod on N elements
chebyprod<<< dimGrid, dimBlock >>>(N, M, d_a, d_b, d_c);
dt = dtime_usec(dt,sync);
printf("Time taken 2D monolithic kernel: %fs\n", dt/(float)USECPSEC);
printf("Speedup: %fs\n", dtc/(dt/(float)USECPSEC));
cudaMemcpy(c, d_c, size, cudaMemcpyDeviceToHost);
std::cout << "Vector c: [ ";
for (int k = 0; k < 10; ++k)
std::cout << c[k] << " ";
std::cout <<"]\n";
dt = dtime_usec(0);
// Perform chebyprod on N elements
dim3 dimGrid2(N, (M+dimBlock.y-1)/dimBlock.y);
chebyprod_imp<<< dimGrid2, dimBlock >>>(N, M, d_a, d_b, d_ci);
dt = dtime_usec(dt,sync);
printf("Time taken 2D stride kernel: %fs\n", dt/(float)USECPSEC);
printf("Speedup: %fs\n", dtc/(dt/(float)USECPSEC));
cudaMemcpy(ci, d_ci, size, cudaMemcpyDeviceToHost);
std::cout << "Vector ci: [ ";
for (int k = 0; k < 10; ++k)
std::cout << ci[k] << " ";
std::cout <<"]\n";
df max_error = 0;
for (int k = 0; k < N*M; k++)
max_error = fmax(max_error, fabs(c[k] - ci[k]));
std::cout << "Max diff = " << max_error << std::endl;
cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_c);
cudaFree(d_ci);
free(a);
free(b);
free(c);
free(cc);
free(ci);
}
$ nvcc -O3 -Xcompiler -fopenmp -arch=sm_52 -o t1498 t1498.cu
$ ./t1498
Time taken 2D CPU: 0.034830s
Vector cc: [ 198.005 197.01 196.015 195.02 194.025 193.03 192.035 191.04 190.045 189.05 ]
Time taken 2D monolithic kernel: 0.033687s
Speedup: 1.033930s
Vector c: [ 198.005 197.01 196.015 195.02 194.025 193.03 192.035 191.04 190.045 189.05 ]
Time taken 2D stride kernel: 0.013526s
Speedup: 2.575041s
Vector ci: [ 198.005 197.01 196.015 195.02 194.025 193.03 192.035 191.04 190.045 189.05 ]
Max diff = 8.52651e-13
$
CUDA 10.1.105, Fedora 29, GTX 960
Note that when we run this same test on a Tesla V100, which can take the most advantage of the "extra" threads available in the striding kernel case, the benefit is more obvious:
$ OMP_NUM_THREADS=32 ./t1498
Time taken 2D CPU: 0.031610s
Vector cc: [ 198.005 197.01 196.015 195.02 194.025 193.03 192.035 191.04 190.045 189.05 ]
Time taken 2D monolithic kernel: 0.018228s
Speedup: 1.734145s
Vector c: [ 198.005 197.01 196.015 195.02 194.025 193.03 192.035 191.04 190.045 189.05 ]
Time taken 2D stride kernel: 0.000731s
Speedup: 43.242137s
Vector ci: [ 198.005 197.01 196.015 195.02 194.025 193.03 192.035 191.04 190.045 189.05 ]
Max diff = 8.52651e-13
If you perform the indexing "transpose" on your monolithic kernel similar to what I have done in the striding kernel, I think you'll end up in a performance situation that is roughly similar to where you ended up in the last question. Little or no performance benefit for the striding kernel over your monolithic kernel on a "small" GPU. ~5x improvement on a "large" GPU.

Newton method implementation for finding initial values, with Dormand Prince to solve differential equations in C

The following code works like a charm to solve a system of differential equations in it(fcn function in the code), with correct initial values. However, the point of the task is to replace initial values y_1(0) and y_2(0) with some random values, and implement some iterative method to find the correct initial values to solve the equation. I already know how to check if the value is correct value, since by definition output of ddopri 5 should give y_2(1) and y_3(1) as 0. How do I implement Newton Raphson for this problem?
#include<stdio.h>
#include<math.h>
#include<stdbool.h>
double ddopri5(void fcn(double, double *, double *), double *y);
double alpha;
void fcn(double t, double *y, double *f);
double eps;
int main(void){
double y[4];
//eps = 1.e-9;
printf("Enter alpha:\n");
scanf("%lg", &alpha);
printf("Enter epsilon:\n");
scanf("%lg", &eps);
y[0]=1.0;//x1(0)
y[1]=-1.22565282791;//x2(0)
y[2]=-0.274772807644;//p1(0)
y[3]=0.0;//p2(0)
ddopri5(fcn, y);
}
void fcn(double t, double *y, double *f){
/* double h = 0.25;*/
f[0] = y[1];
f[1] = y[3] - sqrt(2)*y[0]*exp(-alpha*t);
f[2] = sqrt(2)*y[3]*exp(-alpha*t) + y[0];
f[3] = -y[2];
}
double ddopri5(void fcn(double, double *, double *), double *y){
double t, h, a, b, tw, chi;
double w[4], k1[4], k2[4], k3[4], k4[4], k5[4], k6[4], k7[4], err[4], dy[4];
int i;
double errabs;
int iteration;
iteration = 0;
//eps = 1.e-9;
h = 0.1;
a = 0.0;
b = 1;//3.1415926535;
t = a;
while(t < b -eps){
printf("%lg\n", eps);
fcn(t, y, k1);
tw = t+ (1.0/5.0)*h;
for(i = 0; i < 4; i++){
/*printf("k1[%i] = %.15lf \n", i, k1[i]);*/
w[i] = y[i] + h*(1.0/5.0)*k1[i];
}
fcn(tw, w, k2);
tw = t+ (3.0/10.0)*h;
for(i = 0; i < 4; i++){
/*printf("k2[%i] = %.15lf \n", i, k2[i]);*/
w[i] = y[i] + h*((3.0/40.0)*k1[i] + (9.0/40.0)*k2[i]);
}
fcn(tw, w, k3);
tw = t+ (4.0/5.0)*h;
for(i = 0; i < 4; i++){
/*printf("k3[%i] = %.15lf \n", i, k3[i]);*/
w[i] = y[i] + h*((44.0/45.0)*k1[i] - (56.0/15.0)*k2[i] + (32.0/9.0)*k3[i]);
}
fcn(tw, w, k4);
tw = t+ (8.0/9.0)*h;
for(i = 0; i < 4; i++){
/*printf("k4[%i] = %.15lf \n", i, k4[i]);*/
w[i] = y[i] + h*((19372.0/6561.0)*k1[i] - (25360.0/2187.0)*k2[i] + (64448.0/6561.0)*k3[i] - (212.0/729.0)*k4[i]);
}
fcn(tw, w, k5);
tw = t + h;
for(i = 0; i < 4; i++){
/*printf("k5[%i] = %.15lf \n", i, k5[i]);*/
w[i] = y[i] + h*((9017.0/3168.0)*k1[i] - (355.0/33.0)*k2[i] + (46732.0/5247.0)*k3[i] + (49.0/176.0)*k4[i] - (5103.0/18656.0)*k5[i]) ;
}
fcn(tw, w, k6);
tw = t + h;
for(i = 0; i < 4; i++){
/*printf("k6[%i] = %.15lf \n", i, k6[i]);*/
w[i] = y[i] + h*((35.0/384.0)*k1[i] + (500.0/1113.0)*k3[i] + (125.0/192.0)*k4[i] - (2187.0/6784.0)*k5[i] + (11.0/84.0)*k6[i]);
}
fcn(tw, w, k7);
errabs = 0;
for(i = 0; i < 4; i++){
/* printf("k7[%i] = %.15lf \n", i, k7[i]);*/
/* dy[i] = h*((71.0/57600.0)*k1[i] - (71.0/16695.0)*k3[i] + (71.0/1920.0)*k4[i] - (17253.0/339200.0)*k5[i] + (22.0/525.0)*k6[i]);*/
dy[i] = h*((35.0/384.0)*k1[i] + (500.0/1113.0)*k3[i] + (125.0/192.0)*k4[i] - (2187.0/6784.0)*k5[i] + (11.0/84.0)*k6[i]);
/*err[i] = h*((71.0/57600.0)*k1[i] + (71.0/16695.0)*k3[i] + (71.0/1920.0)*k4[i] - (17253.0/339200.0)*k5[i] + (22.0/525.0)*k6[i] - (1.0/40.0)*k7[i])*/;
err[i] = h*((71.0/57600.0)*k1[i] - (71.0/16695.0)*k3[i] + (71.0/1920.0)*k4[i] - (17253.0/339200.0)*k5[i] + (22.0/525.0)*k6[i] - (1.0/40.0)*k7[i]);
/*printf("err[%i] = %.15lf \n", i, err[i]);*/
errabs+=err[i]*err[i];
}
errabs = sqrt(errabs);
printf("errabs = %.15lf\n", errabs);
if( errabs < eps){
t+= h;
printf(" FROM IF \t t = %.25lf, \n h = %.25lf, \n errabs = %.25lf, \n iteration = %i . \n", t, h, errabs, iteration);
for(i = 0; i < 4; i++){
y[i]+=dy[i];
}
}
/*Avtomaticheskiy vibor shaga*/
chi=errabs/eps;
chi = pow(chi, (1.0/6.0));
if(chi > 10) chi = 10;
if(chi < 0.1) chi = 0.1;
h*= 0.95/chi;
if( t + h > b ) h = b - t;
/* for(i = 0; i < 4; i++){
printf("y[%i] = %.15lf \n", i, y[i]);
}*/
iteration++;
printf("t = %.25lf \t h = %.25lf\n", t, h);
/*if(iteration > 5) break;*/
printf("end \n");
for(i = 0; i < 4; i++){
printf("y[%i] = %.15lf \n", i, y[i]);
}
if(iteration > 30000) break;
}
/* for(i = 0; i < 4; i++){
printf("y[%i] = %.15lf\n", i, y[i]);
}*/
return 0;
}

Try this:
Y0=initial_guess
while (true) {
F=ddopri(Y0);
Error=F-F_correct
if (Error small enough)
break;
J=jacobian(ddopri, Y0) // this is the matrix dF/dY0
Y0=Y0-J^(-1)*Error // here you have to solve a linear system
The Jacobian can be obtained using finite differences, i.e. bump up and down the elements of Y one at a time, compute F, take finite differences.
To be clear, element (i,j) of matrix J is dF_i/dY0_j

Develop Reference

c reactjs sql-server angularjs arrays wpf database batch-file google-app-engine silverlight

How can i optimize my AVX implementation of dot product? - c

Related

Vectorising a nested loop with AVX2

Pseudoinverse code results in C inaccurate compared to MATLAB results

Implementation of OpenMP

Cuda - 2D Multiple double sums in each Matrix element

Newton method implementation for finding initial values, with Dormand Prince to solve differential equations in C

Categories

Resources