I have Mac OS X Yosemite 10.10.1 (14B25).
I have some problems with compiling the code. Here it is:
#include <stdio.h>
#include <mpi.h>
#define n 3
#define repeats 1
double abs(double item)
{
return (item > 0) ? item : -item;
}
int swap_raws (double **a, int p, int q)
{
if (p >= 0 && p < n && q >= 0 && q < n)
{
if (p == q)
return 0;
for (int i = 0; i < n; i++)
{
double temp = a[p][i];
a[p][i] = a[q][i];
a[q][i] = temp;
}
return 0;
}
else
return -1;
}
double f_column (int rank, int size, double *least)
{
double t1, t2, tbeg, tend, each_least = 1, least0;
int map[n];
double **a = malloc (sizeof (*a) * n);
int i, j, k;
for (i = 0; i < n; i++)
a[i] = malloc (sizeof (*a[i]) * n);
if (rank == 0)
for (i = 0; i < n; i++)
for (j = 0; j < n; j++)
a[i][j] = 1.0 / (i + j + 1);
MPI_Bcast (a, n * n, MPI_DOUBLE, 0, MPI_COMM_WORLD);
for (i = 0; i < n; i++)
map[i] = i % size;
MPI_Barrier (MPI_COMM_WORLD);
t1 = MPI_Wtime ();
for (k = 0; k < n - 1; k++)
{
double max = abs (a[k][k]);
int column = k;
for (j = k + 1; j < n; j++)
{
double absv = abs (a[k][j]);
if (absv > max)
{
max = absv;
column = j;
}
}
if (map[k] == rank && column != k && swap_raws (a, k, column))
{
printf("ERROR SWAPPING %d and %d columns\n", k, column);
return -1;
}
MPI_Bcast (&a[k], n, MPI_DOUBLE, map[k], MPI_COMM_WORLD);
MPI_Bcast (&a[column], n, MPI_DOUBLE, map[k], MPI_COMM_WORLD);
if (map[k] == rank)
for (i = k + 1; i < n; i++)
a[k][i] /= a[k][k];
MPI_Barrier (MPI_COMM_WORLD);
MPI_Bcast (&a[k][k+1], n - k - 1, MPI_DOUBLE, map[k], MPI_COMM_WORLD);
for (i = k + 1; i < n; i++)
if (map[i] == rank)
for (j = k + 1; j < n; j++)
a[j][i] -= a[j][k] * a[i][j];
}
t2 = MPI_Wtime ();
for (i = 0; i < n; i++)
if (map[i] == rank)
for (j = 0; j < n; j++)
{
double absv = abs (a[i][j]);
if (each_least > absv)
each_least = absv;
//printf ("a[%d][%d] = %lg\n", j, i, a[i][j]);
}
MPI_Reduce (&each_least, &least0, 1, MPI_DOUBLE, MPI_MIN, 0, MPI_COMM_WORLD);
MPI_Reduce (&t1, &tbeg, 1, MPI_DOUBLE, MPI_MIN, 0, MPI_COMM_WORLD);
MPI_Reduce (&t2, &tend, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
for (i = 0; i < n; i++)
free (a[i]);
free (a);
if (rank == 0)
{
*least = least0;
return (tend - tbeg);
}
}
int main (int argc, char *argv[])
{
int rank, size;
double min, max, aver, least;
if (n == 0)
return 0;
MPI_Init (&argc, &argv);
MPI_Comm_rank (MPI_COMM_WORLD, &rank);
MPI_Comm_size (MPI_COMM_WORLD, &size);
// It works!
//double try = f_column_non_parallel (rank, size, &least);
double try = f_column (rank, size, &least);
aver = max = min = try;
for (int i = 1; i < repeats; i++)
{
//double try = f_column_non_parallel (rank, size, &least);
double try = f_column (rank, size, &least);
if (try < min)
min = try;
else if (try > max)
max = try;
aver += try;
}
aver /= repeats;
MPI_Finalize ();
if (rank == 0)
printf("N: %d\nMIN: %f\nMAX: %f\nAVER: %f\nLEAST: %lg\n", size, min, max, aver, least);
return 0;
}
I have the Gilbert matrix. a(i)(j) = 1 / (i + j + 1) for i,j from 0 to n
This code should find LU decomposition using MPI in order to do it in the parallel way.
The first one process initialises the array and then broadcasts it to other processes.
Then I find the maximum in the raw and swap that columns. Then I would like to broadcast that data to every process, i.e. using MPI_Barrier (MPI_COMM_WORLD); but it says:
So, I don't know what's happened and how I can fix that problem. The same variant of the program runs without using processes and non-parallel version but doesn't work here.
If you find the solution, the example should work like that (I was calculating it by myself, you can check it too, but I can admit it's true). The matrix (here j and i vertically and horizontally respectively, it works in not such a convenient way for people but you should take it):
1 1/2 1/3 1 1/2 1/3 1 1/2 1/3 |1 1/2 1/3 |
1/2 1/3 1/4 -> 1/2 1/12 1/12 -> 1/2 1/12 1 -> |1/2 1/12 1/12 | <- answer
1/3 1/4 1/5 1/3 1/12 4/45 1/3 1/12 1/180 |1/3 1 1/180|
The source matrix so:
|1 0 0| |1 1/2 1/3 | |1 1/2 1/3|
A = |1/2 1 0| * |0 1/12 1/12 | = |1/2 1/3 1/4|
|1/3 1 1| |0 0 1/180| |1/3 1/4 1/5|
Can you help me to find out made mistake? Thank you in advance :)
Your program has a bug in the following part of the code:
double **a = malloc (sizeof (*a) * n);
[...snip...]
MPI_Bcast (a, n * n, MPI_DOUBLE, 0, MPI_COMM_WORLD);
You are allocating 'n' pointers in "a", not an 'n * n' array. So when you do an 'n * n' size MPI_Bcast of "a", you are asking MPI to transfer from garbage memory locations that is not allocated. This is causing MPI to segfault.
You can change "a" to simply "double *" instead of "double **" and allocate 'n * n' doubles in there to fix this issue.
What grieves me the most is that f_column() is supposed to return a double, but the return value is undefined when rank != 0.
This comment caught my attention:
// It works!
//double try = f_column_non_parallel (rank, size, &least);
double try = f_column (rank, size, &least);
It suggests that the previous version of f_column() was working, and that you ran into troubles when attempting to parallelize it (I'm guessing that's what you're doing now).
How this could lead to a segfault is not immediately apparent to me though. I'd expect a floating point exception.
A couple of other points:
I'm not too comfortable with your memory allocation code (I'd probably use calloc() instead of malloc(), and sizeof() on explicit data types, etc...); it just freaks me out to see things like a[i] = malloc(sizeof (*a[i]) * n);, but it's just a matter of style, really.
You appear to have proper bound checking (indices over a are always positive and < n).
Oh, and you're redefining abs(), which is probably not a good idea.
Try to compile your code in debug mode, and run it with gdb; also run it through valgrind if you can, MacOS X should be supported by now.
You should probably take a closer look at your compiler warnings ;-)
Related
I want to implement the following equation in C:
C[l,q,m] = A[m,q,k] * B[k,l]
where the repeated index k is being summed over.
I implemented this in three ways:
Naive implementation with loops
Using the BLAS routine DGEMV (matrix-vector multiplication)
Using the BLAS routine DGEMM (matrix-matrix multiplication)
This is the minimal not-working code:
#include <stdio.h>
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
#include <cblas.h>
void main()
{
const size_t n = 3;
const size_t n2 = n*n;
const size_t n3 = n*n*n;
/* Fill rank 3 tensor with random numbers */
double a[n3];
for (size_t i = 0; i < n3; i++) {
a[i] = (double) rand() / RAND_MAX;
}
/* Fill matrix with random numbers */
double b[n2];
for (size_t i = 0; i < n2; i++) {
b[i] = (double) rand() / RAND_MAX;
}
/* All loops */
double c_exact[n3];
memset(c_exact, 0, n3 * sizeof(double));
for (size_t l = 0; l < n; l++) {
for (size_t q = 0; q < n; q++) {
for (size_t m = 0; m < n; m++) {
for (size_t k = 0; k < n; k++) {
c_exact[l*n2+q*n+m] += a[m*n2+q*n+k] * b[k*n+l];
}
}
}
}
/* Matrix-vector */
double c_mv[n3];
memset(c_mv, 0, n3 * sizeof(double));
for (size_t m = 0; m < n; m++) {
for (size_t l = 0; l < n; l++) {
cblas_dgemv(
CblasRowMajor, CblasNoTrans, n, n, 1.0, &a[m*n2],
n, &b[l], n, 0.0, &c_mv[l*n2+m], n);
}
}
/* Matrix-matrix */
double c_mm[n3];
memset(c_mm, 0, n3 * sizeof(double));
for (size_t m = 0; m < n; m++) {
cblas_dgemm(
CblasRowMajor, CblasTrans, CblasTrans, n, n, n, 1.0, b, n,
&a[m*n2], n, 0.0, &c_mm[m], n2);
}
/* Compute difference */
double diff_mv = 0.0;
double diff_mm = 0.0;
for (size_t idx = 0; idx < n3; idx++) {
diff_mv += c_mv[idx] - c_exact[idx];
diff_mm += c_mm[idx] - c_exact[idx];
}
printf("Difference matrix-vector: %e\n", diff_mv);
printf("Difference matrix-matrix: %e\n", diff_mm);
}
And this the output:
Difference matrix-vector: 0.000000e+00
Difference matrix-matrix: -1.188678e+01
i.e. the DGEMV implementation is correct, the DGEMM not - I really don't understand this. I switched around the multiplication (matrix-matrix multiplication is non commutative) and transposed both to get the right order C[l,q,m] instead of C[q,l,m], but I also tried it without switching/transposing and it does not work.
Can anyone please help?
Thanks.
edit: I thought about it a bit and feel like I'm trying to do something that DGEMM doe not support? Namely I try to insert a submatrix into C[:,:,m], which means that both the leading and trailing index are not contiguous in memory. DGEMM allows me to set the parameter LDC, which in this case needs to be n^2, but it does not know that also the second index is non-contiguous with a stride of n (and there is no parameter to tell it?). So why does DGEMM not support a second parameter for the stride of the trailing dimension?
I'm trying to make a simple console application in C which will calculate the determinant of a Matrix using the Gauss elimination. after a lot of tests I found out that my program is not working because of the core dumped error.After 2 days of editing and undoing, i could not find the problem.
Any help is more than welcomed.
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
int recherche_pivot(int k, int n, float *A)
{
int i, j;
if (A[((k - 1) * n + k) - 1] != 0)
{
return k;
}
else
{ //parcours du reste de la colonne
for (i = k + 1; i <= n; i++)
{
if (A[((k - 1) * n + i) - 1] != 0)
{
return i;
}
}
return -1;
}
}
void fois(int n, float p, int i, float * A, float *b, float * x)
{
int a;
for (a = 1; a <= n; a++)
{
x[a - 1] = A[((i - 1) * n + a) - 1] * p;
}
x[n] = b[i - 1] * p;
}
void afficher_system(int n, float * X, float *b)
{
int i, j;
for (i = 1; i <= n; i++)
{
for (j = 1; j <= n; j++)
printf("%f ", X[((i - 1) * n + j) - 1]);
printf(" | %f", b[i - 1]);
printf("nn");
}
printf("nnnn");
}
void saisirmatrice(int n, float *A)
{
int i, j;
for (i = 1; i <= n; i++)
for (j = 1; j <= n; j++)
scanf("%f", &A[((i - 1) * n + j) - 1]);
}
void affichermatrice(int n, float *A)
{
int i, j;
for (i = 1; i <= n; i++)
for (j = 1; j <= n; j++)
printf("A[%d][%d] = %fn", i, j, A[((i - 1) * n + j) - 1]);
}
void elemination(int n, int k, float *b, float *A)
{
int i, l, j;
float * L, piv;
L = (float *) malloc((n) * sizeof(float));
for (i = k + 1; i <= n; i++)
{
piv = -1 * (A[((i - 1) * n + k) - 1] / A[((k - 1) * n + k) - 1]);
fois(n, piv, k, A, b, L);
//afficher_vecteur(n,L);
for (j = 1; j <= n; j++)
{
A[((i - 1) * n + j) - 1] = A[((i - 1) * n + j) - 1] + L[j - 1];
}
b[i - 1] = b[i - 1] + L[n];
afficher_system(n, A, b);
}
}
void permutter(int n, float * A, int i, int j, float * b)
{
int a;
float t[n + 1];
for (a = 1; a <= n; a++)
{
t[a - 1] = A[((i - 1) * n + a) - 1];
A[((i - 1) * n + a) - 1] = A[((j - 1) * n + a) - 1];
A[((j - 1) * n + a) - 1] = t[a - 1];
}
t[n] = b[i - 1];
b[i - 1] = b[j - 1];
b[j - 1] = t[n];
}
void main()
{
float * A, det, *L, *R, *b, s;
int i, j, i0, n, k, stop = 0;
printf("Veuillez donner la taille de la matrice");
scanf("%d", &n);
A = (float *) malloc(sizeof(float) * (n * n));
L = (float*) malloc(n * sizeof(float));
R = (float*) malloc(n * sizeof(float));
b = (float*) malloc(n * sizeof(float));
printf("Veuillez remplir la matrice");
saisirmatrice(n, A);
det = 1;
stop = 0;
k = 1;
do
{
do
{
i0 = recherche_pivot(k, n, A);
if (i0 == k)
{
//Elémination
elemination(n, k, b, A);
k++;
}
else if (i0 == -1)
{
stop = 1;
}
else
{ //cas ou ligne pivot=i0 != k
//permutation
det = -det;
permutter(n, A, k, i0, b);
//elemination
elemination(n, k, b, A);
//afficher_matrice(n,A);
k++;
}
} while ((k <= n) && (stop == 0));
} while (stop == 1 || k == n);
for (i = 1; i < n; i++)
{
det = det * A[((i - 1) * n + i) - 1];
}
printf("Le determinant est :%f", det);
free(A);
free(L);
free(R);
free(b);
}
There are many problems in the above code. Since arrays are zero-indexed in C, you should count the rows and columns of your matrices starting from zero, instead of counting from 1 and then attempting to convert when array-indexing. There is no need to cast the result of malloc(), and it is better to use an identifier rather than an explicit type as the argument for the sizeof operator:
A = malloc(sizeof(*A) * n * n));
You allocate space for L and R in main(), and then never use these pointers until the end of the program when they are freed. Then you allocate for L within the elemination() function; but you never free this memory, so you have a memory leak. You also allocate space for b in main(), but you don't store any values in b before passing it to the elemination() function. This is bound to cause problems.
There is no need for dynamic allocation here in the first place; I suggest using a variable length array to store the elements of the matrix. These have been available since C99, and will allow you to avoid all of the allocation issues.
There is a problem in the recherche_pivot() function, where you compare:
if(A[((k - 1) * n + i) - 1] != 0) {}
This is a problem because the array element is a floating point value which is the result of arithmetic operations; this value should not be directly compared with 0. I suggest selecting an appropriate DELTA value to represent a zero range, and instead comparing:
#define DELTA 0.000001
...
if (fabs(A[((k - 1) * n + i) - 1]) < DELTA) {}
In the permutter() function you use an array, float t[n];, to hold temporary values. But an array is unnecessary here since you don't need to save these temporary values after the swap; instead just use float t;. Further, when you interchange the values in b[], you use t[n] to store the temporary value, but this is out of bounds.
The elemination() function should probably iterate over all of the rows (excepting the kth row), rather that starting from the kth row, or it should start at the k+1th row. As it is, the kth row is used to eliminate itself. Finally, the actual algorithm that you use to perform the Gaussian elimination in main() is broken. Among other things, the call permutter(n, A, k, i0, b); swaps the kth row with the i0th row, but i0 is the pivot column of the kth row. This makes no sense.
It actually looks like you want to do more than just calculate determinants with this code, since you have b, which is the constant vector of a linear system. This is not needed for the task alluded to in the title of your question. Also, it appears that your code gives a result of 1 for any 1X1 determinant. This is incorrect; it should be the value of the single number in this case.
The Gaussian elimination method for calculating the determinant requires that you keep track of how many row-interchanges are performed, and that you keep a running product of any factors by which individual rows are multiplied. Adding a multiple of one row to another row to replace that row does not change the value of the determinant, and this is the operation used in the reduce() function below. The final result is the product of the diagonal entries in the reduced matrix, multiplied by -1 once for every row-interchange operation, divided by the product of all of the factors used to scale individual rows. In this case, there are no such factors, so the result is simply the product of the diagonal elements of the reduced matrix, with the sign correction. This is the method used by the code posted in the original question.
There were so many issues here that I just wrote a fresh program that implements this algorithm. I think that it is close, at least in spirit, to what you were trying to accomplish. I did add some input validation for the size of the matrix, checking to be sure that the user inputs a positive number, and prompting for re-entry if the input is bad. The input loop that fills the matrix would benefit from similar input validation. Also note that the input size is stored in a signed int, to allow checks for negative input, and a successful input is cast and stored in a variable of type size_t, which is an unsigned integer type guaranteed to hold any array index. This is the correct type to use when indexing arrays, and you will note that size_t is used throughout the program.
#include <stdio.h>
#include <math.h>
#include <stdbool.h>
#define DELTA 0.000001
void show_matrix(size_t mx_sz, double mx[mx_sz][mx_sz]);
void interchange(size_t r1, size_t r2, size_t mx_sz, double mx[mx_sz][mx_sz]);
void reduce(double factor, size_t r1, size_t r2,
size_t mx_sz, double mx[mx_sz][mx_sz]);
size_t get_pivot(size_t row, size_t mx_sz, double mx[mx_sz][mx_sz]);
double find_det(size_t mx_sz, double mx[mx_sz][mx_sz]);
int main(void)
{
size_t n;
int read_val, c;
printf("Enter size of matrix: ");
while (scanf("%d", &read_val) != 1 || read_val < 1) {
while ((c = getchar()) != '\n' && c != EOF) {
continue; // discard extra characters
}
printf("Enter size of matrix: ");
}
n = (size_t) read_val;
double matrix[n][n];
printf("Enter matrix elements:\n");
for (size_t i = 0; i < n; i++) {
for (size_t j = 0; j < n; j++) {
scanf("%lf", &matrix[i][j]);
}
}
printf("You entered:\n");
show_matrix(n, matrix);
putchar('\n');
double result = find_det(n, matrix);
show_matrix(n, matrix);
putchar('\n');
printf("Determinant: %f\n", result);
return 0;
}
void show_matrix(size_t n, double mx[n][n])
{
for (size_t i = 0; i < n; i++) {
for (size_t j = 0; j < n; j++) {
printf("%7.2f", mx[i][j]);
}
putchar('\n');
}
}
/* interchange rows r1 and r2 */
void interchange(size_t r1, size_t r2, size_t mx_sz, double mx[mx_sz][mx_sz])
{
double temp;
for (size_t j = 0; j < mx_sz; j++) {
temp = mx[r1][j];
mx[r1][j] = mx[r2][j];
mx[r2][j] = temp;
}
}
/* add factor * row r1 to row r2 to replace row r2 */
void reduce(double factor, size_t r1, size_t r2,
size_t mx_sz, double mx[mx_sz][mx_sz])
{
for (size_t j = 0; j < mx_sz; j++) {
mx[r2][j] += (factor * mx[r1][j]);
}
}
/* returns pivot column, or mx_sz if there is no pivot */
size_t get_pivot(size_t row, size_t mx_sz, double mx[mx_sz][mx_sz])
{
size_t j = 0;
while (j < mx_sz && fabs(mx[row][j]) < DELTA) {
++j;
}
return j;
}
double find_det(size_t mx_sz, double mx[mx_sz][mx_sz])
{
size_t pivot1, pivot2;
size_t row;
double factor;
bool finished = false;
double result = 1.0;
while (!finished) {
finished = true;
row = 1;
while (row < mx_sz) {
// determinant is zero if there is a zero row
if ((pivot1 = get_pivot(row - 1, mx_sz, mx)) == mx_sz ||
(pivot2 = get_pivot(row, mx_sz, mx)) == mx_sz) {
return 0.0;
}
if (pivot1 == pivot2) {
factor = -mx[row][pivot1] / mx[row - 1][pivot1];
reduce(factor, row - 1, row, mx_sz, mx);
finished = false;
} else if (pivot2 < pivot1) {
interchange(row - 1, row, mx_sz, mx);
result = -result;
finished = false;
}
++row;
}
}
for (size_t j = 0; j < mx_sz; j++) {
result *= mx[j][j];
}
return result;
}
Sample session:
Enter size of matrix: oops
Enter size of matrix: 0
Enter size of matrix: -1
Enter size of matrix: 3
Enter matrix elements:
0 1 3
1 2 0
0 3 4
You entered:
0.00 1.00 3.00
1.00 2.00 0.00
0.00 3.00 4.00
1.00 2.00 0.00
-0.00 -3.00 -9.00
0.00 0.00 -5.00
Determinant: 5.000000
I used an R code which implements a permutation test for the distributional comparison between two populations of functions. We have p univariate p-values.
The bottleneck is the construction of a matrix which contains all the possible CONTIGUOS p-values.
The last row of the matrix of p-values contain all the univariate p-values.
The penultimate row contains all the bivariate p-values in this order:
p_val_c(1,2), p_val_c(2,3), ..., p_val_c(p, 1)
...
The elements of the first row are coincident and the value associated is the p-value of the global test p_val_c(1,...,p)=p_val_c(2,...,p,1)=...=pval(p,1,...,p-1).
For computational reasons, I have decided to implement this component in c and use it in R with .C.
Here the code. The unique important part is the definition of the function Build_pval_asymm_matrix.
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <string.h>
#include <time.h>
void Build_pval_asymm_matrix(int * p, int * B, double * pval,
double * L,
double * pval_asymm_matrix);
// Function used for the sorting of vector T_temp with qsort
int cmp(const void *x, const void *y);
int main() {
int B = 1000; // number Conditional Monte Carlo (CMC) runs
int p = 100; // number univariate tests
// Generate fictitiously data univariate p-values pval and matrix L.
// The j-th column of L is the empirical survival
// function of the statistics test associated to the j-th coefficient
// of the basis expansion. The dimension of L is B * p.
// Generate pval
double pval[p];
memset(pval, 0, sizeof(pval)); // initialize all elements to 0
for (int i = 0; i < p; i++) {
pval[i] = (double)rand() / (double)RAND_MAX;
}
// Construct L
double L[B * p];
// Inizialize to 0 the elements of L
memset(L, 0, sizeof(L));
// Array used to construct the columns of L
double temp_array[B];
memset(temp_array, 0, sizeof(temp_array));
for(int i = 0; i < B; i++) {
temp_array[i] = (double) (i + 1) / (double) B;
}
for (int iter_coeff=0; iter_coeff < p; iter_coeff++) {
// Shuffle temp_array
if (B > 1) {
for (int k = 0; k < B - 1; k++)
{
int j = rand() % B;
double t = temp_array[j];
temp_array[j] = temp_array[k];
temp_array[k] = t;
}
}
for (int i=0; i<B; i++) {
L[iter_coeff + p * i] = temp_array[i];
}
}
double pval_asymm_matrix[p * p];
memset(pval_asymm_matrix, 0, sizeof(pval_asymm_matrix));
// Construct the asymmetric matrix of p-values
clock_t start, end;
double cpu_time_used;
start = clock();
Build_pval_asymm_matrix(&p, &B, pval, L, pval_asymm_matrix);
end = clock();
cpu_time_used = ((double) (end - start)) / CLOCKS_PER_SEC;
printf("TOTAL CPU time used: %f\n", cpu_time_used);
return 0;
}
void Build_pval_asymm_matrix(int * p, int * B, double * pval,
double * L,
double * pval_asymm_matrix) {
int nbasis = *p, iter_CMC = *B;
// Scalar output fisher combining function applied on univariate
// p-values
double T0_temp = 0;
// Vector output fisher combining function applied on a set of
//columns of L
double T_temp[iter_CMC];
memset(T_temp, 0, sizeof(T_temp));
// Counter for elements of T_temp greater than or equal to T0_temp
int count = 0;
// Indexes for columns of L
int inf = 0, sup = 0;
// The last row of matrice_pval_asymm contains the univariate p-values
for(int i = 0; i < nbasis; i++) {
pval_asymm_matrix[i + nbasis * (nbasis - 1)] = pval[i];
}
// Construct the rows from bottom to up
for (int row = nbasis - 2; row >= 0; row--) {
for (int col = 0; col <= row; col++) {
T0_temp = 0;
memset(T_temp, 0, sizeof(T_temp));
inf = col;
sup = (nbasis - row) + col - 1;
// Combining function Fisher applied on
// p-values pval[inf:sup]
for (int k = inf; k <= sup; k++) {
T0_temp += log(pval[k]);
}
T0_temp *= -2;
// Combining function Fisher applied
// on columns inf:sup of matrix L
for (int k = 0; k < iter_CMC; k++) {
for (int l = inf; l <= sup; l++) {
T_temp[k] += log(L[l + nbasis * k]);
}
T_temp[k] *= -2;
}
// Sort the vector T_temp
qsort(T_temp, iter_CMC, sizeof(double), cmp);
// Count the number of elements of T_temp less than T0_temp
int h = 0;
while (h < iter_CMC && T_temp[h] < T0_temp) {
h++;
}
// Number of elements of T_temp greater than or equal to T0_temp
count = iter_CMC - h;
pval_asymm_matrix[col + nbasis * row] = (double) count / (double)iter_CMC;
}
// auxiliary variable for columns of L inf:nbasis-1 and 1:sup
int aux_first = 0, aux_second = 0;
int num_col_needed = 0;
for (int col = row + 1; col < nbasis; col++) {
T0_temp = 0;
memset(T_temp, 0, sizeof(T_temp));
inf = col;
sup = ((nbasis - row) + col) % nbasis - 1;
// Useful indexes
num_col_needed = nbasis - inf + sup + 1;
int index_needed[num_col_needed];
memset(index_needed, -1, num_col_needed * sizeof(int));
aux_first = inf;
for (int i = 0; i < nbasis - inf; i++) {
index_needed[i] = aux_first;
aux_first++;
}
aux_second = 0;
for (int j = 0; j < sup + 1; j++) {
index_needed[j + nbasis - inf] = aux_second;
aux_second++;
}
// Combining function Fisher applied on p-values
// pval[inf:p-1] and pval[0:sup-1]1]
for (int k = 0; k < num_col_needed; k++) {
T0_temp += log(pval[index_needed[k]]);
}
T0_temp *= -2;
// Combining function Fisher applied on columns inf:p-1 and 0:sup-1
// of matrix L
for (int k = 0; k < iter_CMC; k++) {
for (int l = 0; l < num_col_needed; l++) {
T_temp[k] += log(L[index_needed[l] + nbasis * k]);
}
T_temp[k] *= -2;
}
// Sort the vector T_temp
qsort(T_temp, iter_CMC, sizeof(double), cmp);
// Count the number of elements of T_temp less than T0_temp
int h = 0;
while (h < iter_CMC && T_temp[h] < T0_temp) {
h++;
}
// Number of elements of T_temp greater than or equal to T0_temp
count = iter_CMC - h;
pval_asymm_matrix[col + nbasis * row] = (double) count / (double)iter_CMC;
} // end for over col from row + 1 to nbasis - 1
} // end for over rows of asymm p-values matrix except the last row
}
int cmp(const void *x, const void *y)
{
double xx = *(double*)x, yy = *(double*)y;
if (xx < yy) return -1;
if (xx > yy) return 1;
return 0;
}
Here the times of execution in seconds measured in R:
time_original_function
user system elapsed
79.726 1.980 112.817
time_function_double_for
user system elapsed
79.013 1.666 89.411
time_c_function
user system elapsed
47.920 0.024 56.096
The first measure was obtained using an equivalent R function with duplication of the vector pval and matrix L.
What I wanted to ask is some suggestions in order to decrease the execution time with the C function for simulation purposes. The last time I used c was five years ago and consequently there is room for improvement. For instance I sort the vector T_temp with qsort in order to compute in linear time with a while the number of elements of T_temp greater than or equal to T0_temp. Maybe this task could be done in a more efficient way. Thanks in advance!!
I reduced the input size to p to 50 to avoid waiting on it (don't have such a fast machine) -- keeping p as is and reducing B to 100 has a similar effect, but profiling it showed that ~7.5 out of the ~8 seconds used to compute this was spent in the log function.
qsort doesn't even show up as a real hotspot. This test seems to headbutt the machine more in terms of micro-efficiency than anything else.
So unless your compiler has a vastly faster implementation of log than I do, my first suggestion is to find a fast log implementation if you can afford some accuracy loss (there are ones out there that can compute log over an order of magnitude faster with precision loss in the range of ~3% or so).
If you cannot have precision loss and accuracy is critical, then I'd suggest trying to memoize the values you use for log if you can and store them into a lookup table.
Update
I tried the latter approach.
// Create a memoized table of log values.
double log_cache[B * p];
for (int j=0, num=B*p; j < num; ++j)
log_cache[j] = log(L[j]);
Using malloc might be better here, as we're pushing rather large data to the stack and could risk overflows.
Then pass her into Build_pval_asymm_matrix.
Replace these:
T_temp[k] += log(L[l + nbasis * k]);
...
T_temp[k] += log(L[index_needed[l] + nbasis * k]);
With these:
T_temp[k] += log_cache[l + nbasis * k];
...
T_temp[k] += log_cache[index_needed[l] + nbasis * k];
This improved the times for me from ~8 seconds to ~5.3 seconds, but we've exchanged the computational overhead of log for memory overhead which isn't that much better (in fact, it rarely is but calling log for double-precision floats is apparently quite expensive, enough to make this exchange worthwhile). The next iteration, if you want more speed, and it is very possible, involves looking into cache efficiency.
For this kind of huge matrix stuff, focusing on memory layouts and access patterns can work wonders.
I'm trying to find the max of randomly generated numbers. Any thoughts on this...
I am using MPI_Scatter to split the randomly generated numbers into equal processes. I am using MPI_Reduce to get the MAX from each process.
#include <stdio.h>
#include <time.h>
#include <stdlib.h>
#include <mpi.h>
#define atmost 1000
int find(int* partial_max, int from, int to){
int i, max;
printf("%d----%d\n", from, to);
max = partial_max[from];
for (i = from + 1; i <= to; i++)
if (partial_max[i] > max)
max = partial_max[i];
return max;
}
int main(){
int i, j,n, comm_sz, biggest, b, my_rank, q,result;
//1. Declare array of size 1000
int a[atmost];
//2. generate random integer of 0 to 999
srand((unsigned)time(NULL));
n = rand() % atmost;
//n = 10;
for (i = 0; i <= n; i++){
a[i] = rand() % atmost;
printf("My Numbers: %d\n", a[i]);
//a[i] = i;
}
MPI_Init(NULL, NULL);
MPI_Comm_size(MPI_COMM_WORLD, &comm_sz);
MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);
//j is the size we will split each segment into
j = (n / (comm_sz-1));
int partial_max[j];
int receive_vector[j];
//Send random numbers equally to each process
MPI_Scatter(a, j, MPI_INT, receive_vector,
j, MPI_INT, 0, MPI_COMM_WORLD);
int localmax;
localmax = -1;
for (i = 0; i <= comm_sz-1; i++)
if (receive_vector[i] > localmax)
localmax = receive_vector[i];
// Get Max from each process
//MPI_Reduce(receive_vector, partial_max, j, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);
MPI_Reduce(&localmax, &result, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);
if (my_rank == 0)
{
/*
biggest = -1;
for (i = 0; i < comm_sz - 1; i++){
if (i == comm_sz - 2)
b = find(partial_max, i * j, n - 1);
else
b = find(partial_max, i * j, (i + 1) * j - 1);
if (b > biggest)
biggest = b;
}*/
printf("-------------------\n");
printf("The biggest is: %d\n", result);
printf("The n is: %d\n", n);
}
MPI_Finalize();
return 0;
}
You have few bugs there:
You select (a different value of) n in each process. It is better to
select it within rank 0 and bcast to the rest of the processes.
When calculating j you divise by comm_sz-1 instead of comm_sz.
You assume n is divisible by comm_sz and that each process receives the exact same amount of numbers to process.
You loop with i going up to comm_sz-1 instead of going up to j
This is what I could find in a quick look..
I try to compute Fourier transform with the planer fftw_mpi_plan_dft_r2c_2d of FFTW 3.3. Unfortunately, I can not make it work. The result is correct if N0 is equal to the number of processors (nb_proc) but is wrong when N0 != nb_proc.
An example showing my problem:
#include <stdio.h>
#include <complex.h>
#include <fftw3-mpi.h>
int main(int argc, char **argv)
{
/* if N0 (=ny) is equal to nb_proc, result are OK */
/* if N0 is not equal to nb_proc => bug */
const ptrdiff_t N0 = 4, N1 = 4;
int coef_norm = N0*N1;
fftw_plan plan_forward;
double *carrayX;
fftw_complex *carrayK;
ptrdiff_t n_alloc_local, i, j;
ptrdiff_t nX0loc, iX0loc_start, nK0loc, nK1loc;
/* X and K denote physical and Fourier spaces. */
int rank, nb_proc, irank;
MPI_Init(&argc, &argv);
fftw_mpi_init();
/*DETERMINE RANK OF THIS PROCESSOR*/
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
/*DETERMINE TOTAL NUMBER OF PROCESSORS*/
MPI_Comm_size(MPI_COMM_WORLD, &nb_proc);
if (rank==0) printf("program test_fftw3_2Dmpi_simple\n");
printf("I'm rank (processor number) %i of size %i\n", rank, nb_proc);
n_alloc_local = fftw_mpi_local_size_2d(N0, N1/2+1, MPI_COMM_WORLD,
&nX0loc, &iX0loc_start);
carrayX = fftw_alloc_real(2 * n_alloc_local);
carrayK = fftw_alloc_complex(n_alloc_local);
/* create plan for out-of-place r2c DFT */
plan_forward = fftw_mpi_plan_dft_r2c_2d(N0, N1,
carrayX, carrayK,
MPI_COMM_WORLD,
FFTW_MEASURE);
nK0loc = nX0loc;
nK1loc = N1/2+1;
/* initialize carrayX to a constant */
for (i = 0; i < nX0loc; ++i) for (j = 0; j < N1; ++j)
carrayX[i*N1 + j] = 1.;
/* compute forward transform and normalize */
fftw_execute(plan_forward);
for (i = 0; i < nK0loc; ++i) for (j = 0; j < nK1loc; ++j)
carrayK[i*nK1loc + j] = carrayK[i*nK1loc + j]/coef_norm;
/* print carrayK, there should be only one 1 in the first case for rank=0 */
for (irank = 0; irank<nb_proc; irank++)
{
MPI_Barrier(MPI_COMM_WORLD);
if (rank == irank)
{
for (i = 0; i < nK0loc; ++i) for (j = 0; j < nK1loc; ++j)
{
printf("rank = %i, carrayK[%ti*nK1loc + %ti] = (%6.4f, %6.4f)\n",
rank, i, j,
creal(carrayK[i*nK1loc + j]),
cimag(carrayK[i*nK1loc + j]));
}
printf("\n");
}
}
MPI_Barrier(MPI_COMM_WORLD);
fftw_destroy_plan(plan_forward);
MPI_Finalize();
}
There is something wrong in this example but I don't understand what.
For this case (N0 = 4, N1 = 4), the results are correct with
mpirun -np 4 ./test_fftw3_2Dmpi_simple
but not with
mpirun -np 2 ./test_fftw3_2Dmpi_simple
PS: same thing with the flag FFTW_MPI_TRANSPOSED_OUT.