I need help to parallelize this code using OpenMP

I need help to parallelize this code using OpenMP - c

I wrote a C code that I would like to parallelize using OpenMP (I am a beginner and I have just a few days to solve this task); let's start from the main: first of all I have initialized 6 vectors (Vx,Vy,Vz,thetap,phip,theta); then there is a for loop that cycles over Nmax; inside of this loop I allocate some memory for the structure I have defined at the very top of the code; the structure is called coll_CPU and increases its size every cycle; then I pick some of the values from the vectors I have mentioned before and I place them into the structure; so at this point my structure coll_CPU is filled with Ncoll elements; during this process I used some of the functions declared outside of the main (these functions are random number generators). Now comes the important part: in my serial code I use a for loop to pass every single element of the structure to a function called collisionCPU (this function just gets the inputs and multiplies them by 2); My goal is to parallelize this loop so that each of my CPUs gives its contribution to do this operation and speed up the process.
Here are the codes:
main.c
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include <memory.h>
#include <string.h>
#include <time.h>
#include <omp.h>
#define pi2 6.283185307
#define pi 3.141592654
#define IMUL(a,b) __mul24(a,b)
typedef struct {
int seme;
} iniran;
typedef struct{
int jp1;
int jp2;
float kx;
float ky;
float kz;
float vAx;
float vAy;
float vAz;
float vBx;
float vBy;
float vBz;
float tetaAp;
float phiAp;
float tetaA;
float tetaBp;
float phiBp;
float tetaB;
float kAx;
float kAy;
float kAz;
float kBx;
float kBy;
float kBz;
int caso;
} stato_struct;
stato_struct *coll_CPU=0;
unsigned int timer;
#include "DSMC_kernel_float.c"
//=============================================================
float min(float *a, float*b){
if(*a<*b){
return *a;
}
else{
return *b;
}
}
//=============================================================
float max(float *a, float*b){
if(*a>*b){
return *a;
}
else{
return *b;
}
}
//=============================================================
float rf(int *idum){
static int iff=0;
static int inext, inextp, ma[55];
int mj, mk;
int i, k, ii;
float ret_val;
if (*idum<0 || iff==0) {
iff=1;
mj=161803398 - abs(*idum);
mj %= 1000000000;
ma[54]=mj;
mk=1;
for (i=1; i<=54; ++i){
ii=(i*21)%55;
ma[ii-1]=mk;
mk=mj-mk;
if (mk<0) {
mk += 1000000000;
}
mj= ma[ii-1];
}
for(k=1; k<=4; ++k) {
for(i=1; i<=55; ++i){
ma[i-1] -= ma[(i+30)%55];
if (ma[i-1]<0){
ma[i-1] += 1000000000;
}
}
}
inext=0;
inextp=31;
*idum=1;
}
++inext;
if (inext==56){
inext=1;
}
++inextp;
if (inextp==56){
inextp=1;
}
mj=ma[inext-1]-ma[inextp-1];
if (mj<0){
mj += 1000000000;
}
ma[inext-1]=mj;
ret_val=mj*1.0000000000000001e-9;
return ret_val;
}
//============================================================
int genk(float *kx, float *ky, float *kz, int *p2seme){
// float sqrtf(float), sinf(float), cosf(float);
extern float rf(int *);
static float phi;
*kx=rf(p2seme) * 2. -1.f;
*ky= sqrtf(1. - *kx * *kx);
phi=pi2*rf(p2seme);
*kz=*ky * sinf(phi);
*ky *= cosf(phi);
return 0;
}
//==============================================================
int main(void){
float msec_kernel;
int Np=10000, Nmax=512;
int id,jp,jcoll,Ncoll,jp1, jp2, ind;
float Vx[Np],Vy[Np],Vz[Np],teta[Np],tetap[Np],phip[Np];
float kx, ky, kz, Vrx, Vry, Vrz, scalprod, fk;
float kAx, kAy, kAz, kBx, kBy, kBz;
iniran1.seme=7593;
for(jp=1;jp<=Np;jp++){
if(jp<=Np/2){
Vx[jp-1]=2.5;
Vy[jp-1]=0;
Vz[jp-1]=0;
tetap[jp-1]=0;
phip[jp-1]=0;
teta[jp-1]=0;
}
for (Ncoll=1;Ncoll<=Nmax;Ncoll += 10){
coll_CPU=(stato_struct*) malloc(Ncoll*sizeof(stato_struct));
jcoll=0;
while (jcoll<Ncoll){
jp1=1+floorf(Np*rf(&iniran1.seme));
jp2=1+floorf(Np*rf(&iniran1.seme));
genk(&kx,&ky,&kz,&iniran1.seme);
Vrx=Vx[jp2-1]-Vx[jp1-1];
Vry=Vy[jp2-1]-Vy[jp1-1];
Vrz=Vz[jp2-1]-Vz[jp1-1];
scalprod=Vrx*kx+Vry*ky+Vrz*kz;
if (scalprod<0) {
genk(&kAx,&kAy,&kAz,&iniran1.seme);
genk(&kBx,&kBy,&kBz,&iniran1.seme);
coll_CPU[jcoll].jp1= jp1;
coll_CPU[jcoll].jp2=jp2;
coll_CPU[jcoll].kx=kx;
coll_CPU[jcoll].ky=ky;
coll_CPU[jcoll].kz=kz;
coll_CPU[jcoll].vAx=Vx[jp1-1];
coll_CPU[jcoll].vAy=Vy[jp1-1];
coll_CPU[jcoll].vAz=Vz[jp1-1];
coll_CPU[jcoll].vBx=Vx[jp2-1];
coll_CPU[jcoll].vBy=Vy[jp2-1];
coll_CPU[jcoll].vBz=Vz[jp2-1];
coll_CPU[jcoll].tetaAp=tetap[jp1-1];
coll_CPU[jcoll].phiAp=phip[jp1-1];
coll_CPU[jcoll].tetaA=teta[jp1-1];
coll_CPU[jcoll].tetaBp=tetap[jp2-1];
coll_CPU[jcoll].phiBp=phip[jp2-1];
coll_CPU[jcoll].tetaB=teta[jp2-1];
coll_CPU[jcoll].kAx=kAx;
coll_CPU[jcoll].kAy=kAy;
coll_CPU[jcoll].kAz=kAz;
coll_CPU[jcoll].kBx=kBx;
coll_CPU[jcoll].kBy=kBy;
coll_CPU[jcoll].kBz=kBz;
coll_CPU[jcoll].caso=1;
jcoll++;
}
}
clock_t t;
t = clock();
#pragma omp parallel for private(id) //HERE IS WHERE I TRIED TO DO THE PARALLELIZATION BUT WITH NO SUCCESS. WHAT DO I HAVE TO TYPE INSTEAD???
for(id=0;id<Nmax;id++){
CollisioniCPU(coll_CPU,id);
}
t = clock() - t;
msec_kernel = ((float)t*1000)/CLOCKS_PER_SEC;
printf("Tempo esecuzione kernel:%e s\n",msec_kernel*1e-03);
for (ind=0;ind<Ncoll;ind++){
if (coll_CPU[ind].caso==4)
Ncoll_eff++;
else if (coll_CPU[ind].caso==0)
Ncoll_div++;
else
Ncoll_dim++;
}
free(coll_CPU);
}
return 0;
}
DSMC_kernel_float.c
void CollisioniCPU(stato_struct *coll_CPU, int id){
float vettA[6], vettB[6];
vettA[0]=coll_CPU[id].vAx;
vettA[1]=coll_CPU[id].vAy;
vettA[2]=coll_CPU[id].vAz;
vettA[3]=coll_CPU[id].tetaAp;
vettA[4]=coll_CPU[id].phiAp;
vettA[5]=coll_CPU[id].tetaA;
vettB[0]=coll_CPU[id].vBx;
vettB[1]=coll_CPU[id].vBy;
vettB[2]=coll_CPU[id].vBz;
vettB[3]=coll_CPU[id].tetaBp;
vettB[4]=coll_CPU[id].phiBp;
vettB[5]=coll_CPU[id].tetaB;
coll_CPU[id].vAx=2*vettA[0];
coll_CPU[id].vAy=2*vettA[1];
coll_CPU[id].vAz=2*vettA[2];
coll_CPU[id].tetaAp=2*vettA[3];
coll_CPU[id].phiAp=2*vettA[4];
coll_CPU[id].tetaA=2*vettA[5];
coll_CPU[id].vBx=2*vettB[0];
coll_CPU[id].vBy=2*vettB[1];
coll_CPU[id].vBz=2*vettB[2];
coll_CPU[id].tetaBp=2*vettB[3];
coll_CPU[id].phiBp=2*vettB[4];
coll_CPU[id].tetaB=2*vettB[5];
}
In order to compile the program I type this line on the terminal: gcc -fopenmp time_analysis.c -o time_analysis -lm fallowed by export OMP_NUM_THREADS=1; however once I run the executable I get this error message:
Error in `./time_analysis': double free or corruption (!prev): 0x00000000009602c0 ***
Aborted
What does this error mean? what I have done wrong in the main function when I tried to parallelize the for loop? and most important: what should I type instead in order to make my code go on parallel? please help me out if you can because I seriously have no time to study OpenMP from scratch and I need to get this job done right away.

Changing the inner loop as follows should bring you one step further.
#pragma omp parallel for private(id)
for(id=0;id<Ncoll;id++){
CollisioniCPU(coll_CPU,id);
}
Your OpenMP line seems okay, but I doubt that it will lead to significant improvements in runtime. You should optimize the surrounding code as well. Allocating the memory once outside of your loops would be a good start.
By the way, is there any reason for this verbose coding style and not using a more compact and readable version as this one?
void CollisioniCPU(stato_struct *coll_CPU, int id) {
stato_struct *ptr = coll_CPU + id;
ptr->vAx *= 2;
ptr->vAy *= 2;
ptr->vAz *= 2;
ptr->tetaAp *= 2;
ptr->phiAp *= 2;
ptr->tetaA *= 2;
ptr->vBx *= 2;
ptr->vBy *= 2;
ptr->vBz *= 2;
ptr->tetaBp *= 2;
ptr->phiBp *= 2;
ptr->tetaB *= 2;
}

Related

Solution to heat equation

When trying to plot the solution of the heat PDE I've found some troubles. The solution that I've found is:
Here is the code:
#include <string.h>
#include <stdlib.h>
#include <stdio.h>
#include <math.h>
#define N 10000
double f(double x);
double X[N];
double Y[N];
int main(){
int i;
double b=43351/94400;
double dx=0.0001;
X[0]=0;
Y[0]=b;
for (i=1; i<N; i++){
X[i]=X[i-1]+dx;
Y[i]=f(X[i]);
}
FILE* output;
output = fopen("dades.txt", "w");
fprintf(output, "x Posició Temperatura\n");
for (i = 0; i < N; i++){
fprintf(output, "%lf %lf %lf\n", i*dx, X[i], Y[i]);
}
fclose(output);
return 0;
}
double f(double x){
int n;
double b=43351/94400;
for (n=1; n<N; n+=2){
double pi=3.14159265358979323846;
double t0=0.025;
double result=b;
result+=2*(1-pow((-1),n))/(pi*n)*(1-exp(-pow(n,2)*pow(pi,2)*pow(t0,2)))/(pow(n,2)*pow(pi,2))*sin(n*pi*x);
}
return result;
}
What I'm trying to do is to declare a function that calculates the infinite sum for n odd, and then looping it for every x between 0 and 1. The problem is that I don't know how to declare "result" in order to be the sum of all the terms, because if I declare it outside the for loop it doesn't satisfy the boundary conditions.
(Note that I fixed t=0.025).

According to the equation, you can implement f as:
#define M_PI 3.14159265358979323846;
double f(double x)
{
int n;
double result=43351.0/94400.0;
double t0=0.025;
for (n=1; n<N; n+=2){
result+=2*(1-pow((-1),n))/(M_PI*n)*(1-exp(-pow(n,2)*pow(M_PI,2)*pow(t0,2)))/(pow(n,2)*pow(M_PI,2))*sin(n*M_PI*x);
}
return result;
}
Since you are using double, so you have to explicitly add a .0 otherwise it may be considered as integer.
The declarations of variable are moved outside the loop in order both to clarify the code and ensure the variable result gets update instead of being overwritten.
EDIT:
You could improve the function f to take the value of t as an input. This also aligns with the equation provided. It would then implements this way:
double f(double x, double t)
{
int n;
double result=43351.0/94400.0;
for (n=1; n<N; n+=2){
result+=2*(1-pow((-1),n))/(M_PI*n)*(1-exp(-pow(n,2)*pow(M_PI,2)*pow(t,2)))/(pow(n,2)*pow(M_PI,2))*sin(n*M_PI*x);
}
return result;
}
EDIT:
The implementation of the math of the equation could be further simplified:
a^2 b^2 is same as (ab)^2.
(-1)^n with n odd is always -1.
2*(1-pow((-1),n)) is a replacement for 4.
Plus, from a performance perspective you can avoid recalculation of repeated terms by putting them in a variable and the use it as you need (for instance the n^2 pi^2).

Is using pragma omp simd like this correct?

#include <stdio.h>
#include <stdlib.h>
#include <omp.h>
#define pow(x) ((x) * (x))
#define NUM_THREADS 8
#define wmax 1000
#define Nv 2
#define N 5
int b=0;
float Points[N][Nv]={ {0,1}, {3,4}, {1,2}, {5,1} ,{8,9}};
float length[wmax+1]={0};
float EuclDist(float* Ne, float* Pe) {
int i;
float s = 0;
for (i = 0; i < Nv; i++) {
s += pow(Ne[i] - Pe[i]);
}
return s;
}
void DistanceFinder(float* a[]){
int i;
#pragma omp simd
for (i=1;i<N+1;i++){
length[b] += EuclDist(&a[i],&a[i-1]);
}
//printf(" %f\n", length[b]);
}
void NewRoute(){
//some irrelevant things
DistanceFinder(Points);
}
int main(){
omp_set_num_threads(NUM_THREADS);
do{
b+=1;
NewRoute();
} while (b<wmax);
}
Trying to parallelize this loop and trying different things, tried this one.
Seems to be the fastest, however is it correct to use SIMD like that? Because I'm using a previous iteration (i and i - 1). The results I see though are correct weirdly or not.

Seems to be the fastest, however is it correct to use SIMD like that?
First, there is a race condition that needs to be fixed, namely during the updates of the array length[b]. Moreover, you are accessing memory outside the array a; (iterating from 1 to N + 1), and you are passing &a[i]. You can fix the race condition by using OpenMP reduction clause:
void DistanceFinder(float* a[]){
int i;
float sum = 0;
float tmp;
#pragma omp simd private(tmp) reduction(+:sum)
for (i=1;i<N;i++){
tmp = EuclDist(a[i], a[i-1]);
sum += tmp;
}
length[b] += sum;
}
Furthermore, you need to provide a version of EuclDist as follows:
#pragma omp declare simd uniform(Ne, Pe)
float EuclDist(float* Ne, float* Pe) {
int i;
float s = 0;
for (i = 0; i < Nv; i++)
s += pow(Ne[i] - Pe[i]);
return s;
}
Because I'm using a previous iteration (i and i - 1).
In your case, it is okay, since the array a is just being read.
The results I see though are correct weirdly or not.
Very-likely there was no vectorization taking place. Regardless, it would still be undefined behavior due to the aforementioned race condition.
You can simplify your code so that it increases the likelihood of the vectorization actually happening, for instance:
void DistanceFinder(float* a[]){
int i;
float sum = 0;
float tmp;
#pragma omp simd private(tmp) reduction(+:sum)
for (i=1;i<N;i++){
tmp = pow(a[i][0] - a[i-1][0]) + pow(a[i][1] - a[i-1][1])
sum += tmp;
}
length[b] += sum;
}
A further change that you can do to improve the performance of your code is to allocate the matrix (that is passed as a parameter of the function DistanceFinder) in a manner that when you iterate over its rows (i.e., a[i]) you would be iterating over continuous memory address.
For instance, you could pass two arrays a1 and a2 to represent the first and second columns of the matrix a:
void DistanceFinder(float a1[], float a2[]){
int i;
float sum = 0;
float tmp;
#pragma omp simd private(tmp) reduction(+:sum)
for (i=1;i<N;i++){
tmp = pow(a1[i] - a1[i-1]) + pow(a2[i][1] - a2[i-1][1])
sum += tmp;
}
length[b] += sum;
}

Not Printing Average

I have a project going in two files, but I cannot get the main program to print out the average variable, I just get 0.0 no matter what I change. It also does not print out a whole other function Any Tips?
Main File:
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
float average(void);
float std_dev(float);
float output(float);
float array[10] = {4.8, 12.98, 82.1, 5.98, 19.75, 24.9, 75.7, 3.45, 10.0, 28.11};
extern float avg;
int main()
{
float s = 0.0;
printf("The average value of the array is %.2f \n", avg);
s = std_dev(avg);
printf("The standard deviation of the array is %.2f \n", s);
return 0;
}
static void output(float var)
{
printf("The value of the variable is %.2f \n", var);
}
Second File:
#include <math.h>
extern float array[];
float avg = 26.78;
static float average()
{
int n;
float sum = 0.0, mean=0.0;
for(n=0; n<10; n++)
sum = sum + array[n];
mean= sum/10;
output(mean);
return mean;
}
float std_dev()
{
int n;
float cumm_diff = 0.0;
for(n=0; n<10; n++)
cumm_diff += (avg -array[n]) * (avg -array[n]);
return sqrt(cumm_diff/10);
}

the following proposed code:
cleanly compiles
performs the desired functionality
demonstrates how to execute functions in external files
properly prototypes the functions
removes extraneous code
properly formats the code for ease of readability and understanding
eliminates the 'magic' numbers
lets the compiler determine how many items are in the array
uses float literals rather than double literals
documented why each of the system header files is included
used meaningful parameter and variable names
derived the value of count at compile time
cast the int count parameter, when needed, to a float
and now the proposed code:
header file: main.h
#ifndef MAIN_H
#define MAIN_H
void output(float var);
extern float avg;
#endif // MAIN_H
header file: util.h
#ifndef UTIL_H
#define UTIL_H
float calc_std_dev( float *, int );
void calc_mean( float *, int );
#endif // UTIL_H
main.c file
#include <stdio.h> // printf()
#include "main.h"
#include "util.h"
float array[] =
{
4.8f, 12.98f, 82.1f, 5.98f, 19.75f,
24.9f, 75.7f, 3.45f, 10.0f, 28.11f
};
int main( void )
{
printf("The average value of the array is %.2f \n", avg);
float s = calc_std_dev( array, sizeof(array)/sizeof(float) );
printf("The standard deviation of the array is %.2f \n", s);
calc_mean( array, sizeof(array)/sizeof(float) );
return 0;
}
void output(float var)
{
printf("The mean value of the array is: %.2f \n", var);
}
util.c file
#include <math.h> //sqrtf()
#include "main.h"
#include "util.h"
float avg = 26.78f;
void calc_mean( float array[], int count )
{
int n;
float sum = 0.0f;
float mean = 0.0f;
for(n=0; n<count; n++)
sum = sum + array[n];
mean= sum/ (float)count;
output(mean);
}
float calc_std_dev( float array[], int count )
{
int n;
float cumm_diff = 0.0f;
for(n=0; n<count; n++)
cumm_diff += (avg -array[n]) * (avg -array[n]);
return sqrtf(cumm_diff / (float)count);
}

You never call your function average(), which is why the average is not computed, but the initialized value avg = 26.78 (not 0 - you probably changed that at some point) is printed and used by std_dev().

Porting and openMp program to cuda c: correct grid_size/block_size and reduction

I want to convert an openMP program to cuda c.
I try to find my way on the web and the sdk. But the material is beyond my level.
My c program loop over n=2^30 index and add the weight of each index.
1) What is the correct grid_size and block_size?
My guess is to replicate openMP and do
grid_size=n/max_number_of_cuda_threads;
block_size=1;
2) How can I implement openMP reduction in cuda?
I try a cudaMemcpy and then reduce the array in standard c, but it seems slow.
I look at the thrust library and its reduce operator. But I don't see how to integrate it with my current code.
program.c
#include <math.h>
#include <omp.h>
float get_weigth_of_index(long index,float* data){
int i;
float v=0;
for(i=0;i<4;i++)
v+=index*data[i];
return v;
}
int main(){
long i;
float r=0;
long n=pow(2,30);
float data[4]={0,1,2,3};
#pragma omp parallel for reduction (+:r)
for(i=0;i<n;i++)
r+=get_weigth_of_index(i,data);
return 0;
}
program.cu
#include <stdlib.h>
#include <stdio.h>
#include <omp.h>
#include <math.h>
__device__ float get_weigth_of_index(long index,float* data){
int i;
float v=0;
for(i=0;i<4;i++)
v+=index*data[i];
return v;
}
__global__ void looper(long max_number_of_cuda_threads, float* data,float* result){
long bid=blockIdx.x;
long start=bid*max_number_of_cuda_threads;
long end=start+max_number_of_cuda_threads;
long i;
float r=0;
for(i=start;i<end;i++)
r+=get_weigth_of_index(i,data);
result[bid]=r;
}
int main(){
long n=pow(2,30);
int max_number_of_cuda_threads=1024; //I'm not sure it's correct
long grid_size=n/max_number_of_cuda_threads;
long block_size=1;
float data_host[4]={0,1,2,3};
float* data_device=0;
float* result_device=0;
cudaMalloc((void**)&data_device, sizeof(int)*4);
cudaMemcpy(data_device, data_host, sizeof(int)*4, cudaMemcpyHostToDevice);
cudaMalloc((void**)&result_device, sizeof(float)*grid_size);
looper<<<grid_size,block_size>>>(max_number_of_cuda_threads,data_device,result_device);
//reduction with standard c: cudaMemcpy seems slow
float* result_host=(float*)malloc(sizeof(float)*grid_size);
cudaMemcpy(result_host, result_device, sizeof(float)*grid_size, cudaMemcpyDeviceToHost);
long i;
float v=0;
#pragma omp parallel for reduction(+:v)
for(i=0;i<grid_size;i++)
v+=result_host[i];
printf("result:%f",v);
return 0;
}
my gpu card
Device 0: "Tesla M2050"
Number of multiprocessors: 14
Number of cores: 448
Total amount of constant memory: 65536 bytes
Total amount of shared memory per block: 49152 bytes
Total number of registers available per block: 32768
Warp size: 32
Maximum number of threads per block: 1024
Maximum sizes of each dimension of a block: 1024 x 1024 x 64
Maximum sizes of each dimension of a grid: 65535 x 65535 x 1
Maximum memory pitch: 2147483647 bytes
Texture alignment: 512 bytes

I think that thrust::transform_reduce can solve your problem. This code shows how you can use it:
#include <thrust/transform_reduce.h>
#include <thrust/functional.h>
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#include <cmath>
struct get_weigth_of_index
{
get_weigth_of_index(float* data, size_t n)
{
cudaMalloc((void**)&_data,n * sizeof(float));
cudaMemcpy(_data, data, n * sizeof(float), cudaMemcpyHostToDevice);
_n = n;
}
float* _data;
size_t _n;
__host__ __device__
float operator()(const int& index) const
{
float v=0;
for(size_t i=0; i<_n; i++)
v += index * _data[i];
return v;
}
};
int main(void)
{
float x[4] = {1.0, 2.0, 3.0, 4.0};
size_t len = 1024; // init your value
float * index //init and fill you array here
// transfer to device
thrust::device_vector<float> d_index(index, index + len);
get_weigth_of_index unary_op(x, 4);
thrust::plus<float> binary_op;
float init = 0;
float sum = thrust::transform_reduce(d_x.begin(), d_x.end(), unary_op, init, binary_op);
std::cout << sum<< std::endl;
return 0;
}

Help implementing Least Squares algorithm in C [was: want to find the square root of an array]

the formula is pretty complicated. the numerator is num and the denominator is den, in the formula there is a root on the denominator so i have putted den in sqrrt() but sqrrt only accepts doubles
#include<stdio.h>
#include<conio.h>
#include<math.h>
#define LEN 11
// for the following set of x and y find r by the formula ..
float sum(float arr[]);
void main(void)
{ int i;
float x[]={43.22,39.87,41.85,43.23,40.06,53.29,53.29,54.14,49.12,40.71,55.15};
float y[]={102.43,100.93,97.43,97.81,98.32,98.32,100.07,97.08,91.59,94.85,94.6};
float num,den[LEN],r[LEN],xy[LEN],x2[LEN],y2[LEN];
for(i=0;i<LEN;i++)
{
x2[i]=x[i]*x[i];
y2[i]=y[i]*y[i];
xy[i]=x[i]*y[i];
}
num=sum(xy)-sum(x)*sum(y);
for(i=0;i<LEN;i++)
{
den[i]=((LEN*sum(x2)-(sum(x))*(sum(x)))*(LEN*sum(y2))-(sum(y2))*(sum(y2)));
r[i]=num /sqrt(den); /*<----------the problem is here-----> */
}
printf("%f",r);
getch();
}
float sum(float arr[])
{
int i;
float total=0;
for(i=0;i<=LEN;i++)
{
total+=arr[i];
}
return total;
}

Out of sheer boredom I have fixed your code. It is still ugly and extremely inefficient but compiles and should work. I'll leave you or someone else to make it decent.
#include <stdio.h>
#include <math.h>
#define LEN 11
// for the following set of x and y find r by the formula ..
float sum(float arr[]);
int main(void)
{ int i;
float x[]={43.22,39.87,41.85,43.23,40.06,53.29,53.29,54.14,49.12,40.71,55.15};
float y[]={102.43,100.93,97.43,97.81,98.32,98.32,100.07,97.08,91.59,94.85,94.6};
float num,den,r[LEN],xy[LEN],x2[LEN],y2[LEN];
for(i=0;i<LEN;i++)
{
x2[i]=x[i]*x[i];
y2[i]=y[i]*y[i];
xy[i]=x[i]*y[i];
}
num=LEN*sum(xy)-sum(x)*sum(y);
den = (LEN*sum(x2)) - sum(x)*sum(x);
float alpha = sum(y)/LEN - (num/den)*sum(x)/LEN;
printf("beta = %f, alpha = %f\n", num/den, alpha);
for(i=0;i<LEN;i++)
{
float term = y[i] - alpha - (num/den)*x[i];
r[i] = (term*term);
printf("%f",r[i]);
}
}
float sum(float arr[])
{
int i;
float total=0;
for(i=0;i<=LEN;i++)
{
total+=arr[i];
}
return total;
}

To be consistent with the rest of the code, you should presumably be writing:
r[i] = num / sqrt(den[i]);
However, the calculation is not one I recognize. The body of the second loop is going to produce the same result for each value in den and therefore also in r, which is probably not what the question asked for.

You need to give the index den[i] at the denominator....instead in your code you have just passed the base address!
r[i]=num /sqrt(den[i]);
If this is what you want to achieve, which is quite unclear.

Develop Reference

c reactjs sql-server angularjs arrays wpf database batch-file google-app-engine silverlight

I need help to parallelize this code using OpenMP - c

Related

Solution to heat equation

Is using pragma omp simd like this correct?

Not Printing Average

Porting and openMp program to cuda c: correct grid_size/block_size and reduction

Help implementing Least Squares algorithm in C [was: want to find the square root of an array]

Categories

Resources