C MPI - Crash for no apparent reason - Program Received signal SIGSEGV - c

I am running a program in a linux cluster using MPICH2 1.0.6 (i can't actually update it to MPICH3, so i am stuck to that version) and for no apparent reason the program doesn't execute.
I compile it using mpicc -o prog prog.c -lm and execute with mpiexec
The program is an implementation of the hierarchical agglomerative clustering algorithm using the vector space model. The data collection is an n*m array (in the program DOC*MAXWORDS) which are divided into the nodes of the cluster like PARTS=DOC/procs so every node is responsible for PARTS*MAXWORDS data.
While debugging in a serial machine using gdb and ddd i get that the program has a segmentation error in a specific line of the code, where i can't find what is wrong with it. Take a look.
while(iterations != DOC-k){//bigLoop
iterations++;
x=y=-1;
pos1=pos2=node1=node2=-1;
for(i=0;i<PARTS;i++){//ELEGXOS MEGISTOU TOPIKA
if(max_array[i]>x){
x=max_array[i];
pos1=i;
}
else if(max_array[i]==x){
pos2=i;
} //ELEGXOS META TO LOOP GIA OMOIOTHTES
}
if(max_array[pos1]!=max_array[pos2]){
for(i=0;i<PARTS;i++){
if(max_array[i]>max_array[pos2] && i!=pos1)
pos2=1;
}
}
if(MPI_Allgather(&x,1,MPI_DOUBLE,
n_max,1,MPI_DOUBLE,MPI_COMM_WORLD) != MPI_SUCCESS) {
printf("Allgather high valuer - error");
exit(1);
}
for(i=0;i<procs;i++){
if(n_max[i]>y){
y=n_max[i];
node1=i;
}
else if(n_max[i]==y){
node2=i;
}
}
for(i=0;i<MAXWORDS;i++){
merger_one[i]=merger_two[i]=0;
}
if(n_max[node1]==n_max[node2]){
if(id==node1){
for(i=0;i<MAXWORDS;i++){
merger_one[i]=vector[node1*PARTS+pos1][i];
last_one[i]=vector[(node1*PARTS)+texts_vectors[node1]][i];
}
size_one=size_of[pos1];
nn_array[pos1]=nn_array[texts_vectors[node1]];
max_array[pos1]=max_array[texts_vectors[node1]];
size_of[pos1]=size_of[texts_vectors[node1]];
texts_vectors[node1]--;
}
if(id==node2){
for(i=0;i<MAXWORDS;i++){
merger_two[i]=vector[node2*PARTS+pos2][i];
last_two[i]=vector[(node2*PARTS)+texts_vectors[node2]][i];
}
j=pos2;
pos2=pos1;
pos1=j;
size_two=size_of[pos2];
nn_array[pos2]=nn_array[texts_vectors[node2]];
max_array[pos2]=max_array[texts_vectors[node2]];
size_of[pos2]=size_of[texts_vectors[node2]];
texts_vectors[node2]--;
}
}
else{
node2=node1;
if(id==node1){
for(i=0;i<MAXWORDS;i++){
merger_one[i]=vector[node1*PARTS+pos1][i];
merger_two[i]=vector[node2*PARTS+pos2][i];
last_one[i]=vector[(node1*PARTS)+texts_vectors[node1]][i];/*SIGSEV ERROR*/
last_two[i]=vector[(node2*PARTS)+texts_vectors[node2]-1][i];
}
size_one=size_of[pos1];
size_two=size_of[pos2];
nn_array[pos1]=nn_array[texts_vectors[node1]];
max_array[pos1]=max_array[texts_vectors[node1]];
size_of[pos1]=size_of[texts_vectors[node1]];
nn_array[pos2]=nn_array[texts_vectors[node2]-1];
max_array[pos2]=max_array[texts_vectors[node2]-1];
size_of[pos2]=size_of[texts_vectors[node2]-1];
texts_vectors[node1]=texts_vectors[node1]-2;
}
}
MPI_Bcast(&pos1, 1, MPI_INT,node1, MPI_COMM_WORLD);
MPI_Bcast(&pos2, 1, MPI_INT,node2, MPI_COMM_WORLD);
MPI_Bcast(&size_one, 1, MPI_INT,node1, MPI_COMM_WORLD);
MPI_Bcast(&size_two, 1, MPI_INT,node2, MPI_COMM_WORLD);
MPI_Bcast(merger_one, MAXWORDS, MPI_INT,node1, MPI_COMM_WORLD);
MPI_Bcast(merger_two, MAXWORDS, MPI_INT,node2, MPI_COMM_WORLD);
MPI_Bcast(last_one, MAXWORDS, MPI_INT,node1, MPI_COMM_WORLD);
MPI_Bcast(last_two, MAXWORDS, MPI_INT,node2, MPI_COMM_WORLD);
MPI_Allgather(&texts_vectors,1,MPI_INT,texts_vectors,1,MPI_INT,MPI_COMM_WORLD);
for(i=0;i<MAXWORDS;i++){
vector[node1*PARTS+pos1][i]=last_one[i];
vector[node2*PARTS+pos2][i]=last_two[i];
}
Pmanager=PARTS+1;
for(i=0;i<procs;i++){
if(texts_vectors[i]<Pmanager)
Pmanager=i;
}
texts_vectors[Pmanager]++;
for(i=0;i<MAXWORDS;i++){
x=merger_one[i]*size_one;
y=merger_two[i]*size_two;
vector[Pmanager*PARTS+texts_vectors[Pmanager]][i]=(x+y)/(size_one + size_two);
}
for(i=id*PARTS; i< (id+1)*texts_vectors[id]; i++){
for(j=0;j<procs;j++){
for(m=j*PARTS;m<j*PARTS+texts_vectors[j];m++){
x=0;y=0;z=0;
for(l=0; l < MAXWORDS; l++){
x+=vector[i][l]*vector[m][l];
y+=vector[i][l]*vector[i][l];
z+=vector[m][l]*vector[m][l];
}
if(i!=m){
if(y!=0 && z!=0){
sim_matrix[i-(PARTS*id)][m] = x / (sqrt(y) * sqrt(z) );
}
else{
sim_matrix[i-(PARTS*id)][m] = 0.0;
}
}
}
}
}
for(i=0; i<texts_vectors[id]; i++){
x=0.0;
for(j=0;j<DOC;j++){
if(sim_matrix[i][j]>x){
nn_array[i]=j;
max_array[i]=x=sim_matrix[i][j];
}
}
}
}
Prior to this there is the creation of the arrays and the data input into the vector[i][j]
I created the arrays using malloc :
int **vector = malloc(DOC * sizeof *vector);
for (i = 0; i < DOC; i++){
vector[i] = malloc(MAXWORDS * sizeof **vector);
}
double **sim_matrix = malloc(PARTS * sizeof *sim_matrix);
for (i = 0; i < PARTS; i++)
sim_matrix[i] = malloc(DOC * sizeof **sim_matrix);
int *list = malloc(WHOLE * sizeof(int));
int *nn_array = malloc(PARTS * sizeof(int));
double *max_array = malloc(PARTS * sizeof(double));
int *size_of = malloc(PARTS * sizeof(int));
double *n_max = malloc(procs * sizeof(double));
int *texts_vectors = malloc(procs * sizeof(int));
int *merger_one = malloc(MAXWORDS * sizeof(int));
int *merger_two = malloc(MAXWORDS * sizeof(int));
int *last_one = malloc(MAXWORDS * sizeof(int));
int *last_two = malloc(MAXWORDS * sizeof(int));
The line where the problem persists: last_one[i]=vector[(node1*PARTS)+texts_vectors[node1]][i];/*SIGSEV ERROR*/ is also executed in the first part of the if-loop if(n_max[node1]==n_max[node2]){ but in that case there is no error.
The only thing that feels a little suspicious about this problem is the texts_vectors[i] array which keeps counting the number of vector[i][j] type data that are currently inside the nodes. But even that i think i got it covered.
I really hope that somebody could have a look at this cause it is really frustrating and it needs to be done.
If you have a better idea of what is going on and want to take a look at the whole code, i pasted it into a pastezone. Cheers and thanks in advance.
EDIT:
As it turns out the value that i passed with the array text_vectors where exceeding the boundaries of the array. Since the value was giving the maximum value, for the actual last position in the array i should subtract 1. So that was it, no segmentation fault in the serial gdb and ddd. However this program now it doesn't run for more than 2 nodes. If i execute it in 4> nodes it crashes.

This line has multiple errors in it:
MPI_Allgather(&texts_vectors,1,MPI_INT,texts_vectors,1,MPI_INT,MPI_COMM_WORLD);
First, you are providing a pointer to a pointer to the data as the first argument of the gather-to-all operation. Therefore the value transmitted by each rank is not the first element of text_vectors but rather the memory address of the data (or the lower half of the address on 64-bit little-endian LP64 systems).
Second, if you fix that by removing the address-off operator & from the beginning of the first argument, you will run into another problem. The MPI standard does not allow the source and destination buffers in MPI_Allgather to overlap. Some MPI implementations do not enforce that requirement and silently do The Right Thing (TM). Some other MPI implementations will try to copy the data with memcpy and run into problems with the C library (memcpy does not allow overlapping buffers). And finally, some MPI implementations will give you a nice error message about overlapping buffers and terminate your program.
Since you are sending a single integer element, simply copy the value into a temporary variable and use its address as the first argument.

Related

Process terminated with status -1073741819 mid loop?

Beginner in C and running into a problem with a function that initializes an array. Compiled in Code:Blocks 16.01 on Windows 10. Specific code I'm having issues with is:
void initAuction(float auction[2][MAXAUCTIONITEMS]) {
int i;
for (i = 0; i < MAXAUCTIONITEMS; i++) {
auction[1][i] = -1;
printf("\n%f\t%d\n", auction[1][i], i);
};
for (i = 0; i < MAXAUCTIONITEMS; i++) {
auction[2][i] = 0;
printf("\n\n%f\t%d", auction[2][i], i);
}
printf("\n%f\n", auction[2][70]);
return;
}
I've set up print statements to see how far I'm getting before the crash and I make it to the second for loop but it crashes at i=140. If I change the constant (which is equal to 1000) then the highest I can set it to without crashing is i<84 oddly enough. What would cause the termination status -1073741819 mid loop when the first row initialized no problem but row 2 chooses to crash at around i=140.
I've tried searching on google and here and it seems the termination code isn't a very specific code since I've seen solutions from needing a return statement, trying to access something that doesn't exist, etc. Really lost.
The valid indices are auction[0][*] and auction[1][*].
You are setting elements of the array beyond its boundaries: the initial dimension of auction is 2, the only valid values for this index are 0 and 1.
You can fix and simplify the code this way:
void initAuction(float auction[2][]) {
for (int i = 0; i < MAXAUCTIONITEMS; i++) {
auction[0][i] = -1;
auction[1][i] = 0;
}
}
Note that the second dimension is not part of the type of auction, it is ignored by the compiler.

C: Segmentation fault: GDB: <error reading variable>

I have a function shortestPath() that is a modified implementation of Dijkstra's algorithm for use with a board game AI I am working on for my comp2 class. I have trawled through the website and using gdb and valgrind I know exactly where the segfault happens (actually knew that a few hours ago), but can't figure out what undefined behaviour or logic error is causing the problem.
The function in which the problem occurs is called around 10x and works as expected until it segfaults with GDB:
"error reading variable: cannot access memory"
and valgrind:
"Invalid read of size 8"
Normally that would be enough, but I can't work this one out. Also any general advise and tips are appreciated... thanks!
GDB: https://gist.github.com/mckayryan/b8d1e9cdcc58dd1627ea
Valgrind: https://gist.github.com/mckayryan/8495963f6e62a51a734f
Here is the function in which the segfault occurs:
static void processBuffer (GameView currentView, Link pQ, int *pQLen,
LocationID *buffer, int bufferLen, Link prev,
LocationID cur)
{
//printLinkIndex("prev", prev, NUM_MAP_LOCATIONS);
// adds newly retrieved buffer Locations to queue adding link types
appendLocationsToQueue(currentView, pQ, pQLen, buffer, bufferLen, cur);
// calculates distance of new locations and updates prev when needed
updatePrev(currentView, pQ, pQLen, prev, cur); <--- this line here
qsort((void *) pQ, *pQLen, sizeof(link), (compfn)cmpDist);
// qsort sanity check
int i, qsortErr = 0;
for (i = 0; i < *pQLen-1; i++)
if (pQ[i].dist > pQ[i+1].dist) qsortErr = 1;
if (qsortErr) {
fprintf(stderr, "loadToPQ: qsort did not sort succesfully");
abort();
}
}
and the function whereby after it is called everything falls apart:
static void appendLocationsToQueue (GameView currentView, Link pQ,
int *pQLen, LocationID *buffer,
int bufferLen, LocationID cur)
{
int i, c, conns;
TransportID type[MAX_TRANSPORT] = { NONE };
for (i = 0; i < bufferLen; i++) {
// get connection information (up to 3 possible)
conns = connections(currentView->gameMap, cur, buffer[i], type);
for (c = 0; c < conns; c++) {
pQ[*pQLen].loc = buffer[i];
pQ[(*pQLen)++].type = type[c];
}
}
}
So I thought that a pointer had been overridden to the wrong address, but after a lot of printing in GDB that doesn't seem to be the case. I also rotated through making reads/writes to the variables in question to see which trigger the fault and they all do after appendLocationsToQueue(), but not before (or at the end of that function for that matter).
Here is the rest of the relevant code:
shortestPath():
Link shortestPath (GameView currentView, LocationID from, LocationID to, PlayerID player, int road, int rail, int boat)
{
if (!RAIL_MOVE) rail = 0;
// index of locations that have been visited
int visited[NUM_MAP_LOCATIONS] = { 0 };
// current shortest distance from the source
// the previous node for current known shortest path
Link prev;
if(!(prev = malloc(NUM_MAP_LOCATIONS*sizeof(link))))
fprintf(stderr, "GameView.c: shortestPath: malloc failure (prev)");
int i;
// intialise link data structure
for (i = 0; i < NUM_MAP_LOCATIONS; i++) {
prev[i].loc = NOWHERE;
prev[i].type = NONE;
if (i != from) prev[i].dist = INF;
else prev[i].dist = LAST;
}
LocationID *buffer, cur;
// a priority queue that dictates the order LocationID's are checked
Link pQ;
int bufferLen, pQLen = 0;
if (!(pQ = malloc(MAX_QUEUE*sizeof(link))))
fprintf(stderr, "GameView.c: shortestPath: malloc failure (pQ)");
// load initial location into queue
pQ[pQLen++].loc = from;
while (!visited[to]) {
// remove first item from queue into cur
shift(pQ, &pQLen, &cur);
if (visited[cur]) continue;
// freeing malloc from connectedLocations()
if (cur != from) free(buffer);
// find all locations connected to
buffer = connectedLocations(currentView, &bufferLen, cur,
player, currentView->roundNum, road,
rail, boat);
// mark current node as visited
visited[cur] = VISITED;
// locations from buffer are used to update priority queue (pQ)
// and distance information in prev
processBuffer(currentView, pQ, &pQLen, buffer, bufferLen, prev,
cur);
}
free(buffer);
free(pQ);
return prev;
}
The fact that all your parameters look good before this line:
appendLocationsToQueue(currentView, pQ, pQLen, buffer, bufferLen, cur);
and become unavailable after it tells me that you've stepped on (wrote 0x7fff00000000 to) the $rbp register (all local variables and parameters are relative to $rbp when building without optimization).
You can confirm this in GDB with print $rbp before and after call to appendLocationsToQueue ($rbp is supposed to always have the same value inside a given function, but will have changed).
Assuming this is true, there are only a few ways this could happen, and the most likely way is a stack buffer overflow in appendLocationsToQueue (or something it calls).
You should be able to use Address Sanitizer (g++ -fsanitize=address ...) to find this bug fairly easily.
It's also fairly easy to find the overflow in GDB: step into appendLocationsToQueue, and do watch -l *(char**)$rbp, continue. The watchpoint should fire when your code overwrites the $rbp save location.

Function crashing in release mode but runs flawless in debugger

My program crashes on this function on the 7th line, when I call malloc() when I run in release mode I get the `Program.exe has stopped working message, and when I run in debugger, most of the time it succeeds but sometimes I get this message (especially on larger input):
MONOM* polynomialsProduct(MONOM* poly1, int size1, MONOM* poly2, int size2, int* productSize)
{
int i1, i2;
int phSize = 1, logSize = 0;
MONOM* product;
product = (MONOM*)malloc(phSize*sizeof(MONOM));
monomAllocationVerification(product);
for (i1 = 0; i1 < size1; i1++)
{
for (i2 = 0; i2 < size2; i2++)
{
if (logSize == phSize)
{
phSize *= 2;
product = (MONOM*)realloc(product,phSize*sizeof(MONOM));
monomAllocationVerification(product);
}
product[logSize].coefficient = poly1[i1].coefficient * poly2[i2].coefficient;
product[logSize].power = poly1[i1].power + poly2[i2].power;
logSize++;
}
}
mergeSort(product,logSize);
*productSize = sumMonomsWithSamePower(product, logSize);
return product;
}
I understand that I'm dealing with memory errors and problems, but is there any quick way to analyze my code and look for memory errors? I look at my code a dozen of times looking for this kind of errors and found nothing. (I didn't want to post the code here since its 420 lines long).
First of all, if heap corruption is detected on the first malloc, that means it happened earlier (not in this function or on previous pass). So the problem may lie outside this code.
However, the code also looks suspicious to me.
monomAllocationVerification has no size parameter, so it should work on one monom only, yet you call it only once after realloc on pointer to first element, despite having allocated space for quite a few monoms. Please clarify your decision.
It is a bit unclear why sumMonomsWithSamePower should return a size, and thus modify an array to store a value. May be a quirk, but still suspicious.
UPDATE
The problem was in other functions; a few reallocs with wrong size.
I would check the return value of malloc() and use perror() to describe what error has occured. Also here is the documentation for malloc() and perror().
if((product = (MONOM*)malloc(phSize*sizeof(MONOM))) == NULL)
{
perror("ERROR: Failed to malloc ");
return 1;
//perror() will display a system specified string to describe the error it may tell you the error
}
Also do you know the size of MONOM? If not add the following line to your code.
printf("MONOM SIZE = %i\n", sizeof(MONOM));

Thread execution issue

I'm approaching C programming with threads and I can't get this program to work properly. Basically there's a vector with k elements, n threads and each thread has to calculate the max on its k/n elements.
My code is (please note it's not the whole code):
// Struct code used later
struct maxStruct
{
double *vettore;
int dimensione;
};
// Gathering data input from user
[ . . . ]
vector = (double *) malloc (dimensione * sizeof(double));
pid_thread = (int *) malloc (numero_thread * sizeof(int));
thread = (pthread_t *) malloc (numero_thread * sizeof(pthread_t));
// Generating the vector
[ . . . ]
for (i = 0; i < numero_thread; i++)
{
e = generaStruct(i, vettore, dimensione, numero_thread);
if (status = pthread_create(&thread[i], NULL, calcolaMassimo, (void *) e))
{
pthread_perror("pthread_join", status);
exit(1);
}
}
//Note that the function doesn't calculate the max, I've coded it in this way
//in order to see whether it was being called by each thread and apparently it is not.
void *calcolaMassimo(void * e)
{
printf("Sono chiamata!!\n");
struct maxStruct *sottoVettore = e;
printf("Dimensione: %d\n", ((*sottoVettore).dimensione));
}
Apparently this function is not being called by each thread and I can't figure out why. Will you please help me solve this issue?
Firstly, a minor nit pick, the idiomatic way to write (*sottoVettore).dimensione) is sottoVettore->dimensione.
The process containing all of threads will exit when main() exits. I know you said you're joining in you're actual code so that should not be an issue but if you're not joining in the test code then that could be an issue.
It is also possible that the issue is not that the code in each thread isn't executing, but that the statements aren't actually reaching stdout. You might want to try a fflush(stdout) at the end of calcolaMassimo and see if that changes things.

Running out of memory.. How?

I'm attempting to write a solver for a particular puzzle. It tries to find a solution by trying every possible move one at a time until it finds a solution. The first version tried to solve it depth-first by continually trying moves until it failed, then backtracking, but this turned out to be too slow. I have rewritten it to be breadth-first using a queue structure, but I'm having problems with memory management.
Here are the relevant parts:
int main(int argc, char *argv[])
{
...
int solved = 0;
do {
solved = solver(queue);
} while (!solved && !pblListIsEmpty(queue));
...
}
int solver(PblList *queue) {
state_t *state = (state_t *) pblListPoll(queue);
if (is_solution(state->pucks)) {
print_solution(state);
return 1;
}
state_t *state_cp;
puck new_location;
for (int p = 0; p < puck_count; p++) {
for (dir i = NORTH; i <= WEST; i++) {
if (!rules(state->pucks, p, i)) continue;
new_location = in_dir(state->pucks, p, i);
if (new_location.x != -1) {
state_cp = (state_t *) malloc(sizeof(state_t));
state_cp->move.from = state->pucks[p];
state_cp->move.direction = i;
state_cp->prev = state;
state_cp->pucks = (puck *) malloc (puck_count * sizeof(puck));
memcpy(state_cp->pucks, state->pucks, puck_count * sizeof(puck)); /*CRASH*/
state_cp->pucks[p] = new_location;
pblListPush(queue, state_cp);
}
}
}
free(state->pucks);
return 0;
}
When I run it I get the error:
ice(90175) malloc: *** mmap(size=2097152) failed (error code=12)
*** error: can't allocate region
*** set a breakpoint in malloc_error_break to debug
Bus error
The error happens around iteration 93,000.
From what I can tell, the error message is from malloc failing, and the bus error is from the memcpy after it.
I have a hard time believing that I'm running out of memory, since each game state is only ~400 bytes. Yet that does seem to be what's happening, seeing as the activity monitor reports that it is using 3.99GB before it crashes. I'm using http://www.mission-base.com/peter/source/ for the queue structure (it's a linked list).
Clearly I'm doing something dumb. Any suggestions?
Check the result of malloc. If it's NULL, you might want to print out the length of that queue.
Also, the code snippet you posted didn't include any frees...
You need to free() the memory you've allocated manually after you're done with it; dynamic memory doesn't just "free itself"

Resources