Resetting arrays in VS and Xcode have weird timestamp discrepancies - c

EDIT (09/24/15): Added GCC on Ubuntu timestamps
I have a 17-dimensional array (containing exactly 79,626,240 values) that I need reset (to -3) every loop.
I've written a chunk of code to test out using memset and for-loops to reset this array, and logged the average time it takes to reset the array. Comparing the same code block in Xcode and Visual Studio, some very weird results emerge...
Here are my timestamps (the code is appended to the bottom of this post):
| XCODE 7.0 | VS 14 | GCC |
---------------------------------------------------------
memset (seconds) | 0.00450 s | 0.00719 s | 0.01197 s |
for-loop (seconds) | 0.73300 s | 0.00728 s | 1.08112 s |
What is up with the time discrepancies?! Why is memset two orders of magnitude faster than using for-loops in Xcode, but in Visual Studio, they functionally take the exact same time?
The code is below. compiletable_main uses for-loops to reset the array, while compiletable_main_3 uses memset.
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <string.h>
#include <memory.h>
#define num_runs 10
#define num_gens 4000
#define num_threads 3
//// BEGIN CATABLE STRUCTURE
#define box1 3
#define box2 3
#define box3 3
#define box4 3
#define box5 4
#define box6 4
#define box7 4
#define box8 4
#define memvara 2
#define memvarb 2
#define memvarc 2
#define memvard 2
#define tdirect 1
#define adirect 4
#define outputnum 15
#define fs 2
#define bs 2
typedef struct calookup
{
signed char n[box1][box2][box3][box4][box5][box6][box7][box8][memvara][memvarb][memvarc][memvard][adirect][tdirect][fs][bs][outputnum];
} calookup;
//// END CATABLE STRUCTURE
int ra_pos = 0;
long int dimensions = box1*box2*box3*box4*box5*box6*box7*box8*memvara*memvarb*memvarc*memvard*adirect*tdirect*fs*bs*outputnum;
// Compiletable_main
void compiletable_main(calookup *lookup) {
int i, j, k, l, m, nb, o, p, na, nx, ny, nx1, naa, nbb;
int x, y, z, zz, xa, xb, xc, xd, ncc, ndd;
for (j = 0;j < box1; j++)
{
for (k = 0; k < box2; k++)
{
for (l = 0; l < box3; l++)
{
for (m = 0; m < box4; m++)
{
for (x = 0;x < box5; x++)
{
for (y = 0; y < box6; y++)
{
for (xa = 0;xa < box7; xa++)
{
for (xb = 0; xb < box8; xb++)
{
for (nb = 0; nb < memvara; nb++)
{
for (na = 0; na < memvarb; na++)
{
for (nx = 0; nx < memvarc; nx++)
{
for (nx1 = 0; nx1 < memvard; nx1++)
{
for (naa = 0; naa < adirect; naa++)
{
for (nbb = 0; nbb < tdirect; nbb++)
{
for (ncc = 0; ncc < fs; ncc++)
{
for (ndd = 0; ndd < bs; ndd++)
{
for (o = 0; o < outputnum; o++)
{
lookup->n[j][k][l][m][x][y][xa][xb][nb][na][nx][nx1][naa][nbb][ncc][ndd][o] = -3; //set to default value
}
}
}
}
}
}
}
}
}
}
}
}
}
}
}
}
}
}
void compiletable_main_3(calookup *lookup) {
memset(lookup->n, -3, (dimensions*sizeof(signed char)));
}
void evaluatepopulation_tissueb(calookup *rs) {
// Swap between compiletable_main_3 and compiletable_main
// for the memset and for-loop approaches, respectively
compiletable_main_3(rs);
}
calookup ra;
int main() {
printf(" Begin program... \n");
static double time_consumed = 0;
static double avg_time = 0;
clock_t start, end;
int i;
int i_max = 100;
start = clock();
for (i = 0; i < i_max; i++) {
evaluatepopulation_tissueb(&ra);
}
end = clock();
time_consumed = (double)(end-start)/CLOCKS_PER_SEC;
avg_time = time_consumed / i_max;
printf("Completed run \n");
printf(" Total Time : %lf \n ", time_consumed);
printf(" Avg Time/Loop : %lf \n", avg_time);
//sleep(70000);
return 0;
}

C has a long list of compilers developed by different organisations & individuals check here
These compilers vary a lot it the techniques they would employ for optimization & many other stuffs . Visual Studio and XCode being from different Tech Giants you should not expect them to use the same compiler.
A basic google Search Revealed that Apple supports LLVM/Clang which offers complete replacement for GCC
Microsoft uses a different one.
Thus you might notice a total time taken difference.
Try running your code on GCC or TCC ! You should again get different answers.
Result on GCC for the code you uploaded :

Related

Faster divisibility test than % operator?

I noticed a curious thing on my computer.* The handwritten divisibility test is significantly faster than the % operator. Consider the minimal example:
* AMD Ryzen Threadripper 2990WX, GCC 9.2.0
static int divisible_ui_p(unsigned int m, unsigned int a)
{
if (m <= a) {
if (m == a) {
return 1;
}
return 0;
}
m += a;
m >>= __builtin_ctz(m);
return divisible_ui_p(m, a);
}
The example is limited by odd a and m > 0. However, it can be easily generalized to all a and m. The code just converts the division to a series of additions.
Now consider the test program compiled with -std=c99 -march=native -O3:
for (unsigned int a = 1; a < 100000; a += 2) {
for (unsigned int m = 1; m < 100000; m += 1) {
#if 1
volatile int r = divisible_ui_p(m, a);
#else
volatile int r = (m % a == 0);
#endif
}
}
... and the results on my computer:
| implementation | time [secs] |
|--------------------|-------------|
| divisible_ui_p | 8.52user |
| builtin % operator | 17.61user |
Therefore more than 2 times faster.
The question: Can you tell me how the code behaves on your machine? Is it missed optimization opportunity in GCC? Can you do this test even faster?
UPDATE:
As requested, here is a minimal reproducible example:
#include <assert.h>
static int divisible_ui_p(unsigned int m, unsigned int a)
{
if (m <= a) {
if (m == a) {
return 1;
}
return 0;
}
m += a;
m >>= __builtin_ctz(m);
return divisible_ui_p(m, a);
}
int main()
{
for (unsigned int a = 1; a < 100000; a += 2) {
for (unsigned int m = 1; m < 100000; m += 1) {
assert(divisible_ui_p(m, a) == (m % a == 0));
#if 1
volatile int r = divisible_ui_p(m, a);
#else
volatile int r = (m % a == 0);
#endif
}
}
return 0;
}
compiled with gcc -std=c99 -march=native -O3 -DNDEBUG on AMD Ryzen Threadripper 2990WX with
gcc --version
gcc (Gentoo 9.2.0-r2 p3) 9.2.0
UPDATE2: As requested, the version that can handle any a and m (if you also want to avoid integer overflow, the test has to be implemented with integer type twice as long as the input integers):
int divisible_ui_p(unsigned int m, unsigned int a)
{
#if 1
/* handles even a */
int alpha = __builtin_ctz(a);
if (alpha) {
if (__builtin_ctz(m) < alpha) {
return 0;
}
a >>= alpha;
}
#endif
while (m > a) {
m += a;
m >>= __builtin_ctz(m);
}
if (m == a) {
return 1;
}
#if 1
/* ensures that 0 is divisible by anything */
if (m == 0) {
return 1;
}
#endif
return 0;
}
What you’re doing is called strength reduction: replacing an expensive operation with a series of cheap ones.
The mod instruction on many CPUs is slow, because it historically was not tested in several common benchmarks and the designers therefore optimized other instructions instead. This algorithm will perform worse if it has to do many iterations, and % will perform better on a CPU where it needs only two clock cycles.
Finally, be aware that there are many shortcuts to take the remainder of division by specific constants. (Although compilers will generally take care of this for you.)
I will answer my question myself. It seems that I became a victim of branch prediction. The mutual size of the operands does not seem to matter, only their order.
Consider the following implementation
int divisible_ui_p(unsigned int m, unsigned int a)
{
while (m > a) {
m += a;
m >>= __builtin_ctz(m);
}
if (m == a) {
return 1;
}
return 0;
}
and the arrays
unsigned int A[100000/2];
unsigned int M[100000-1];
for (unsigned int a = 1; a < 100000; a += 2) {
A[a/2] = a;
}
for (unsigned int m = 1; m < 100000; m += 1) {
M[m-1] = m;
}
which are / are not shuffled using the shuffle function.
Without shuffling, the results are still
| implementation | time [secs] |
|--------------------|-------------|
| divisible_ui_p | 8.56user |
| builtin % operator | 17.59user |
However, once I shuffle these arrays, the results are different
| implementation | time [secs] |
|--------------------|-------------|
| divisible_ui_p | 31.34user |
| builtin % operator | 17.53user |

Printing every unlock pattern in a 2d phone keypad in C

I want to write a function that prints all possible patterns like in the examples below. In every case, we must start in the top left of a 3x3 array. It's similar to the patterns to unlock mobile phones, except the line can't go diagonally and must pass through every box.
1--->2--->3 1--->2--->3
| |
v v
8<---7 4 or 6<---5<---4
| ^ | |
v | v v
9 6<---5 7--->8--->9
I started by writing a code where [0][0] was assigned 1 then randomise the rest of the digits in the 2d array until 1[0] or 0 was equal to 2, and so forth. But I feel like this is making the problem even more difficult to solve.
Then tried to use recursion to call the makePattern function again and again until the array is changed; however, it changes all values in the array to 2 because of these lines of code:
int value = 2;
array[x][y] = value;
However, I don't how to loop this value so that it increases as the function is called again.
#include <stdio.h>
#include <stdlib.h>
#define ROW 3
#define COLUMN 3
int makePattern(int array[ROW][COLUMN], int x, int y);
int main(void) {
int x, y;
int count = 2;
int i, j;
int array[ROW][COLUMN] = {
{'1', '0', '0'},
{'0', '0', '0'},
{'0', '0', '0'},
};
makePattern(array, 0, 0);
for (i = 0; i < ROW; i++) {
for (j = 0; j < COLUMN; j++) {
printf("%d", array[i][j]);
}
printf("\n");
}
return 0;
}
int makePattern(int array[ROW][COLUMN], int x, int y) {
int value = 2;
array[x][y] = value;
for (value = 2; value < 9; value++) {
if (x + 1 < ROW && array[x+1][y] == '0') {
makePattern(array, x + 1, y);
}
if (x - 1 >= 0 && array[x - 1][y] == '0') {
makePattern(array, x - 1, y);
}
if (y + 1 < COLUMN && array[x][y + 1] == '0') {
makePattern(array, x, y + 1);
}
if (y - 1 >= 0 && array[x][y - 1] == '0') {
makePattern(array, x, y - 1);
}
value++;
}
}
You're on the right track here in that you're using a 3x3 matrix to keep track of state (visited nodes and to store the path taken), x/y coordinates to represent the current location and spawning four recurse calls to handle the possible move directions (with bounds checks).
However, I'm not sure the loop running to 9 is going to work--this will spawn 36 recursive calls per frame. This might be workable in some implementations, but I think the easiest approach is to treat each frame as exploring one possible direction given an x/y coordinate pair, then backtracking (undoing the move) after all directions have been explored recursively from that square. Whenever we hit the last step, we know we've explored all of the squares and it's time to print the current solution path.
Here's code which achieves this and basically hardcodes the dimensions. An exercise would be to generalize the code to matrices of any size and return the path to separate printing from the traversal logic. I also opted to move state out of the main function.
#include <stdbool.h>
#include <stdio.h>
#include <string.h>
static void print_unlock_patterns_r(int pad[3][3], int x, int y, int step) {
static int const directions[][2] = {{1, 0}, {0, 1}, {-1, 0}, {0, -1}};
pad[y][x] = 1 + step;
for (int i = 0; i < 4; i++) {
int xp = x + directions[i][0];
int yp = y + directions[i][1];
if (xp >= 0 && xp < 3 && yp >= 0 && yp < 3 && !pad[yp][xp]) {
print_unlock_patterns_r(pad, xp, yp, step + 1);
}
}
if (step == 8) {
for (int i = 0; i < 3; i++, puts("")) {
for (int j = 0; j < 3; printf("%d", pad[i][j++]));
}
puts("");
}
pad[y][x] = 0;
}
void print_unlock_patterns() {
int pad[3][3];
memset(pad, 0, sizeof(pad));
print_unlock_patterns_r(pad, 0, 0, 0);
}
int main(void) {
print_unlock_patterns();
return 0;
}
Output:
123
894
765
123
874
965
123
654
789
129
438
567
145
236
987
189
276
345
187
296
345
167
258
349

How to speed up printf in C

I have a task to print all the prime numbers between 1 and 1000000 in class and the fastest 10 programs get extra marks. The main problem is the time it takes for the prime numbers to be printed to the console.
Basically using the Sieve of Eratosthenes I produce an array with only boolean values in it. The boolean value Numbers[i] is true if i+2 is a prime number.
for(i = 0; i <= n - 2; ++i)
if (Numbers[i]) // True if the number is prime
printf("%d\n", i+2);
Printf seems to be really slow as the program can generate the list of primes in about 0.035 s but then takes a further 11 seconds to print the list. Is there anyway I can speed this up, thanks.
Beneath is a slightly unoptimized implementation (although I skipped the intermediate list and print directly) of what I think you were supposed to do. Running that program on an AMD A8-6600K with a small load (mainly a Youtube music-video for some personal entertainment) results in
real 0m1.211s
user 0m0.047s
sys 0m0.122s
averaged over a couple of runs. So the problem lies in your implementation of the sieve or you are hiding some essential facts about your hardware.
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <inttypes.h>
#include <limits.h>
#include <string.h>
/* I call it a general bitset. Others might call it an abomination. YMMV. */
# define ERAT_BITS (sizeof(uint32_t)*CHAR_BIT)
# define GET_BIT(s,n) ((*(s+(n/ERAT_BITS)) & ( 1<<( n % ERAT_BITS ))) != 0)
# define SET_BIT(s,n) (*(s+(n/ERAT_BITS)) |= ( 1<<( n % ERAT_BITS )))
# define CLEAR_BIT(s,n) (*(s+(n/ERAT_BITS)) &= ~( 1<<( n % ERAT_BITS )))
# define TOG_BIT(s,n) (*(s+(n/ERAT_BITS)) ^= ( 1<<( n % ERAT_BITS )))
/* size is the size in bits, the overall size might be bigger */
typedef struct mp_bitset_t {
uint32_t size;
uint32_t *content;
} mp_bitset_t;
# define mp_bitset_alloc(bst, n) \
do {\
(bst)->content=malloc(( n /(sizeof(uint32_t)) + 1 ));\
if ((bst)->content == NULL) {\
fprintf(stderr, "memory allocation for bitset failed");\
exit(EXIT_FAILURE);\
}\
(bst)->size = n;\
} while (0)
# define mp_bitset_size(bst) ((bst)->size)
# define mp_bitset_setall(bst) memset((bst)->content,~(uint32_t)(0),\
(bst->size /(sizeof(uint32_t) ) +1 ))
# define mp_bitset_clearall(bst) memset((bst)->content,0,\
(bst->size /(sizeof(uint32_t) ) +1 ))
# define mp_bitset_clear(bst,n) CLEAR_BIT((bst)->content, n)
# define mp_bitset_set(bst,n) SET_BIT((bst)->content, n)
# define mp_bitset_get(bst,n) GET_BIT((bst)->content, n)
# define mp_bitset_free(bst) \
do {\
free((bst)->content);\
free(bst);\
} while (0)
uint32_t mp_bitset_nextset(mp_bitset_t * bst, uint32_t n);
uint32_t mp_bitset_prevset(mp_bitset_t * bst, uint32_t n);
void mp_eratosthenes(mp_bitset_t * bst);
/* It's called Hallek's method but it has many inventors*/
static uint32_t isqrt(uint32_t n)
{
uint32_t s, rem, root;
if (n < 1)
return 0;
/* This is actually the highest square but it goes
* downward from this, quite fast */
s = 1 << 30;
rem = n;
root = 0;
while (s > 0) {
if (rem >= (s | root)) {
rem -= (s | root);
root >>= 1;
root |= s;
} else {
root >>= 1;
}
s >>= 2;
}
return root;
}
uint32_t mp_bitset_nextset(mp_bitset_t *bst, uint32_t n)
{
while ((n < mp_bitset_size(bst)) && (!mp_bitset_get(bst, n))) {
n++;
}
return n;
}
/*
* Standard method, quite antique now, but good enough for the handful
* of primes needed here.
*/
void mp_eratosthenes(mp_bitset_t *bst)
{
uint32_t n, k, r, j;
mp_bitset_setall(bst);
mp_bitset_clear(bst, 0);
mp_bitset_clear(bst, 1);
n = mp_bitset_size(bst);
r = isqrt(n);
for (k = 4; k < n; k += 2)
mp_bitset_clear(bst, k);
k = 0;
while ((k = mp_bitset_nextset(bst, k + 1)) < n) {
if (k > r) {
break;
}
for (j = k * k; j < n; j += k * 2) {
mp_bitset_clear(bst, j);
}
}
}
#define UPPER_LIMIT 1000000 /* one million */
int main(void) {
mp_bitset_t *bst;
uint32_t n, k, j;
bst = malloc(sizeof(mp_bitset_t));
if(bst == NULL) {
fprintf(stderr, "failed to allocate %zu bytes\n",sizeof(mp_bitset_t));
exit(EXIT_FAILURE);
}
mp_bitset_alloc(bst, UPPER_LIMIT);
mp_bitset_setall(bst);
mp_bitset_clear(bst, 0); // 0 is not prime b.d.
mp_bitset_clear(bst, 1); // 1 is not prime b.d.
n = mp_bitset_size(bst);
for (k = 4; k < n; k += 2) {
mp_bitset_clear(bst, k);
}
k = 0;
while ((k = mp_bitset_nextset(bst, k + 1)) < n) {
printf("%" PRIu32 "\n", k);
for (j = k * k; j < n; j += k * 2) {
mp_bitset_clear(bst, j);
}
}
mp_bitset_free(bst);
return EXIT_SUCCESS;
}
Compiled with
gcc-4.9 -O3 -g3 -W -Wall -Wextra -Wuninitialized -Wstrict-aliasing -pedantic -std=c11 tests.c -o tests
(GCC is gcc-4.9.real (Ubuntu 4.9.4-2ubuntu1~14.04.1) 4.9.4)
Since by default console output is line buffered, which is the reason of the increased time.
You can use the setvbuf function to allow printing to console/stdout only in chunks rather than for each iteration.
E.g.
char buffer[256];
setvbuf(stdout, buffer, _IOFBF, sizeof(buffer));
You can alter the size of buffer according to your needs.
IOFBF option is for full buffering i.e. output will be printed once the buffer is full.
See setvbuf for more details

custom string alignment using printf in C

I'm trying to get the following output from the given array
Apples 200 Grapes 900 Bananas Out of stock
Grapefruits 2 Blueberries 100 Orangess Coming soon
Pears 10000
Here's what I came up so far (feels like I'm overdoing it), however, I'm still missing something when padding the columns. I'm open to any suggestions on how to approach this.
#include <stdio.h>
#include <string.h>
#define ARRAY_SIZE(a) (sizeof(a) / sizeof(a[0]))
char *fruits[][2] = {
{"Apples", "200"},
{"Grapes", "900"},
{"Bananas", "Out of stock"},
{"Grapefruits", "2"},
{"Blueberries", "100"},
{"Oranges", "Coming soon"},
{"Pears", "10000"},
};
int get_max (int j, int y) {
int n = ARRAY_SIZE(fruits), width = 0, i;
for (i = 0; i < n; i++) {
if (i % j == 0 && strlen(fruits[i][y]) > width) {
width = strlen(fruits[i][y]);
}
}
return width;
}
int main(void) {
int n = ARRAY_SIZE(fruits), i, j;
for (i = 0, j = 1; i < n; i++) {
if (i > 0 && i % 3 == 0) {
printf("\n"); j++;
}
printf("%-*s ", get_max(j, 0), fruits[i][0]);
printf("%-*s ", get_max(j, 1), fruits[i][1]);
}
printf("\n");
return 0;
}
Current output:
Apples 200 Grapes 900 Bananas Out of stock
Grapefruits 2 Blueberries 100 Oranges Coming soon
Pears 10000
You are computing widths wrong. In essence, you want to be able to compute the width of a particular column. Thus, in your get_max function, you should be able to specify a column. We can then pick out the elements from the list based on whether their index mod 3 is equal to the column. This can be accomplished as such:
int get_max (int column, int y) {
...
if (i % 3 == column /* <- change */ && strlen(fruits[i][y]) > width) {
...
}
Then in your main loop, you want to choose the widths of the columns based on what column you are currently in. You can do that by taking the index mod 3:
for (i = 0, j = 1; i < n; i++) {
...
printf("%-*s ", get_max(i % 3 /* change */, 0), fruits[i][0]);
printf("%-*s ", get_max(i % 3 /* change */, 1), fruits[i][1]);
}
This should work as you expect.
I dint try understanding your logic but i think you can space the data using tab with "\t":
printf("%s \t %d","banana", 200);

Values that are stored in a C structure 'magically' change on their own

I have a structure (with nesting) of the following type:
typedef struct {
float precursor_mz;
float precursor_int;
int scan;
float time;
spectrum* spectra; /* Nested struct */
int array_length;
int mz_length;
int int_length;
char* mz_binary;
char* int_binary;
int hits;
} compound;
typedef struct {
float mz_value;
float int_value;
int peaks;
} spectrum;
I transform this structure to allow me to use qsort, after which i store it back as my own 'type'. A few lines later in the code i wish to loop over the structure but somehow the values changed without me accessing them (in between). Code snippet below:
// The transformating & qsort chunk
for (i = 0; i < compounds->hits; i++) {
spectrum test[(compounds+i)->spectra->peaks];
for (j = 0; j < (compounds+i)->spectra->peaks; j++) {
test[j] = *((compounds+i)->spectra+j);
}
qsort(test,(compounds+i)->spectra->peaks,sizeof(spectrum),compare_mz);
for (j = 0; j < (compounds+i)->spectra->peaks; j++) {
((compounds+i)->spectra+j)->mz_value = test[j].mz_value;
((compounds+i)->spectra+j)->int_value = test[j].int_value;
if ( j < 10) {
printf("%i %i\t", i, j);
printf("%f %f\n",((compounds+i)->spectra+j)->mz_value, ((compounds+i)->spectra+j)->int_value); // Here values are still correct
}
}
}
/* Summing values that are in 'mass-tolerance' of each other */
float int_total;
float mz_int_total;
for (i = 0; i < compounds->hits; i++) {
counter = 0;
printf("---\n");
for (j = 0; j < (compounds+i)->spectra->peaks; j++) {
lower_mass = ((compounds+i)->spectra+j)->mz_value - 0.05; //args->mass_tolerance;
upper_mass = ((compounds+i)->spectra+j)->mz_value + 0.05; //args->mass_tolerance;
if (j < 10) {
printf("%i %i\t", i , j);
printf("%f %f\n",((compounds+i)->spectra+j)->mz_value, ((compounds+i)->spectra+j)->int_value); // Here values are borked
}
// Rest of the code chopped off as it should be irrelevant
This code however produces the following output:
tarskin#5-PARA-11-0120:/data/programming/C/Compound_Spectra$ ./Run -f ../PeptMoiety/32757_p_01.mzML -c 1
0 0 168.858765 32489.994141
0 1 168.960327 72930.046875
0 2 169.039993 4924.188477
0 3 169.913681 85340.171875
0 4 169.932312 2406.798096
0 5 171.000320 345949.593750
0 6 171.007950 1034718.312500
0 7 171.034088 882886.562500
0 8 171.034378 58554.589844
0 9 171.056320 871035.500000
---
0 0 168.858765 32489.994141
0 1 168.960327 72930.046875
0 2 169.039993 4924.188477
0 3 169.913681 85340.171875
0 4 0.000000 0.000000
0 5 169.932312 2406.798096
0 6 171.007950 1034718.312500
0 7 0.000000 0.000000
0 8 0.000000 0.000000
0 9 0.000000 0.000000
Does anyone have any idea what could be happening ?
-- EDIT 1 --
Alk requested the code for compare_mz, which is as follows:
int
compare_mz (const void *a, const void *b)
{
const spectrum *fa = (const spectrum *) a;
const spectrum *fb = (const spectrum *) b;
return (fa->mz_value > fb->mz_value)
-(fa->mz_value < fb->mz_value);
}
The test case that i showed was for a single compound (so i = 1).
I strongly assume the memory referenced by spectrum* spectra; /* Nested struct */ had not been allocated properly or had been (partly) released.
Try running your app using valgrind.
Also (in case you are using gcc): Do you get any compiler warnings if using gcc's -Wall and/or -pedantic options?
Try the following mod/ and see if the app/ behaves different (the stack won't be touched between the to for (j=0;..;..) loops this way):
float int_total;
float mz_int_total;
spectrum test[(compounds+i)->spectra->peaks];
for (i = 0; i < compounds->hits; i++) {
...
/* Summing values that are in 'mass-tolerance' of each other */
for (i = 0; i < compounds->hits; i++) {
...

Resources