Below is a usual way to allocate multidimensional arrays on heap, by using pointers to pointers.
typedef struct ArrayInt {
int *array;
int length;
} ArrayInt;
static void ArrayIntCreate(ArrayInt *array, int length) {
array->array = MjMalloc(length * sizeof(int));
array->length = length;
}
static void ArrayIntDelete(ArrayInt *array) {
free(array->array);
}
typedef struct ArrayArrayInt {
ArrayInt *array;
int length;
} ArrayArrayInt;
static void ArrayArrayIntCreate(ArrayArrayInt *array, int length, int length2) {
array->array = MjMalloc(length * sizeof(ArrayInt));
array->length = length;
for (int i = 0; i < length; i += 1) {
ArrayIntCreate(&array->array[i], length2);
}
}
static void ArrayArrayIntDelete(ArrayArrayInt *array) {
for (int i = 0; i < array->length; i += 1) {
ArrayIntDelete(&array->array[i]);
}
free(array->array);
}
But I decided to make a version that allocates only one chunck of memory and does element accessing by multiplication to an index value.
typedef struct ArrayArrayInt2 {
int *array;
int length;
int length2;
} ArrayArrayInt2;
static void ArrayArrayInt2Create(ArrayArrayInt2 *array, int length, int length2) {
array->array = MjMalloc(length * length2 * sizeof(ArrayInt));
array->length = length;
array->length2 = length2;
}
static void ArrayArrayInt2Delete(ArrayArrayInt2 *array) {
free(array->array);
}
#define aai2At(aai2, i) (&aai2.array[i * aai2.length2])
The second version appreas to run about 20% faster when running the test code below. What is likely to be the cause, and is this a generally applicable optimization technique? Are there some libraries that define array types of this kind for optimization purpose?
I made a huge mistake in the test code before edit. The first version ran slower because its allocation and deallocation kept place inside the for-loop while the second one did it only once before entering the loop. See the comments in the test code below. After making the two tests equal, I find that the first version can run even faster, especially after optimization. The more complex operations and various copies I put into the test code, I see the first one always run a little bit faster. It seems that the multiplication for indexing is slow in my machine? I'm not sure for the cause, though.
static double ElapsedTime(clock_t startTime, clock_t endTime) {
return (double)(endTime - startTime) / CLOCKS_PER_SEC;
}
#define N 2000
int main() {
ArrayArrayInt aai;
ArrayArrayInt2 aai2;
long long int sum;
clock_t startTime, endTime;
startTime = clock();
sum = 0;
for (int k = 0; k < N; k += 1) {
ArrayArrayIntCreate(&aai, N, N);
for (int i = 0; i < aai.length; i += 1) {
int j = 0;
for (; j < aai.array[i].length; j += 1) {
aai.array[i].array[j] = i;
}
while ((j -= 1) >= 0) {
sum += aai.array[i].array[j] - i + 1;
}
}
ArrayArrayIntDelete(&aai);
}
endTime = clock();
printf("aai: sum = %lld; time = %.2f\n", sum, ElapsedTime(startTime, endTime));
startTime = clock();
sum = 0;
ArrayArrayInt2Create(&aai2, N, N); //Mistake Here!!
for (int k = 0; k < N; k += 1) {
for (int i = 0; i < aai2.length; i += 1) {
int j = 0;
for (; j < aai2.length2; j += 1) {
aai2At(aai2, i)[j] = i;
}
while ((j -= 1) >= 0) {
sum += aai2At(aai2, i)[j] - i + 1;
}
}
}
ArrayArrayInt2Delete(&aai2); //Should go inside the loop block..
endTime = clock();
printf("aai2: sum = %lld; time = %.2f\n", sum, ElapsedTime(startTime, endTime));
return 0;
}
Yes, using arithmetic and a single base pointer is what the compiler does internally for non-dynamically allocated 2D (n-dimensional) arrays.
You gain the most performance because there's a single calculation and indexed lookup. With the 2D array shown, there are two pointer lookups and two index calculations per array access (one index calculation and lookup to get to the right array, and then the second to access the element in the right array). With a 3D array, there'd be three index calculations and three lookups.
You also allocate less memory, and need fewer memory allocations, but those are second order effects.
Also, as WhozCraig points out in a comment but I didn't mention, you get better locality of reference and potential for smarter prefetch with a single big chunk of memory compared with multiple smaller chunks (that add up to more memory than the single big chunk).
I tested this file (sim2d.c) compiled with GCC 4.9.1 on Mac OS X 10.10.2 Yosemite.
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
static void *MjMalloc(size_t nbytes)
{
void *rv = malloc(nbytes);
if (rv == 0)
{
fprintf(stderr, "Memory allocation failure (%zu bytes)\n", nbytes);
exit(1);
}
return rv;
}
/* Mechanism 1 */
typedef struct ArrayInt {
int *array;
int length;
} ArrayInt;
static void ArrayIntCreate(ArrayInt *array, int length) {
array->array = MjMalloc(length * sizeof(int));
array->length = length;
}
static void ArrayIntDelete(ArrayInt *array) {
free(array->array);
}
typedef struct ArrayArrayInt {
ArrayInt *array;
int length;
} ArrayArrayInt;
static void ArrayArrayIntCreate(ArrayArrayInt *array, int length, int length2) {
array->array = MjMalloc(length * sizeof(ArrayInt));
array->length = length;
for (int i = 0; i < length; i += 1) {
ArrayIntCreate(&array->array[i], length2);
}
}
static void ArrayArrayIntDelete(ArrayArrayInt *array) {
for (int i = 0; i < array->length; i += 1) {
ArrayIntDelete(&array->array[i]);
}
free(array->array);
}
/* Mechanism 2 */
typedef struct ArrayArrayInt2 {
int *array;
int length;
int length2;
} ArrayArrayInt2;
static void ArrayArrayInt2Create(ArrayArrayInt2 *array, int length, int length2) {
array->array = MjMalloc(length * length2 * sizeof(ArrayInt));
array->length = length;
array->length2 = length2;
}
static void ArrayArrayInt2Delete(ArrayArrayInt2 *array) {
free(array->array);
}
#define aai2At(aai2, i) (&aai2.array[(i) * aai2.length2])
#define aai2At2(aai2, i, j) (aai2.array[(i) * aai2.length2 + (j)])
/* Head-to-head testing */
static double ElapsedTime(clock_t startTime, clock_t endTime) {
return (double)(endTime - startTime) / CLOCKS_PER_SEC;
}
#define N 2000
#define N_CYCLES 1000
static void one_test_cycle(void)
{
ArrayArrayInt aai;
ArrayArrayInt2 aai2;
long long int sum;
clock_t startTime, endTime;
startTime = clock();
sum = 0;
for (int k = 0; k < N_CYCLES; k += 1) {
ArrayArrayIntCreate(&aai, N, N);
for (int i = 0; i < aai.length; i += 1) {
int j = 0;
for (; j < aai.array[i].length; j += 1) {
aai.array[i].array[j] = i;
}
while ((j -= 1) >= 0) {
sum += aai.array[i].array[j] - i + 1;
}
}
ArrayArrayIntDelete(&aai);
}
endTime = clock();
printf("aai1: sum = %lld; time = %.2f\n", sum, ElapsedTime(startTime, endTime));
startTime = clock();
sum = 0;
for (int k = 0; k < N_CYCLES; k += 1) {
ArrayArrayInt2Create(&aai2, N, N);
for (int i = 0; i < aai2.length; i += 1) {
int j = 0;
for (; j < aai2.length2; j += 1) {
aai2At(aai2, i)[j] = i;
}
while ((j -= 1) >= 0) {
sum += aai2At(aai2, i)[j] - i + 1;
}
}
ArrayArrayInt2Delete(&aai2);
}
endTime = clock();
printf("aai2: sum = %lld; time = %.2f\n", sum, ElapsedTime(startTime, endTime));
startTime = clock();
sum = 0;
for (int k = 0; k < N_CYCLES; k += 1) {
ArrayArrayInt2Create(&aai2, N, N);
for (int i = 0; i < aai2.length; i += 1) {
int j = 0;
for (; j < aai2.length2; j += 1) {
aai2At2(aai2, i, j) = i;
}
while ((j -= 1) >= 0) {
sum += aai2At2(aai2, i, j) - i + 1;
}
}
ArrayArrayInt2Delete(&aai2);
}
endTime = clock();
printf("aai3: sum = %lld; time = %.2f\n", sum, ElapsedTime(startTime, endTime));
}
static void print_now(const char *tag)
{
time_t now = time(0);
struct tm *lt = localtime(&now);
char buffer[32];
strftime(buffer, sizeof(buffer), "%Y-%m-%d %H:%M:%S", lt);
printf("%s: %s\n", tag, buffer);
}
int main(void)
{
print_now("Started");
for (int i = 0; i < 3; i++)
one_test_cycle();
print_now("Finished");
return 0;
}
There are two slightly different ways of accessing the aai2 data. I also separated the array size (N = 2000) from the number of cycles in a single test (N_CYCLES = 1000). The timing results I got were:
Started: 2015-04-07 07:40:41
aai1: sum = 4000000000; time = 6.80
aai2: sum = 4000000000; time = 5.99
aai3: sum = 4000000000; time = 5.98
aai1: sum = 4000000000; time = 6.75
aai2: sum = 4000000000; time = 6.02
aai3: sum = 4000000000; time = 5.99
aai1: sum = 4000000000; time = 6.72
aai2: sum = 4000000000; time = 6.01
aai3: sum = 4000000000; time = 5.99
Finished: 2015-04-07 07:41:38
I was getting similar patterns with (N_CYCLE = 2000), but it was taking twice as long to run — surprise, surprise.
I'm seeing a small but noticeable benefit (about 13% decrease) from the single allocation code, but no significant difference between the two timings for the 'aai2' tests.
Basic statistics:
# All data
# Count = 9
# Mean = 6.250000e+00
# Std Dev = 3.807230e-01
# aai1 only:
# Count = 3
# Mean = 6.756667e+00
# Std Dev = 4.041452e-02
# aai2 and aai3:
# Count = 6
# Mean = 5.996667e+00
# Std Dev = 1.505545e-02
# aai2 only:
# Count = 3
# Mean = 6.006667e+00
# Std Dev = 1.527525e-02
# aai3 only:
# Count = 3
# Mean = 5.986667e+00
# Std Dev = 5.773503e-03
Clearly, formally making sure the machine is otherwise unloaded, and running many more iterations of the test, and similar benchmarking steps might improve the data, but the single allocation aai2 mechanism performs better on this machine than the multi-allocation aai mechanism. (Tangential aside: why do people not put a suffix 1 on their first version when they have two or more versions of the code?)
Hardware: 17" Mac Book Pro, early-2011, 2.3 GHz Intel Core i7, 16 GiB 1333 MHz DDR3 RAM.
Related
I'm trying to solve a CodeWars problem called "Training on Binary Genetic Algorithms." There is a fitness function that is preloaded. When the program is run, a test function creates a random 35-bit string and it uses my run function which is supposed to return the same 35-bit string. This string is supposed to be found using a genetic algorithm.
Here is my code:
#include <stddef.h>
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <time.h>
typedef double fitness_t (const char *, ...);
extern fitness_t fitness;
void generate (size_t length, char * s)
{
for (size_t i = 0; i < length; i++)
s[i] = rand() % 2 + 48;
}
double sum(size_t n, double ar[n])
{
double sum = 0;
for (int i = 0; i < n; i++)
sum += ar[i];
return sum;
}
void select (int size, char* population[size], double fitnesses[size])
{
double probabilities[size]; // normalized to 1
double r; // random number
int s1, s2;
int i;
for (i = 0; i < size; i++)
probabilities[i] = fitnesses[i] / sum(size, fitnesses);
// select first chromosome
r = (double)(rand() % 1000000) / 1000000; // generates a random float between 0 and 1
for (i = 0; i < size && r > 0; i++)
r -= probabilities[i];
s1 = i;
// select second chromosome
s2 = s1;
while (s2 == s1) // ensures the two chromosomes aren't the same
{
r = (double)(rand() % 1000000) / 1000000; // generates a random float between 0 and 1
for (i = 0; i < size && r > 0; i++)
r -= probabilities[i];
s2 = i;
}
// places these two chromosomes on top
char * temp = population[0];
population[0] = population[s1];
population[s1] = temp;
temp = population[1];
population[1] = population[s2];
population[s2] = temp;
}
void crossover (size_t n, char* s1, char* s2)
{
int r = rand() % n; // select a random bit to cross over at
char temp;
for (size_t i = r; i < n; i++) // swap every bit from bit r to bit n
{
temp = s1[i];
s1[i] = s2[i];
s2[i] = temp;
}
}
void mutate (size_t n, char* s, double p)
{
double r;
for (size_t i = 0; i < n; i++) // for each bit
{
r = (double)(rand() % 1000000) / 1000000; // random float between 0 and 1
if (r <= p) // if random number is less than probability
{
if (s[i] == '1') s[i] = '0'; // swap 0s and 1s
else s[i] = '1';
}
}
}
void bubbleSortPop(int size, char * population[size], double fitnesses[size])
{
int i, j;
char * temp_chrome;
double temp_fitness;
for (i = 0; i < size - 1; i++)
// Last i elements are already in place
for (j = 0; j < size - i - 1; j++)
if (fitnesses[j] < fitnesses[j + 1])
{
temp_chrome = population[j];
population[j] = population[j+1];
population[j+1] = temp_chrome;
temp_fitness = fitnesses[j];
fitnesses[j] = fitnesses[j+1];
fitnesses[j+1] = temp_fitness;
}
}
// this function changes the population.
// select, crossover, mutate
void evolve(fitness_t f, size_t size, int length, char * population[size],
double fitnesses[size], double p_c, double p_m)
{
char * s1, * s2;
double f1, f2;
char * temp_pop[size+2];
double temp_fit[size+2];
int i;
double r;
// moves two selected parents to the top
select(size, population, fitnesses);
// begin reproduction process; duplicate the chromosomes
s1 = population[0];
s2 = population[1];
// crossover
r = (double)(rand() % 1000000) / 1000000; // random float between 0 and 1
if (r < p_c) // probability of crossing over
crossover(length, s1, s2); // commences with crossover
// mutate
mutate(length, s1, p_m);
mutate(length, s2, p_m);
// calculate fitnesses
f1 = f(s1);
f2 = f(s2);
// merge fitneses
// copy original fitnesses into temp_fit
for (i = 0; i < size; i++)
temp_fit[i] = fitnesses[i];
// add new fitnesses
temp_fit[size] = f1;
temp_fit[size+1] = f2;
// merge children into population
// copy original population into temp_pop
for (i = 0; i < size; i++)
temp_pop[i] = population[i];
// add two children to temp_pop
temp_pop[size] = s1;
temp_pop[size+1] = s2;
// sort fitnesses and population
bubbleSortPop(size+2, temp_pop, temp_fit);
// add first 100 elements of temp_pop and fit to population and fitnesses
for (i = 0; i < size; i++)
{
population[i] = temp_pop[i];
fitnesses[i] = temp_fit[i];
}
}
char* runN (fitness_t f, int length, double p_c, double p_m, size_t iterations) {
}
char* run (fitness_t f, int length, double p_c, double p_m)
{
size_t size = 100;
char * population[size];
double fitnesses[size];
size_t i;
int r;
srand(time(0));
// initialize population array
for (i = 0; i < size; i++)
population[i] = malloc((length+1) * sizeof(char));
// generate original population
for (i = 0; i < size; i++)
{
generate(length, population[i]);
fitnesses[i] = f(population[i]);
printf("[%2d] %s %lf\n", i, population[i], fitnesses[i]);
}
// evolve the population
for (i = 0; i < 10; i++)
evolve(f, size, length, population, fitnesses, p_c, p_m);
// print result
printf("\nAFTER EVOLUTION\n");
for (i = 0; i < size; i++) // generates original population
printf("[%2d] %s %lf\n", i, population[i], fitnesses[i]);
// store best chromosome and free memory
char ret[length+1];
strcpy(ret, population[0]);
for (i = 0; i < size; i++)
free(population[i]);
return ret;
}
The issue is when I run my code, it nearly always comes out with a segfault at some point while printing the contents of population and fitness.
At least these problems:
Attempting to print a non-string with "%s"
Code uses "%s" and passes population[i] as if it points to a string. population[i] does not point to a string as it does not certainly have a null character. Result undefined behavior (UB). Perhaps attempting to access beyond allocated memory.
// Undefined behavior: population[i] not a string
printf("[%2d] %s %lf\n", i, population[i], fitnesses[i]);
Set the null character.
generate(length, population[i]);
population[i][length] = '\0'; // Add this here or equivalent in `generate()`.
Many compiler warnings (20+)
Enable all compiler warnings and fix those.
I found the solution. It was all the places where I tried to copy a string by making a string pointer and assigning it the same address as the pointer I wanted to copy. For example, in 'select', when I tried to move the two strings to the top, I did
char * temp = population[0];
population[0] = population[s1];
population[s1] = temp;
temp = population[1];
population[1] = population[s2];
population[s2] = temp;
I changed this to using strcpy(). I made the same mistake in 'evolve' where I tried to duplicate the chromosomes by just copying their address into variables, rather than the strings themselves:
char * s1, * s2;
// begin reproduction process; duplicate the chromosomes
s1 = population[0];
s2 = population[1];
I changed it to this:
char s1[length+1], s2[length+1];
strcpy(s1, population[0]);
strcpy(s2, population[1]);
After I made this change the segfault went away. Thanks for all your answers.
I'll get right into my problem. So basically what I want to do is to generate an array of random numbers of different amounts. So one with 10,000, 50,000, 100,000, 500,000, 600,000, etc. Then I would sort them using quicksort and print the sorted array to the screen. Additionally, the time taken for it to run would be recorded and printed as well. The only part I'm having problems with however is generating the array. For some reason generating past 500,000 random numbers does not work and returns this:
Process exited after 2.112 seconds with return value 3221225725
Press any key to continue . . .
([1]: https://i.stack.imgur.com/m83el.png)
This is my code:
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
void randNums(int array[], int range) {
int i, num;
for (i = 0; i < range; i++) {
num = rand() % range;
array[i] = num;
}
}
//prints elements of given array
void display(int array[], int size) {
int i;
for (i = 0; i < size; i++) {
printf("#%d. %d\n", i, array[i]);
}
}
//displays time taken for sorting algorithm to run
void timeTaken(char sortingAlgo[], int size, clock_t start, clock_t end) {
double seconds = end - start;
double milliseconds = seconds / 1000;
printf("Time taken for %s Sort to sort %d numbers was %f milliseconds or %f seconds",
sortingAlgo, size, milliseconds, seconds);
}
//quick sort
void quickSort(int array[], int first, int last) {
int i, j, pivot, temp;
if (first < last) {
pivot = first;
i = first;
j = last;
while (i < j) {
while (array[i] <= array[pivot] && i < last)
i++;
while (array[j] > array[pivot])
j--;
if (i < j) {
temp = array[i];
array[i] = array[j];
array[j] = temp;
}
}
temp = array[pivot];
array[pivot] = array[j];
array[j] = temp;
quickSort(array, first, j - 1);
quickSort(array, j + 1, last);
}
}
int main() {
int size = 600000;
int myArray[size];
time_t end, start;
int first, last;
randNums(myArray, size);
first = myArray[0];
last = sizeof(myArray) / sizeof(myArray[0]);
time(&start);
quickSort(myArray, first, last);
time(&end);
display(myArray, size);
timeTaken("Quick", size, start, end);
return 0;
}
Any help would be greatly appreciated, Thank you!
There's a lot of little bugs in this code that aren't too difficult to resolve. I'll try and break it down here in this refactoring and cleanup:
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
void randNums(int* array, int range) {
// Declare iterator variables like `i` within the scope of the iterator.
for (int i = 0; i < range; i++) {
// No need for a single-use variable here, just assign directly.
array[i] = rand () % range;
}
}
void display(int* array, int size) {
// for is not a function, it's a control flow mechanism, so
// it is expressed as `for (...)` with a space. `for()` implies
// it is a function, which it isn't.
for (int i = 0; i < size; i++) {
printf("#%d. %d\n", i, array[i]);
}
}
void timeTaken(char* sortingAlgo, int size, clock_t start, clock_t end) {
// Time calculation here needs to account for the fact that clock_t
// does not use seconds as units, it must be converted
// https://en.cppreference.com/w/c/chrono/clock_t
printf(
"Time taken for %s Sort to sort %d numbers was %.6f seconds",
sortingAlgo,
size,
((double) (end - start)) / CLOCKS_PER_SEC
);
}
void quickSort(int* array, int first, int last) {
// Establish a guard condition. Rest of the function is no longer
// nested in a control flow structure, so it simplifies the code.
if (first >= last) {
return;
}
int pivot = first;
int i = first;
int j = last;
// Use `while (...)` as it's also a control flow structure.
while (i < j) {
// Adding space around operators improves clarity considerably. Unspaced
// elements like `a->b()` are supposed to stand out and not be confused
// with visually similar `a>>b()` which does something very different.
while (array[i] <= array[pivot] && i < last) {
i++;
}
// Use surrounding braces on all blocks, even single-line ones, as this
// can avoid a whole class of errors caused by flawed assumptions.
// while (...) { ... }
while (array[j] > array[pivot]) {
j--;
}
if (i < j) {
int temp = array[i];
array[i] = array[j];
array[j] = temp;
}
}
int temp = array[pivot];
array[pivot] = array[j];
array[j] = temp;
quickSort(array, first, j - 1);
quickSort(array, j + 1, last);
}
int main(int argc, char** argv) {
int size = 600000;
// If an argument was given...
if (argc > 1) {
// ...use that as the size parameter instead.
size = atol(argv[1]);
}
// Allocate an array of sufficient size
int* numbers = calloc(size, sizeof(int));
randNums(numbers, size);
// time_t has at best second-level precision, it's very inaccurate.
// Use clock_t which gives far more fidelity.
clock_t start = clock();
// This function takes *offsets*, not values.
quickSort(numbers, 0, size - 1);
clock_t end = clock();
display(numbers, size);
timeTaken("Quick", size, start, end);
free(numbers);
return 0;
}
The number one bug here was calling quickSort() incorrectly:
// Represents first *value* in the array
first = myArray[0]; // Should be: 0
// Rough calculation of the size of the array, but this is off by one
last = sizeof(myArray)/sizeof(myArray[0]); // Should be: size - 1
quickSort(myArray, first, last);
I'm a French student and trying to calculate the execution time of the Merge Sort algorithm for different size of array.
I also want to write the different execution time in a .csv file. But when my program tries to sort an array with 1 million elements the process returns -1073741571 (0xC00000FD) in Code::Blocks. So if you could point me to a way to find a solution I would be very grateful!
Here is my code:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
void genTab(int *tab, int n) {
int i;
for (i = 0; i < n; i++) {
tab[i] = rand() % 100;
}
}
void fusion(int *tab, int deb, int mid, int fin) {
int i = deb;
int j = mid + 1;
int k = deb;
int temp[fin + 1];
while ((i <= mid) && (j <= fin)) {
if (tab[i] <= tab[j]) {
temp[k] = tab[i];
i++;
} else {
temp[k] = tab[j];
j++;
}
k++;
}
while (i <= mid) {
temp[k] = tab[i];
i++;
k++;
}
while (j <= fin) {
temp[k] = tab[j];
k++;
j++;
}
for (i = deb; i <= fin; i++) {
tab[i] = temp[i];
}
}
void triFusion(int *tab, int i, int j) {
if (i < j) {
triFusion(tab, i, (int)((i + j) / 2));
triFusion(tab, (int)((i + j) / 2 + 1), j);
fusion(tab, i, (int)((i + j) / 2), j);
}
}
void reset(int *tab1, int *tab2, int n) {
for (int i = 0; i < n; i++) {
tab2[i] = tab1[i];
}
}
int main() {
srand(time(NULL));
clock_t start, end;
int nbrTest[15] = {
1000, 5000, 10000, 50000, 80000, 100000, 120000, 140000,
150000, 180000, 200000, 250000, 300000, 450000, 1000000
};
FILE *fp;
char *tpsExecution = "exeTime.csv";
fp = fopen(tpsExecution, "w");
fprintf(fp, "Array Size; Merge Time");
for (int i = 0; i < 15; i++) {
int n = nbrTest[i];
printf("Calculating time for an array of %d \n", n);
int *tab = malloc(sizeof(int) * n);
genTab(tab, n);
int *copie = malloc(sizeof(int) * n);
reset(tab, copie, n);
start = clock();
triFusion(tab, 0, n - 1);
end = clock();
float tpsFusion = (float)(end - start) / CLOCKS_PER_SEC;
reset(tab, copie, n);
printf("writing in the file\n");
fprintf(fp, "\n%d;%f", n, tpsFusion);
free(tab);
free(copie);
}
fclose(fp);
return 0;
}
int temp[fin+1]; may exceed the space limit for the stack. You should allocate it with malloc instead, and free it with free.
If you want to exclude malloc and free from the timed code, the allocation could be performed outside the timed code and passed in as work space.
(Note: posted after the answer from #Eric Postpischil).
The function
void fusion(int * tab, int deb, int mid, int fin)
Has the line
int temp[fin+1];
and the value of fin comes through another function from the number of elements n to be sorted
triFusion(tab, 0, n-1);
and as an automatic variable, breaks the stack when n is large.
I suggest replacing the line with
int *temp = malloc((fin+1) * sizeof *temp);
if(temp == NULL) {
puts("malloc");
exit(1);
}
// ...
free(temp);
fusion() is always allocating the full size of the array for temp, even when only a small fraction of temp is being used. You could change this to:
int k = 0;
...
int temp[fin+1-deb];
...
tab[i]=temp[i-deb];
still this will exceed stack space if n is large. So as suggested in the other answers:
int k = 0;
...
int *temp = malloc((fin+1-deb)*sizeof(int));
...
tab[i]=temp[i-deb];
...
free(temp)
or better still, do a one time allocation of a second array in main or in a "helper" function, the include a pointer to the second array in the merge sort functions.
I can get the random numbers into an array but I can't figure out how to check to make sure that they aren't repeating. I print out the code but there are no numbers in the array (prints out nothing).
//puts random numbers into an array
i = 0, j = 0;
srand(time(NULL));
for (i = 0; i < arrSize; i++)
{
randArr[i] = randNums(1,50);
}
i = 0;
for(i = 0; i < arrSize; i++)
{
printf("%d ", randArr[i]);
}
printf("\n\n");
//checks to make sure there are no duplicates
i = 0, j = 0, k = 0, temp = 0;
for (i = 0; i < arrSize; i++)
{
for (j = 1; j <= arrSize;)
{
if (randArr[j] == randArr[i])
{
for (k = j; k <= arrSize; k++)
{
temp = randNums(1,50);
randArr[k + 1] = temp;
}
arrSize--;
}
else
j++;
}
}
//generates random numbers between the inputed max and min
int randNums(int min, int max)
{
int result = 0, low = 0, high = 0;
if (min < max)
{
low = min;
high = max + 1;
}
else
{
low = max + 1;
high = min;
}
result = (rand() % (high - low)) + low;
return (result);
}
Beware! There are many different solutions to this problem and they all have one or another downside. If I was to quickly implement it, I would go for something like this (without too much C-magic going on):
#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#define SIZE (30)
#define RAND_MIN (1)
#define RAND_MAX (50)
static int randNums(int min, int max) {
// ...
}
int main(void) {
(void) srand(time(NULL));
int arr[SIZE];
int used = 0;
while (used < SIZE) {
int num = randNums(RAND_MIN, RAND_MAX);
bool exists = false;
for (int i = 0; i < used; ++i) {
if (arr[i] == num)
exists = true;
}
if (exists == false)
arr[used++] = num;
}
for (int i = 0; i < SIZE; ++i)
(void) printf("%d\n", arr[i]);
return EXIT_SUCCESS;
}
I hope it helps a bit :)
Like this answer, you can do rejection sampling, but the uniform distribution of a fixed number of samples is perfect for a very simple hash set. (Though the asymptotic runtime might be irrelevant for n=6.)
#include <stdlib.h> /* (s)rand */
#include <stdio.h> /* printf */
#include <time.h> /* clock */
#include <assert.h> /* assert */
/* Double-pointers are confusing. */
struct Reference { int *ref; };
/* Simple fixed hash set. */
static struct Reference bins[256];
static int nums[6];
static const size_t no_bins = sizeof bins / sizeof *bins,
no_nums = sizeof nums / sizeof *nums;
static size_t count_num;
/* Uniformly distributed numbers are great for hashing, but possibly clump
together under linear probing. */
static size_t hash(const int n) { return ((size_t)n * 21) % no_bins; }
/* Linear probing. */
static struct Reference *probe(const int n) {
size_t bin_index;
struct Reference *bin;
assert(sizeof bins > sizeof nums);
for(bin_index = hash(n); bin = bins + bin_index,
bin->ref && *bin->ref != n; bin_index = (bin_index + 1) % no_bins);
return bin;
}
/* Return whether it's a new value. */
static int put_in_set(const int n) {
struct Reference *bin = probe(n);
int *num;
assert(count_num < no_nums);
if(bin->ref) return 0; /* Already in hash. */
num = nums + count_num++;
*num = n;
bin->ref = num;
return 1;
}
/* http://c-faq.com/lib/randrange.html */
static int rand_range(const unsigned n) {
unsigned int x = (RAND_MAX + 1u) / n;
unsigned int y = x * n;
unsigned int r;
assert(n > 0);
do {
r = rand();
} while(r >= y);
return r / x;
}
/* Generates random numbers between the inputed max and min without
repetition; [min, max] inclusive. */
static int unique_uniform(const int min, const int max) {
int n;
assert(min <= max && (size_t)(max - min) >= count_num);
do { n = rand_range(max - min + 1) + min; } while(!put_in_set(n));
return n;
}
int main(void) {
int n = 6;
srand((int)clock()), rand(); /* My computer always picks the same first? */
while(n--) { printf("%d\n", unique_uniform(1, 50)); }
return EXIT_SUCCESS;
}
However, if the numbers are densely packed, (eg, unique_uniform(1, 6),) it's going to reject a lot of numbers. Another solution is to take a Poisson distributed numbers as a running sum, (recurrence T(n+1)=T(n)+\mu_{n+1},) where the expected value is the range of numbers divided by the total samples, then take a random permutation.
I'm writing a CUDA kernel and each thread has to complete the following task: suppose I have an ordered array a of n unsigned integers (the first one is always 0) stored in shared memory, each thread has to find the array index i such that a[i] ≤ threadIdx.x and a[i + 1] > threadIdx.x.
A naive solution could be:
for (i = 0; i < n - 1; i++)
if (a[i + 1] > threadIdx.x) break;
but I suppose this is not the optimal way to do it... can anyone suggest anything better?
Like Robert, I was thinking that a binary search has got to be faster that a naïve loop -- the upper bound of operation count for a binary search is O(log(n)), compared to O(N) for the loop.
My extremely simple implementation:
#include <iostream>
#include <climits>
#include <assert.h>
__device__ __host__
int midpoint(int a, int b)
{
return a + (b-a)/2;
}
__device__ __host__
int eval(int A[], int i, int val, int imin, int imax)
{
int low = (A[i] <= val);
int high = (A[i+1] > val);
if (low && high) {
return 0;
} else if (low) {
return -1;
} else {
return 1;
}
}
__device__ __host__
int binary_search(int A[], int val, int imin, int imax)
{
while (imax >= imin) {
int imid = midpoint(imin, imax);
int e = eval(A, imid, val, imin, imax);
if(e == 0) {
return imid;
} else if (e < 0) {
imin = imid;
} else {
imax = imid;
}
}
return -1;
}
__device__ __host__
int linear_search(int A[], int val, int imin, int imax)
{
int res = -1;
for(int i=imin; i<(imax-1); i++) {
if (A[i+1] > val) {
res = i;
break;
}
}
return res;
}
template<int version>
__global__
void search(int * source, int * result, int Nin, int Nout)
{
extern __shared__ int buff[];
int tid = threadIdx.x + blockIdx.x*blockDim.x;
int val = INT_MAX;
if (tid < Nin) val = source[threadIdx.x];
buff[threadIdx.x] = val;
__syncthreads();
int res;
switch(version) {
case 0:
res = binary_search(buff, threadIdx.x, 0, blockDim.x);
break;
case 1:
res = linear_search(buff, threadIdx.x, 0, blockDim.x);
break;
}
if (tid < Nout) result[tid] = res;
}
int main(void)
{
const int inputLength = 128000;
const int isize = inputLength * sizeof(int);
const int outputLength = 256;
const int osize = outputLength * sizeof(int);
int * hostInput = new int[inputLength];
int * hostOutput = new int[outputLength];
int * deviceInput;
int * deviceOutput;
for(int i=0; i<inputLength; i++) {
hostInput[i] = -200 + 5*i;
}
cudaMalloc((void**)&deviceInput, isize);
cudaMalloc((void**)&deviceOutput, osize);
cudaMemcpy(deviceInput, hostInput, isize, cudaMemcpyHostToDevice);
dim3 DimBlock(256, 1, 1);
dim3 DimGrid(1, 1, 1);
DimGrid.x = (outputLength / DimBlock.x) +
((outputLength % DimBlock.x > 0) ? 1 : 0);
size_t shmsz = DimBlock.x * sizeof(int);
for(int i=0; i<5; i++) {
search<1><<<DimGrid, DimBlock, shmsz>>>(deviceInput, deviceOutput,
inputLength, outputLength);
}
for(int i=0; i<5; i++) {
search<0><<<DimGrid, DimBlock, shmsz>>>(deviceInput, deviceOutput,
inputLength, outputLength);
}
cudaMemcpy(hostOutput, deviceOutput, osize, cudaMemcpyDeviceToHost);
for(int i=0; i<outputLength; i++) {
int idx = hostOutput[i];
int tidx = i % DimBlock.x;
assert( (hostInput[idx] <= tidx) && (tidx < hostInput[idx+1]) );
}
cudaDeviceReset();
return 0;
}
gave about a five times speed up compared to the loop:
>nvprof a.exe
======== NVPROF is profiling a.exe...
======== Command: a.exe
======== Profiling result:
Time(%) Time Calls Avg Min Max Name
60.11 157.85us 1 157.85us 157.85us 157.85us [CUDA memcpy HtoD]
32.58 85.55us 5 17.11us 16.63us 19.04us void search<int=1>(int*, int*, int, int)
6.52 17.13us 5 3.42us 3.35us 3.73us void search<int=0>(int*, int*, int, int)
0.79 2.08us 1 2.08us 2.08us 2.08us [CUDA memcpy DtoH]
I'm sure that someoneclever could do a lot better than that. But perhaps this gives you at least a few ideas.
can anyone suggest anything better?
A brute force approach would be to have each thread do a binary search (on threadIdx.x + 1).
// sets idx to the index of the first element in a that is
// equal to or larger than key
__device__ void bsearch_range(const int *a, const int key, const unsigned len_a, unsigned *idx){
unsigned lower = 0;
unsigned upper = len_a;
unsigned midpt;
while (lower < upper){
midpt = (lower + upper)>>1;
if (a[midpt] < key) lower = midpt +1;
else upper = midpt;
}
*idx = lower;
return;
}
__global__ void find_my_idx(const int *a, const unsigned len_a, int *my_idx){
unsigned idx = (blockDim.x * blockIdx.x) + threadIdx.x;
unsigned sp_a;
int val = idx+1;
bsearch_range(a, val, len_a, &sp_a);
my_idx[idx] = ((val-1) < a[sp_a]) ? sp_a:-1;
}
This is coded in browser, not tested. It's hacked from a piece of working code, however. If you have trouble making it work, I can revisit it. I don't recommend this approach on a device without caches (cc 1.x device).
This is actually searching on the full unique 1D thread index (blockDim.x * blockIdx.x + threadIdx.x + 1) You can change val to be anything you like.
You could also add an appropriate thread check, if the number of threads you intend to launch is greater than the length of your my_idx result vector.
I imagine there is a more clever approach that may use something akin to prefix sums.
This is the best algorithm so far. It's called: LPW Indexed Search
__global__ void find_position_lpw(int *a, int n)
{
int idx = threadIdx.x;
__shared__ int aux[ MAX_THREADS_PER_BLOCK /*1024*/ ];
aux[idx] = 0;
if (idx < n)
atomicAdd( &aux[a[idx]], 1); // atomics in case there are duplicates
__syncthreads();
int tmp;
for (int j = 1; j <= MAX_THREADS_PER_BLOCK / 2; j <<= 1)
{
if( idx >= j ) tmp = aux[idx - j];
__syncthreads();
if( idx >= j ) aux[idx] += tmp;
__syncthreads();
}
// result in "i"
int i = aux[idx] - 1;
// use "i" here...
// ...
}