I have to implement a clustering algorithm, after loading the dataset, I go to check for each point in which cluster it can be inserted. If points cannot be inserted into any cluster, I have to move them from the dataset and insert them into the retained set. Since I do not know a priori the size of the retained set, I allocate an area of memory initially equal to 0 and that is incremented by the bytes size needed to hold a point each time I have to insert a point into the retained set.
It works for some iterations (4 to be precise) and then stops
This is what I try:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
#include <malloc/malloc.h>
#include <float.h>
#include <stdbool.h>
double **load_dataset(char *filename, int d, int chunk_size);
int assign_point_to_cluster(double **clusters, int **set, double **retained_set, double *point,double *standard_deviation,
int d, int k, int *chunk_size, int p_in_r);
int find_candidate_cluster(double **clusters, double *point, double *std_deviation, int d, int k);
double mean(const double *vector, int d);
double mahalanobis(const double *cluster, const double *point, const double *standard_deviation, int d);
void compute_std_dev(const double *cluster, double *standard_deviation_vector, int d);
int inizialize_cluster(double **dataset, int d, int k, double **clusters, int **set, int chunk_size, bool retain);
double compute_sum_of_euclidean_distance(double **center_points, double *point, int n, int d);
void feature_scaling(double **dataset, int d, int chunk_size);
int main(int argc, char **argv) {
if(argc < 6){
printf("Error parameters! Usage: ./main <input_file> <total number of point> <chunk_size> <points_dimension> <cluster_number>");
return 0;
}
char* filename = argv[1];
int d = atoi(argv[4]), k = atoi(argv[5]), chunk_size = atoi(argv[3]), total = atoi(argv[2]);
int k_compressed = 0;
printf("Path: %s\n", filename);
printf("Number of point in set %i\n", total);
printf("chunk size: %i\n", chunk_size);
printf("Dimension of points: %i\n", d);
printf("Number of cluster: %i\n", k);
printf("----------------\n");
double **clusters = malloc(k * sizeof(double *));
double *standard_deviation = malloc(d * sizeof(double));
int **discard_set = malloc(k * sizeof(int *));
double **retained_set = malloc(1);
double * cohesion = malloc(2 * sizeof(double));
double* radius = NULL;
double **mini_cluster = NULL;
double* temp_cluster = NULL;
int** compressed_set = NULL;
double** mini_cluster_temp = NULL;
int p_in_r = 0;
double **dataset = load_dataset(filename, d, chunk_size);
/**
* Rescaling of variables
*/
//feature_scaling(dataset, d, chunk_size); TODO: Something is wrong
/**
* Cluster initialization
*/
if(!clusters || !discard_set || !standard_deviation || !retained_set || !cohesion){
printf("Something went wrong in main(), memory allocation failed!");
exit(1);
}
chunk_size = inizialize_cluster(dataset, d, k, clusters, discard_set, chunk_size, false);
/**
* At this point we are only interested in keeping a "summary" of the data that will be placed within a cluster.
* In dataset we put the id of the points that are added to a cluster, while cluster contains the statistics
* useful to perform clustering
**/
/**
* We start processing the points the (CHUNK - 1)eighth point in the dataset is assigned to the cluster if the
* mahalanobis distance is less than a threshold and if it is the closest.
* Clusetering dataset -> discard_set
*/
while (chunk_size > 0) {
p_in_r += assign_point_to_cluster(clusters, discard_set, retained_set, dataset[chunk_size - 1], standard_deviation, d, k, &chunk_size, p_in_r);
/**
* always working on the last element of the dataset, it is not necessary to move the list of points,
* just delete the last element
*/
free(dataset[chunk_size]);
dataset[chunk_size] = NULL;
dataset = realloc(dataset, chunk_size * sizeof(double *));
if(dataset == NULL){
printf("Something went wrong in main(), memory allocation failed!");
exit(1);
}
}
free(dataset);
dataset = NULL;
return 0;
}
int inizialize_cluster(double **dataset, int d, int k, double ** clusters, int** set, int chunk_size, bool retain) {
double ** center_point = malloc(k * sizeof(double *));
for (int i = 0; i < k; i++) {
center_point[i] = malloc((d + 1) * sizeof(double));
if(center_point[i] == NULL){
printf("Something went wrong in inizialize_cluster(), memory allocation failed!");
exit(1);
}
}
/**
* The point representing the center of the first cluster is chosen as the first point in the dataset
**/
memcpy(*center_point, *dataset, (d + 1) * sizeof(double));
/**
* The first point can be removed from the dataset or
* in case we are working on the retained set, move it to the end.
**/
chunk_size--;
if(retain){
double* temp = malloc(sizeof(double *));
memcpy(temp, dataset, sizeof(double *));
memcpy(dataset, dataset+1, chunk_size * sizeof(double *));
memcpy(dataset+chunk_size-1, temp, sizeof(double *));
/*for (int i = 0; i < CHUNK; ++i) {
printf("id[%i]: %f", dataset[i][0]);
}*/
}
else{
free(dataset[0]);
memcpy(dataset, dataset+1, chunk_size * sizeof(double *));
dataset[chunk_size] = NULL;
dataset = realloc(dataset, chunk_size * sizeof(double *));
if(dataset == NULL){
printf("Something went wrong in inizialize_cluster(), memory allocation failed!");
exit(1);
}
}
/**
* The centers of the next clusters are chosen as those that are furthest apart
**/
double max;
int pos;
double distance;
for (int i = 1; i < k; i++) {
/**
* I choose the point that maximizes the sum of the distances from the centerpieces
*/
max = -1;
for (int j = 0; j < chunk_size; j++){
distance = compute_sum_of_euclidean_distance(center_point, dataset[j], i, d);
if (distance > max) {
pos = j;
max = distance;
}
}
memcpy(*(center_point + i), *(dataset + pos), (d + 1) * sizeof(double));
/**
* When a point is chosen as the center of a cluster, I remove it from the dataset
**/
chunk_size--;
if(retain){
double** temp = malloc(sizeof(double *));
memcpy(temp, dataset + pos, sizeof(double *));
memcpy(dataset + pos, dataset + pos + 1, (chunk_size - pos) * sizeof(double *));
memcpy(dataset + chunk_size - 1, temp, sizeof(double *));
}
else{
free(dataset[pos]);
memcpy(dataset + pos, dataset + pos + 1, (chunk_size - pos) * sizeof(double *));
dataset = realloc(dataset, chunk_size * sizeof(double *));
if(dataset == NULL){
printf("Something went wrong in inizialize_cluster(), memory allocation failed!");
exit(1);
}
}
}
/**
* When I have found k points that can be used as the initial centres of the k clusters,
* I summarize them (calculate cluster statistics) and enter them into the discard set.
*/
for (int i = 0; i < k; i++) {
/**
* Cluster and discard set initialization
*/
clusters[i] = malloc(((2 * d) + 1) * sizeof(double));
set[i] = malloc(sizeof(int ));
if(clusters[i] == NULL || set[i] == NULL){
printf("Something went wrong in in inizialize_cluster(), memory allocation failed!");
exit(1);
}
clusters[i][0]=1;
set[i][0] = (int) center_point[i][0];
for (int j = 1; j < d + 1; j++) {
clusters[i][j] = center_point[i][j];
clusters[i][j + d] = pow(center_point[i][j], 2);
}
free(center_point[i]);
center_point[i] = NULL;
}
free(center_point);
center_point = NULL;
return chunk_size;
}
double **load_dataset(char *filename, int d, int chunk_size) {
double **dataset = malloc(chunk_size * sizeof(double *));
if(dataset == NULL){
printf("Something went wrong in load_dataset(), memory allocation failed!");
exit(1);
}
for (int i = 0; i < chunk_size; i++) {
dataset[i] = malloc((d + 1) * sizeof(double));
if(dataset[i] == NULL){
printf("Something went wrong in load_dataset(), memory allocation failed!");
exit(1);
}
}
FILE *file;
file=fopen(filename, "r");
if (file == NULL){
printf("Something went wrong in load_dataset(), file opening failed! (row 162)");
exit(1);
}
char *line = NULL, *token;
size_t len = 0;
int i = 0;
int j = 0;
int first_line = 0;
while ((getline(&line, &len, file)) != -1 && i < chunk_size) {
if(first_line != 0) {
while ((token = strsep(&line, ",")) != NULL) {
dataset[i][j] = atof(token);
j++;
}
j = 0;
i++;
} else{
first_line = 1;
}
}
fclose(file);
return dataset;
}
int assign_point_to_cluster(double **clusters, int **set, double **retained_set, double *point,double *standard_deviation,
int d, int k, int *chunk_size, int p_in_r) {
/**
* For each point I assess which cluster it can go into
*/
int candidate;
candidate = find_candidate_cluster(clusters, point, standard_deviation, d, k);
/**
* After identifying the candidate cluster (if there is one), I add the point to the discard set and update the
* cluster statistics otherwise I go ahead and put the point in the retained set
*/
(*chunk_size)--;
if(candidate > -1){
/**
* I add the point to the discard/compressed set
*/
clusters[candidate][0]++;
set[candidate] = realloc(set[candidate], (unsigned long)clusters[candidate][0] * sizeof(int));
if(set[candidate] == NULL){
printf("Something went wrong in in assign_point_to_cluster(), memory allocation failed!");
exit(1);
}
set[candidate][(int) clusters[candidate][0] - 1] = (int) point[0];
/**
* I update the cluster statistics
*/
for (int i = 1; i < d + 1; i++) {
clusters[candidate][i] += point[i];
clusters[candidate][i + d] += pow(point[i], 2);
}
}
else if(retained_set){
/**
* I insert the point in the retained set
*/
p_in_r++;
retained_set = realloc(retained_set, p_in_r * sizeof(double *));
retained_set[p_in_r - 1] = malloc((d + 1) * sizeof(double));
memcpy(*(retained_set + p_in_r - 1), point, (d + 1) * sizeof(double ));
return 1;
}
return 0;
}
int find_candidate_cluster(double **clusters, double *point, double *std_deviation, int d, int k) {
double actual = DBL_MAX;
int candidate = -1;
double threshold;
double distance;
for (int j = 0; j < k; j++) {
/**
* Calculation of varainza,threshold and mahalanobis' distance
*/
compute_std_dev(clusters[j], std_deviation, d);
//TODO: Would it be okay as a threshold? An alternative could be the module?
threshold = 3.5 * mean(std_deviation, d);
distance = mahalanobis(clusters[j], point, std_deviation, d);
if(distance < threshold && distance < actual){
/**
* the cluster is a candidate for the point
*/
candidate = j;
actual = distance;
}
}
return candidate;
}
double mean(const double *vector, int d) {
double sum = 0;
for (int i = 0; i < d; ++i) {
sum += vector[i];
}
return sum/d;
}
void compute_std_dev(const double *cluster, double *standard_deviation_vector, int d) {
double sigma;
/**
* Vector of the variances of the components of the cluster elements
*/
for (int i = 0; i < d; i++) {
sigma = sqrt(fabs(cluster[i + 1 + d]/cluster[0] - pow(cluster[i + 1]/cluster[0], 2)));
if( sigma == 0)
sigma = 1;
standard_deviation_vector[i] = sigma;
}
}
double mahalanobis(const double *cluster, const double *point, const double *standard_deviation, int d) {
double distance=0;
for (int i = 1; i < d; ++i) {
distance += pow((point[i] - cluster[i]) / standard_deviation[i - 1], 2);
}
return sqrt(distance)/d; //TODO: can it be okay? I thought so since the threshold is the average of the st.dev.
}
double compute_sum_of_euclidean_distance(double **center_points, double *point, int n, int d) {
double component_sum = 0;
double final_sum = 0;
for (int i = 0; i < n; i++) {
for (int j = 1; j < d + 1; j++){
component_sum += pow(center_points[i][j] - point[j], 2);
}
final_sum += sqrt(component_sum);
}
return final_sum;
}
void feature_scaling(double **dataset, int d, int chunk_size) {
/**
* We perform a Z-score Normalization
**/
double mean;
double sigma;
double sum;
double sumQ;
double variance;
/**
* We calculate mean and variance for each column
**/
for (int i = 1; i < d + 1; i++) {
sum = 0;
for (int j = 0; j < chunk_size; j++) {
sum += dataset[j][i];
}
mean = sum / chunk_size;
sumQ = 0;
for (int j = 0; j < chunk_size; j++) {
sumQ += pow((dataset[j][i] - mean), 2);
}
variance = sumQ / chunk_size;
sigma = sqrt(variance);
if( sigma == 0)
sigma = 1;
/**
* Feature scaling: (x-x_med)/sigma
**/
for (int j = 0; j < chunk_size; j++) {
dataset[j][i] = (dataset[j][i] - mean) / sigma;
}
}
}
The command I use when run is:
./main "db.csv" 100 35 4 3
It works if the 3rd argument is less then 34
The file db.csv contains:
CustomerID,Gender,Age,Annual Income (k$),Spending Score (1-100),cluster
1,0,19,15,39,4
2,0,21,47,81,3
3,1,20,56,6,4
4,1,23,16,77,3
5,1,31,17,40,4
6,1,22,17,76,3
7,1,35,18,6,4
8,1,23,18,94,3
9,0,64,19,3,4
10,1,30,19,72,3
11,0,67,19,14,4
12,1,35,19,99,3
13,1,58,20,15,4
14,1,24,20,77,3
15,0,37,20,13,4
16,0,22,20,79,3
17,1,35,21,35,4
18,0,20,21,66,3
19,0,52,23,29,4
20,1,35,23,98,3
21,0,35,24,35,4
22,0,25,24,73,3
23,1,46,25,5,4
24,0,31,25,73,3
25,1,54,28,14,4
26,0,29,28,82,3
27,1,45,28,32,4
28,0,35,28,61,3
29,1,40,29,31,4
30,1,23,29,87,3
31,0,60,30,4,4
32,1,21,30,73,3
33,0,53,33,4,4
34,0,18,33,92,3
35,1,49,33,14,4
36,1,21,33,81,3
37,1,42,34,17,4
38,1,30,34,73,3
39,1,36,37,26,4
40,1,20,37,75,3
41,1,65,38,35,0
42,0,24,38,92,3
43,0,48,39,36,0
44,1,31,39,61,5
45,1,49,39,28,4
46,1,24,39,65,3
47,1,50,40,55,0
48,1,27,40,47,5
49,1,29,40,42,5
50,1,31,40,42,5
51,1,49,42,52,0
52,0,33,42,60,5
53,1,31,43,54,5
54,0,59,43,60,0
55,1,50,43,45,0
56,0,47,43,41,0
57,1,51,44,50,0
58,0,69,44,46,0
59,1,27,46,51,5
60,0,53,46,46,0
61,0,70,46,56,0
62,0,19,46,55,5
63,1,67,47,52,0
64,1,54,47,59,0
65,0,63,48,51,0
66,0,18,48,59,5
67,1,43,48,50,0
68,1,68,48,48,0
69,0,19,48,59,5
70,1,32,48,47,5
71,0,70,49,55,0
72,1,47,49,42,0
73,1,60,50,49,0
74,1,60,50,56,0
75,0,59,54,47,0
76,0,26,54,54,5
77,1,45,54,53,0
78,0,40,54,48,5
79,1,23,54,52,5
80,1,49,54,42,0
81,0,57,54,51,0
82,0,38,54,55,5
83,0,67,54,41,0
84,1,46,54,44,0
85,1,21,54,57,5
86,0,48,54,46,0
87,1,55,57,58,0
88,1,22,57,55,5
89,1,34,58,60,5
90,1,50,58,46,0
91,1,68,59,55,0
92,0,18,59,41,5
93,0,48,60,49,0
94,1,40,60,40,5
95,1,32,60,42,5
96,0,24,60,52,5
97,1,47,60,47,0
98,1,27,60,50,5
99,0,48,61,42,0
100,0,20,61,49,5
101,1,23,62,41,5
102,1,49,62,48,0
103,0,67,62,59,0
104,0,26,62,55,5
105,0,49,62,56,0
106,1,21,62,42,5
107,1,66,63,50,0
108,0,54,63,46,0
109,0,68,63,43,0
110,0,66,63,48,0
111,0,65,63,52,0
112,1,19,63,54,5
113,1,38,64,42,5
114,0,19,64,46,5
115,1,18,65,48,5
116,1,19,65,50,5
117,1,63,65,43,0
118,1,49,65,59,0
119,1,51,67,43,0
120,1,50,67,57,0
121,0,27,67,56,5
122,1,38,67,40,5
123,1,40,69,58,5
124,0,39,69,91,1
125,1,23,70,29,5
126,1,31,70,77,1
127,0,43,71,35,2
128,0,40,71,95,1
129,0,59,71,11,2
130,0,38,71,75,1
131,0,47,71,9,2
132,0,39,71,75,1
133,1,25,72,34,5
134,1,31,72,71,1
135,0,20,73,5,2
136,1,29,73,88,1
137,1,44,73,7,2
138,0,32,73,73,1
139,0,19,74,10,2
140,1,35,74,72,1
141,1,57,75,5,2
142,0,32,75,93,1
143,1,28,76,40,5
144,1,32,76,87,1
145,0,25,77,12,2
146,0,28,77,97,1
147,0,48,77,36,2
148,1,32,77,74,1
149,1,34,78,22,2
150,0,34,78,90,1
151,0,43,78,17,2
152,0,39,78,88,1
153,1,44,78,20,2
154,1,38,78,76,1
155,1,47,78,16,2
156,1,27,78,89,1
157,0,37,78,1,2
158,1,30,78,78,1
159,0,34,78,1,2
160,1,30,78,73,1
161,1,56,79,35,2
162,1,29,79,83,1
163,0,19,81,5,2
164,1,31,81,93,1
165,0,50,85,26,2
166,1,36,85,75,1
167,0,42,86,20,2
168,1,33,86,95,1
169,1,36,87,27,2
170,0,32,87,63,1
171,0,40,87,13,2
172,0,28,87,75,1
173,0,36,87,10,2
174,0,36,87,92,1
175,1,52,88,13,2
176,1,30,88,86,1
177,0,58,88,15,2
178,0,27,88,69,1
179,0,59,93,14,2
180,0,35,93,90,1
181,1,37,97,32,2
182,1,32,97,86,1
183,0,46,98,15,2
184,1,29,98,88,1
185,1,41,99,39,2
186,0,30,99,97,1
187,1,54,101,24,2
188,0,28,101,68,1
189,1,41,103,17,2
190,1,36,103,85,1
191,1,34,103,23,2
192,1,32,103,69,1
193,0,33,113,8,2
194,1,38,113,91,1
195,1,47,120,16,2
196,1,35,120,79,1
197,1,45,126,28,2
198,0,32,126,74,1
199,0,32,137,18,2
200,0,30,137,83,1
download it from mega: db.csv.
Originally found on Kaggle but I made some modifications.
Edit: I included the whole code
Edit: I alse get this error trying to see what is in retained_set : read memory from 0x3d2fdfcb8030 failed (0 of 8 bytes read)
Edit: I translate the comment in the code and added the file I use as input
assign_point_to_cluster has a local variable double **retained_set. This means that you cannot do retained_set = realloc(retained_set, ... or you will just change where that local variable points at, not where the pointer-to-pointer on the caller side points at. And because of that you also create a memory leak. See this FAQ: Dynamic memory access only works inside function
As for how to solve it, it appears that encapsulating all of this data into structs would simplify the program a lot. You could also implement it as an "opaque type" (How to do private encapsulation in C?) and get rid of the caller's responsibility to handle dynamic allocation.
Using 2D arrays instead of pointer-to-pointers might also simplify the program and improve performance. For example if you could use a "pointer to array pointer" parameter double (**retained_set)[x][y]) then you could do double (*tmp)[x][y] = realloc(*retained_set,...) and then *retained_set = tmp;, which would affect the caller. But structs would be easier to read so that should be the first option.
Also note that malloc.h has been obsolete since forever. difference between <stdlib.h> and <malloc.h>
I'm not C expert and I've read through the forum, but I still need some advice regarding a sorting problem on C.
I have 4 dynamic arrays of doubles in C. All of them are the same size, and lets say n. What I want to do is to sort all of them using one of the arrays as first order and a second array as my second order. So if the arrays are *x, *y, *w and *z. I want to sort them according to the values of *x, then *y.
I must do this efficiently because the arrays are quite large.
Any help will be much appreciated.
The easy way to do this would be to map your four separate arrays onto a single array of a struct type like
struct rec {
double x;
double y;
double w;
double z;
};
struct rec *arr = malloc( sizeof *arr * N ); // where N is the number of
// elements in each array
if ( !arr )
// malloc failed, handle error somehow
for ( size_t i = 0; i < N; i++ )
{
arr[i].x = x[i];
arr[i].y = y[i];
arr[i].w = w[i];
arr[i].z = z[i];
}
and then create a comparison function to pass to qsort:
int cmpRec( const void *lhs, const void *rhs )
{
struct rec *l = lhs;
struct rec *r = rhs;
if ( l->x < r->x )
return -1;
else if ( l->x > r->x )
return 1;
else
{
if ( l->y < r->y )
return -1;
else if ( l->y > r->y )
return 1;
else
return 0;
}
return 0;
}
Now you can use the qsort library function to sort that array of struct:
qsort( arr, N, sizeof *arr, cmpRec );
Once that array is sorted, you can map the results back onto your four original arrays.
Clearly, sorting this using standard qsort() is not going to work; there isn't a mechanism for passing four arrays.
Equally clearly, if the data were structured as an array of structures, then using qsort() would be feasible.
Question 1: Is it feasible to create an array of structures, load it, sort it, and then unload back into the original arrays?
Question 2: Another option is to sort an array of integers:
int indexes[n];
for (int i = 0; i < n; i++)
indexes[i] = i;
qsort(indexes, n, sizeof(indexes[0]), comparator);
The comparator function would have to be able to access the x and y arrays as file scope variables:
int comparator(void const *v1, void const *v2)
{
int i1 = *(int *)v1;
int i2 = *(int *)v2;
extern double *x, *y;
if (x[i1] > x[i2])
return +1;
else if (x[i1] < x[i2])
return -1;
else if (y[i1] > y[i2])
return +1;
else if (y[i1] < y[i2])
return -1;
else
return 0;
}
You'd then be able to access the arrays using x[indexes[i]] etc to access the ith element in sorted order.
Is that acceptable?
If that is not convenient either, then you will end up writing your own sort; it isn't horribly painful, but will require some care.
I spent some time adapting an existing sort test framework to this scenario. The full code is quite large because it includes a lot of testing support code. The core function (compare, swap, partition and quicksort) are here (122 lines, including comment and blank lines):
/* SO 20271977 - sort arrays x, y, z, w (type double, size n) in parallel based on values in x and y */
/*
** To apply this to the real code, where there are 4 arrays to be sorted
** in parallel, you might write:
**
** Array4 a;
** a.x = x;
** a.y = y;
** a.z = z;
** a.w = w;
** a.n = n;
** quicksort_random(&a);
**
** Or even:
**
** quicksort_random((Array4){ .n = n, .x = x, .y = y, .z = z, .w = w });
**
** combining designated initializers and compound literals. Or you could write a
** trivial wrapper so that you can call:
**
** quicksort_random_wrapper(n, x, y, z, w);
*/
/* SOF so-20271977.h */
#include <stddef.h>
typedef struct Array4
{
size_t n;
double *x;
double *y;
double *z;
double *w;
} Array4;
extern void quicksort_random(Array4 *A);
/* EOF so-20271977.h */
#include <assert.h>
#include <stdlib.h> /* lrand48() */
/*
** Note that a more careful implementation would use nrand48() instead
** of lrand48() to prevent its random number generation from interfering
** with other uses of the x-rand48() functions.
*/
typedef size_t (*Part)(Array4 *A, size_t p, size_t r);
static void quicksort_partition(Array4 *A, size_t p, size_t r, Part partition);
static size_t partition_random(Array4 *A, size_t p, size_t r);
/* Quick Sort Wrapper function - specifying random partitioning */
void quicksort_random(Array4 *A)
{
quicksort_partition(A, 0, A->n - 1, partition_random);
}
/* Main Quick Sort function */
static void quicksort_partition(Array4 *A, size_t p, size_t r, Part partition)
{
if (p < r)
{
size_t q = (*partition)(A, p, r);
assert(p <= q && q <= r);
if (q > 0)
quicksort_partition(A, p, q-1, partition);
quicksort_partition(A, q+1, r, partition);
}
}
static inline int compare(Array4 const *A, size_t p, size_t r)
{
if (A->x[p] < A->x[r])
return -1;
else if (A->x[p] > A->x[r])
return +1;
if (A->y[p] < A->y[r])
return -1;
else if (A->y[p] > A->y[r])
return +1;
else
return 0;
}
static inline size_t random_int(size_t p, size_t r)
{
return(lrand48() % (r - p + 1) + p);
}
static inline void swap(Array4 *A, size_t i, size_t j)
{
double d;
d = A->x[i];
A->x[i] = A->x[j];
A->x[j] = d;
d = A->y[i];
A->y[i] = A->y[j];
A->y[j] = d;
d = A->z[i];
A->z[i] = A->z[j];
A->z[j] = d;
d = A->w[i];
A->w[i] = A->w[j];
A->w[j] = d;
}
static size_t partition_random(Array4 *A, size_t p, size_t r)
{
size_t pivot = random_int(p, r);
swap(A, pivot, r);
size_t i = p-1;
size_t j = p;
while (j <= r)
{
if (compare(A, j, r) <= 0)
swap(A, j, ++i);
j++;
}
return i;
}
The test framework (quite ridiculously elaborate if it weren't that I already had a variant of it on hand) is 369 lines including blank lines and comment lines — and all the code above:
#include <assert.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
#define FLTFMT "%13.6f"
typedef struct Array4
{
size_t n;
double *x;
double *y;
double *z;
double *w;
} Array4;
static int trace = 0;
static void *xmalloc(size_t size)
{
void *space = malloc(size);
if (space == 0)
{
fprintf(stderr, "Out of memory (%zu)\n", size);
exit(1);
}
return space;
}
void quicksort_last(Array4 *A);
void quicksort_random(Array4 *A);
void selectionsort(Array4 *A);
static inline int compare(Array4 const *A, size_t p, size_t r)
{
if (A->x[p] < A->x[r])
return -1;
else if (A->x[p] > A->x[r])
return +1;
if (A->y[p] < A->y[r])
return -1;
else if (A->y[p] > A->y[r])
return +1;
else
return 0;
}
static void dump_array(char const *tag, Array4 const *A)
{
printf("%s [%zu..%zu]:\n", tag, (size_t)0, A->n-1);
for (size_t i = 0; i < A->n; i++)
printf("(" FLTFMT ", " FLTFMT ", " FLTFMT ", " FLTFMT ")\n",
A->x[i], A->y[i], A->z[i], A->w[i]);
}
static void chk_sort(Array4 const *A)
{
for (size_t i = 0; i < A->n - 1; i++)
{
//if (compare(A, i, i+1) > 0)
{
if (A->x[i] > A->x[i+1])
{
printf("Out of order: A.x[%zu] = " FLTFMT ", A.x[%zu] = " FLTFMT "\n",
i, A->x[i], i+1, A->x[i+1]);
}
else if ((A->x[i] == A->x[i+1] && A->y[i] > A->y[i+1]))
{
printf("Out of order: A.x[%zu] = " FLTFMT ", A.x[%zu] = " FLTFMT ", "
"A.y[%zu] = " FLTFMT ", A.y[%zu] = " FLTFMT "\n",
i, A->x[i], i+1, A->x[i+1], i, A->y[i], i+1, A->y[i+1]);
}
}
}
}
static inline void set(Array4 *A, size_t p, double d)
{
A->x[p] = d;
A->y[p] = d + drand48() - 0.5;
A->z[p] = d / 2.0;
A->w[p] = d * 2.0;
}
static void load_random(Array4 *A)
{
size_t size = A->n;
for (size_t i = 0; i < size; i++)
{
A->x[i] = drand48() * size;
A->y[i] = drand48() * size + drand48() - 0.5;
A->z[i] = drand48() * size / 2.0;
A->w[i] = drand48() * size * 2.0;
}
}
static void load_ascending(Array4 *A)
{
for (size_t i = 0; i < A->n; i++)
set(A, i, i);
}
static void load_descending(Array4 *A)
{
for (size_t i = 0; i < A->n; i++)
set(A, i, A->n - i);
}
static void load_uniform(Array4 *A)
{
for (size_t i = 0; i < A->n; i++)
set(A, i, A->n);
}
static void load_organpipe(Array4 *A)
{
for (size_t i = 0; i <= A->n / 2; i++)
set(A, i, i);
for (size_t i = A->n / 2 + 1; i < A->n; i++)
set(A, i, A->n - i);
}
static void load_invorganpipe(Array4 *A)
{
size_t range = A->n / 2;
for (size_t i = 0; i < A->n / 2; i++)
set(A, i, range - i);
for (size_t i = A->n / 2 + 1; i < A->n; i++)
set(A, i, i - range);
}
typedef void (*Load)(Array4 *A);
typedef void (*Sort)(Array4 *A);
typedef size_t (*Part)(Array4 *A, size_t p, size_t r);
static void test_one_sort(Array4 *A, Sort sort, char const *s_tag,
char const *l_tag, char const *z_tag)
{
if (trace)
{
printf("%s-%s-%s:", z_tag, l_tag, s_tag);
dump_array("Before", A);
}
clock_t start = clock();
(*sort)(A);
clock_t finish = clock();
double sec = (finish - start) / (double)CLOCKS_PER_SEC;
printf("%s-%s-%s: %13.6f\n", z_tag, l_tag, s_tag, sec);
chk_sort(A);
if (trace)
{
printf("%s-%s-%s:", z_tag, l_tag, s_tag);
dump_array("After", A);
}
fflush(stdout);
}
static Array4 *alloc_array(size_t size)
{
Array4 *A = xmalloc(sizeof(*A));
A->n = size;
A->x = xmalloc(size * sizeof(A->x[0]));
A->y = xmalloc(size * sizeof(A->y[0]));
A->z = xmalloc(size * sizeof(A->z[0]));
A->w = xmalloc(size * sizeof(A->w[0]));
return A;
}
static Array4 *dup_array(Array4 *A)
{
size_t size = A->n;
Array4 *B = alloc_array(size);
if (B != 0)
{
B->n = size;
memmove(B->x, A->x, size * sizeof(A->x[0]));
memmove(B->y, A->y, size * sizeof(A->y[0]));
memmove(B->z, A->z, size * sizeof(A->z[0]));
memmove(B->w, A->w, size * sizeof(A->w[0]));
}
return B;
}
static void free_array(Array4 *A)
{
free(A->x);
free(A->y);
free(A->z);
free(A->w);
free(A);
}
static void test_set_sorts(Array4 *A, char const *l_tag, char const *z_tag)
{
struct sorter
{
Sort function;
char const *tag;
} sort[] =
{
{ quicksort_last, "QS.L" },
{ quicksort_random, "QS.R" },
{ selectionsort, "SS.N" },
};
enum { NUM_SORTS = sizeof(sort) / sizeof(sort[0]) };
for (int i = 0; i < NUM_SORTS; i++)
{
Array4 *B = dup_array(A);
test_one_sort(B, sort[i].function, sort[i].tag, l_tag, z_tag);
free(B);
}
}
static void test_set_loads(size_t size, char const *z_tag)
{
struct loader
{
Load function;
char const *tag;
} load[] =
{
{ load_random, "R" },
{ load_ascending, "A" },
{ load_descending, "D" },
{ load_organpipe, "O" },
{ load_invorganpipe, "I" },
{ load_uniform, "U" },
};
enum { NUM_LOADS = sizeof(load) / sizeof(load[0]) };
Array4 *A = alloc_array(size);
for (int i = 0; i < NUM_LOADS; i++)
{
load[i].function(A);
test_set_sorts(A, load[i].tag, z_tag);
}
free_array(A);
}
/* Main Quick Sort function */
static void quicksort_partition(Array4 *A, size_t p, size_t r, Part partition)
{
if (p < r)
{
size_t q = (*partition)(A, p, r);
assert(p <= q && q <= r);
if (q > 0)
quicksort_partition(A, p, q-1, partition);
quicksort_partition(A, q+1, r, partition);
}
}
static size_t partition_random(Array4 *A, size_t p, size_t r);
static size_t partition_last(Array4 *A, size_t p, size_t r);
/* Quick Sort Wrapper function - specifying random partitioning */
void quicksort_random(Array4 *A)
{
quicksort_partition(A, 0, A->n - 1, partition_random);
}
/* Quick Sort Wrapper function - specifying partitioning about last element */
void quicksort_last(Array4 *A)
{
quicksort_partition(A, 0, A->n - 1, partition_last);
}
static inline size_t random_int(size_t p, size_t r)
{
return(lrand48() % (r - p + 1) + p);
}
static inline void swap(Array4 *A, size_t i, size_t j)
{
double d;
d = A->x[i];
A->x[i] = A->x[j];
A->x[j] = d;
d = A->y[i];
A->y[i] = A->y[j];
A->y[j] = d;
d = A->z[i];
A->z[i] = A->z[j];
A->z[j] = d;
d = A->w[i];
A->w[i] = A->w[j];
A->w[j] = d;
}
static size_t partition_random(Array4 *A, size_t p, size_t r)
{
size_t pivot = random_int(p, r);
swap(A, pivot, r);
size_t i = p-1;
size_t j = p;
while (j <= r)
{
if (compare(A, j, r) <= 0)
swap(A, j, ++i);
j++;
}
return i;
}
static size_t partition_last(Array4 *A, size_t p, size_t r)
{
size_t i = p-1;
size_t j = p;
while (j <= r)
{
if (compare(A, j, r) <= 0)
swap(A, j, ++i);
j++;
}
return i;
}
/* Selection Sort algorithm */
void selectionsort(Array4 *A)
{
size_t r = A->n;
for (size_t p = 0; p < r; p++)
{
for (size_t i = p; i < r; i++)
{
if (compare(A, p, i) > 0)
swap(A, p, i);
}
}
}
/*
** To apply this to the real code, where there are 4 arrays to be sorted
** in parallel, you might write:
**
** Array4 a;
** a.x = x;
** a.y = y;
** a.z = z;
** a.w = w;
** a.n = n;
** quicksort_random(&a);
**
** Or even:
**
** quicksort_random((Array4){ .n = n, .x = x, .y = y, .z = z, .w = w });
**
** combining designated initializers and compound literals. Or you could write a
** trivial wrapper so that you can call:
**
** quicksort_random_wrapper(n, x, y, z, w);
*/
int main(void)
{
srand48((long)time(0));
for (size_t i = 10; i <= 40; i += 10)
{
char buffer[10];
snprintf(buffer, sizeof(buffer), "%zuK", i);
test_set_loads(1000*i, buffer);
}
return 0;
}
If you can't use qsort with
typedef struct Point {
double x;
double y;
double w;
double z;
} Point;
Use qsort with
typedef struct UglyThing {
double x;
int i;
} UglyThing;
Create an array of size n, fill x with x values, i with index.
Call qsort. At the end, i will store the permutation order.
Swap the three other arrays according to the permutation order.
Then do the same with little arrays ("with same x") in the y direction.
If this ugly trick is not possible, then I don't see any other solution than reinventing the wheel.
(edit : I have just seen Andrew said something very close to this answer...sorry!)
Bye,
Francis
#define MAXROWS 88
#define MAXSTATES 10
#define MAXPROBS 6
int obs[MAXROWS]= {0,5,2,3,0,5,2,3,2,4,0,3,5,1,4,3,1,5,2,0,4,4,1,5,3,3,1,4,0,5,1,2,3,0,2,0,5,2,0, 4,4,5,3,0,5,2,5,1,5,4,0,3,1,4,5,2,3,5,1,5,2,4,5,1,5,4,2,5,0,3,4,1,5,2,4,1,5,0,4,2,3,0,5,1,5,2,4,1};//{2,1,0} ;
int q[MAXROWS]= {1};
int s=MAXROWS, i=1,j=0;
double **A;
double **B;
double AD[MAXSTATES][MAXSTATES]={{0,1,0,0,0,0,0,0,0,0},{0,0,1,0,0,0,0,0,0,0},{0,0,0,1,0,0,0,0,0,0},{0,0,0,0,1,0,0,0,0,0},{0.8,0,0,0,0,0.2,0,0,0,0},{0,0,0,0,0,0,1,0,0,0},{0,0,0,0,0,0,0,1,0,0},{0,0,0,0,0,0,0,0,1,0},{0.2,0,0,0,0,0.8,0,0,0,0}};//{{.6,.4},{.3,.7}};//{ { .500, .375, .125 }, { .250,.125, .625 }, { .250,.375,.375 } };
double BD[MAXSTATES][MAXPROBS]={{.167,.167,.167,.167,.167,.167},{.167,.167,.167,.167,.167,.167},{.167,.167,.167,.167,.167,.167},{.167,.167,.167,.167,.167,.167},{.167,.167,.167,.167,.167,.167},{.4,.1125,.1125,.1125,.1125,.15},{.4,.1125,.1125,.1125,.1125,.15},{.4,.1125,.1125,.1125,.1125,.15},{.4,.1125,.1125,.1125,.1125,.15},{.4,.1125,.1125,.1125,.1125,.15}};//{{.1,.3,.6},{.5,.4,.1}};//{ { .60, .20 ,.15, .05}, { .25, .25, .25, .25 }, { .05,.10,.35,.50 } };
double *pi;
double pi2[MAXSTATES] = {1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0};//{.4,.6};
double *poolA;
double *curPtrA;
double *poolB;
double *curPtrB;
double pproba=0;
double **delta;
double *pool;
double *curPtr;
int **psi;
int *poolpsi;
int *curPtrpsi;
HMM model;
A = (double** )calloc(MAXSTATES, sizeof(double* ));
poolA = (double *)calloc( MAXSTATES * MAXSTATES, sizeof(double));
curPtrA = poolA;
for( i = 0; i < MAXSTATES; i++)
{
*(A + i) = curPtrA;
curPtrA += MAXSTATES;
}
B = (double** )calloc(MAXSTATES, sizeof(double* ));
poolB = (double *)calloc( MAXSTATES * MAXPROBS, sizeof(double));
curPtrB = poolB;
for( i = 0; i < MAXSTATES; i++)
{
*(B + i) = curPtrB;
curPtrB += MAXPROBS;
}
for(i = 0; i <MAXSTATES; i++)
for(j=0; j< MAXPROBS; j++)
B[i][j] = BD[i][j];
for(i = 0; i < MAXSTATES; i++)
for(j=0; j< MAXSTATES; j++)
A[i][j] = AD[i][j];
pi = (double* )calloc(MAXSTATES, sizeof(double* ));
for(i = 0; i <MAXSTATES; i++)
pi[i] = pi2[i];
model.M=MAXPROBS;
model.N=MAXSTATES;
model.A= A;
model.B = B;
model.pi = pi;
//double delta[6][4];
psi = (int** )calloc(MAXROWS, sizeof(int* ));
poolpsi = (int *)calloc( MAXROWS*MAXSTATES, sizeof(int));
curPtrpsi = poolpsi;
for( i = 0; i < MAXROWS; i++)
{
*(psi + i) = curPtrpsi;
curPtrpsi += MAXSTATES;
}
I start getting error on psi = (int **) ... line about heaps which is like this:
"Windows has triggered a breakpoint in TestProj.exe.
This may be due to a corruption of the heap, which indicates a bug in TestProj.exe or any of the DLLs it has loaded.
This may also be due to the user pressing F12 while TestProj.exe has focus.
The output window may have more diagnostic information."
pi = (double* )calloc(MAXSTATES, sizeof(double* ));
i think you should have sizeof(double) not pointer since its an array of double values.
Allocating memory as follows avoids the need to repeat the name of the type all over the place, and thus you won't mismatch it.
double* pi = calloc(MAXSTATES, sizeof(*pi));
int **psi , i;
psi = malloc( MAXROWS * sizeof(int*));
for( i = 0 ; i < MAXROWS ; i++ )
{
psi[i] = calloc (MAXSTATES, sizeof(int));
}
Try to do that and a little check if psi is not null could be great ^^.