so I was trying to make a GPGPU emulator with c & pthreads but ran into a rather strange problem which I have no idea why its occurring. The code is as below:
#include <stdlib.h>
#include <stdio.h>
#include <pthread.h>
#include <assert.h>
// simplifies malloc
#define MALLOC(a) (a *)malloc(sizeof(a))
// Index of x/y coordinate
#define x (0)
#define y (1)
// Defines size of a block
#define BLOCK_DIM_X (3)
#define BLOCK_DIM_Y (2)
// Defines size of the grid, i.e., how many blocks
#define GRID_DIM_X (5)
#define GRID_DIM_Y (7)
// Defines the number of threads in the grid
// execution environment for the kernel
typedef struct exec_env {
int threadIdx[2]; // thread location
int blockIdx[2];
int blockDim[2];
int gridDim[2];
float *A,*B; // parameters for the thread
float *C;
} exec_env;
// kernel
void *kernel(void *arg)
exec_env *env = (exec_env *) arg;
// compute number of threads in a block
int sz = env->blockDim[x] * env->blockDim[y];
// compute the index of the first thread in the block
int k = sz * (env->blockIdx[y]*env->gridDim[x] + env->blockIdx[x]);
// compute the index of a thread inside a block
k = k + env->threadIdx[y]*env->blockDim[x] + env->threadIdx[x];
// check whether it is in range
assert(k >= 0 && k < GRID_SIZE && "Wrong index computation");
// print coordinates in block and grid and computed index
/*printf("tx:%d ty:%d bx:%d by:%d idx:%d\n",env->threadIdx[x],
env->blockIdx[y], k);
// retrieve two operands
float *A = &env->A[k];
float *B = &env->B[k];
printf("%f %f \n",*A, *B);
// retrieve pointer to result
float *C = &env->C[k];
// do actual computation here !!!
// For assignment replace the following line with
// the code to do matrix addition and multiplication.
*C = *A + *B;
// free execution environment (not needed anymore)
return NULL;
// main function
int main(int argc, char **argv)
float A[GRID_SIZE] = {-1};
float B[GRID_SIZE] = {-1};
float C[GRID_SIZE] = {-1};
pthread_t threads[GRID_SIZE];
int i=0, bx, by, tx, ty;
//Error location
/*for (i = 0; i < GRID_SIZE;i++){
A[i] = i;
B[i] = i+1;
printf("%f %f\n ", A[i], B[i]);
// Step 1: create execution environment for threads and create thread
for (bx=0;bx<GRID_DIM_X;bx++) {
for (by=0;by<GRID_DIM_Y;by++) {
for (tx=0;tx<BLOCK_DIM_X;tx++) {
for (ty=0;ty<BLOCK_DIM_Y;ty++) {
exec_env *e = MALLOC(exec_env);
assert(e != NULL && "memory exhausted");
// set parameters
e->A = A;
e->B = B;
e->C = C;
// create thread
pthread_create(&threads[i++],NULL,kernel,(void *)e);
// Step 2: wait for completion of all threads
for (i=0;i<GRID_SIZE;i++) {
pthread_join(threads[i], NULL);
// Step 3: print result
for (i=0;i<GRID_SIZE;i++) {
printf("%f ",C[i]);
return 0;
Ok this code here runs fine, but as soon as I uncomment the "Error Location" (for loop which assigns A[i] = i and B[i] = i + 1, I get snapped by a segmentation fault in unix, and by these random 0s within C in cygwin. I must admit my fundamentals in C is pretty poor, so it may be highly likely that I missed something. If someone can give an idea on what's going wrong it'd be greatly appreciated. Thanks.
It works when you comment that because i is still 0 when the 4 nested loops start.
You have this:
for (i = 0; i < GRID_SIZE;i++){
A[i] = i;
B[i] = i+1;
printf("%f %f\n ", A[i], B[i]);
/* What value is `i` now ? */
And then
pthread_create(&threads[i++],NULL,kernel,(void *)e);
So pthread_create will try to access some interesting indexes indeed.
I am trying to calculate the sum of two vectors a and b using pthreads in C. I am given a function that computes the sum in sequential form and another which does so in parallel form. My program is working properly but computing different sums when there are multiple threads. I have used proper thread synchronization on the critical area, but still cannot see where I am going wrong. I get the correct answer on the first thread since there is only one thread doing the job and then I get wrong answers on multiple threads. Here is my code:
#include <stdio.h>
#include <stdlib.h>
#include <pthread.h>
// type for value of vector element
typedef short value_t;
// type for vector dimension / indices
typedef long index_t;
// function type to combine two values
typedef value_t (*function_t)(const value_t x, const value_t y);
// struct to store the respective values of the vectors a,b and c
typedef struct{
index_t start;
index_t end;
value_t *arr;
value_t *brr;
value_t *crr;
value_t *part_sum;
pthread_mutex_t *mutex;
// function to combine two values
value_t add(const value_t x, const value_t y) {
return ((x+y)*(x-y)) % ((int)x+1) +27;
// function to initialize the vectors a,b and c
void vectorInit(index_t n, value_t a[n], value_t b[n], value_t c[n]) {
for(index_t i=0; i<n; i++) {
a[i] = (value_t)(2*i);
b[i] = (value_t)(n-i);
c[i] = 0;
// function to count the sum of two variables sequentially
value_t vectorOperation(index_t n, value_t a[n], value_t b[n], value_t c[n], function_t f) {
value_t sum = 0;
for(index_t i=0; i<n; i++) {
sum += (c[i] = f(a[i], b[i]));
return sum;
/* Thread function */
void* vector_sum(void* arg)
arg_struct *param = (arg_struct*)arg;
for(index_t i= param->start; i<param->end; i++)
*param->part_sum += vectorOperation(i,param->arr,param->brr,param->crr,add);
index_t n = param->end - param->start;
// Each thread uses the vectorOperation function to calculate the sum sequentially(Also the critical area)
*param->part_sum = *param->part_sum + vectorOperation(n,param->arr,param->brr,param->crr,add);
//*param->part_sum += vectorOperation(param->end-param->start,param->arr,param->brr,param->crr,add);
// Sum of two vectors in parallel.
value_t vectorOperationParallel(index_t n, value_t a[n], value_t b[n], value_t c[n], function_t f, int p) {
value_t sum = 0;
pthread_t threads[p];
arg_struct thread_args[p];
pthread_mutex_t mutex;
index_t div = (n+p-1)/p;
for(int i=0; i<p; i++)
thread_args[i].start = i*div;
thread_args[i].end = (i+1)*div;
thread_args[i].arr = a;
thread_args[i].brr = b;
thread_args[i].crr = c;
for(int j =0; j<div; j++)
thread_args[i].arr[j] = a[thread_args[i].start+j];
thread_args[i].brr[j] = b[thread_args[i].start+j];
thread_args[i].crr[j] = c[thread_args[i].start+j];
thread_args[i].part_sum = ∑
thread_args[i].mutex = &mutex;
pthread_create(&threads[i],NULL,vector_sum, (void*)&thread_args[i]);
for(int i=0; i<p; i++)
return sum;
int main(int argc, char **argv)
// check for correct argument count
if (argc != 3)
printf ("usage: %s vector_size n_threads\n", argv[0]);
// get arguments
// vector size
index_t n = (index_t)atol (argv[1]);
// number of threads
int p = atoi (argv[2]);
// check for plausible values
if((p < 1) || (p > 1000)) {
printf("illegal number of threads\n");
// allocate memory
value_t *a = malloc(n * sizeof(*a));
value_t *b = malloc(n * sizeof(*b));
value_t *c = malloc(n * sizeof(*c));
if((a == NULL) || (b == NULL) || (c == NULL)) {
printf("no more memory\n");
// initialize vectors a,b,c
vectorInit(n, a, b, c);
// work on vectors sequentially
value_t c1sum = vectorOperation(n, a, b, c, add);
// work on vectors parallel for all thread counts from 1 to p
for(int thr=1; thr<= p; thr++) {
// do operation
value_t c2sum = vectorOperationParallel(n, a, b, c, add, thr);
// check result
if(c1sum != c2sum) {
printf("!!! error: vector results are not identical !!!\nsum1=%ld, sum2=%ld\n", (long)c1sum, (long)c2sum);
printf("The results are equal: sum1=%ld, sum2=%ld\n",(long)c1sum, (long)c2sum);
Okay I am not sure but this seems to be what is wrong.
At first the names for the variables are horrible.
then n.m. commented:
pthread_mutex_init in a loop is probably a bad idea
you calculate index_t div = (elements_in_vector + num_of_threads - 1) / num_of_threads;
And later you use div * num_of_threads to distrubute the elements. This way you may try to access more elements than there are available.
index_t div = (elements_in_vector + num_of_threads - 1) / num_of_threads;
//(13 * 5 - 1) / 5 = 3
thread_args[i].end = (i + 1) * div; // for the last i ( = 2)
//(2 + 1) * 5 = 15
As soon as you access i >= 13 you get garbage values (undefined behaviour)
Then you make a copy of parts of your original array (I would assume this is slower then just passing a reference to the original).
You don't seem to use the result array *thread_args[i].crr at all.
You only need the mutex for the sum of all values as you have dedicated memory for every array you pass in the thread. You could even pass pointers of the original arrays to the threads without a mutex if you would not use the sum variable in all of them. Because as every addition is self contained and does not access memory of another addition, no mutex is needed.
To calculate the sum of all value you could just use the return value of the thread instead of a reference to a value you pass to every one. This way it would be much faster.
I am not sure if I found everything, but this may help you improve this a good bit.
I am using GeForce GT 520 (compute capablility v2.1) to run a program that performs the scan operation on an array of int elements. Here's the code:
This is an implementation of the parallel scan algorithm.
Only a single block of threads is used. Maximum array size = 2048
#include <stdio.h>
#include <stdlib.h>
#include <cuda.h>
#define errorCheck(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
if (code != cudaSuccess)
fprintf(stderr,"GPUassert: %s, file: %s line: %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
__global__ void blelloch_scan(int* d_in, int* d_out, int n)
extern __shared__ int temp[];// allocated on invocation
int thid = threadIdx.x;
int offset = 1;
temp[2*thid] = d_in[2*thid]; // load input into shared memory
temp[2*thid+1] = d_in[2*thid+1];
// build sum in place up the tree
for (int d = n>>1; d > 0; d >>= 1)
if (thid < d)
int ai = offset*(2*thid+1)-1;
int bi = offset*(2*thid+2)-1;
temp[bi] += temp[ai];
offset *= 2;
// clear the last element
if (thid == 0)
temp[n - 1] = 0;
// traverse down tree & build scan
for (int d = 1; d < n; d *= 2)
offset >>= 1;
if (thid < d)
int ai = offset*(2*thid+1)-1;
int bi = offset*(2*thid+2)-1;
int t = temp[ai];
temp[ai] = temp[bi];
temp[bi] += t;
d_out[2*thid] = temp[2*thid]; // write results to device memory
d_out[2*thid+1] = temp[2*thid+1];
int main(int argc, char **argv)
if(argc != 2)
printf("Input Syntax: ./a.out <number-of-elements>\nProgram terminated.\n");
exit (1);
ARRAY_SIZE = (int) atoi(*(argv+1));
int *h_in, *h_out, *d_in, *d_out, i;
h_in = (int *) malloc(sizeof(int) * ARRAY_SIZE);
h_out = (int *) malloc(sizeof(int) * ARRAY_SIZE);
cudaDeviceProp devProps;
if (cudaGetDeviceProperties(&devProps, 0) == 0)
printf("Using device %d:\n", 0);
printf("%s; global mem: %dB; compute v%d.%d; clock: %d kHz\n",, (int)devProps.totalGlobalMem,
(int)devProps.major, (int)devProps.minor,
for(i = 0; i < ARRAY_SIZE; i++)
h_in[i] = i;
errorCheck(cudaMalloc((void **) &d_in, sizeof(int) * ARRAY_SIZE));
errorCheck(cudaMalloc((void **) &d_out, sizeof(int) * ARRAY_SIZE));
errorCheck(cudaMemcpy(d_in, h_in, ARRAY_SIZE * sizeof(int), cudaMemcpyHostToDevice));
blelloch_scan <<<1, ARRAY_SIZE / 2, sizeof(int) * ARRAY_SIZE>>> (d_in, d_out, ARRAY_SIZE);
errorCheck(cudaMemcpy(h_out, d_out, ARRAY_SIZE * sizeof(int), cudaMemcpyDeviceToHost));
for(i = 0; i < ARRAY_SIZE; i++)
printf("h_in[%d] = %d, h_out[%d] = %d\n", i, h_in[i], i, h_out[i]);
return 0;
On compiling using nvcc -arch=sm_21 -o parallel-scan, I get an error:
GPUassert: unspecified launch failure, file: line: 106
Line 106 is the line after kernel launch when we check for errors using errorCheck.
This is what I am planning to implement:
From the kernel, it can be seen that if a block has 1000 threads, it can operate on 2000 elements. Therefore, blockSize = ARRAY_SIZE / 2.
And, shared memory = sizeof(int) * ARRAY_SIZE
Everything is loaded into shared mem. Then, up sweep is done, with last element being set to 0. Finally, down sweep is done to give an exclusive scan of the elements.
I have used this file as the reference to write this code. I do not understand what's the mistake in my code. Any help would be greatly appreciated.
You are launching the kernel like so
blelloch_scan <<<1, ARRAY_SIZE / 2, sizeof(int) * ARRAY_SIZE>>>
meaning that witihin then kernel 0 < thid < int(ARRAY_SIZE/2).
However, your kernel requires a minimum of (2 * int(ARRAY_SIZE/2)) + 1 words of available shared memory to work correctly, otherwise this:
temp[2*thid+1] = d_in[2*thid+1];
will produce an out-of-bounds shared memory access.
If my integer mathematical skillz are not too rusty, this should mean that the code will be safe if ARRAY_SIZE is odd, because ARRAY_SIZE == (2 * int(ARRAY_SIZE/2)) + 1 for any odd integer. However, if ARRAY_SIZE is even, then ARRAY_SIZE < (2 * int(ARRAY_SIZE/2)) + 1 and you have a problem.
It might be that shared memory page size granularity saves you for some even values of ARRAY_SIZE which should theoretically fail, because the hardware will always round up the dynamic shared memory allocation to the next page size larger than the request size. But there should be a number of even values of ARRAY_SIZE for which this fails.
I can't comment on whether the rest of the kernel is correct or not, but using a shared memory size of sizeof(int) * size_t(1 + ARRAY_SIZE) should make this particular problem go away.
The code line: gsl_blas_daxpy(-a,&gsl_matrix_column(D, q).vector,y);
cause the error
error C2102: '&' requires l-value
, now the problem is that I have no control of the GSL functions so I don't know how to figure this out (removing the "&" didn't work)
afterwards i get
error C2198: 'gsl_blas_daxpy' : too few arguments for call
I'm using Visual studio 2010.
GSL_EXPORT int gsl_blas_daxpy (double alpha,
const gsl_vector * X,
gsl_vector * Y);
#include <stdio.h>
#include <math.h>
#include <time.h>
#include <gsl/gsl_vector.h>
#include <gsl/gsl_matrix.h>
#include <gsl/gsl_blas.h>
#define M (10) // Number of columns in dictionary */
#define N ((int)(M/2)) // Number of rows in dictionary */
int K = 0.07*M; //Number of non-zero elements in signal - the sparsity
int P=1; //number of signals
double epsilon = 1.0e-7; // Residual error
int numOfIterations = N; /* Max num of iterations - same as num of elements in signal */
double sign(double x){return (x>=0) - (x<0);} // Sign function
int main(int argc, char** argv)
int n, m, k, iter, q;
double normi, normf, tmp , norm=sqrt(N), htime;
gsl_matrix *D; // A random dictionary used for encoding the sparse signal NxM
gsl_vector *x; // Sparse info signal (encoder input) MxP
gsl_vector *z; // Evaluated Sparse info signal (decoder output) MxP
gsl_vector *r; // Residual error vector MxP
gsl_vector *y; // Sparse representation of signal (encoder output) NxP
gsl_vector_view v;
clock_t start; //for measuring performance
printf("\nDictionary is:NxM=%dx%d,and the signal sparsity is K=%d", N, M, K);
srand(time(NULL)); //Initialize srand
start =clock(); //Initialize clock
/* Initiallize D as a Bernoulli random dictionary */
D = gsl_matrix_alloc (N, M);
for(m=0; m<M; m++)
for(n=0; n<N; n++)
gsl_matrix_set (D, n, m, tmp); //D[n,m]=tmp
/* Create a random K-sparse info signal */
x = gsl_vector_alloc(M);
for(k=0; k<K; k++)
gsl_vector_set(x, rand()%M, 2.0*rand()/(float)RAND_MAX - 1.0); //put random values at k random positions
/* Allocate memory for solution (evaluated signal) */
z = gsl_vector_calloc(M);
/* Allocate memory for residual vector */
r = gsl_vector_calloc(M);
/* Allocate memory for the encoded signal vector (its representation) */
y = gsl_vector_alloc(N);
printf("\nTime data allocation: %f", htime);
/* Encoding the signal (x to y) */
start = clock();
gsl_blas_dgemv(CblasNoTrans, 1, D, x, 0, y); // y = Dx
printf("\nTime for encoding: %f", htime);
/* Decoding the signal */
start = clock();
normi = gsl_blas_dnrm2(y); // ||y|| (L2 norm)
epsilon = sqrt(epsilon * normi);
normf = normi;
iter = 0;
/*iterate till the computational error is small enough*/
while(normf > epsilon && iter < numOfIterations)
gsl_blas_dgemv(CblasTrans, 1, D, y, 0, r); // r=D'*y
q = gsl_blas_idamax(r); //index of max element in residual vector
tmp = gsl_vector_get(r, q); //the max element in r
gsl_vector_set(z, q, gsl_vector_get(z, q)+tmp); // z[q]=z[q]+ tmp
v=gsl_matrix_column(D, q); // choose the dictrionary's atom (coloum) with the index of largest element in r
gsl_blas_daxpy(-tmp,&v.vector,y); // y = y-tmp*v
normf = gsl_blas_dnrm2(y); // ||y|| (L2 norm)
htime = ((double)clock()-start)/CLOCKS_PER_SEC;
printf("\nTime for decoding: %f", htime);
tmp = 100.0*(normf*normf)/(normi*normi); // the error at end of algorithm
printf("\nComputation residual error: %f",tmp);
/* Check the solution (evaluated signal) against the original signal */
printf("\nSolution (first column),Reference (second column):");
getchar(); // wait for pressing a key
for(m=0; m<M; m++)
printf("\n%.3f\t%.3f", gsl_vector_get(x, m),gsl_vector_get(z, m));
normi = gsl_blas_dnrm2(x);
gsl_blas_daxpy(-1.0, x, z); // z = z-x
normf = gsl_blas_dnrm2(z); // ||z|| (L2 norm)
tmp = 100.0*(normf*normf)/(normi*normi); //final error
printf("\nSolution residual error: %f\n",tmp);
/* Memory clean up and shutdown*/
gsl_vector_free(y); gsl_vector_free(r);
gsl_vector_free(z); gsl_vector_free(x);
gsl_matrix_column(D, q).vector is an R-value. You can't take its address. You need an L-value, so assign it to a named variable first, then pass the address of that variable to the function.
If you make a more permanent home for the return value of gsl_matrix_column, (this particular) problem will go away.
Here is some simplified code that illustrates how one might capture a return value in an addressable slot:
struct _foo {
int i;
struct _foo bar () {
struct _foo result = { 5 };
return result;
/* won't compile; 'lvalue required as unary & operand */
void qux () {
int *j = &bar().i;
/* compiles OK */
void qal () {
struct _foo result = bar();
int* j = &result.i;
gsl_vector_view c=gsl_matrix_column(D, q);
I think, introducing a temporal variable led you pass a pointer to it to the function.
EDIT: Well, trying to understand the problem, I wanted to know what the function expect:
int gsl_blas_daxpy (double alpha, const gsl_vector * x, gsl_vector * y)
gsl_vector_view gsl_matrix_column (gsl_matrix * m, size_t j)
witj some explanation:
A vector view can be passed to any subroutine which takes a vector
argument just as a directly allocated vector would be, using
and an example:
for (j = 0; j < 10; j++)
gsl_vector_view column = gsl_matrix_column (m, j);
double d;
d = gsl_blas_dnrm2 (&column.vector);
printf ("matrix column %d, norm = %g\n", j, d);
Now we have another problem:
Here another answer:
Are you aware that int K= 0.7 is K=0 ??
#define M (10) // Number of columns in dictionary */
int K = 0.07*M; //Number of non-zero elements in signal - the sparsity
alloc do not initialice the vector x. x will contain garbage values, not 0. Did you meant x = gsl_vector_calloc(M); with c? It will set x to 0.
/* Create a random K-sparse info signal */
x = gsl_vector_alloc(M);
for(k=0; k<K; k++) // K=0, for get skiped and x not modified.
gsl_vector_set(x, rand()%M, 2.0*rand()/(float)RAND_MAX - 1.0); //put random values at k random positions
(And here you will have at most K random values, but possible lest)
I'm new to multithreading had my first lesson yesterday. So I've wrote a program to get the average of 4 big arrays , each array is a thread and the main waits for all the threads and gives the average of the 4 arrays. This is possible because each thread gives the average of one array. The array is just a headerfile with a float array.
It compiles but gives me a segmentation error and I don't see why.
#include "gemiddelde.h"
#include <stdlib.h>
#include <stdio.h>
float *gemiddelde(void *arg)
float *a;
int i;
a = (float *)arg;
float * som;
for( i = 0; i < 100000; i++)
*som += a[i];
*som = *som / 100000;
return som;
int main()
pthread_t t1,t2,t3,t4;
float * som1, * som2, * som3, * som4, *result;
*result = *som1 + *som2 + *som3 + *som4;
printf("Gemiddelde is: %f ", *result);
return 0;
Can someone help me?
Kind regards,
*result = *som1 + *som2 + *som3 + *som4;
result is used unitialized. Make it a plain float instead of a pointer.
From your current code, segfault occurs because som* aren't initialized -- they are dangling pointers.
Your code is very problematic, because the thread code requires memory to store the result, and as it stands your code is plain wrong because it doesn't have any memory and just dereferences a dangling pointer. But even allocating memory inside the thread is not a great idea, because it's not clear who is responsible for it and who will clean it up. So it's much better to allocate all your required memory in the main function. First some boiler plate to set up the thread argument data:
typedef struct thread_arg_type_
float * data;
size_t len;
float retval;
} thread_arg_type;
thread_arg_type * create_thread_arg(size_t n)
thread_arg_type * result = malloc(sizeof(thread_arg_type));
if (!result) return NULL;
float * const p = malloc(n * sizeof(float));
if (!p)
return NULL;
result->len = n;
result->data = p;
return result;
void free_thread_arg(thred_arg_type * r)
if (r) free(r->data);
Now here's how we use it:
int main()
thread_arg_type * arg;
pthread_t t;
arg = create_thread_arg(array1_size);
pthread_create(&t, NULL, getmiddle, arg);
// ...
pthread_join(t, NULL);
printf("The result is: %f.\n", arg->retval);
And finally we must adapt getmiddle:
void * getmiddle(thread_arg_t * arg)
arg->retval = 0;
for(unsigned int i = 0; i != arg->len; ++i)
arg->retval += arg->data[i];
arg->retval /= arg->len;
return NULL;
I am trying to create an array of size n (where n is user's input) and when the user runs the program, the array elements should be set to 1 (each in a separate thread). Here is what I have done so far:
#include <windows.h>
#include <stdlib.h>
#include <string.h>
#include <stdio.h>
#include <conio.h>
#include <process.h>
int *x;
int index = *(int *) param;
x[index] = 1;
return 0;
int main(int argc, char *argv[])
int n = atoi(argv[1]);
int i; // counter.
HANDLE THandles[n];
x = malloc(n * sizeof (int));
for(i = 0; i < n; i++)
THandles[i] = CreateThread(NULL, 0, init_X, &i, 0, NULL);
// Now wait for threads to finish
WaitForMultipleObjects(n, THandles, TRUE, INFINITE);
// Close the thread handle
for(i = 0; i < n; i++)
printf("After initialization x = ");
for(i = 0; i < n; i++)
printf("%d ", x[i]);
if(i < n - 1) printf(" ");
// ...
return 0;
I run this program and I got wrong outputs:
> Test.exe 3
After initialization x = 11611536 11600064 50397186
It should be After initialization x = 1 1 1 though. I am not sure how I can I fix this, but I am sure its something related to the pointers.
P.S: I'm Java programmer so I'm not familiar with pointers.
The value you are passing as your array index will more than likely be invalid by the time the thread runs, as there is no guaranteeing that the thread is run immediately after the call to CreateThread.
You have two solutions, either pass by value (simple & easy, but not always safe) or allocate a temporary buffer for the value that will be freed by the thread when its used.
Minor Update:
In fact, a better way would be to pass &x[i], then you can just do *(int*)param = 1;
You are passing i by pointer to the thread, so the value each thread gets will depend on when int index = *(int *) param; actually executes and it should be something between 0 and n. You can just pass i by value (casted to a pointer) to avoid this.