Cuda program for matrix batch multiplication - c

I am a novice in the field of CUDA program and I am trying to repeat the function of cublasSgemmBatched, which means that I want to perform the matrix-matrix multiplication of a batch of matrices. I try to implement my idea as the following code.
#include <stdio.h>
__global__ void BatchMulCUDA(float* array1, float* array2, int narray1, int dim, float* result)
{
int tx = blockIdx.x * blockDim.x + threadIdx.x;
if (tx < narray1 * dim)
{
float temp = 0;
int index = tx / dim;
#pragma
for (int i = 0; i < dim; i++)
{
temp += array1[tx * dim + i] * array2[index * dim + i];
}
result[tx] = temp;
}
}
void BatchMulGPU(float* array1, float* array2, int narray1, int dim, float* result)
{
dim3 threads(1024, 1);
dim3 grid(narray1 / 1024 + 1, 1);
int threadsPerBlock = threads.x * threads.y;
int blocksPerGrid = grid.x * grid.y;
printf("CUDA kernel launch with %d blocks of %d threads\n", blocksPerGrid, threadsPerBlock);
BatchMulCUDA<<<grid, threads>>>(array1, array2, narray1, dim, result);
}
However, strangely, I found that I can get the right output before the index 19730. After the element of 19730, the output of GPU is always 0. I do not know what the problem is. The CPU version of my code and test function are as the following. Is there any hardware limitation that I do not realize?
#include "kernel.h"
#include <cuda_runtime.h>
#include <stdio.h>
#include <stdlib.h>
#include <iostream>
#include <sys/time.h>
#include <math.h>
double cpuSecond()
{
struct timeval tp;
gettimeofday(&tp, NULL);
return ((double) tp.tv_sec + (double)tp.tv_usec*1e-6);
}
void BatchMulCPU(float* array1, float* array2, int narray1, int dim, float* result)
{
for (int i = 0; i < narray1 * dim; i++)
{
float temp = 0;
int index = i / dim;
for (int j = 0; j < dim; j++)
{
temp += array1[i * dim + j] * array2[index * dim + j];
}
result[i] = temp;
}
}
int main(int argc, char** argv)
{
int narray1 = 6980;
int dim = 4;
float* array1 = new float[narray1 * dim * dim];
float* array2 = new float[narray1 * dim];
float* resultGPU = new float[narray1 * dim];
float* resultCPU = new float[narray1 * dim];
float* d_array1;
float* d_array2;
float* d_result;
for (int i = 0; i < narray1 * dim * dim; i++)
{
array1[i] = static_cast<float> (rand() / (static_cast<float> (RAND_MAX / 10)));
}
for (int i = 0; i < narray1 * dim; i++)
{
array2[i] = static_cast<float> (rand() / (static_cast<float> (RAND_MAX / 10)));
}
cudaError_t err;
double iStart = cpuSecond();
err = cudaMalloc((void**)&d_array1, narray1 * dim * dim * sizeof(float));
err = cudaMalloc((void**)&d_array2, narray1 * dim * sizeof(float));
err = cudaMalloc((void**)&d_result, narray1 * dim * sizeof(float));
err = cudaMemcpy(d_array1, array1, narray1 * dim * dim * sizeof(float), cudaMemcpyHostToDevice);
err = cudaMemcpy(d_array2, array2, narray1 * dim * sizeof(float), cudaMemcpyHostToDevice);
BatchMulGPU(d_array1, d_array2, narray1, dim, d_result);
err = cudaMemcpy(resultGPU, d_result, narray1 * dim * sizeof(float), cudaMemcpyDeviceToHost);
double iElaps = cpuSecond() - iStart;
printf("Total GPU computation time is %lf \n" , iElaps);
iStart = cpuSecond();
BatchMulCPU(array1, array2, narray1, dim, resultCPU);
iElaps = cpuSecond() - iStart;
printf("Total CPU computation time is %lf \n" , iElaps);
float error = 0;
float temp = 0;
for (long i = 0; i < narray1 * dim; i++)
{
// temp = abs(resultCPU[i] - resultGPU[i]);
// if (temp > 0.5)
// {
// std::cout << i << std::endl;
// }
error += abs(resultCPU[i] - resultGPU[i]);
}
printf("Error is %f \n", error);
// for (int i = 19730; i < 19750; i++)
// {
// std::cout << "GPU " << resultGPU[i] << std::endl;
// std::cout << "CPU " << resultCPU[i] << std::endl;
// }
cudaFree(d_array1);
cudaFree(d_array2);
cudaFree(d_result);
return 0;
}

Apart from the possibility of a WDDM TDR timeout as discussed in the comments, the code has an error.
Its evident that the kernel design expects that a total grid size (total number of threads) will be launched that is equal to or greater than the number of arrays times the side dimension:
int tx = blockIdx.x * blockDim.x + threadIdx.x;
if (tx < narray1 * dim)
i.e. narray1*dim are the needed number of threads
However the number being launched is only narray1:
dim3 threads(1024, 1);
dim3 grid(narray1 / 1024 + 1, 1);
If we change the last line above to:
dim3 grid((narray1*dim) / 1024 + 1, 1);
this code design error will be addressed.
The reason the code works correctly for small number of matrices (anything up to 256) is because of the rounding-up effect in the grid sizing to a minimum of 1024 threads, which is 256*4 (narray1 * dim).
As an aside, this code is not functionally similar to cublasSgemmBatched from what I can see. I don't recognize this code as being any matrix multiplication (matrix dot product) that I am familiar with.

Related

double** pointer being realloc'd was not allocated

I have to implement a clustering algorithm, after loading the dataset, I go to check for each point in which cluster it can be inserted. If points cannot be inserted into any cluster, I have to move them from the dataset and insert them into the retained set. Since I do not know a priori the size of the retained set, I allocate an area of memory initially equal to 0 and that is incremented by the bytes size needed to hold a point each time I have to insert a point into the retained set.
It works for some iterations (4 to be precise) and then stops
This is what I try:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
#include <malloc/malloc.h>
#include <float.h>
#include <stdbool.h>
double **load_dataset(char *filename, int d, int chunk_size);
int assign_point_to_cluster(double **clusters, int **set, double **retained_set, double *point,double *standard_deviation,
int d, int k, int *chunk_size, int p_in_r);
int find_candidate_cluster(double **clusters, double *point, double *std_deviation, int d, int k);
double mean(const double *vector, int d);
double mahalanobis(const double *cluster, const double *point, const double *standard_deviation, int d);
void compute_std_dev(const double *cluster, double *standard_deviation_vector, int d);
int inizialize_cluster(double **dataset, int d, int k, double **clusters, int **set, int chunk_size, bool retain);
double compute_sum_of_euclidean_distance(double **center_points, double *point, int n, int d);
void feature_scaling(double **dataset, int d, int chunk_size);
int main(int argc, char **argv) {
if(argc < 6){
printf("Error parameters! Usage: ./main <input_file> <total number of point> <chunk_size> <points_dimension> <cluster_number>");
return 0;
}
char* filename = argv[1];
int d = atoi(argv[4]), k = atoi(argv[5]), chunk_size = atoi(argv[3]), total = atoi(argv[2]);
int k_compressed = 0;
printf("Path: %s\n", filename);
printf("Number of point in set %i\n", total);
printf("chunk size: %i\n", chunk_size);
printf("Dimension of points: %i\n", d);
printf("Number of cluster: %i\n", k);
printf("----------------\n");
double **clusters = malloc(k * sizeof(double *));
double *standard_deviation = malloc(d * sizeof(double));
int **discard_set = malloc(k * sizeof(int *));
double **retained_set = malloc(1);
double * cohesion = malloc(2 * sizeof(double));
double* radius = NULL;
double **mini_cluster = NULL;
double* temp_cluster = NULL;
int** compressed_set = NULL;
double** mini_cluster_temp = NULL;
int p_in_r = 0;
double **dataset = load_dataset(filename, d, chunk_size);
/**
* Rescaling of variables
*/
//feature_scaling(dataset, d, chunk_size); TODO: Something is wrong
/**
* Cluster initialization
*/
if(!clusters || !discard_set || !standard_deviation || !retained_set || !cohesion){
printf("Something went wrong in main(), memory allocation failed!");
exit(1);
}
chunk_size = inizialize_cluster(dataset, d, k, clusters, discard_set, chunk_size, false);
/**
* At this point we are only interested in keeping a "summary" of the data that will be placed within a cluster.
* In dataset we put the id of the points that are added to a cluster, while cluster contains the statistics
* useful to perform clustering
**/
/**
* We start processing the points the (CHUNK - 1)eighth point in the dataset is assigned to the cluster if the
* mahalanobis distance is less than a threshold and if it is the closest.
* Clusetering dataset -> discard_set
*/
while (chunk_size > 0) {
p_in_r += assign_point_to_cluster(clusters, discard_set, retained_set, dataset[chunk_size - 1], standard_deviation, d, k, &chunk_size, p_in_r);
/**
* always working on the last element of the dataset, it is not necessary to move the list of points,
* just delete the last element
*/
free(dataset[chunk_size]);
dataset[chunk_size] = NULL;
dataset = realloc(dataset, chunk_size * sizeof(double *));
if(dataset == NULL){
printf("Something went wrong in main(), memory allocation failed!");
exit(1);
}
}
free(dataset);
dataset = NULL;
return 0;
}
int inizialize_cluster(double **dataset, int d, int k, double ** clusters, int** set, int chunk_size, bool retain) {
double ** center_point = malloc(k * sizeof(double *));
for (int i = 0; i < k; i++) {
center_point[i] = malloc((d + 1) * sizeof(double));
if(center_point[i] == NULL){
printf("Something went wrong in inizialize_cluster(), memory allocation failed!");
exit(1);
}
}
/**
* The point representing the center of the first cluster is chosen as the first point in the dataset
**/
memcpy(*center_point, *dataset, (d + 1) * sizeof(double));
/**
* The first point can be removed from the dataset or
* in case we are working on the retained set, move it to the end.
**/
chunk_size--;
if(retain){
double* temp = malloc(sizeof(double *));
memcpy(temp, dataset, sizeof(double *));
memcpy(dataset, dataset+1, chunk_size * sizeof(double *));
memcpy(dataset+chunk_size-1, temp, sizeof(double *));
/*for (int i = 0; i < CHUNK; ++i) {
printf("id[%i]: %f", dataset[i][0]);
}*/
}
else{
free(dataset[0]);
memcpy(dataset, dataset+1, chunk_size * sizeof(double *));
dataset[chunk_size] = NULL;
dataset = realloc(dataset, chunk_size * sizeof(double *));
if(dataset == NULL){
printf("Something went wrong in inizialize_cluster(), memory allocation failed!");
exit(1);
}
}
/**
* The centers of the next clusters are chosen as those that are furthest apart
**/
double max;
int pos;
double distance;
for (int i = 1; i < k; i++) {
/**
* I choose the point that maximizes the sum of the distances from the centerpieces
*/
max = -1;
for (int j = 0; j < chunk_size; j++){
distance = compute_sum_of_euclidean_distance(center_point, dataset[j], i, d);
if (distance > max) {
pos = j;
max = distance;
}
}
memcpy(*(center_point + i), *(dataset + pos), (d + 1) * sizeof(double));
/**
* When a point is chosen as the center of a cluster, I remove it from the dataset
**/
chunk_size--;
if(retain){
double** temp = malloc(sizeof(double *));
memcpy(temp, dataset + pos, sizeof(double *));
memcpy(dataset + pos, dataset + pos + 1, (chunk_size - pos) * sizeof(double *));
memcpy(dataset + chunk_size - 1, temp, sizeof(double *));
}
else{
free(dataset[pos]);
memcpy(dataset + pos, dataset + pos + 1, (chunk_size - pos) * sizeof(double *));
dataset = realloc(dataset, chunk_size * sizeof(double *));
if(dataset == NULL){
printf("Something went wrong in inizialize_cluster(), memory allocation failed!");
exit(1);
}
}
}
/**
* When I have found k points that can be used as the initial centres of the k clusters,
* I summarize them (calculate cluster statistics) and enter them into the discard set.
*/
for (int i = 0; i < k; i++) {
/**
* Cluster and discard set initialization
*/
clusters[i] = malloc(((2 * d) + 1) * sizeof(double));
set[i] = malloc(sizeof(int ));
if(clusters[i] == NULL || set[i] == NULL){
printf("Something went wrong in in inizialize_cluster(), memory allocation failed!");
exit(1);
}
clusters[i][0]=1;
set[i][0] = (int) center_point[i][0];
for (int j = 1; j < d + 1; j++) {
clusters[i][j] = center_point[i][j];
clusters[i][j + d] = pow(center_point[i][j], 2);
}
free(center_point[i]);
center_point[i] = NULL;
}
free(center_point);
center_point = NULL;
return chunk_size;
}
double **load_dataset(char *filename, int d, int chunk_size) {
double **dataset = malloc(chunk_size * sizeof(double *));
if(dataset == NULL){
printf("Something went wrong in load_dataset(), memory allocation failed!");
exit(1);
}
for (int i = 0; i < chunk_size; i++) {
dataset[i] = malloc((d + 1) * sizeof(double));
if(dataset[i] == NULL){
printf("Something went wrong in load_dataset(), memory allocation failed!");
exit(1);
}
}
FILE *file;
file=fopen(filename, "r");
if (file == NULL){
printf("Something went wrong in load_dataset(), file opening failed! (row 162)");
exit(1);
}
char *line = NULL, *token;
size_t len = 0;
int i = 0;
int j = 0;
int first_line = 0;
while ((getline(&line, &len, file)) != -1 && i < chunk_size) {
if(first_line != 0) {
while ((token = strsep(&line, ",")) != NULL) {
dataset[i][j] = atof(token);
j++;
}
j = 0;
i++;
} else{
first_line = 1;
}
}
fclose(file);
return dataset;
}
int assign_point_to_cluster(double **clusters, int **set, double **retained_set, double *point,double *standard_deviation,
int d, int k, int *chunk_size, int p_in_r) {
/**
* For each point I assess which cluster it can go into
*/
int candidate;
candidate = find_candidate_cluster(clusters, point, standard_deviation, d, k);
/**
* After identifying the candidate cluster (if there is one), I add the point to the discard set and update the
* cluster statistics otherwise I go ahead and put the point in the retained set
*/
(*chunk_size)--;
if(candidate > -1){
/**
* I add the point to the discard/compressed set
*/
clusters[candidate][0]++;
set[candidate] = realloc(set[candidate], (unsigned long)clusters[candidate][0] * sizeof(int));
if(set[candidate] == NULL){
printf("Something went wrong in in assign_point_to_cluster(), memory allocation failed!");
exit(1);
}
set[candidate][(int) clusters[candidate][0] - 1] = (int) point[0];
/**
* I update the cluster statistics
*/
for (int i = 1; i < d + 1; i++) {
clusters[candidate][i] += point[i];
clusters[candidate][i + d] += pow(point[i], 2);
}
}
else if(retained_set){
/**
* I insert the point in the retained set
*/
p_in_r++;
retained_set = realloc(retained_set, p_in_r * sizeof(double *));
retained_set[p_in_r - 1] = malloc((d + 1) * sizeof(double));
memcpy(*(retained_set + p_in_r - 1), point, (d + 1) * sizeof(double ));
return 1;
}
return 0;
}
int find_candidate_cluster(double **clusters, double *point, double *std_deviation, int d, int k) {
double actual = DBL_MAX;
int candidate = -1;
double threshold;
double distance;
for (int j = 0; j < k; j++) {
/**
* Calculation of varainza,threshold and mahalanobis' distance
*/
compute_std_dev(clusters[j], std_deviation, d);
//TODO: Would it be okay as a threshold? An alternative could be the module?
threshold = 3.5 * mean(std_deviation, d);
distance = mahalanobis(clusters[j], point, std_deviation, d);
if(distance < threshold && distance < actual){
/**
* the cluster is a candidate for the point
*/
candidate = j;
actual = distance;
}
}
return candidate;
}
double mean(const double *vector, int d) {
double sum = 0;
for (int i = 0; i < d; ++i) {
sum += vector[i];
}
return sum/d;
}
void compute_std_dev(const double *cluster, double *standard_deviation_vector, int d) {
double sigma;
/**
* Vector of the variances of the components of the cluster elements
*/
for (int i = 0; i < d; i++) {
sigma = sqrt(fabs(cluster[i + 1 + d]/cluster[0] - pow(cluster[i + 1]/cluster[0], 2)));
if( sigma == 0)
sigma = 1;
standard_deviation_vector[i] = sigma;
}
}
double mahalanobis(const double *cluster, const double *point, const double *standard_deviation, int d) {
double distance=0;
for (int i = 1; i < d; ++i) {
distance += pow((point[i] - cluster[i]) / standard_deviation[i - 1], 2);
}
return sqrt(distance)/d; //TODO: can it be okay? I thought so since the threshold is the average of the st.dev.
}
double compute_sum_of_euclidean_distance(double **center_points, double *point, int n, int d) {
double component_sum = 0;
double final_sum = 0;
for (int i = 0; i < n; i++) {
for (int j = 1; j < d + 1; j++){
component_sum += pow(center_points[i][j] - point[j], 2);
}
final_sum += sqrt(component_sum);
}
return final_sum;
}
void feature_scaling(double **dataset, int d, int chunk_size) {
/**
* We perform a Z-score Normalization
**/
double mean;
double sigma;
double sum;
double sumQ;
double variance;
/**
* We calculate mean and variance for each column
**/
for (int i = 1; i < d + 1; i++) {
sum = 0;
for (int j = 0; j < chunk_size; j++) {
sum += dataset[j][i];
}
mean = sum / chunk_size;
sumQ = 0;
for (int j = 0; j < chunk_size; j++) {
sumQ += pow((dataset[j][i] - mean), 2);
}
variance = sumQ / chunk_size;
sigma = sqrt(variance);
if( sigma == 0)
sigma = 1;
/**
* Feature scaling: (x-x_med)/sigma
**/
for (int j = 0; j < chunk_size; j++) {
dataset[j][i] = (dataset[j][i] - mean) / sigma;
}
}
}
The command I use when run is:
./main "db.csv" 100 35 4 3
It works if the 3rd argument is less then 34
The file db.csv contains:
CustomerID,Gender,Age,Annual Income (k$),Spending Score (1-100),cluster
1,0,19,15,39,4
2,0,21,47,81,3
3,1,20,56,6,4
4,1,23,16,77,3
5,1,31,17,40,4
6,1,22,17,76,3
7,1,35,18,6,4
8,1,23,18,94,3
9,0,64,19,3,4
10,1,30,19,72,3
11,0,67,19,14,4
12,1,35,19,99,3
13,1,58,20,15,4
14,1,24,20,77,3
15,0,37,20,13,4
16,0,22,20,79,3
17,1,35,21,35,4
18,0,20,21,66,3
19,0,52,23,29,4
20,1,35,23,98,3
21,0,35,24,35,4
22,0,25,24,73,3
23,1,46,25,5,4
24,0,31,25,73,3
25,1,54,28,14,4
26,0,29,28,82,3
27,1,45,28,32,4
28,0,35,28,61,3
29,1,40,29,31,4
30,1,23,29,87,3
31,0,60,30,4,4
32,1,21,30,73,3
33,0,53,33,4,4
34,0,18,33,92,3
35,1,49,33,14,4
36,1,21,33,81,3
37,1,42,34,17,4
38,1,30,34,73,3
39,1,36,37,26,4
40,1,20,37,75,3
41,1,65,38,35,0
42,0,24,38,92,3
43,0,48,39,36,0
44,1,31,39,61,5
45,1,49,39,28,4
46,1,24,39,65,3
47,1,50,40,55,0
48,1,27,40,47,5
49,1,29,40,42,5
50,1,31,40,42,5
51,1,49,42,52,0
52,0,33,42,60,5
53,1,31,43,54,5
54,0,59,43,60,0
55,1,50,43,45,0
56,0,47,43,41,0
57,1,51,44,50,0
58,0,69,44,46,0
59,1,27,46,51,5
60,0,53,46,46,0
61,0,70,46,56,0
62,0,19,46,55,5
63,1,67,47,52,0
64,1,54,47,59,0
65,0,63,48,51,0
66,0,18,48,59,5
67,1,43,48,50,0
68,1,68,48,48,0
69,0,19,48,59,5
70,1,32,48,47,5
71,0,70,49,55,0
72,1,47,49,42,0
73,1,60,50,49,0
74,1,60,50,56,0
75,0,59,54,47,0
76,0,26,54,54,5
77,1,45,54,53,0
78,0,40,54,48,5
79,1,23,54,52,5
80,1,49,54,42,0
81,0,57,54,51,0
82,0,38,54,55,5
83,0,67,54,41,0
84,1,46,54,44,0
85,1,21,54,57,5
86,0,48,54,46,0
87,1,55,57,58,0
88,1,22,57,55,5
89,1,34,58,60,5
90,1,50,58,46,0
91,1,68,59,55,0
92,0,18,59,41,5
93,0,48,60,49,0
94,1,40,60,40,5
95,1,32,60,42,5
96,0,24,60,52,5
97,1,47,60,47,0
98,1,27,60,50,5
99,0,48,61,42,0
100,0,20,61,49,5
101,1,23,62,41,5
102,1,49,62,48,0
103,0,67,62,59,0
104,0,26,62,55,5
105,0,49,62,56,0
106,1,21,62,42,5
107,1,66,63,50,0
108,0,54,63,46,0
109,0,68,63,43,0
110,0,66,63,48,0
111,0,65,63,52,0
112,1,19,63,54,5
113,1,38,64,42,5
114,0,19,64,46,5
115,1,18,65,48,5
116,1,19,65,50,5
117,1,63,65,43,0
118,1,49,65,59,0
119,1,51,67,43,0
120,1,50,67,57,0
121,0,27,67,56,5
122,1,38,67,40,5
123,1,40,69,58,5
124,0,39,69,91,1
125,1,23,70,29,5
126,1,31,70,77,1
127,0,43,71,35,2
128,0,40,71,95,1
129,0,59,71,11,2
130,0,38,71,75,1
131,0,47,71,9,2
132,0,39,71,75,1
133,1,25,72,34,5
134,1,31,72,71,1
135,0,20,73,5,2
136,1,29,73,88,1
137,1,44,73,7,2
138,0,32,73,73,1
139,0,19,74,10,2
140,1,35,74,72,1
141,1,57,75,5,2
142,0,32,75,93,1
143,1,28,76,40,5
144,1,32,76,87,1
145,0,25,77,12,2
146,0,28,77,97,1
147,0,48,77,36,2
148,1,32,77,74,1
149,1,34,78,22,2
150,0,34,78,90,1
151,0,43,78,17,2
152,0,39,78,88,1
153,1,44,78,20,2
154,1,38,78,76,1
155,1,47,78,16,2
156,1,27,78,89,1
157,0,37,78,1,2
158,1,30,78,78,1
159,0,34,78,1,2
160,1,30,78,73,1
161,1,56,79,35,2
162,1,29,79,83,1
163,0,19,81,5,2
164,1,31,81,93,1
165,0,50,85,26,2
166,1,36,85,75,1
167,0,42,86,20,2
168,1,33,86,95,1
169,1,36,87,27,2
170,0,32,87,63,1
171,0,40,87,13,2
172,0,28,87,75,1
173,0,36,87,10,2
174,0,36,87,92,1
175,1,52,88,13,2
176,1,30,88,86,1
177,0,58,88,15,2
178,0,27,88,69,1
179,0,59,93,14,2
180,0,35,93,90,1
181,1,37,97,32,2
182,1,32,97,86,1
183,0,46,98,15,2
184,1,29,98,88,1
185,1,41,99,39,2
186,0,30,99,97,1
187,1,54,101,24,2
188,0,28,101,68,1
189,1,41,103,17,2
190,1,36,103,85,1
191,1,34,103,23,2
192,1,32,103,69,1
193,0,33,113,8,2
194,1,38,113,91,1
195,1,47,120,16,2
196,1,35,120,79,1
197,1,45,126,28,2
198,0,32,126,74,1
199,0,32,137,18,2
200,0,30,137,83,1
download it from mega: db.csv.
Originally found on Kaggle but I made some modifications.
Edit: I included the whole code
Edit: I alse get this error trying to see what is in retained_set : read memory from 0x3d2fdfcb8030 failed (0 of 8 bytes read)
Edit: I translate the comment in the code and added the file I use as input
assign_point_to_cluster has a local variable double **retained_set. This means that you cannot do retained_set = realloc(retained_set, ... or you will just change where that local variable points at, not where the pointer-to-pointer on the caller side points at. And because of that you also create a memory leak. See this FAQ: Dynamic memory access only works inside function
As for how to solve it, it appears that encapsulating all of this data into structs would simplify the program a lot. You could also implement it as an "opaque type" (How to do private encapsulation in C?) and get rid of the caller's responsibility to handle dynamic allocation.
Using 2D arrays instead of pointer-to-pointers might also simplify the program and improve performance. For example if you could use a "pointer to array pointer" parameter double (**retained_set)[x][y]) then you could do double (*tmp)[x][y] = realloc(*retained_set,...) and then *retained_set = tmp;, which would affect the caller. But structs would be easier to read so that should be the first option.
Also note that malloc.h has been obsolete since forever. difference between <stdlib.h> and <malloc.h>

parallelizing Mandelbrot using MPI

I am trying to parallelize the Mandelbrot.
the correct output should be around 1.510659. however I am not getting that correctly.
** PROGRAM: Mandelbrot area
**
** PURPOSE: Program to compute the area of a Mandelbrot set.
** The correct answer should be around 1.510659.
**
** USAGE: Program runs without input ... just run the executable
**
reduction for numoutside.
this is my parallelized code
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <mpi.h>
#define NPOINTS 1000
#define MAXITER 1000
int P = 1;
struct d_complex
{
double r;
double i;
};
int testpoint(struct d_complex);
struct d_complex c;
struct d_complex cPart;
int numoutside = 1;
int main()
{
int i, j, row;
int res;
double area, error, eps = 1.0e-5;
int myrank, mysize;
double stsec, ensec, commtime, maxcommtime;
MPI_Status status;
MPI_Init(NULL, NULL);
MPI_Comm_size(MPI_COMM_WORLD, &mysize);
MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
stsec = MPI_Wtime();
// Loop over grid of points in the complex plane which contains the Mandelbrot set,
// testing each point to see whether it is inside or outside the set.
/*for (i = 0; i < NPOINTS; i ++)
{
for (j = 0; j < NPOINTS ; j++)
{
c.r = -2.0 + 2.5 * (double)(i) / (double)(NPOINTS) + eps;
c.i = 1.125 * (double)(j) / (double)(NPOINTS) + eps;
testpoint(c);
}
}*/
if (myrank == 0)
{
/* Begin User Program - the master */
//*
int outsum, nb_pixel = NPOINTS*NPOINTS ;
for (i = 0; i < nb_pixel; i++)
{
MPI_Recv(&res, 1, MPI_INT, MPI_ANY_SOURCE, 0, MPI_COMM_WORLD, &status);
// printf("Slave id %d has send : %d \n", status.MPI_SOURCE, data[2]);
// printf("%d: [%d,%d] -> [%d,%d] = %d\n", status.MPI_SOURCE, data[0], data[1], data[0] + MAXX, data[1] + MAXY, data[2]);
res += numoutside;
}
area = 2.0 * 2.5 * 1.125 * (double)(NPOINTS * NPOINTS - res) / (double)(NPOINTS * NPOINTS);
error = area / (double)NPOINTS;
printf("Area of Mandlebrot set = %12.8f +/- %12.8f\n", area, error);
printf("Finish.\n");
}
else
{
for (i = myrank; i < NPOINTS; i+=mysize)
{
for (j = 0; j < NPOINTS; j++)
{
c.r = -2.0 + 2.5 * (double)(i) / (double)(NPOINTS) + eps;
c.i = 1.125 * (double)(j) / (double)(NPOINTS) + eps;
res=testpoint(c);
MPI_Send(&res, 1, MPI_INT, 0, 0, MPI_COMM_WORLD);
}
}
}
// Calculate area of set and error estimate and output the results
MPI_Finalize();
ensec = MPI_Wtime();
commtime = ensec - stsec;
// area = 2.0 * 2.5 * 1.125 * (double)(NPOINTS * NPOINTS - numoutside) / (double)(NPOINTS * NPOINTS);
// error = area / (double)NPOINTS;
printf("Area of Mandlebrot set = %12.8f +/- %12.8f\n", area, error);
if (myrank == 0)
{
printf("%.3f\n", commtime);
}
}
int testpoint(struct d_complex c)
{
// Does the iteration z=z*z+c, until |z| > 2 when point is known to be outside set
// If loop count reaches MAXITER, point is considered to be inside the set
struct d_complex z;
int iter;
double temp;
z = c;
for (iter = 0; iter < MAXITER; iter++)
{
temp = (z.r * z.r) - (z.i * z.i) + c.r;
z.i = z.r * z.i * 2 + c.i;
z.r = temp;
if ((z.r * z.r + z.i * z.i) > 4.0)
{
// MPI_Send( &numoutside, 1, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);
return numoutside;
break;
}
}
return 0;
}
the expectation is to get around 1.510659 when running the code with NPOINTS: 1000,2000 and 2,4, processors .

PThreads is not providing a program speedup over serial code

I am creating 2 programs to test the differences in run time of serial matrix multiply vs that of parallel matrix multiply. The parallel code that I have written is actually running slower than serial code, and running the program with additional cores enabled provides no speedup at all... using more cores actually seems to slow down the parallel program.
What is going on here? This is my parallel code: to use this pass in matrix size and thread number (see my useage below)
#include <stdio.h>
#include <stdlib.h> // rand(), srand()
#include <unistd.h>
#include <time.h>
#include <pthread.h>
// Time struct + prototypes
struct timespec time1, time2, diffTime;
struct timespec timespecDifference(struct timespec start, struct timespec end); // For timing
double** reserveMatrix(int nRows, int nCols);
void printMat(double** mat1, int rows, int cols);
void* matMult(void* arg);
// Argstruct
typedef struct {
double** result;
int tid;
int size;
int s;
int e;
} argStr;
// global variables for use by all threads
int size; // Size of a row and column.
int numThreads; // Number of pThreads to do work
double** mat1;
double** mat2;
double** mat3;
// Main function
int main(int argc, char *argv[]) {
size = atoi(argv[1]);
numThreads = atoi(argv[2]);
mat1 = reserveMatrix(size, size);
mat2 = reserveMatrix(size, size);
mat3 = reserveMatrix(size, size);
if (size == 0) {
//printf("Matrix cannot be size 0\n");
return -1;
}
//Start timer
clock_gettime(CLOCK_MONOTONIC, &time1);
// *********** Begin main operation *********** //
// //
// declare necessary local variables
pthread_t theThreads[numThreads];
argStr data[numThreads]; // Create numThreads # of argStr objects
for (int i = 0; i < numThreads; i++) {
data[i].result = reserveMatrix(size, size);
data[i].tid = i; // Self-assigned threadID
data[i].size = size; // Size of a block
data[i].s = size * i / numThreads;
data[i].e = size * (i + 1) / numThreads - 1;
//printf("I handle operations from %d to %d\n", data[i].s, data[i].e);
}
// Start the threads
for (int i = 0; i < numThreads; i++) {
pthread_create(&theThreads[i], NULL, matMult, (void*) (&data[i]));
}
// await all threads being done.
for (int i = 0; i < numThreads; i++) {
pthread_join(theThreads[i], NULL);
}
// rejoin received data
//printMat(data[1].result, size, size);
// //
// *********** End main operation *********** //
// Stop timer and find time taken
clock_gettime(CLOCK_MONOTONIC, &time2);
diffTime = timespecDifference(time1, time2);
double cpuTimeUsed = ((double)diffTime.tv_sec + (double)diffTime.tv_nsec / 1000000000.0);
//Print Time
printf("Pthread Matrix Multiply, %d, %d, %lf\n", size, numThreads, cpuTimeUsed);
}
// Struct Timer
struct timespec timespecDifference(struct timespec start, struct timespec end)
{
struct timespec temp;
if ((end.tv_nsec - start.tv_nsec) < 0) {
temp.tv_sec = end.tv_sec - start.tv_sec - 1;
temp.tv_nsec = 1000000000 + end.tv_nsec - start.tv_nsec;
}
else {
temp.tv_sec = end.tv_sec - start.tv_sec;
temp.tv_nsec = end.tv_nsec - start.tv_nsec;
}
return temp;
}
// Reserve matrix function
double** reserveMatrix(int nRows, int nCols) {
double** matrix1 = (double**)malloc(nRows * sizeof(double*));
matrix1[0] = (double*)malloc(nRows * nCols * sizeof(double));
// Assign row pointers to "segment" out the data
for (int r = 1; r < nRows; ++r) {
matrix1[r] = &(matrix1[0][r * nCols]);
}
// Give values to the array
for(int i = 0; i < nRows * nCols; i++) {
matrix1[0][i] = i;
}
return matrix1;
}
// Print matrix function
void printMat(double** mat1, int rows, int cols) {
for (int i = 0; i < rows; i++) {
for (int j = 0; j < cols; j++) {
printf("%f, ", mat1[i][j]);
}
printf("\n");
}
printf("End of array print\n");
}
void* matMult(void* arg) {
//printf("Begin an operation\n");
argStr* args = (argStr*)arg;
double** result = args->result;
int tid = args->tid;
int size = args->size; // Size of the matrix
long s = args->s; // Start
long e = args->e; // End
// Print message to confirm data is getting stored
//printf("Hello from operation %d! \n", tid);
//printf("I am working from number %ld to %ld\n", s, e);
for(int r = s; r <= e; r++) { // May need to declare out of loop
for(int c = 0; c < size; c++) {
result[r][c] = 0.0;
for(int i = 0; i < size; i++) {
result[r][c] += mat1[r][i] * mat2[i][c];
}
}
}
// Print multipled matrix values
//printMat(mat3, size, size);
return NULL;
}
This is my serial code: To use this pass in the same sized row and column (see my useage below)
#include <stdio.h>
#include <stdlib.h> // rand(), srand()
#include <unistd.h>
#include <time.h>
// Matrix multiply code
// **** Time struct **** //
struct timespec time1, time2, diffTime;
// Prototypes
struct timespec timespecDifference(struct timespec start, struct timespec end); // For timing
double** matrixMultiply(double** matrix1, double** matrix2, double** result, int nRows, int nCols);
double** transMatrixMultiply(double** matrix1, double** matrix2, double** result, int nRows, int nCols);
double** reserveMatrix(int nRows, int nCols);
double matrixProduct(double** mat1, double** mat2, int nRows, int nCols);
void printMat(double** mat1, int rows, int cols);
// Begin main
int main(int argc, char *argv[])
{
int rows = atoi(argv[1]);
int cols = atoi(argv[2]);
// Declare the ARRAYS and populate them
double** arr1 = reserveMatrix(rows, cols);
double** arr2 = reserveMatrix(rows, cols);
double** arr3 = reserveMatrix(rows, cols);
double** arr4 = reserveMatrix(rows, cols);
double prod1 = matrixProduct(arr1, arr2, rows, cols);
//Start Clock
clock_gettime(CLOCK_MONOTONIC, &time1);
arr3 = matrixMultiply(arr1, arr2, arr3, rows, cols);
// Stop timer and find time taken
clock_gettime(CLOCK_MONOTONIC, &time2);
diffTime = timespecDifference(time1, time2);
double cpuTimeUsed = ((double)diffTime.tv_sec + (double)diffTime.tv_nsec / 1000000000.0);
//Print Time
printf("Matrix Multiply, %d, %lf\n", rows, cpuTimeUsed);
// Print input matrix values. Used to test that matrix multiply works - it does
// Perform a transposition of matrix 2
for (int r = 0; r < rows; ++r) {
for (int c = r + 1; c < cols; ++c) {
double val = arr2[r][c];
arr2[r][c] = arr2[c][r];
arr2[c][r] = val;
}
}
// Run matrix multiply again on the newly transposed data.
//Start Clock
clock_gettime(CLOCK_MONOTONIC, &time1);
arr4 = transMatrixMultiply(arr1, arr2, arr4, rows, cols);
// Stop timer and find time taken
clock_gettime(CLOCK_MONOTONIC, &time2);
diffTime = timespecDifference(time1, time2);
cpuTimeUsed = ((double)diffTime.tv_sec + (double)diffTime.tv_nsec / 1000000000.0);
//Print Time
printf("Trans Matrix Multiply, %d, %lf\n", rows, cpuTimeUsed);
//double prod2 = matrixProduct(arr3, arr4, rows, cols);
//printf("The matrix product of m3 and m4 is: %f\n", prod2);
//printMat(mat3, rows, cols);
return 0;
}
// Struct Timer
struct timespec timespecDifference(struct timespec start, struct timespec end)
{
struct timespec temp;
if ((end.tv_nsec - start.tv_nsec) < 0) {
temp.tv_sec = end.tv_sec - start.tv_sec - 1;
temp.tv_nsec = 1000000000 + end.tv_nsec - start.tv_nsec;
}
else {
temp.tv_sec = end.tv_sec - start.tv_sec;
temp.tv_nsec = end.tv_nsec - start.tv_nsec;
}
return temp;
}
// standard matrix multiply
double** matrixMultiply(double** matrix1, double** matrix2, double** result, int nRows, int nCols) {
for (int r = 0; r < nRows; ++r) {
for (int c = 0; c < nCols; ++c) {
result[r][c] = 0.0;
for (int i = 0; i < nRows; ++i) {
result[r][c] += matrix1[r][i] * matrix2[i][c];
}
}
}
return result;
}
// Transpose matrix multiply
double** transMatrixMultiply(double** matrix1, double** matrix2, double** result, int nRows, int nCols) {
for (int c = 0; c < nCols; ++c) {
for (int r = 0; r < nRows; ++r) {
result[c][r] = 0.0;
for (int i = 0; i < nCols; ++i) {
result[c][r] += matrix1[c][i] * matrix2[r][i];
}
}
}
return result;
}
// Reserve data function. Reserves and populates array data
double** reserveMatrix(int nRows, int nCols) {
double** matrix1 = (double**)malloc(nRows * sizeof(double*));
matrix1[0] = (double*)malloc(nRows * nCols * sizeof(double));
// Assign row pointers to "segment" out the data
for (int r = 1; r < nRows; ++r) {
matrix1[r] = &(matrix1[0][r * nCols]);
}
// Give values to the array
for(int i = 0; i < nRows * nCols; i++) {
matrix1[0][i] = i;
}
return matrix1;
}
// Check that matrix1 and matrix2 are the same
double matrixProduct(double** mat1, double** mat2, int nRows, int nCols) {
double sum = 0.0;
for(int i = 0; i < nRows * nCols; i++) {
sum += (mat1[0][i] - mat2[0][i]) * (mat1[0][i] - mat2[0][i]);
//printf("matrix product pos: %i, sum: %f\n", i, sum);
}
return sum;
}
// Print matrix function
void printMat(double** mat1, int rows, int cols) {
for (int i = 0; i < rows; i++) {
for (int j = 0; j < cols; j++) {
printf("%f, ", mat1[i][j]);
}
printf("\n");
}
printf("End of array print\n");
}
Here is the linux output of me compiling and running this code. At matrix size 1200 x 1200 the run time differences are not that pronounced, but the serial code ends up being significantly faster than the parallel at sizes above 1500 x 1500:
MYPC:~/Projects/matrixMultiply/phase3$ gcc matrixMult.c -o MM
MYPC:~/Projects/matrixMultiply/phase3$ gcc pMatMult.c -lpthread -o PMM
MYPC:~/Projects/matrixMultiply/phase3$ ./MM 1200 1200
Matrix Multiply, 1200, 25.487388
Trans Matrix Multiply, 1200, 16.452777
MYPC:~/Projects/matrixMultiply/phase3$ ./PMM 1200 2
Pthread Matrix Multiply, 1200, 2, 22.495115
MYPC:~/Projects/matrixMultiply/phase3$ ./PMM 1200 4
Pthread Matrix Multiply, 1200, 4, 22.181686
The sections in bold contain the meaningful output. It reads
name of the process
matrix size
number of threads spawned (in pThread program only)
run time
Any help would be appreciated. I will be instantly replying to questions for the next 2 hours.
The solution was to terminate extra processes that were running on my ubuntu machine. The code worked perfectly fine as a few users pointed out. Killing all other processes on the machine, then running my parallel code provided the expected speedups.
I am not sure of the precise technical reason this is going on other than the machine wasn't prioritizing my program when it had others running, resulting in slower times.

Simple CUDA kernel with Bizarre Result?

I am using a CUDA kernel object in MATLAB in order to fill a 2D array with all '55's. The result is very strange. The 2D array only fills up to a certain point as shown below. After row 1025, the array is all zeros. Any idea what could be going wrong?
As I mentioned in the comment above, you are mistakenly offsetting the matrix rows. The code below is a full working example proving this point.
#include<thrust\device_vector.h>
__global__ void myKern(double* masterForces, int r_max, int iterations) {
int threadsPerBlock = blockDim.x * blockDim.y;
int blockId = blockIdx.x + (blockIdx.y * gridDim.x);
int threadId = threadIdx.x + (threadIdx.y * blockDim.x);
int globalIdx = (blockId * threadsPerBlock) + threadId;
//for (int i=0; i<iterations; i++) masterForces[globalIdx * r_max + i] = 55;
for (int i=0; i<iterations; i++) masterForces[globalIdx * iterations + i] = 55;
}
void main() {
int ThreadBlockSize = 32;
int GridSize = 32;
int reps = 1024;
int iterations = 2000;
thrust::device_vector<double> gpuF_M(reps*iterations, 0);
myKern<<<GridSize,ThreadBlockSize>>>(thrust::raw_pointer_cast(gpuF_M.data()),reps,iterations);
int numerrors = 0;
for (int i=0; i<reps*iterations; i++) {
double test = gpuF_M[i];
if (test != 55) { printf("Error %i %f\n",i,test); numerrors++; }
}
printf("Finished!\n");
printf("The number of errors is = %i\n",numerrors);
getchar();
}

How to Optimize CUDA Sieve of Eratosthenes [closed]

This question is unlikely to help any future visitors; it is only relevant to a small geographic area, a specific moment in time, or an extraordinarily narrow situation that is not generally applicable to the worldwide audience of the internet. For help making this question more broadly applicable, visit the help center.
Closed 9 years ago.
I'm new to CUDA. To get my hands dirty, I tried writing a Sieve of Eratosthenes (for finding all the primes up to some number n).
There are a number of things I had to do to get it to work that it seems shouldn't have been necessary. I'm curious whether anyone knows of a more natural (and still CUDA-optimized) approach.
To take the entries marked as prime in the isPrime array, I had to do two separate kernel calls. The first counts the number of primes in each threadblock and assigns to each entry i the number of primes in that block less than i. Then I have to make a second call to add in the number of primes in all the previous blocks in order to get the final index.
But it's even worse than that, because to avoid heaps of concurrent reads, I had to store the number of primes in the block in a separate array at each of THREADS_PER_BLOCK indices effectively doubling the required memory for the algorithm. It seems like there should be a way to have all the threads read the same value for each block rather than have to copy it so many times.
Despite all this, there's still the problem of concurrent reads in the clearMultiples method. Especially for small primes like 2 and 3, every thread has to read the value in. Isn't there any way to deal with this?
Could anyone look at my code and tell me if there's anything obvious I could do that would be simpler or more efficient?
Is there anything I'm doing that's particularly inefficient (besides printing out all the primes at the end of course)?
Is it necessary to call synchronize after every kernel call?
Do I need to synchronize after memcpy's as well?
Finally, how come when I set THREADS_PER_BLOCK to 512 it doesn't work?
Thank you
#include <stdio.h>
#include <cuda.h>
#include <assert.h>
#include <math.h>
#define MAX_BLOCKS 256
#define THREADS_PER_BLOCK 256 //Must be a power of 2
#define BLOCK_SPACE 2 * THREADS_PER_BLOCK
__global__ void initialize(int* isPrime, int n) {
int idx = blockIdx.x * THREADS_PER_BLOCK + threadIdx.x;
int step = gridDim.x * THREADS_PER_BLOCK;
int i;
for (i = idx; i <= 1; i += step) {
isPrime[i] = 0;
}
for (; i < n; i += step) {
isPrime[i] = 1;
}
}
__global__ void clearMultiples(int* isPrime, int* primeList, int startInd,
int endInd, int n) {
int yidx = blockIdx.y * blockDim.y + threadIdx.y;
int xidx = blockIdx.x * blockDim.x + threadIdx.x;
int ystep = gridDim.y * blockDim.y;
int xstep = gridDim.x * blockDim.x;
for (int pnum = startInd + yidx; pnum < endInd; pnum += ystep) {
int p = primeList[pnum];
int pstart = p * (p + xidx);
int pstep = p * xstep;
for (int i = pstart; i < n; i += pstep) {
isPrime[i] = 0;
}
}
}
__device__ void makeCounts(int* isPrime, int* addend, int start, int stop) {
__shared__ int tmpCounts[BLOCK_SPACE];
__shared__ int dumbCounts[BLOCK_SPACE];
int idx = threadIdx.x;
tmpCounts[idx] = ((start + idx) < stop) ? isPrime[start + idx] : 0;
__syncthreads();
int numEntries = THREADS_PER_BLOCK;
int cstart = 0;
while (numEntries > 1) {
int prevStart = cstart;
cstart += numEntries;
numEntries /= 2;
if (idx < numEntries) {
int i1 = idx * 2 + prevStart;
tmpCounts[idx + cstart] = tmpCounts[i1] + tmpCounts[i1 + 1];
}
__syncthreads();
}
if (idx == 0) {
dumbCounts[cstart] = tmpCounts[cstart];
tmpCounts[cstart] = 0;
}
while (cstart > 0) {
int prevStart = cstart;
cstart -= numEntries * 2;
if (idx < numEntries) {
int v1 = tmpCounts[idx + prevStart];
int i1 = idx * 2 + cstart;
tmpCounts[i1 + 1] = tmpCounts[i1] + v1;
tmpCounts[i1] = v1;
dumbCounts[i1] = dumbCounts[i1 + 1] = dumbCounts[idx + prevStart];
}
numEntries *= 2;
__syncthreads();
}
if (start + idx < stop) {
isPrime[start + idx] = tmpCounts[idx];
addend[start + idx] = dumbCounts[idx];
}
}
__global__ void createCounts(int* isPrime, int* addend, int lb, int ub) {
int step = gridDim.x * THREADS_PER_BLOCK;
for (int i = lb + blockIdx.x * THREADS_PER_BLOCK; i < ub; i += step) {
int start = i;
int stop = min(i + step, ub);
makeCounts(isPrime, addend, start, stop);
}
}
__global__ void sumCounts(int* isPrime, int* addend, int lb, int ub,
int* totalsum) {
int idx = blockIdx.x;
int s = 0;
for (int i = lb + idx; i < ub; i += THREADS_PER_BLOCK) {
isPrime[i] += s;
s += addend[i];
}
if (idx == 0) {
*totalsum = s;
}
}
__global__ void condensePrimes(int* isPrime, int* primeList, int lb, int ub,
int primeStartInd, int primeCount) {
int idx = blockIdx.x * THREADS_PER_BLOCK + threadIdx.x;
int step = gridDim.x * THREADS_PER_BLOCK;
for (int i = lb + idx; i < ub; i += step) {
int term = isPrime[i];
int nextTerm = i + 1 == ub ? primeCount : isPrime[i + 1];
if (term < nextTerm) {
primeList[primeStartInd + term] = i;
}
}
}
int main(void) {
printf("Enter upper bound:\n");
int n;
scanf("%d", &n);
int *isPrime, *addend, *numPrimes, *primeList;
cudaError_t t = cudaMalloc((void**) &isPrime, n * sizeof(int));
assert(t == cudaSuccess);
t = cudaMalloc(&addend, n * sizeof(int));
assert(t == cudaSuccess);
t = cudaMalloc(&numPrimes, sizeof(int));
assert(t == cudaSuccess);
int primeBound = 2 * n / log(n);
t = cudaMalloc(&primeList, primeBound * sizeof(int));
assert(t == cudaSuccess);
int numBlocks = min(MAX_BLOCKS,
(n + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK);
initialize<<<numBlocks, THREADS_PER_BLOCK>>>(isPrime, n);
t = cudaDeviceSynchronize();
assert(t == cudaSuccess);
int bound = (int) ceil(sqrt(n));
int lb;
int ub = 2;
int primeStartInd = 0;
int primeEndInd = 0;
while (ub < n) {
if (primeEndInd > primeStartInd) {
int lowprime;
t = cudaMemcpy(&lowprime, primeList + primeStartInd, sizeof(int),
cudaMemcpyDeviceToHost);
assert(t == cudaSuccess);
int numcols = n / lowprime;
int numrows = primeEndInd - primeStartInd;
int threadx = min(numcols, THREADS_PER_BLOCK);
int thready = min(numrows, THREADS_PER_BLOCK / threadx);
int blockx = min(numcols / threadx, MAX_BLOCKS);
int blocky = min(numrows / thready, MAX_BLOCKS / blockx);
dim3 gridsize(blockx, blocky);
dim3 blocksize(threadx, thready);
clearMultiples<<<gridsize, blocksize>>>(isPrime, primeList,
primeStartInd, primeEndInd, n);
t = cudaDeviceSynchronize();
assert(t == cudaSuccess);
}
lb = ub;
ub *= 2;
if (lb >= bound) {
ub = n;
}
numBlocks = min(MAX_BLOCKS,
(ub - lb + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK);
createCounts<<<numBlocks, THREADS_PER_BLOCK>>>(isPrime, addend, lb, ub);
t = cudaDeviceSynchronize();
assert(t == cudaSuccess);
sumCounts<<<THREADS_PER_BLOCK, 1>>>(isPrime, addend, lb, ub, numPrimes);
t = cudaDeviceSynchronize();
assert(t == cudaSuccess);
int primeCount;
t = cudaMemcpy(&primeCount, numPrimes, sizeof(int),
cudaMemcpyDeviceToHost);
assert(t == cudaSuccess);
assert(primeCount > 0);
primeStartInd = primeEndInd;
primeEndInd += primeCount;
condensePrimes<<<numBlocks, THREADS_PER_BLOCK>>>(isPrime, primeList, lb,
ub, primeStartInd, primeCount);
t = cudaDeviceSynchronize();
assert(t == cudaSuccess);
}
int finalprimes[primeEndInd];
t = cudaMemcpy(finalprimes, primeList, primeEndInd * sizeof(int),
cudaMemcpyDeviceToHost);
assert(t == cudaSuccess);
t = cudaFree(isPrime);
assert(t == cudaSuccess);
t = cudaFree(addend);
assert(t == cudaSuccess);
t = cudaFree(numPrimes);
assert(t == cudaSuccess);
t = cudaFree(primeList);
assert(t == cudaSuccess);
for (int i = 0; i < primeEndInd; i++) {
if (i % 16 == 0)
printf("\n");
else
printf(" ");
printf("%4d", finalprimes[i]);
}
printf("\n");
return 0;
}
Answering some of your questions.
Fix your error checking as defined in the comments.
define what you mean by "concurrent reads". You're concerned about this but I'm not sure what you mean by it.
Is it necessary to call synchronize after every kernel call?
No, it isn't. If your code is not working correctly, synchronizing after every kernel call then doing proper error checking will tell you if any kernels are not launching correctly. Synchronization is generally not needed for relatively simple single-stream programs like this one. The cuda calls that need to synchronize like cudaMemcpy will do this automatically for you.
Do I need to synchronize after memcpy's as well?
No, cudaMemcpy is synchronous in nature (it will force all cuda calls in the same stream to complete before it begins, and it will not return control to the host thread until the copy is complete.) If you don't want the blocking characteristic (not returning control to the host thread until complete) then you can use the cudaMemcpyAsync version of the call. You would use streams to get around the behavior of forcing all previous cuda calls to complete.
Finally, how come when I set THREADS_PER_BLOCK to 512 it doesn't work?
Please define what you mean by "it doesn't work". I compiled your code with THREADS_PER_BLOCK of 512 and 256, and for an upper bound of 1000 it gave the same output in each case.

Resources