* Returns the offset of a local array
* with regards to block decomposition
* of a global array.
* #param (int) process rank
* #param (int) total number of processes
* #param (int) size of global array
* #return (int) offset of local array in global array
#define BLOCK_LOW(id, p, n) ((id)*(n)/(p))
* Returns the index immediately after the
* end of a local array with regards to
* block decomposition of a global array.
* #param (int) process rank
* #param (int) total number of processes
* #param (int) size of global array
* #return (int) offset after end of local array
#define BLOCK_HIGH(id, p, n) (BLOCK_LOW((id)+1, (p), (n)))
* Returns the size of a local array
* with regards to block decomposition
* of a global array.
* #param (int) process rank
* #param (int) total number of processes
* #param (int) size of global array
* #return (int) size of local array
#define BLOCK_SIZE(id, p, n) ((BLOCK_HIGH((id), (p), (n))) - (BLOCK_LOW((id), (p), (n))))
* Returns the rank of the process that
* handles a certain local array with
* regards to block decomposition of a
* global array.
* #param (int) index in global array
* #param (int) total number of processes
* #param (int) size of global array
* #return (int) rank of process that handles index
#define BLOCK_OWNER(i, p, n) (((p)*((i)+1)-1)/(n))
small matrix A.bin of dimension 100 × 50
small matrix B.bin of dimension 50 × 100
large matrix A.bin of dimension 1000 × 500
large matrix B.bin of dimension 500 × 1000
An MPI program should be implemented such that it can
• accept two file names at run-time,
• let process 0 read the A and B matrices from the two data files,
• let process 0 distribute the pieces of A and B to all the other processes,
• involve all the processes to carry out the the chosen parallel algorithm
for matrix multiplication C = A * B ,
• let process 0 gather, from all the other processes, the different pieces
of C ,
• let process 0 write out the entire C matrix to a data file.
#include <stdio.h>
#include <stdlib.h>
#include <mpi.h>
#include "mpi-utils.c"
void read_matrix_binaryformat (char*, double***, int*, int*);
void write_matrix_binaryformat (char*, double**, int, int);
void create_matrix (double***,int,int);
void matrix_multiplication (double ***, double ***, double ***,int,int, int);
int main(int argc, char *argv[]) {
int id,p; // Process rank and total amount of processes
int rowsA, colsA, rowsB, colsB; // Matrix dimensions
double **A; // Matrix A
double **B; // Matrix B
double **C; // Result matrix C : AB
int local_rows; // Local row dimension of the matrix A
double **local_A; // The local A matrix
double **local_C; // The local C matrix
MPI_Init (&argc, &argv);
MPI_Comm_rank (MPI_COMM_WORLD, &id);
MPI_Comm_size (MPI_COMM_WORLD, &p);
if(argc != 3) {
if(id == 0) {
printf("Usage:\n>> %s matrix_A matrix_B\n",argv[0]);
if (id == 0) {
read_matrix_binaryformat (argv[1], &A, &rowsA, &colsA);
read_matrix_binaryformat (argv[2], &B, &rowsB, &colsB);
if (p == 1) {
matrix_multiplication (&A,&B,&C,rowsA,colsB,colsA);
char* filename = "matrix_C.bin";
write_matrix_binaryformat (filename, C, rowsA, colsB);
return 0;
// For this assignment we have chosen to bcast the whole matrix B:
MPI_Bcast (&colsA, 1, MPI_INT, 0, MPI_COMM_WORLD);
MPI_Bcast (&colsB, 1, MPI_INT, 0, MPI_COMM_WORLD);
MPI_Bcast (&rowsA, 1, MPI_INT, 0, MPI_COMM_WORLD);
MPI_Bcast (&rowsB, 1, MPI_INT, 0, MPI_COMM_WORLD);
local_rows = BLOCK_SIZE(id, p, rowsA);
int *proc_elements = (int*)malloc(p*sizeof(int)); // amount of elements for each processor
int *displace = (int*)malloc(p*sizeof(int)); // displacement of elements for each processor
int i;
for (i = 0; i<p; i++) {
proc_elements[i] = BLOCK_SIZE(i, p, rowsA)*colsA;
displace[i] = BLOCK_LOW(i, p, rowsA)*colsA;
create_matrix (&local_C,local_rows,colsB);
matrix_multiplication (&local_A,&B,&local_C,local_rows,colsB,colsA);
MPI_Gatherv(&local_C[0], rowsA*colsB, MPI_DOUBLE,&C[0],
&proc_elements[0],&displace[0],MPI_DOUBLE,0, MPI_COMM_WORLD);
char* filename = "matrix_C.bin";
write_matrix_binaryformat (filename, C, rowsA, colsB);
free (proc_elements);
free (displace);
free (local_A);
free (local_C);
free (A);
free (B);
free (C);
MPI_Finalize ();
return 0;
void create_matrix (double ***C,int rows,int cols) {
*C = (double**)malloc(rows*sizeof(double*));
(*C)[0] = (double*)malloc(rows*cols*sizeof(double));
int i;
for (i=1; i<rows; i++)
(*C)[i] = (*C)[i-1] + cols;
void matrix_multiplication (double ***A, double ***B, double ***C, int rowsC,int colsC,int colsA) {
double sum;
int i,j,k;
for (i = 0; i < rowsC; i++) {
for (j = 0; j < colsC; j++) {
sum = 0.0;
for (k = 0; k < colsA; k++) {
sum = sum + (*A)[i][k]*(*B)[k][j];
(*C)[i][j] = sum;
/* Reads a 2D array from a binary file*/
void read_matrix_binaryformat (char* filename, double*** matrix, int* num_rows, int* num_cols) {
int i;
FILE* fp = fopen (filename,"rb");
fread (num_rows, sizeof(int), 1, fp);
fread (num_cols, sizeof(int), 1, fp);
/* storage allocation of the matrix */
*matrix = (double**)malloc((*num_rows)*sizeof(double*));
(*matrix)[0] = (double*)malloc((*num_rows)*(*num_cols)*sizeof(double));
for (i=1; i<(*num_rows); i++)
(*matrix)[i] = (*matrix)[i-1]+(*num_cols);
/* read in the entire matrix */
fread ((*matrix)[0], sizeof(double), (*num_rows)*(*num_cols), fp);
fclose (fp);
/* Writes a 2D array in a binary file */
void write_matrix_binaryformat (char* filename, double** matrix, int num_rows, int num_cols) {
FILE *fp = fopen (filename,"wb");
fwrite (&num_rows, sizeof(int), 1, fp);
fwrite (&num_cols, sizeof(int), 1, fp);
fwrite (matrix[0], sizeof(double), num_rows*num_cols, fp);
fclose (fp);
My task is to do a parallel matrix multiplication of matrix A and B and gather the results in matrix C.
I am doing this by dividing matrix A in rowwise pieces and each process is going to use its piece to multiply matrix B, and get back its piece from the multiplication. Then I am going to gather all the pieces from the processes and put them together to matrix C.
I allready posted a similiar question, but this code is improved and I have progressed but I am still getting a segmentation fault after the scatterv call.
So I see a few problems right away:
Here, you're passing not a pointer to doubles, but a pointer to a pointer to a pointer to a double (B is defined as double **B) and you're telling MPI to follow that pointer and send 1 double from there. That is not going to work.
You might think that what you're accomplishing here is sending the pointer to the matrix, from which all tasks can read the array -- that doesn't work. The processes don't share a common memory space (that's why MPI is called distributed memory programming) and the pointer doesn't go anywhere. You're actually going to have to send the contents of the matrix,
MPI_Bcast (&(B[0][0]), rowsB*colsB, MPI_DOUBLE, 0, MPI_COMM_WORLD);
and you're going to have to make sure the other processes have correctly allocated memory for the B matrix ahead of time.
There's similar pointer problems elsewhere:
MPI_Scatterv(&A[0], ..., &local_A[0]
Again, A is a pointer to a pointer to doubles (double **A) as is local_A, and you need to be pointing MPI to pointer to doubles for this to work, something like
MPI_Scatterv(&(A[0][0]), ..., &(local_A[0][0])
that error seems to be present in all the communications routines.
Remember that anything that looks like (buffer, count, TYPE) in MPI means that the MPI routines follow the pointer buffer and send the next count pieces of data of type TYPE there. MPI can't follow pointers within the buffer you sent becaue in general it doens't know they're there. It just takes the next (count * sizeof(TYPE)) bytes from pointer buffer and does whatever communications is appropriate with them. So you have to pass it a pointer to a stream of data of type TYPE.
Having said all that, it would be a lot easier to work with you on this if you had narrowed things down a bit; right now the program you've posted includes a lot of I/O stuff that's irrelevant, and it means that no one can just run your program to see what happens without first figuring out the matrix format and then generating two matrices on their own. When posting a question about source code, you really want to post a (a) small bit of source which (b) reproduces the problem and (c) is completely self-contained.
Consider this an extended comment as Jonathan Dursi has already given a fairly elaborate answer. You matrices are really represented in a weird way but at least you followed the advice given to your other question and allocate space for them as contiguous blocks and not separately for each row.
Given that, you should replace:
A[0] already points to the beginning of the matrix data and there is no need to make a pointer to it. The same goes for local_A[0] as well as for the parameters to the MPI_Gatherv() call.
It has been said many times already - MPI doesn't do pointer chasing and only works with flat buffers.
I've also noticed another mistake in your code - memory for your matrices is not freed correctly. You are only freeing the array of pointers and not the matrix data itself:
should really become
free(A[0]); free(A);
I'm going to use MPI_Pack() to make a message composed of n ints and m doubles. Their positions in the message buffer will be something like this
p1 x ints, q1 x doubles, p2 x ints, q2 x doubles, ..., pN x ints, qN x doubles
where n=p1+p2+...+pN and m=q1+q2+...+qN.
My question: Is the size of this message equal to the size of a message composed of the same number of ints and doubles but with the following order:
n x ints, m x doubles
I'm asking this question because I want to know how much memory should be allocated for the buffer. If the size of the message depends only on the number of ints and doubles and not how they are arranged, then the buffer can be allocated very easily:
MPI_Pack_size(n, MPI_INT, communicator, &k1);
MPI_Pack_size(m, MPI_DOUBLE, communicator, &k2);
buffer = malloc(k1 + k2);
Obviously the following solution is correct:
k = 0;
for (int i=0; i < N; i++)
MPI_Pack_size(p[i], MPI_INT, communicator, &k1);
MPI_Pack_size(q[i], MPI_DOUBLE, communicator, &k2);
k += k1 + k2;
buffer = malloc(k);
But for a large N, it may result in a too excessively large buffer, because as the official document of MPI states, the routine MPI_Pack_size()
returns an upper bound, rather than an exact bound, since the
exact amount of space needed to pack the message may depend on the context (e.g.,
first message packed in a packing unit may take more space).
UPDATE: a program I wrote for testing if the order of packing the ints and doubles affect the size of the message.
#include <stdio.h>
#include <mpi.h>
#include <assert.h>
#include <stdlib.h>
#include <time.h>
#define BUFF_SIZE 200000 /* buffer size in bytes */
typedef double real;
int main()
int ic = 0, rc = 0; /* counters of int and real numbers */
int pos = 0; /* position in the buffer, used in MPI_Pack() calls */
/* allocate memory of the pack buffer */
void *buff = malloc(BUFF_SIZE);
/* case 1: packing a large number of pairs of arrays */
for (int i=0; i<100; i++) /* 100 array pairs */
/* make int and real arrays of random lengths */
int ik = 99 * ((double)rand() / RAND_MAX) + 1;
int rk = 99 * ((double)rand() / RAND_MAX) + 1;
int *iarr = (int *)malloc(ik * sizeof(int));
double *rarr = (real *)malloc(rk * sizeof(real));
ic += ik;
rc += rk;
/* pack the array pair */
MPI_Pack(iarr, ik, MPI_INT, buff, BUFF_SIZE, &pos, MPI_COMM_WORLD);
MPI_Pack(rarr, rk, MY_MPI_REAL, buff, BUFF_SIZE, &pos, MPI_COMM_WORLD);
printf("final position for case 1 = %d\n", pos);
/* case 2: packing a single pair of arrays */
pos = 0;
int *iarr = (int *)malloc(ic * sizeof(int));
double *rarr = (real *)malloc(rc * sizeof(real));
MPI_Pack(iarr, ic, MPI_INT, buff, BUFF_SIZE, &pos, MPI_COMM_WORLD);
MPI_Pack(rarr, rc, MY_MPI_REAL, buff, BUFF_SIZE, &pos, MPI_COMM_WORLD);
printf("final position for case 2 = %d\n", pos);
printf("sizeof(int) = %ld, sizeof(real) = %ld\n", sizeof(int), sizeof(real));
printf("num of ints = %d, num of reals = %d\n", ic, rc);
printf("num of ints x sizeof(int) + num of reals x sizeof(real) = %ld\n", ic*sizeof(int)+rc*sizeof(real));
I think your worries are misplaced. The only possible overhead I see would be from alignment: maybe a one time alignment at the start of the buffer, and then maybe per element. However, the pack buffer is counted in bytes, and I just tested it: even packing a single byte does not lead to any padding. So that leads me to suspect that every data type basically takes the exact amount of space.
I am doing a particle simulation, and need to send some part of three different arrays to other processes. How to use MPI user defined types to do this?
For example, suppose, I have three Matrixes with datatype being double, A, B, and C on Process 1. Now I want to send the first two rows of A, B, and C to Process 2. So how to use MPI user defined type to do this, assuming C type storage for these Matrices? Thank you.
Currently, I am copying the first two rows of these Matrices to a single buffer, and then perform MPI Send. These involves basically the following steps:
Copy the first two rows of A, B, and C to a send_buffer on Process 1.
Send the send_buffer from Process 1 to Process 2.
On Process 2, use recv_buffer to receive data from Process 1.
On Process 2, copy data from recv_buffer to A, B, C on Process 2.
I hope there is a better way to do this. Thanks.
In the code below, an MPI data type is defined to communicate a range of rows of a matrix. If there are three matrices then there would be three send/receive. You can compare the following code with your own code to see which one is better.
If you think transferring matrices one by one is not efficient then you might put all matrices inside a struct and make a MPI data type or consider using MPI_PACK.
#include <mpi.h>
#include <stdio.h>
#include <stdlib.h>
void make_layout(int row_begin, int row_end, int ncol, MPI_Datatype* mpi_dtype)
int nblock = 1;
int block_count = (row_end - row_begin + 1) * ncol;
MPI_Aint lb, extent;
MPI_Type_get_extent(MPI_DOUBLE, &lb, &extent);
MPI_Aint offset = row_begin * ncol * extent;
MPI_Datatype block_type = MPI_DOUBLE;
MPI_Type_create_struct(nblock, &block_count, &offset, &block_type, mpi_dtype);
double** allocate(int nrow, int ncol)
double *data = (double *)malloc(nrow*ncol*sizeof(double));
double **array= (double **)malloc(nrow*sizeof(double*));
for (int i=0; i<nrow; i++)
array[i] = &(data[ncol*i]);
return array;
int main()
int rank;
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
// make 3x3 matrix.
int nrow = 3;
int ncol = 3;
double** A = allocate(nrow, ncol);
// make mpi datatype to send rows [0, 1] excluding row 2.
// you can send any range of rows i.e. rows [row_begin, row_end].
int row_begin = 0;
int row_end = 1; // inclusive.
MPI_Datatype mpi_dtype;
make_layout(row_begin, row_end, ncol, &mpi_dtype);
if (rank == 0)
MPI_Send(&(A[0][0]), 1, mpi_dtype, 1, 0, MPI_COMM_WORLD);
MPI_Recv(&(A[0][0]), 1, mpi_dtype, 0, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
return 0;
i just learned GPU programming and now i have a task to find a minimum value from 100x100 matrix by doing parallel at CUDA. i have try this code, but it's not showing the answer, instead of showing my initiate value hmin = 9999999.can anyone give me the right code? oh, the code is in C lang.
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <time.h>
#define size (100*100)
//Kernel Functions & Variable
__global__ void FindMin(int* mat[100][100],int* kmin){
int b=blockIdx.x+threadIdx.x*blockDim.x;
int k=blockIdx.y+threadIdx.y*blockDim.y;
if(mat[b][k] < kmin){
kmin = mat[b][k];
int main(int argc, char *argv[]) {
//Declare Variabel
int i,j,hmaks=0,hmin=9999999,hsumin,hsumax; //Host Variable
int *da[100][100],*dmin,*dmaks,*dsumin,*dsumax; // Device Variable
FILE *baca; //for opening txt file
char buf[4]; //used for fscanf
int ha[100][100],b; //matrix shall be filled by "b"
//1: Read txt File
if (!baca){
printf("Hey, it's not even exist"); //Checking File, is it there?
i=0;j=0; //Matrix index initialization
if(!feof(baca)){ //if not end of file then do
for(i = 0; i < 100; i++){
for(j = 0; j < 100; j++){
fscanf(baca,"%s",buf); //read max 4 char
b=atoi(buf); //parsing from string to integer
ha[i][j]=b; //save it to my matrix
//all file has been read
//time to close the file
//Sesi 2: Allocation data di GPU
cudaMalloc((void **)&da, size*sizeof(int));
cudaMalloc((void **)&dmin, sizeof(int));
cudaMalloc((void **)&dmaks, sizeof(int));
cudaMalloc((void **)&dsumin, sizeof(int));
cudaMalloc((void **)&dsumax, sizeof(int));
//Sesi 3: Copy data to Device
cudaMemcpy(da, &ha, size*sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(dmin, &hmin, sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(dmaks, &hmaks, sizeof(int), cudaMemcpyHostToDevice);
//Sesi 4: Call Kernel
//5: Copy from Device to Host
cudaMemcpy(&hmin, dmin, sizeof(int), cudaMemcpyDeviceToHost);
//6: Print that value
printf("Minimum Value = %i \n",hmin);
system("pause"); return 0;
this is my result
Minimum Value = 9999999
Press any key to continue . . .
I saw a few issues in your code.
As mentioned in the comments from MayurK, you got the indexing wrong.
Also as MayurK said, you are comparing two pointers and not the values they point to.
You kernel invocation code asks for 100 x 100 x 1 grid, with each block containing just 1 thread. This is very bad in terms of efficiency. Also, because of this, your b and k will only range from 0 to 99, as the threadIdx.x will always be zero.
Finally, all threads will be running in parallel, resulting in a race condition in kmin = mat[b][k] (which should be *kmin by the way). When you fixed the indexing problem, all threads in the same block will write to the location in global memory at same time. You should use atomicMin() or a parallel reduction for finding the minimum value in parallel.
I want to implement a convolution function to use in mean filter and gaussian filter and I need to implement those 2 filters as well to apply to pgm files.
I have
typedef struct _PGM{
int row;
int col;
int max_value;
int **matrix;
struct and
int convolution(int ** kernel,int ksize, PGM * image, PGM * output){
int i, j, x, y;
int sum;
int data;
int scale =ksize*ksize;
int coeff;
for (x=ksize/2; x<image->row-ksize/2;++x) {
for (y=ksize/2; y<image->col-ksize/2; ++y){
sum = 0;
for (i=-ksize/2; i<=ksize/2; ++i){
for (j=-ksize/2; j<=ksize/2; ++j){
data = image->matrix[x +i][y +j];
coeff = kernel[i+ksize/2][j+ksize/2];
sum += data * coeff;
output->matrix[x][y] = sum / scale;
return sum/scale;
convolution function but I get error(actually it terminates) in convolution function so I could not proceed to filter
Can you help me with the implementation ?
Thank you.
In your convolution there are two things wrong that probably aren't causing the crash. The first is style: You're using x to iterate over the rows of an image, something I picture more as a y displacement, and vice-versa. The second is that when you're computing the sum, you're not resetting the variable sum = 0 prior to evaluating the kernel (the inner two loops) for each pixel. Instead you accumulate sum over all pixels, probably eventually causing integer overflow. While strictly speaking this is UB and could cause a crash, it's not the issue you're facing.
If you would kindly confirm that the crash occurs on the first pixel (x = ksize/2, y = ksize/2), then since the crash occurs at the first coefficient read from the kernel, I suspect you may have passed the "wrong thing" as the kernel. As presented, the kernel is an int**. For a kernel size of 3x3, this means that to call this function correctly, you must have allocated on the heap or stack an array of int*, where you stored 3 pointers to int arrays with 3 coefficients each. If you instead passed a int[3][3] array, the convolution function will attempt to interpret the first one or two int in the array as a pointer to an int when it is not, and try to dereference it to pull in the coefficient. This will most likely cause a segfault.
I also don't know why you are returning the accumulated sum. This isn't a "traditional" output of convolution, but I surmise you were interested in the average brightness of the output image, which is legitimate; In this case you should use a separate and wider integer accumulator (long or long long) and, at the end, divide it by the number of pixels in the output.
You probably found the PGM data structure from the internet, say, here. Allow me to part with this best-practice advice. In my field (computer vision), the computer vision library of choice, OpenCV, does not express a matrix as an array of row pointers to buffers of col elements. Instead, a large slab of memory is allocated, in this case of size image->row * image->col * sizeof(int) at a minimum, but often image->row * image->step * sizeof(int) where image->step is image->col rounded up to the next multiple of 4 or 16. Then, only a single pointer is kept, a pointer to the base of the entire image, although an extra field (the step) has to be kept if images aren't continuous.
I would therefore rework your code thus:
/* Includes */
#include <stdlib.h>
/* Defines */
#define min(a, b) (((a) < (b)) ? (a) : (b))
#define max(a, b) (((a) > (b)) ? (a) : (b))
/* Structure */
* Mat structure.
* Stores the number of rows and columns in the matrix, the step size
* (number of elements to jump from one row to the next; must be larger than or
* equal to the number of columns), and a pointer to the first element.
typedef struct Mat{
int rows;
int cols;
int step;
int* data;
} Mat;
/* Functions */
* Allocation. Allocates a matrix big enough to hold rows * cols elements.
* If a custom step size is wanted, it can be given. Otherwise, an invalid one
* can be given (such as 0 or -1), and the step size will be chosen
* automatically.
* If a pointer to existing data is provided, don't bother allocating fresh
* memory. However, in that case, rows, cols and step must all be provided and
* must be correct.
* #param [in] rows The number of rows of the new Mat.
* #param [in] cols The number of columns of the new Mat.
* #param [in] step The step size of the new Mat. For newly-allocated
* images (existingData == NULL), can be <= 0, in
* which case a default step size is chosen; For
* pre-existing data (existingData != NULL), must be
* provided.
* #param [in] existingData A pointer to existing data. If NULL, a fresh buffer
* is allocated; Otherwise the given data is used as
* the base pointer.
* #return An allocated Mat structure.
Mat allocMat(int rows, int cols, int step, int* existingData){
Mat M;
M.rows = max(rows, 0);
M.cols = max(cols, 0);
M.step = max(step, M.cols);
if(rows <= 0 || cols <= 0){
M.data = 0;
}else if(existingData == 0){
M.data = malloc(M.rows * M.step * sizeof(*M.data));
M.data = existingData;
return M;
* Convolution. Convolves input by the given kernel (centered) and stores
* to output. Does not handle boundaries (i.e., in locations near the border,
* leaves output unchanged).
* #param [in] input The input image.
* #param [in] kern The kernel. Both width and height must be odd.
* #param [out] output The output image.
* #return Average brightness of output.
* Note: None of the image buffers may overlap with each other.
int convolution(const Mat* input, const Mat* kern, Mat* output){
int i, j, x, y;
int coeff, data;
int sum;
int avg;
long long acc = 0;
/* Short forms of the image dimensions */
const int iw = input ->cols, ih = input ->rows, is = input ->step;
const int kw = kern ->cols, kh = kern ->rows, ks = kern ->step;
const int ow = output->cols, oh = output->rows, os = output->step;
/* Kernel half-sizes and number of elements */
const int kw2 = kw/2, kh2 = kh/2;
const int kelem = kw*kh;
/* Left, right, top and bottom limits */
const int l = kw2,
r = max(min(iw-kw2, ow-kw2), l),
t = kh2,
b = max(min(ih-kh2, oh-kh2), t);
/* Total number of pixels */
const int totalPixels = (r-l)*(b-t);
/* Input, kernel and output base pointers */
const int* iPtr = input ->data;
const int* kPtr = kern ->data + kw2 + ks*kh2;
int* oPtr = output->data;
/* Iterate over pixels of image */
for(y=t; y<b; y++){
for(x=l; x<r; x++){
sum = 0;
/* Iterate over elements of kernel */
for(i=-kh2; i<=kh2; i++){
for(j=-kw2; j<=kw2; j++){
data = iPtr[j + is*i + x];
coeff = kPtr[j + ks*i ];
sum += data * coeff;
/* Compute average. Add to accumulator and store as output. */
avg = sum / kelem;
acc += avg;
oPtr[x] = avg;
/* Bump pointers by one row step. */
iPtr += is;
oPtr += os;
/* Compute average brightness over entire output */
if(totalPixels == 0){
avg = 0;
avg = acc/totalPixels;
/* Return average brightness */
return avg;
* Main
int main(int argc, char* argv[]){
* Coefficients of K. Binomial 3x3, separable. Unnormalized (weight = 16).
* Step = 3.
int Kcoeff[3][3] = {{1, 2, 1}, {2, 4, 2}, {1, 2, 1}};
Mat I = allocMat(1920, 1080, 0, 0);/* FullHD 1080p: 1920x1080 */
Mat O = allocMat(1920, 1080, 0, 0);/* FullHD 1080p: 1920x1080 */
Mat K = allocMat( 3, 3, 3, &Kcoeff[0][0]);
/* Fill Mat I with something.... */
/* Convolve with K... */
int avg = convolution(&I, &K, &O);
/* Do something with O... */
/* Return */
return 0;
Reference: Years of experience in computer vision.
I've got a problem with CUDA. I want to make small program which count letters from array of char.
I read letters from file and save to int variable called N, how many letters read. After that I malloc.
char *b_h, *b_d;
size_t size_char = N * sizeof(char);
b_h = (char *)malloc(size_char);
After malloc I read file again and assign current letter to element of char array (it works):
int j=0;
After that I create an int variable (a_h) as counter.
int *a_h, *a_d;
size_t size_count = 1*sizeof(int);
a_h = (int *)malloc(size_count);
Ok, go with CUDA:
cudaMalloc((void **) &a_d, size_count);
cudaMalloc((void **) &b_d, size_char);
Copy from host to device:
cudaMemcpy(a_d, a_h, size_count, cudaMemcpyHostToDevice);
cudaMemcpy(b_d, b_h, size_char, cudaMemcpyHostToDevice);
Set blocks and call CUDA function:
int block_size = 4;
int n_blocks = N/block_size + (N%block_size == 0 ? 0:1);
square_array <<< n_blocks, block_size >>> (a_d,b_d,c_d, N);
Receive from function:
cudaMemcpy(a_h, a_d, size_count, cudaMemcpyDeviceToHost);
cudaMemcpy(b_h, d_d, size_char, cudaMemcpyDeviceToHost);
And print count:
printf("\Count: %d\n", a_h[0]);
And it doesn't work. In array of char I have sentence: Super testSuper test ; I'm looking for 'e' letter and I got a_h[0] = 1.
Where is problem?
CUDA function:
__global__ void square_array(int *a, char *b, int *c, int N)
const char* letter = "e";
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx<N)
if(b[idx] == *letter)
Please, help me.
I'm guessing that N is small enough that your GPU is able to launch all your threads in parallel. So, you start a thread for each character in your array. The threads, all running simultaneously, don't see the output from each other. Instead, each thread reads the value of a[0] (which is 0), and increases it by 1 and stores the resulting value (1). If this is homework, that would have been the basic lesson that the professor wanted to impart.
When multiple threads store a value in the same location simultaneously, it is undefined which thread will get its value stored. In your case, that doesn't matter because all threads that store a value will store the value, "1".
A typical solution would be to have each thread store a value of 0 or 1 in a separate location (depending on if there is a match or not), and then add up the values in a separate step.
You can also use an atomic increase operation.