How to use mpi to send struct that contains arrays of other struct - c

I have these structs and I want to send them from master to slave (assume we have 2 process )
`struct Element
{
int id;
int n;
int **matrix;
};
struct Manager
{
double matching_value;
int num_pictures;
struct Element *pictures;
int num_patterns;
struct Element *patterns;
};
any recommendations about how to approach this ?
I tried to create mpi struct but no success
// Define the MPI datatype for the Element struct
MPI_Datatype MPI_Element;
MPI_Type_contiguous(1, MPI_INT, &MPI_Element); // id
MPI_Type_commit(&MPI_Element);
MPI_Datatype MPI_ElementArray;
MPI_Type_contiguous(2, MPI_INT, &MPI_ElementArray); // n and **matrix
MPI_Type_create_resized(MPI_ElementArray, 0, sizeof(int *), &MPI_ElementArray);
MPI_Type_commit(&MPI_ElementArray);
MPI_Datatype MPI_ElementArray2;
MPI_Type_contiguous(1, MPI_ElementArray, &MPI_ElementArray2); // *pictures or *patterns
MPI_Type_commit(&MPI_ElementArray2);
// Define the MPI datatype for the Manager struct
int blocklengths[] = {1, 1, 1, 1}; // matching_value, num_pictures, pictures, num_patterns, patterns
MPI_Aint displacements[] = {0, sizeof(double), sizeof(double) + sizeof(int), sizeof(double) + sizeof(int) + sizeof(struct Element *), sizeof(double) + sizeof(int) + sizeof(struct Element *) + sizeof(int)};
MPI_Datatype types[] = {MPI_DOUBLE, MPI_INT, MPI_ElementArray2, MPI_INT, MPI_ElementArray2};
MPI_Datatype MPI_Manager;
MPI_Type_create_struct(5, blocklengths, displacements, types, &MPI_Manager);
MPI_Type_commit(&MPI_Manager);

Related

getdents() syscall implementation for rootkit

I plan to hook my own version of getdents() for my rootkit. Code is here:
asmlinkage int new_getdents(unsigned int fd, struct linux_dirent *dirp, unsigned int count)
{
int nread;
int bpos;
struct linux_dirent *d;
int (*orig_func)(unsigned int fd, struct linux_dirent *dirp, unsigned int count);
t_syscall_hook *open_hook;
open_hook = find_syscall_hook(__NR_getdents);
orig_func = (void*) open_hook->orig_func;
nread = (*orig_func)(fd, dirp, count);
d = dirp;
for (bpos = 0; bpos < nread;) {
d = (struct linux_dirent *) ((char*)dirp + bpos);
printk(KERN_INFO "%s\n", d->d_name);
bpos += d->d_reclen;
}
return nread;
}
I fail to understand the type cast in this line: d = (struct linux_dirent *) ((char*)dirp + bpos);
Both d and dirp hold memory address for a linux_dirent struct. d_reclen contains length of the entry. If we receive d_reclen as 3, 5, 7, then entries would be present at dirp, dirp+3 /size(linux_dirent), (dirp+3/size(linux_dirent)+5/size(linux_dirent))...
So the line should be something like this then: d = (struct linux_dirent *) ((dirp + bpos)/size(linux_dirent));
Why are we converting into (char *)?
typedef struct {
unsigned long d_ino;
unsigned long d_off;
unsigned short d_reclen;
char d_name[1];
} linux_dirent;
So the line should be something like this then: d = (struct linux_dirent *) ((dirp + bpos)/size(linux_dirent));
No - dirp / sizeof(linux_dirent) makes little sense, the offset of dirp from 0 has no relation to the size of the structure. Dividing memory address by the size of the structure... it's just some unrelated address.
You meant, like, to divide only the offset from the memory location, and then add the resulting pointer to the pointer. Well, along:
(char*)dirp + ((char*)(dirp + bpos) - (char*)dirp)/sizeof(linux_dirent)
^^^^^^^^^^^ = (char*)dirp + bpos * sizeof(linux_dirent)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ = bpos * sizoef(linux_dirent)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ = bpos
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ = (char*)dirp + bpos
But... instead of incrementing the dirp pointer in sizeof(linux_dirent) steps, you can just increment it in 1 bytes steps. That's what the cast to char* is doing. sizeof(char) is always 1. The following is true to the value:
dirp + bpos == (char*)dirp + bpos * sizeof(linux_dirent)
Why are we converting into (char *)?
We are converting to char * to change how many bytes the + operator will increment. Short example:
int *a = 20;
a + 5 == 20 + 5 * sizeof(int)
(char*)a + 5 == 20 + 5 * sizeof(char) // sizeof(char) = 1, so:
(char*)a + 5 == 25
Pointer arithmetic is a good topic to research.

CUDA C allocating GPU memory for a struct of structs [duplicate]

I have a structure with arrays of structures inside in C, and I need a copy of that in the GPU. For that I am writing a function that makes some cudaMalloc and cudaMemcpys of the variables in the struct from host to device.
A simple version (the real one has various structs and variables/arrays inside) of the struct is:
struct Node {
float* position;
};
struct Graph{
unsigned int nNode;
Node* node;
unsigned int nBoundary;
unsigned int* boundary;
};
My problem is that I must be doing something wrong in the memory allocation and copy of the struct. When I copy the variables withing Graph, I can see that they are properly copied (by accessing it in a kernel as in the example below). For example, I can check that graph.nBoundary=3.
However, I can only see this if I do not allocate and copy the memory of Node *. If I do, I get -858993460 instead of 3. Interestingly, Node * is not wrongly allocated, as I can inspect the value of say graph.node[0].pos[0] and it has the correct value.
This only happens with the graph.nBoundary. All the other variables remain with the correct numerical values, but this one gets "wronged" when running the cudaMemcpy of the Node*.
What am I doing wrong and why does this happen? How do I fix it?
Let me know if you need more information.
MCVE:
#include <algorithm>
#include <cuda_runtime_api.h>
#include <cuda.h>
// A point, part of some elements
struct Node {
float* position;
};
struct Graph{
unsigned int nNode;
Node* node;
unsigned int nBoundary;
unsigned int* boundary;
};
Graph* cudaGraphMalloc(const Graph* inGraph);
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort = true)
{
if (code != cudaSuccess)
{
fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
__global__ void testKernel(Graph* graph,unsigned int * d_res){
d_res[0] = graph->nBoundary;
};
int main()
{
// Generate some fake data on the CPU
Graph graph;
graph.node = (Node*)malloc(2 * sizeof(Node));
graph.boundary = (unsigned int*)malloc(3 * sizeof(unsigned int));
for (int i = 0; i < 3; i++){
graph.boundary[i] = i + 10;
}
graph.nBoundary = 3;
graph.nNode = 2;
for (int i = 0; i < 2; i++){
// They can have different sizes in the original code
graph.node[i].position = (float*)malloc(3 * sizeof(float));
graph.node[i].position[0] = 45;
graph.node[i].position[1] = 1;
graph.node[i].position[2] = 2;
}
// allocate GPU memory
Graph * d_graph = cudaGraphMalloc(&graph);
// some dummy variables to test on GPU.
unsigned int * d_res, *h_res;
cudaMalloc((void **)&d_res, sizeof(unsigned int));
h_res = (unsigned int*)malloc(sizeof(unsigned int));
//Run kernel
testKernel << <1, 1 >> >(d_graph, d_res);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaMemcpy(h_res, d_res, sizeof(unsigned int), cudaMemcpyDeviceToHost));
printf("%u\n", graph.nBoundary);
printf("%d", h_res[0]);
return 0;
}
Graph* cudaGraphMalloc(const Graph* inGraph){
Graph* outGraph;
gpuErrchk(cudaMalloc((void**)&outGraph, sizeof(Graph)));
//copy constants
gpuErrchk(cudaMemcpy(&outGraph->nNode, &inGraph->nNode, sizeof(unsigned int), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(&outGraph->nBoundary, &inGraph->nBoundary, sizeof(unsigned int), cudaMemcpyHostToDevice));
// copy boundary
unsigned int * d_auxboundary, *h_auxboundary;
h_auxboundary = inGraph->boundary;
gpuErrchk(cudaMalloc((void**)&d_auxboundary, inGraph->nBoundary*sizeof(unsigned int)));
gpuErrchk(cudaMemcpy(d_auxboundary, h_auxboundary, inGraph->nBoundary*sizeof(unsigned int), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(&outGraph->boundary, d_auxboundary, sizeof(unsigned int *), cudaMemcpyDeviceToDevice));
//Create nodes
Node * auxnode;
gpuErrchk(cudaMalloc((void**)&auxnode, inGraph->nNode*sizeof(Node)));
// Crate auxiliary pointers to grab them from host and pass them to device
float ** d_position, ** h_position;
d_position = static_cast<float **>(malloc(inGraph->nNode*sizeof(float*)));
h_position = static_cast<float **>(malloc(inGraph->nNode*sizeof(float*)));
for (int i = 0; i < inGraph->nNode; i++){
// Positions
h_position[i] = inGraph->node[i].position;
gpuErrchk(cudaMalloc((void**)&d_position[i], 3 * sizeof(float)));
gpuErrchk(cudaMemcpy(d_position[i], h_position[i], 3 * sizeof(float), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(&auxnode[i].position, d_position[i], sizeof(float *), cudaMemcpyDeviceToDevice));
}
///////////////////////////////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////////////////////////////
////////////// If I comment the following section, nBoundary can be read by the kernel
///////////////////////////////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////////////////////////////
gpuErrchk(cudaMemcpy(&outGraph->node, auxnode, inGraph->nNode*sizeof(Node *), cudaMemcpyDeviceToDevice));
return outGraph;
}
The problem is in the function cudaGraphMalloc where you are trying to allocate device memory to the members of outGraph which has already been allocated on the device. In process of doing so, you are de-referencing a device pointer on host which is illegal.
To allocate device memory to members of struct type variable which exists on the device, we first have to create a temporary host variable of that struct type, then allocate device memory to its members, and then copy it to the struct which exists on the device.
I have answered a similar question here. Please take a look at it.
The fixed code may look like this:
#include <algorithm>
#include <cuda_runtime.h>
#include <cuda.h>
// A point, part of some elements
struct Node {
float* position;
};
struct Graph {
unsigned int nNode;
Node* node;
unsigned int nBoundary;
unsigned int* boundary;
};
Graph* cudaGraphMalloc(const Graph* inGraph);
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort = true)
{
if (code != cudaSuccess)
{
fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
__global__ void testKernel(Graph* graph, unsigned int * d_res) {
d_res[0] = graph->nBoundary;
};
int main()
{
// Generate some fake data on the CPU
Graph graph;
graph.node = (Node*)malloc(2 * sizeof(Node));
graph.boundary = (unsigned int*)malloc(3 * sizeof(unsigned int));
for (int i = 0; i < 3; i++) {
graph.boundary[i] = i + 10;
}
graph.nBoundary = 3;
graph.nNode = 2;
for (int i = 0; i < 2; i++) {
// They can have different sizes in the original code
graph.node[i].position = (float*)malloc(3 * sizeof(float));
graph.node[i].position[0] = 45;
graph.node[i].position[1] = 1;
graph.node[i].position[2] = 2;
}
// allocate GPU memory
Graph * d_graph = cudaGraphMalloc(&graph);
// some dummy variables to test on GPU.
unsigned int * d_res, *h_res;
cudaMalloc((void **)&d_res, sizeof(unsigned int));
h_res = (unsigned int*)malloc(sizeof(unsigned int));
//Run kernel
testKernel << <1, 1 >> >(d_graph, d_res);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaMemcpy(h_res, d_res, sizeof(unsigned int), cudaMemcpyDeviceToHost));
printf("%u\n", graph.nBoundary);
printf("%u\n", h_res[0]);
return 0;
}
Graph* cudaGraphMalloc(const Graph* inGraph)
{
//Create auxiliary Graph variable on host
Graph temp;
//copy constants
temp.nNode = inGraph->nNode;
temp.nBoundary = inGraph->nBoundary;
// copy boundary
gpuErrchk(cudaMalloc((void**)&(temp.boundary), inGraph->nBoundary * sizeof(unsigned int)));
gpuErrchk(cudaMemcpy(temp.boundary, inGraph->boundary, inGraph->nBoundary * sizeof(unsigned int), cudaMemcpyHostToDevice));
//Create nodes
size_t nodeBytesTotal = temp.nNode * sizeof(Node);
gpuErrchk(cudaMalloc((void**)&(temp.node), nodeBytesTotal));
for (int i = 0; i < temp.nNode; i++)
{
//Create auxiliary node on host
Node auxNodeHost;
//Allocate device memory to position member of auxillary node
size_t nodeBytes = 3 * sizeof(float);
gpuErrchk(cudaMalloc((void**)&(auxNodeHost.position), nodeBytes));
gpuErrchk(cudaMemcpy(auxNodeHost.position, inGraph->node[i].position, nodeBytes, cudaMemcpyHostToDevice));
//Copy auxillary host node to device
Node* dPtr = temp.node + i;
gpuErrchk(cudaMemcpy(dPtr, &auxNodeHost, sizeof(Node), cudaMemcpyHostToDevice));
}
Graph* outGraph;
gpuErrchk(cudaMalloc((void**)&outGraph, sizeof(Graph)));
gpuErrchk(cudaMemcpy(outGraph, &temp, sizeof(Graph), cudaMemcpyHostToDevice));
return outGraph;
}
Be advised that you will have to keep the host copies of internal device pointers (i.e. the auxiliary host variables). This is because you will have to free the device memory later and since you will only have a device copy of Graph in the main code, you won't be able to access its members from the host to call cudaFree on them. In this case the variable Node auxNodeHost (created in each iteration) and Graph temp are those variables.
The above code does not do that and is just for demonstration purpose.
Tested on Windows 10, Visual Studio 2015, CUDA 9.2, NVIDIA Driver 397.44.

qsort an array of structs in a struct

I'm attempting to sort an array of structs in a struct based on an int value. I've successfully sorted an array of structs but I'm guessing I'm passing a wrong value somewhere for the nested structs.
I just need the structs in the array sorted the value of a.
The structs are set up as:
struct s2{
int a;
int b;
};
struct s1{
int c;
struct s2 arr[10];
}
I have a compare function:
int comp(const void *a, const void *b){
struct s1 *q1 = (struct s1 *)a;
struct s1 *q2 = (struct s1 *)b;
return(q1->arr->a - q2->arr->a);
}
And I call qsort:
struct s1 myStruct;
size_t theLen = sizeof(myStruct.arr) / sizeof(struct s2);
qsort(myStruct.arr, 10, theLen, comp);
For the input:
10, 5, 7, 20, 17, 9, 3, 11, 15, 1
I get output:
2147451181, 589824, 327680, 65536, 131072, 4, 5, 11, 15, 8
I'm guessing it may be something to do with how I declare the length?
Thanks!
The file line is:
10 5 7 20 17 9 3 11 15 1
myStruct.arr[i].a is filled from file input using fgets and sscanf:
fgets(t, sizeof(t), fp);
sscanf(t, "%d,...,%d", &myStruct.arr[0].a,...,&myStruct.arr[9].a);
myStruct.arr[i].b is filled with a for loop:
for(int i = 0; i < 10; i++){
myStruct.arr[i].b = i+1;
}
There are two mistakes with your code
You are using q1->arr->a to compare where you should use q1->a (where q1 is of type const struct s2). This has also been explained by #GauravSehgal in his answer
If you look at the third argument to qsort, it is acutally the size of each element to compare in bytes. But you have passed the number of elements. Change your call to -
qsort(myStruct.arr, 10, sizeof(struct s2), comp);
and you should get the desired result.
There are some other points you need to take care of (pointed out by #Stargateur) -
Declare q1 and q2 to be of type const struct s2*, because you do not want to discard the const qualifier.
Do not cast a and b explicitly while assigning to q1 and q2 because they are of type const void* which auto-promotes to pointer of any const type.
qsort(myStruct.arr, 10, theLen, comp);
You are sorting myStruct.arr here where each element is of type struct s2.So your compare should be
int comp(const void *a, const void *b){
struct s2 *q1 = (struct s2 *)a;
struct s2 *q2 = (struct s2 *)b;
return(q1->a - q2->a);
}
EDIT: the third parameter to qsort is the size of each element of array to be sorted.So it should be
qsort(myStruct.arr, theLen, sizeof(struct s2), comp);

Send 2darray through network

Say I want to send a 2 dimensional array through network:
int array_to_send[2][4] = {{1, 2, 3, 4}, {5, 6, 7, 8}};
I save this array in a generic one because I don't know the type at compile time:
struct generic_array_type {
/* Pointer to array */
void *array;
/* Size of one element */
size_t elem_size;
/* Number of dimensions, max 3 */
size_t num_dimensions;
/* Dimensions */
size_t dimensions[3];
};
typedef struct generic_array_type genarray_t;
Then I fill the struct like this:
genarray_t _2dgarray;
/* Fill in data */
_2dgarray.array = array_to_send;
_2dgarray.elem_size = sizeof(int);
_2dgarray.num_dimensions = 2;
_2dgarray.dimensions[0] = 2;
_2dgarray.dimensions[1] = 4;
Now my question is: since int are affected by the endianess of the system I want to convert each element to network order. I do it like this:
/* Now, compute the size of the array in bytes */
// Simplified for this example
size_t garray_size = _2dgarray.dimensions[0] * _2dgarray.dimensions[1] * _2dgarray.elem_size;
/* Go through each element */
{
unsigned char *ptr = _2dgarray.array;
void (*swapBytes)(void *) = NULL;
switch (_2dgarray.elem_size) {
case 2:
swapBytes = swapBytes2;
break;
case 4:
swapBytes = swapBytes4;
break;
default:
fprintf(stderr, "Byte swap not implemented for %zu\n", _2dgarray.elem_size);
break;
}
for (size_t i = 0; i < garray_size; i += _2dgarray.elem_size) {
/* Pointer to element */
void *element = ptr + i;
/* Swap bytes */
swapBytes(element);
}
}
/* Now the array should have network order endianess, convert it to char buffer */
char buffer[garray_size];
memcpy(buffer, (char *)_2dgarray.array, garray_size);
/* Send "buffer" over network... */
Function for byteswapping:
void swapBytes2(void *ptr)
{
uint16_t val = *(uint16_t *)ptr;
*(uint16_t *)ptr = _bswap16(val);
}
void swapBytes4(void *ptr)
{
uint32_t val = *(uint32_t *)ptr;
*(uint32_t *)ptr = _bswap32(val);
}
It works, but I'm really unsure if this is the right way to do it. Especially the "memcpy" part which converts the array to a char array.

How to create a dynamic array vector without using stl?

I have to create and do some operations with dynamic array "vector", but without using stl and malloc. It should be in c. I have no idea how to do it, I googled it but all what I found is information about "vectors" in stl and no malloc(
If I'm understanding the question correctly, you are being asked to implement a dynamic data structure (a vector) without relying on malloc or other library routine to manage dynamic memory.
This means you have to create and manage your own memory pool; basically, declare a large-ish array and "allocate" memory from it to build your vector. You'll need a secondary data structure to track those allocations somehow. Something like the following:
#define MEMORY_SIZE ...
#define MAX_ALLOCS ...
static unsigned char memory[MEMORY_SIZE];
struct allocation_record {
unsigned char *start;
size_t length;
struct allocation_record *next;
};
struct allocation_record *allocs;
void *myalloc( size_t size )
{
// create a new allocation record
// find the next available chunk of "memory" that
// can satisfy the request
// set the allocation record to point to that chunk of "memory"
// add the allocation record to the allocs list
// return the start pointer in the allocation record
}
void myfree( void *ptr )
{
// search the list of allocation records for one with a
// start member that matchs ptr
// mark that memory as available *or* remove the allocation
// record from the list
}
This is very simplistic, almost to the point of being useless, but it should get you thinking in the right direction. The hard bit is figuring out where to grab the next chunk of memory (best fit, first fit, etc.), how to deal with fragmentation, etc.
And this doesn't even get into building the vector itself!
Here is a small proof-of-concept program that uses file I/O. It stores the length of the vector as an unsigned int at the beginning of the file, with the int values following.
It can push and pop values, and has random access (get). Removing values from the middle or beginning of the vector is up to you (it involves shifting all the values around).
Beware that it does no (error) checking whatsoever, that too is left as an exercise to the reader.
#include <stdio.h>
/* read & write the length, return new length */
unsigned int updateLength(FILE * vector, int difference){
unsigned int length;
rewind(vector);
fread(&length, 1, sizeof(unsigned int), vector);
rewind(vector);
length += difference; /* no error checking! */
fwrite(&length, 1, sizeof(unsigned int), vector);
return length;
}
/* append a value to the vector */
void push(FILE * vector, int value){
unsigned int length = updateLength(vector, 1) - 1;
/* write value */
fseek(vector, length * sizeof(int) + sizeof(unsigned int), SEEK_SET);
fwrite(&value, 1, sizeof(int), vector);
}
/* return the last element, can't actually remove it from the file, but the
length is updated */
int pop(FILE * vector){
unsigned int length = updateLength(vector, -1);
int value;
fseek(vector, length * sizeof(int) + sizeof(unsigned int), SEEK_SET);
fread(&value, 1, sizeof(int), vector);
return value;
}
/* get a value from the vector, doesn't check if pos is valid! */
int get(FILE * vector, unsigned int pos){
int ret;
fseek(vector, pos * sizeof(int) + sizeof(unsigned int), SEEK_SET);
fread(&ret, 1, sizeof(int), vector);
return ret;
}
/* initialise the file: write the length (0) */
void init(FILE * vector){
unsigned int length = 0;
fwrite(&length, sizeof(unsigned int), 1, vector);
}
int main(){
FILE * vector = fopen("vector.dat", "w+b");
int v1, v2, v3;
init(vector);
push(vector, 12);
push(vector, 123);
push(vector, 1234);
v1 = pop(vector);
v2 = pop(vector);
v3 = pop(vector);
printf("%i %i %i\n", v1, v2, v3);
fclose(vector);
return 0;
}

Resources