MPI_Bcast error on multiple nodes - c

Background: I am writing MPI versions of I/O system calls, which are based on the collfs project.
The code runs without error on multiple processors on a single node.
However, running on multiple nodes causes a segmentation fault... The error message with 2 processes, 1 process per node is the following:
$ qsub test.sub
$ cat test.e291810
0: pasc_open(./libSDL.so, 0, 0)
1: pasc_open(./libSDL.so, 0, 0)
1: mptr[0]=0 mptr[len-1]=0
1: MPI_Bcast(mptr=eed11000, len=435104, MPI_BYTE, 0, MPI_COMM_WORLD)
0: mptr[0]=127 mptr[len-1]=0
0: MPI_Bcast(mptr=eeb11000, len=435104, MPI_BYTE, 0, MPI_COMM_WORLD)
_pmiu_daemon(SIGCHLD): [NID 00632] [c3-0c0s14n0] [Sun May 18 13:10:30 2014] PE RANK 0 exit signal Segmentation fault
[NID 00632] 2014-05-18 13:10:30 Apid 8283706: initiated application termination
The function where the error occurs is the following:
static int nextfd = BASE_FD;
#define next_fd() (nextfd++)
int pasc_open(const char *pathname, int flags, mode_t mode)
{
int rank;
int err;
if(!init)
return ((pasc_open_fp) def.open)(pathname, flags, mode);
if(MPI_Comm_rank(MPI_COMM_WORLD, &rank) != MPI_SUCCESS)
return -1;
dprintf("%d: %s(%s, %x, %x)\n", rank, __FUNCTION__, pathname, flags, mode);
/* Handle just read-only access for now. */
if(flags == O_RDONLY || flags == (O_RDONLY | O_CLOEXEC)) {
int fd, len, xlen, mptr_is_null;
void *mptr;
struct mpi_buf { int len, en; } buf;
struct file_entry *file;
if(rank == 0) {
len = -1;
fd = ((pasc_open_fp) def.open)(pathname, flags, mode);
/* Call stat to get file size and check for errors */
if(fd >= 0) {
struct stat st;
if(fstat(fd, &st) >= 0)
len = st.st_size;
else
((pasc_close_fp) def.close)(fd);
}
/* Record them */
buf.len = len;
buf.en = errno;
}
/* Propagate file size and errno */
if(MPI_Bcast(&buf, 2, MPI_INT, 0, MPI_COMM_WORLD) != MPI_SUCCESS)
return -1;
len = buf.len;
if(len < 0) {
dprintf("error opening file, len < 0");
return -1;
}
/* Get the page-aligned size */
xlen = page_extend(len);
/* `mmap` the file into memory */
if(rank == 0) {
mptr = ((pasc_mmap_fp) def.mmap)(0, xlen, PROT_READ, MAP_PRIVATE,
fd, 0);
} else {
fd = next_fd();
mptr = ((pasc_mmap_fp) def.mmap)(0, xlen, PROT_READ | PROT_WRITE,
MAP_PRIVATE | MAP_ANONYMOUS, fd, 0);
}
((pasc_lseek_fp) def.lseek)(fd, 0, SEEK_SET);
/* Ensure success on all aux. processes */
if(rank != 0)
mptr_is_null = !mptr;
MPI_Allreduce(MPI_IN_PLACE, &mptr_is_null, 1, MPI_INT, MPI_LAND,
MPI_COMM_WORLD);
if(mptr_is_null) {
if(mptr)
((pasc_munmap_fp) def.munmap)(mptr, xlen);
dprintf("%d: error: mmap/malloc error\n", rank);
return -1;
}
dprintf("%d: mptr[0]=%d mptr[len-1]=%d\n", rank, ((char*)mptr)[0], ((char*)mptr)[len-1]);
/* Propagate file contents */
dprintf("%d: MPI_Bcast(mptr=%x, len=%d, MPI_BYTE, 0, MPI_COMM_WORLD)\n",
rank, mptr, len);
if(MPI_Bcast(mptr, len, MPI_BYTE, 0, MPI_COMM_WORLD) != MPI_SUCCESS)
return -1;
if(rank != 0)
fd = next_fd();
/* Register the file in the linked list */
file = malloc(sizeof(struct file_entry));
file->fd = fd;
file->refcnt = 1;
strncpy(file->fn, pathname, PASC_FNMAX);
file->mptr = mptr;
file->len = len;
file->xlen = xlen;
file->offset = 0;
/* Reverse stack */
file->next = open_files;
open_files = file;
return fd;
}
/* Fall back to independent access */
return ((pasc_open_fp) def.open)(pathname, flags, mode);
}
The error occurs at the final MPI_Bcast call. I am at a loss as to why it is happening: the memory it copies from and to I can dereference just fine.
I am using MPICH on a custom Cray XC30 machine running SUSE Linux x86_64.
Thanks!
EDIT: I have tried replacing the MPI_Bcast call with a MPI_Send/MPI_Recv pair, and the result is the same.

The Cray MPI implementation probably does some magic for performance reasons. Without knowing the internals much of the answer is a guess.
The inter-node communication likely does not utilize the network stack, relying on some sort of shared memory communication. When you try to send mmap-ed buffer over the network stack something somewhere breaks - the DMA engine (I'm wildly guessing here) cannot handle this case.
You can try to page lock the mmaped buffer - perhaps mlock will work just fine.
If that fails, then go with copying the data into malloced buffer.

Related

Reading memory of another process in C without ptrace in linux

I am trying to read memory of another process and print whatever is in the memory (Heap and/or stack). I have got the range of memory addresses using /proc
I have extracted address range like this. Now I want to read the memory range of the other process like as defined.
5569032d2000-5569032f3000 rw-p 00000000 00:00 0 [heap]
I am stuck on how to access those memory addresses. I tried something like shown below , but doesn't help much.
int main(int argc, char *argv[]) {
off_t offset = strtoul(argv[1], NULL, 0);
size_t len = strtoul(argv[2], NULL, 0);
// Truncate offset to a multiple of the page size, or mmap will fail.
size_t pagesize = sysconf(_SC_PAGE_SIZE);
off_t page_base = (offset / pagesize) * pagesize;
off_t page_offset = offset - page_base;
int fd = open("/dev/mem", O_SYNC);
unsigned char *mem = mmap(NULL, page_offset + len, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, page_base);
if (mem == MAP_FAILED) {
perror("Can't map memory");
return -1;
}
size_t i;
for (i = 0; i < len; ++i)
printf("%x ", (int)mem[page_offset + i]);
//size_t i;
return 0;}
Thanks.
I am making like a debug tool for my embedded system. I can't use ptrace() as it halts the running process while trying to peek into the device memory.
I figured out to read the process of another process, I can use process_vm_readv() function as follow:
pid_t pid; // Put value of pid in this
void *remotePtr; // Put starting address
size_t bufferLength; // Put size of buffer in this, aka size to read
// Build iovec structs
struct iovec local[1];
local[0].iov_base = calloc(bufferLength, sizeof(char));
local[0].iov_len = bufferLength;
struct iovec remote[1];
remote[0].iov_base = remotePtr;
remote[0].iov_len = bufferLength;
/*Nread will contain amount of bytes of data read*/
nread = process_vm_readv(pid, local, 2, remote, 1, 0);
if (nread < 0) {
switch (errno) {
case EINVAL:
printf("ERROR: INVALID ARGUMENTS.\n");
break;
case EFAULT:
printf
("ERROR: UNABLE TO ACCESS TARGET MEMORY ADDRESS.\n");
break;
case ENOMEM:
printf("ERROR: UNABLE TO ALLOCATE MEMORY.\n");
break;
case EPERM:
printf
("ERROR: INSUFFICIENT PRIVILEGES TO TARGET PROCESS.\n");
break;
case ESRCH:
printf("ERROR: PROCESS DOES NOT EXIST.\n");
break;
default:
printf("ERROR: AN UNKNOWN ERROR HAS OCCURRED.\n");
}
return -1;
}
/* To print the read data */
printf("The read text is \n %s\n", local[0].iov_base);

mmap file not syncing

Hello I am trying to back up a vector by mmap.
However, I have tried msync then munmap but it doesn't work. After I write to the (char *) then munmap the file, the file has no content. The mmap file is also created with flag MAP_SHARED. Would really appreciate it if anyone can help.
//update file descriptor
if ((fd = open(filename.c_str(), O_RDWR | S_IRWXU)) < 0) { //| O_CREAT
printf("ERROR opening file %s for writing", filename.c_str());
exit(1);
}
//lseek create a file large enough
off_t i = lseek(fd, frontier_size * URL_MAX_SIZE, SEEK_SET);
if (i != frontier_size * URL_MAX_SIZE) {
cout << "failed to seek";
}
//reposition and write 3 bytes to the file else will failed to read
char buff[3] = "ta";
ssize_t kk = lseek(fd, 0, SEEK_SET);
if (kk < 0) {
cout << "failed to reposition";
}
ssize_t temp_write = write(fd, (void *)& buff, 2);
if (temp_write < 0) {
cout << "failed to write";
cout << temp_write;
}
//reposition to begining
ssize_t k = lseek(fd, 0, SEEK_SET);
if (k < 0) {
cout << "failed to reposition";
}
char * map = (char *)mmap(0, frontier_size * URL_MAX_SIZE, PROT_WRITE, MAP_SHARED, fd, 0);
if (map == MAP_FAILED) {
printf("failed mmap");
exit(1);
}
mmap_frontier = map;
//write to frontier
for (int i = 0; i < frontier.size(); ++i) {
strcpy(mmap_frontier, frontier[i].c_str());
mmap_frontier += URL_MAX_SIZE;
}
mmap_frontier -= frontier.size() * URL_MAX_SIZE;
ssize_t k = lseek(fd, 0, SEEK_SET);
if (k < 0) {
cout << "failed to reposition";
}
int sync = msync((void *)0, frontier.size() * URL_MAX_SIZE, MS_ASYNC);
if (sync < 0 ) {
cout << "failed to sync";
}
int unmap = munmap((void *)0, frontier.size() * URL_MAX_SIZE);
if (unmap < 0) {
cout << "failed to unmap";
}
There are quite a few problems with your code, and with the question:
S_IRWXU is the 3rd argument to open(), not a flag for the 2nd parameter.
mmap() won't work correctly if the file is too small. You can use ftruncte() to set the file size correctly. You tried to seek past the total size of the mapping and write a couple of bytes ("ta"), but before doing that you issued the seek lseek(fd, 0, SEEK_SET) which means the file size was set to 3 rather than mapping_size+3.
You're not backing the vector with an mmapped file, the vector has nothing to do with it, the vector uses its own memory that isn't related in any way to this mapping (please edit your question...).
You called msync() with the address (void *)0, so the actual address which needs to be synced, map, is not being synced.
Likewise, you called munmap() with the address (void *)0, so the actual address which needs to be unmapped is not being unmapped.
You called msync() with MS_ASYNC, which means there's no guarantee that the sync happens before you read the file's contents.
Here's what's working for me (error handling omitted for brevity):
unsigned frontier_size = 2;
const unsigned URL_MAX_SIZE = 100;
int fd = open("data", O_RDWR);
loff_t size = frontier_size * URL_MAX_SIZE;
ftruncate(fd, size);
char *map = (char *)mmap(0, size, PROT_WRITE, MAP_SHARED, fd, 0);
strcpy(map, "hello there");
msync(map, size, MS_SYNC);
munmap(map, size);
close(fd);

file mapping vs file system synchronization

I have a file with some data, which is also memory-mapped. So that I have both file descriptor and the pointer to the mapped pages. Mostly the data is only read from the mapping, but eventually it's also modified.
The modification consists of modifying some data within the file (sort of headers update), plus appending some new data (i.e. writing post the current end of the file).
This data structure is accessed from different threads, and to prevent collisions I synchronize access to it (mutex and friends).
During the modification I use both the file mapping and the file descriptor. Headers are updated implicitly by modifying the mapped memory, whereas the new data is written to the file by the appropriate API (WriteFile on windows, write on posix). Worth to note that the new data and the headers belong to different pages.
Since the modification changes the file size, the memory mapping is re-initialized after every such a modification. That is, it's unmapped, and then mapped again (with the new size).
I realize that writes to the mapped memory are "asynchronous" wrt file system, and order is not guaranteed, but I thought there was no problem because I explicitly close the file mapping, which should (IMHO) act as a sort of a flushing point.
Now this works without problem on windows, but on linux (android to be exact) eventually the mapped data turns-out to be inconsistent temporarily (i.e. data is ok when retrying). Seems like it doesn't reflect the newly-appended data.
Do I have to call some synchronization API to ensure the data if flushed properly? If so, which one should I use: sync, msync, syncfs or something different?
Thanks in advance.
EDIT:
This is a pseudo-code that illustrates the scenario I'm dealing with.
(The real code is more complex of course)
struct CompressedGrid
{
mutex m_Lock;
int m_FileHandle;
void* m_pMappedMemory;
Hdr* get_Hdr() { return /* the mapped memory with some offset*/; }
void SaveGridCell(int idx, const Cell& cCompressed)
{
AutoLock scope(m_Lock);
// Write to mapped memory
get_Hdr()->m_pCellOffset[Idx] = /* current end of file */;
// Append the data
lseek64(m_FileHandle, 0, FILE_END);
write(m_FileHandle, cCompressed.pPtr, cCompressed.nSize);
// re-map
munmap(...);
m_pMappedMemory = mmap(...); // specify the new file size of course
}
bool DecodeGridCell(int idx, Cell& cRaw)
{
AutoLock scope(m_Lock);
uint64_t nOffs = get_Hdr()->m_pCellOffset[Idx] = /* ;
if (!nOffs)
return false; // unavail
const uint8_t* p = m_pMappedMemory + nOffs;
cRaw.DecodeFrom(p); // This is where the problem appears!
return true;
}
Use addr = mmap(NULL, len, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_NORESERVE, fd, offset) to map the file.
If the size of the file changes, use newaddr = mremap(addr, len, newlen, MREMAP_MAYMOVE) to update the mapping to reflect it. To extend the file, use ftruncate(fd, newlen) before remapping the file.
You can use mprotect(addr, len, protflags) to change the protection (read/write) on any pages in the mapping (both must be aligned on a page boundary). You can also tell the kernel about your future accesses via madvise(), if the mapping is too large to fit in memory at once, but the kernel seems pretty darned good at managing readahead etc. even without those.
When you make changes to the mapping, use msync(partaddr, partlen, MS_SYNC | MS_INVALIDATE) or msync(partaddr, partlen, MS_ASYNC | MS_INVALIDATE) to ensure the changes int partlen chars from partaddr forward are visible to other mappings and file readers. If you use MS_SYNC, the call returns only when the update is complete. The MS_ASYNC call tells the kernel to do the update, but won't wait until it is done. If there are no other memory maps of the file, the MS_INVALIDATE does nothing; but if there are, that tells the kernel to ensure the changes are reflected in those too.
In Linux kernels since 2.6.19, MS_ASYNC does nothing, as the kernel tracks the changes properly anyway (no msync() is needed, except possibly before munmap()). I don't know if Android kernels have patches that change that behaviour; I suspect not. It is still a good idea to keep them in the code, for portability across POSIXy systems.
mapped data turns-out to be inconsistent temporarily
Well, unless you do use msync(partaddr, partlen, MS_SYNC | MS_INVALIDATE), the kernel will do the update when it sees best.
So, if you need some changes to be visible to file readers before proceeding, use msync(areaptr, arealen, MS_SYNC | MS_INVALIDATE) in the process doing those updates.
If you don't care about the exact moment, use msync(areaptr, arealen, MS_ASYNC | MS_INVALIDATE). It'll be a no-op on current Linux kernels, but it's a good idea to keep them for portability (perhaps commented out, if necessary for performance) and to remind developers about the (lack of) synchronization expectations.
As I commented to OP, I cannot observe the synchronization issues on Linux at all. (That does not mean it does not happen on Android, because Android kernels are derivatives of Linux kernels, not exactly the same.)
I do believe the msync() call is not needed on Linux kernels since 2.6.19 at all, as long as the mapping uses flags MAP_SHARED | MAP_NORESERVE, and the underlying file is not opened using the O_DIRECT flag. The reason for this belief is that in this case, both mapping and file accesses should use the exact same page cache pages.
Here are two test programs, that can be used to explore this on Linux. First, a single-process test, test-single.c:
#define _POSIX_C_SOURCE 200809L
#define _GNU_SOURCE
#include <stdlib.h>
#include <unistd.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <sys/mman.h>
#include <sys/wait.h>
#include <fcntl.h>
#include <signal.h>
#include <string.h>
#include <stdio.h>
#include <errno.h>
static inline int read_from(const int fd, void *const to, const size_t len, const off_t offset)
{
char *p = (char *)to;
char *const q = (char *)to + len;
ssize_t n;
if (lseek(fd, offset, SEEK_SET) != offset)
return errno = EIO;
while (p < q) {
n = read(fd, p, (size_t)(q - p));
if (n > 0)
p += n;
else
if (n != -1)
return errno = EIO;
else
if (errno != EINTR)
return errno;
}
return 0;
}
static inline int write_to(const int fd, const void *const from, const size_t len, const off_t offset)
{
const char *const q = (const char *)from + len;
const char *p = (const char *)from;
ssize_t n;
if (lseek(fd, offset, SEEK_SET) != offset)
return errno = EIO;
while (p < q) {
n = write(fd, p, (size_t)(q - p));
if (n > 0)
p += n;
else
if (n != -1)
return errno = EIO;
else
if (errno != EINTR)
return errno;
}
return 0;
}
int main(int argc, char *argv[])
{
unsigned long tests, n, merrs = 0, werrs = 0;
size_t page;
long *map, data[2];
int fd;
char dummy;
if (argc != 3) {
fprintf(stderr, "\n");
fprintf(stderr, "Usage: %s FILENAME COUNT\n", argv[0]);
fprintf(stderr, "\n");
fprintf(stderr, "This program will test synchronization between a memory map\n");
fprintf(stderr, "and reading/writing the underlying file, COUNT times.\n");
fprintf(stderr, "\n");
return EXIT_FAILURE;
}
if (sscanf(argv[2], " %lu %c", &tests, &dummy) != 1 || tests < 1) {
fprintf(stderr, "%s: Invalid number of tests to run.\n", argv[2]);
return EXIT_FAILURE;
}
/* Create the file. */
page = sysconf(_SC_PAGESIZE);
fd = open(argv[1], O_RDWR | O_CREAT | O_EXCL, 0644);
if (fd == -1) {
fprintf(stderr, "%s: Cannot create file: %s.\n", argv[1], strerror(errno));
return EXIT_FAILURE;
}
if (ftruncate(fd, page) == -1) {
fprintf(stderr, "%s: Cannot resize file: %s.\n", argv[1], strerror(errno));
unlink(argv[1]);
return EXIT_FAILURE;
}
/* Map it. */
map = mmap(NULL, page, PROT_READ|PROT_WRITE, MAP_SHARED|MAP_NORESERVE, fd, 0);
if (map == MAP_FAILED) {
fprintf(stderr, "%s: Cannot map file: %s.\n", argv[1], strerror(errno));
unlink(argv[1]);
close(fd);
return EXIT_FAILURE;
}
/* Test loop. */
for (n = 0; n < tests; n++) {
/* Update map. */
map[0] = (long)(n + 1);
map[1] = (long)(~n);
/* msync(map, 2 * sizeof map[0], MAP_SYNC | MAP_INVALIDATE); */
/* Check the file contents. */
if (read_from(fd, data, sizeof data, 0)) {
fprintf(stderr, "read_from() failed: %s.\n", strerror(errno));
munmap(map, page);
unlink(argv[1]);
close(fd);
return EXIT_FAILURE;
}
werrs += (data[0] != (long)(n + 1) || data[1] != (long)(~n));
/* Update data. */
data[0] = (long)(n * 386131);
data[1] = (long)(n * -257);
if (write_to(fd, data, sizeof data, 0)) {
fprintf(stderr, "write_to() failed: %s.\n", strerror(errno));
munmap(map, page);
unlink(argv[1]);
close(fd);
return EXIT_FAILURE;
}
merrs += (map[0] != (long)(n * 386131) || map[1] != (long)(n * -257));
}
munmap(map, page);
unlink(argv[1]);
close(fd);
if (!werrs && !merrs)
printf("No errors detected.\n");
else {
if (!werrs)
printf("Detected %lu times (%.3f%%) when file contents were incorrect.\n",
werrs, 100.0 * (double)werrs / (double)tests);
if (!merrs)
printf("Detected %lu times (%.3f%%) when mapping was incorrect.\n",
merrs, 100.0 * (double)merrs / (double)tests);
}
return EXIT_SUCCESS;
}
Compile and run using e.g.
gcc -Wall -O2 test-single -o single
./single temp 1000000
to test a million times, whether the mapping and the file contents stay in sync, when both accesses are done in the same process. Note that the msync() call is commented out, because on my machine it is not needed: I never see any errors/desynchronization during testing even without it.
The test rate on my machine is about 550,000 tests per second. Note that each tests does it both ways, so includes a read and a write. I just cannot get this to detect any errors. It is written to be quite sensitive to errors, too.
The second test program uses two child processes and a POSIX realtime signal to tell the other process to check the contents. test-multi.c:
#define _POSIX_C_SOURCE 200809L
#define _GNU_SOURCE
#include <stdlib.h>
#include <unistd.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <sys/mman.h>
#include <sys/wait.h>
#include <fcntl.h>
#include <signal.h>
#include <string.h>
#include <stdio.h>
#include <errno.h>
#define NOTIFY_SIGNAL (SIGRTMIN+0)
int mapper_process(const int fd, const size_t len)
{
long value = 1, count[2] = { 0, 0 };
long *data;
siginfo_t info;
sigset_t sigs;
int signum;
if (fd == -1) {
fprintf(stderr, "mapper_process(): Invalid file descriptor.\n");
return EXIT_FAILURE;
}
data = mmap(NULL, len, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_NORESERVE, fd, 0);
if (data == MAP_FAILED) {
fprintf(stderr, "mapper_process(): Cannot map file.\n");
return EXIT_FAILURE;
}
sigemptyset(&sigs);
sigaddset(&sigs, NOTIFY_SIGNAL);
sigaddset(&sigs, SIGINT);
sigaddset(&sigs, SIGHUP);
sigaddset(&sigs, SIGTERM);
while (1) {
/* Wait for the notification. */
signum = sigwaitinfo(&sigs, &info);
if (signum == -1) {
if (errno == EINTR)
continue;
fprintf(stderr, "mapper_process(): sigwaitinfo() failed: %s.\n", strerror(errno));
munmap(data, len);
return EXIT_FAILURE;
}
if (signum != NOTIFY_SIGNAL)
break;
/* A notify signal was received. Check the write counter. */
count[ (data[0] == value) ]++;
/* Update. */
data[0] = value++;
data[1] = -(value++);
/* Synchronize */
/* msync(data, 2 * sizeof (data[0]), MS_SYNC | MS_INVALIDATE); */
/* And let the writer know. */
kill(info.si_pid, NOTIFY_SIGNAL);
}
/* Print statistics. */
printf("mapper_process(): %lu errors out of %lu cycles (%.3f%%)\n",
count[0], count[0] + count[1], 100.0 * (double)count[0] / (double)(count[0] + count[1]));
fflush(stdout);
munmap(data, len);
return EXIT_SUCCESS;
}
static inline int read_from(const int fd, void *const to, const size_t len, const off_t offset)
{
char *p = (char *)to;
char *const q = (char *)to + len;
ssize_t n;
if (lseek(fd, offset, SEEK_SET) != offset)
return errno = EIO;
while (p < q) {
n = read(fd, p, (size_t)(q - p));
if (n > 0)
p += n;
else
if (n != -1)
return errno = EIO;
else
if (errno != EINTR)
return errno;
}
return 0;
}
static inline int write_to(const int fd, const void *const from, const size_t len, const off_t offset)
{
const char *const q = (const char *)from + len;
const char *p = (const char *)from;
ssize_t n;
if (lseek(fd, offset, SEEK_SET) != offset)
return errno = EIO;
while (p < q) {
n = write(fd, p, (size_t)(q - p));
if (n > 0)
p += n;
else
if (n != -1)
return errno = EIO;
else
if (errno != EINTR)
return errno;
}
return 0;
}
int writer_process(const int fd, const size_t len, const pid_t other)
{
long data[2] = { 0, 0 }, count[2] = { 0, 0 };
long value = 0;
siginfo_t info;
sigset_t sigs;
int signum;
sigemptyset(&sigs);
sigaddset(&sigs, NOTIFY_SIGNAL);
sigaddset(&sigs, SIGINT);
sigaddset(&sigs, SIGHUP);
sigaddset(&sigs, SIGTERM);
while (1) {
/* Update. */
data[0] = ++value;
data[1] = -(value++);
/* then write the data. */
if (write_to(fd, data, sizeof data, 0)) {
fprintf(stderr, "writer_process(): write_to() failed: %s.\n", strerror(errno));
return EXIT_FAILURE;
}
/* Let the mapper know. */
kill(other, NOTIFY_SIGNAL);
/* Wait for the notification. */
signum = sigwaitinfo(&sigs, &info);
if (signum == -1) {
if (errno == EINTR)
continue;
fprintf(stderr, "writer_process(): sigwaitinfo() failed: %s.\n", strerror(errno));
return EXIT_FAILURE;
}
if (signum != NOTIFY_SIGNAL || info.si_pid != other)
break;
/* Reread the file. */
if (read_from(fd, data, sizeof data, 0)) {
fprintf(stderr, "writer_process(): read_from() failed: %s.\n", strerror(errno));
return EXIT_FAILURE;
}
/* Check the read counter. */
count[ (data[1] == -value) ]++;
}
/* Print statistics. */
printf("writer_process(): %lu errors out of %lu cycles (%.3f%%)\n",
count[0], count[0] + count[1], 100.0 * (double)count[0] / (double)(count[0] + count[1]));
fflush(stdout);
return EXIT_SUCCESS;
}
int main(int argc, char *argv[])
{
struct timespec duration;
double seconds;
pid_t mapper, writer, p;
size_t page;
siginfo_t info;
sigset_t sigs;
int fd, status;
char dummy;
if (argc != 3) {
fprintf(stderr, "\n");
fprintf(stderr, "Usage: %s FILENAME SECONDS\n", argv[0]);
fprintf(stderr, "\n");
fprintf(stderr, "This program will test synchronization between a memory map\n");
fprintf(stderr, "and reading/writing the underlying file.\n");
fprintf(stderr, "The test will run for the specified time, or indefinitely\n");
fprintf(stderr, "if SECONDS is zero, but you can also interrupt it with\n");
fprintf(stderr, "Ctrl+C (INT signal).\n");
fprintf(stderr, "\n");
return EXIT_FAILURE;
}
if (sscanf(argv[2], " %lf %c", &seconds, &dummy) != 1) {
fprintf(stderr, "%s: Invalid number of seconds to run.\n", argv[2]);
return EXIT_FAILURE;
}
if (seconds > 0) {
duration.tv_sec = (time_t)seconds;
duration.tv_nsec = (long)(1000000000 * (seconds - (double)(duration.tv_sec)));
} else {
duration.tv_sec = 0;
duration.tv_nsec = 0;
}
/* Block INT, HUP, CHLD, and the notification signal. */
sigemptyset(&sigs);
sigaddset(&sigs, SIGINT);
sigaddset(&sigs, SIGHUP);
sigaddset(&sigs, SIGCHLD);
sigaddset(&sigs, NOTIFY_SIGNAL);
if (sigprocmask(SIG_BLOCK, &sigs, NULL) == -1) {
fprintf(stderr, "Cannot block the necessary signals: %s.\n", strerror(errno));
return EXIT_FAILURE;
}
/* Create the file. */
page = sysconf(_SC_PAGESIZE);
fd = open(argv[1], O_RDWR | O_CREAT | O_EXCL, 0644);
if (fd == -1) {
fprintf(stderr, "%s: Cannot create file: %s.\n", argv[1], strerror(errno));
return EXIT_FAILURE;
}
if (ftruncate(fd, page) == -1) {
fprintf(stderr, "%s: Cannot resize file: %s.\n", argv[1], strerror(errno));
unlink(argv[1]);
return EXIT_FAILURE;
}
close(fd);
fd = -1;
/* Ensure streams are flushed before forking. They should be, we're just paranoid here. */
fflush(stdout);
fflush(stderr);
/* Fork the mapper child process. */
mapper = fork();
if (mapper == -1) {
fprintf(stderr, "Cannot fork mapper child process: %s.\n", strerror(errno));
unlink(argv[1]);
return EXIT_FAILURE;
}
if (!mapper) {
fd = open(argv[1], O_RDWR);
if (fd == -1) {
fprintf(stderr, "mapper_process(): %s: Cannot open file: %s.\n", argv[1], strerror(errno));
return EXIT_FAILURE;
}
status = mapper_process(fd, page);
close(fd);
return status;
}
/* For the writer child process. (mapper contains the PID of the mapper process.) */
writer = fork();
if (writer == -1) {
fprintf(stderr, "Cannot fork writer child process: %s.\n", strerror(errno));
unlink(argv[1]);
kill(mapper, SIGKILL);
return EXIT_FAILURE;
}
if (!writer) {
fd = open(argv[1], O_RDWR);
if (fd == -1) {
fprintf(stderr, "writer_process(): %s: Cannot open file: %s.\n", argv[1], strerror(errno));
return EXIT_FAILURE;
}
status = writer_process(fd, page, mapper);
close(fd);
return status;
}
/* Wait for a signal. */
if (duration.tv_sec || duration.tv_nsec)
status = sigtimedwait(&sigs, &info, &duration);
else
status = sigwaitinfo(&sigs, &info);
/* Whatever it was, we kill the child processes. */
kill(mapper, SIGHUP);
kill(writer, SIGHUP);
do {
p = waitpid(-1, NULL, 0);
} while (p != -1 || errno == EINTR);
/* Cleanup. */
unlink(argv[1]);
printf("Done.\n");
return EXIT_SUCCESS;
}
Note that the child processes open the temporary file separately. To compile and run, use e.g.
gcc -Wall -O2 test-multi.c -o multi
./multi temp 10
The second parameter is the duration of the test, in seconds. (You can interrupt the testing safely using SIGINT (Ctrl+C) or SIGHUP.)
On my machine, the test rate is roughly 120,000 tests per second; the msync() call is commented out here also, because I don't ever see any errors/desynchronization even without it. (Plus, msync(ptr, len, MS_SYNC) and msync(ptr, len, MS_SYNC | MS_INVALIDATE) are horribly slow; with either, I can get less than 1000 tests per second, with absolutely no difference in the results. That's a 100x slowdown.)
The MAP_NORESERVE flag to mmap tells it to use the file itself as backing storage when under memory pressure, rather than swap. If you compile the code on a system that does not recognize that flag, you can omit it. As long as the mapping is not evicted from RAM, the flag does not affect the operation at all.

Issues mmaping the same file twice

I'm using a Raspberry Pi B+, and I'm trying to mmap two different sections of /dev/mem - the first to be able to set two pins' functions from location 0x2020 0004 (0x04 bytes long), the other to manipulate the BSC Slave functions on the BCM2835 chip on the Pi from location 0x2021 4000 (0x1C bytes long).
static uint32_t * initMapMem(int fd, uint32_t addr, uint32_t len)
{
return (uint32_t *) mmap((void*)0x0, len,
PROT_READ|PROT_WRITE|PROT_EXEC,
MAP_SHARED|MAP_LOCKED,
fd, addr);
}
int initialise(void) {
int fd;
fd = open("/dev/mem", O_RDWR | O_SYNC) ;
if (fd < 0)
{
fprintf(stderr, "This program needs root privileges. Try using sudo.\n");
return 1;
}
pinReg = initMapMem(fd, 0x20200004, 0x4);
bscReg = initMapMem(fd, 0x20214000, 0x1C);
close(fd);
if (bscReg == MAP_FAILED)
{
fprintf(stderr, "Bad, mmap failed.\n");
return 1;
}
if (pinReg == MAP_FAILED)
{
fprintf(stderr, "Bad, mmap failed.\n");
return 1;
}
return 0;
}
initialise() is called out of main(). Stepping through the program with gdb I find that bscReg gets positioned right, but pinReg returns as MAP_FAILED (aka 0xFFFFFFFF) with errno set to EINVAL. Doesn't matter which way it's done, either - pinReg always finds itself as MAP_FAILED when mmaped first or second.
How do I get pinReg to a valid value?
The first mmap() is failing because the offset you're trying to map (0x20200004) isn't page-aligned. Create a mapping at 0x20200000 with a size of at least 8, then write to it at an offset of 0x4.

Memory leak in MPI_File_read_at

i'm writing a C/MPI program that making many processes read from a data file.
When using the standard functions from stdio (fopen, fread, fseek) everything goes well. The problem that i can't go beyond 4 Go offsets. So i used MPI-IO functions to read a big file and at this moment memory doesn't liberate well.
In fact i read a buffer, i process it then i free the allocated memory. The memory usage per process is perfect but the global memory usage doesn't stop increasing. I don't have this problem by just replacing mpi_file_read at by fread.
there is my code :
double CPUtime(){ return ((double) clock())/CLOCKS_PER_SEC;}int main(int argc, char* argv []){
if(argc != 5) {
printf("\t[Dictionary file] [Dictionary] [Input file] [Buffer size]\n");
exit(0);
}
char* sInput = malloc (sizeof(char)*maxLength);
char* sOutput = malloc (sizeof(char)*maxLength);
char* compl = malloc (sizeof(char)*maxLength);
char* sDictionaryFileName = argv[1];
char* sDictionaryName = argv[2];
char* filename = argv[3];
int Mbuffer = atoi(argv[4]);
int maxBuffer = Mbuffer*1024*1024;
int over = 10000;
int rank,numprocess;
long int offset;
char* buffer;
char* opbuffer;
double tstart=CPUtime();
MPI_Init( &argc, &argv );
MPI_Comm_rank( MPI_COMM_WORLD, &rank );
/* mpi version */
/* open the file*/
MPI_File fh;
int err;
err = MPI_File_open(MPI_COMM_WORLD, filename, MPI_MODE_RDONLY, MPI_INFO_NULL, &fh);
if (err != MPI_SUCCESS) {
char errstr[MPI_MAX_ERROR_STRING];
int errlen;
MPI_Error_string(err, errstr, &errlen);
printf("Error at opening file %s (%s)\n",filename,errstr);
MPI_Finalize();
exit(1);
}
// get offsets and buffer size
MPI_Offset sfile;
MPI_File_get_size(fh,&sfile);
MPI_Status status;
/* C version */
/*FILE* fh;
long int sfile;
fh =fopen( filename,"rb");
if (fh==NULL) {
printf("Error at opening file %s\n",filename);
exit(1);
}
// get offsets and buffer size
fseek(fh, 0L, SEEK_END);
sfile = ftell(fh);
fseek(fh, 0L, SEEK_SET);*/
MPI_Comm_size( MPI_COMM_WORLD, &numprocess );
/* number of iterations */
long int data_size = (long int)(sfile/(numprocess));
int nbIter = data_size/maxBuffer;
if(nbIter<=1){
nbIter = 1;
maxBuffer = data_size;
}
/* offsets */
offset = data_size*(rank);
long int cursor = offset;
char* header;
if(rank==0){
FILE* fh;
fh =fopen( filename,"rb");
if (fh==NULL) {
printf("Error at opening file %s\n",filename);
exit(1);
}
/* read the header and broadcast it */
header = malloc(sizeof(char)*1000);
fgets(header,1000,fh);
fclose(fh);
//broadcast header
int sndHeader = strlen(header);
//cursor+=sndHeader;
int process_counter;
for(process_counter=1;process_counter<numprocess;process_counter++){
int ierr = MPI_Send(&sndHeader, 1, MPI_INT, process_counter, 42,MPI_COMM_WORLD);
if (ierr != MPI_SUCCESS) {
int errclass,resultlen;
char err_buffer[MPI_MAX_ERROR_STRING];
MPI_Error_class(ierr,&errclass);
if (errclass== MPI_ERR_RANK) {
fprintf(stderr,"Invalid rank used in MPI send call\n");
MPI_Error_string(ierr,err_buffer,&resultlen);
fprintf(stderr,err_buffer);
MPI_Finalize();
}
}
MPI_Send(header, sndHeader, MPI_CHAR, process_counter, 43, MPI_COMM_WORLD);
}
}
else{
/* receive the header */
int sizeofHeader;
MPI_Status s ;
MPI_Recv(&sizeofHeader,1,MPI_INT,0,42,MPI_COMM_WORLD,&s);
header = malloc (sizeof(char)*sizeofHeader+1);
MPI_Recv(header,sizeofHeader,MPI_CHAR,0,43,MPI_COMM_WORLD,&s);
}
/* Synchronization barrier */
MPI_Barrier(MPI_COMM_WORLD);
int count;
opbuffer = malloc(sizeof(char)*maxBuffer);
/* C version */
//fseek(fh,cursor,SEEK_SET);
for(count=0;count<nbIter;count++){
if(count==0 && rank==numprocess-1){ //init ring
//send the token to p0
int token=1;
MPI_Send(&token,sizeof(int),MPI_INT,0,55,MPI_COMM_WORLD);
}
//recv
int token;
int sender;
if(rank==0)
sender = numprocess-1;
else
sender=rank-1;
MPI_Status s;
MPI_Recv(&token,sizeof(int),MPI_INT,sender,55,MPI_COMM_WORLD,&s);
fflush(stdout);printf("P%d got the token at %G\n",rank,CPUtime());
//read
double start=CPUtime();
/*double readtime;
double sread=CPUtime();//read time*/
//read
if(token==1){
/* MPI version */
int err=MPI_File_read_at(fh, cursor,opbuffer, sizeof(char)*maxBuffer, MPI_CHAR, &status);
if(err!=MPI_SUCCESS){
/*char errstr[MPI_MAX_ERROR_STRING];
int errlen;
MPI_Error_string(err, errstr, &errlen);
printf("Error reading file %s (%s)\n",filename,errstr);*/
MPI_Finalize();
exit(0);
}
/* C version of read */
/*int k=fread(opbuffer,sizeof(char),maxBuffer,fh);
if(k==0)
perror("fread");*/
cursor+=maxBuffer;
buffer=opbuffer;
}
else{
printf("Error token!\n");
token=1;
}
//printf("P%d readtime=%G\n",rank,CPUtime()-sread);
//Isend
int next = (rank+1)%numprocess;
MPI_Send(&token,sizeof(int),MPI_INT,next,55,MPI_COMM_WORLD);
/* start processing*/
/* end processing */
}
free(opbuffer);
int er=MPI_File_close(&fh);
if(er!=MPI_SUCCESS){
printf("Error closing file\n");
MPI_Finalize();
exit(1);
}
MPI_Finalize();
printf("Global time : %G\n",CPUtime()-tstart);
return 0;
}
If any one have any idea of what is it i would apprciate that.
Thank you.
It's probably that you're never calling MPI_File_close. That will cause intermediate operations on the file to leak. Note that you should also close it under the error condition if(err!=MPI_SUCCESS) if you really want to write clean code.

Resources