Linux :Identifying pages in memory - c

I want to know what part of a huge file are cached in memory. I'm using some code from fincore for that, which works this way: the file is mmaped, then fincore loops over the address space and check pages with mincore, but it's very long (several minutes) because of the file size (several TB).
Is there a way to loop on used RAM pages instead? It would be much faster, but that means I should get the list of used pages from somewhere... However I can't find a convenient system call that would allow that.
Here comes the code:
#include <errno.h>
#include <fcntl.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/stat.h>
#include <sys/types.h>
#include <unistd.h>
/* } */
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <unistd.h>
#include <sys/mman.h>
#include <sys/sysinfo.h>
void
fincore(char *filename) {
int fd;
struct stat st;
struct sysinfo info;
if (sysinfo(& info)) {
perror("sysinfo");
return;
}
void *pa = (char *)0;
char *vec = (char *)0;
size_t pageSize = getpagesize();
register size_t pageIndex;
fd = open(filename, 0);
if (0 > fd) {
perror("open");
return;
}
if (0 != fstat(fd, &st)) {
perror("fstat");
close(fd);
return;
}
pa = mmap((void *)0, st.st_size, PROT_NONE, MAP_SHARED, fd, 0);
if (MAP_FAILED == pa) {
perror("mmap");
close(fd);
return;
}
/* vec = calloc(1, 1+st.st_size/pageSize); */
/* 2.2 sec for 8 TB */
vec = calloc(1, (st.st_size+pageSize-1)/pageSize);
if ((void *)0 == vec) {
perror("calloc");
close(fd);
return;
}
/* 48 sec for 8 TB */
if (0 != mincore(pa, st.st_size, vec)) {
fprintf(stderr, "mincore(%p, %lu, %p): %s\n",
pa, (unsigned long)st.st_size, vec, strerror(errno));
free(vec);
close(fd);
return;
}
/* handle the results */
/* 2m45s for 8 TB */
for (pageIndex = 0; pageIndex <= st.st_size/pageSize; pageIndex++) {
if (vec[pageIndex]&1) {
printf("%zd\n", pageIndex);
}
}
free(vec);
vec = (char *)0;
munmap(pa, st.st_size);
close(fd);
return;
}
int main(int argc, char *argv[]) {
fincore(argv[1]);
return 0;
}

The amount of information needed to represent a list is, for the pessimistic case when all or almost all pages are indeed in RAM, much higher than the bitmap - at least 64 vs 1 bits per entry. If there was such an API, when querying it about your 2 billion pages, you would have to be prepared to get 16 GB of data in the reply. Additionally, handling variable-length structures such as lists is more complex than handling a fixed-length array, so library functions, especially low-level system ones, tend to avoid the hassle.
I am also not quite sure about the implementation (how the OS interacts with the TLB and Co in this case), but it may well be that (even size difference aside) filling out the bitmap can be performed faster than creating a list due to the OS- and hardware-level structures the information is extracted from.
If you are not concerned about very fine granularity, you could have a look at /proc/<PID>/smaps. For each mapped region it shows some stats, including how much is loaded into memory (Rss field). If for the purpose of debugging you map some regions of a file with a separate mmap() call (in addition to the main mapping used for performing the actual task), you will probably get separate entries in smaps and thus see separate statistics for these regions. You almost certainly can't make billions of mappings without killing your system, but if the file is structured well, maybe having separate statistics for just a few dozen well-chosen regions can help you find the answers you are looking for.

Cached by whom?
Consider after a boot the file sits on disk. No part of it is in memory.
Now the file is opened and random reads are performed.
The file system (e.g. the kernel) will be caching.
The C standard library will be caching.
The kernel will be caching in kernel-mode memory, the C standard library in user-mode memory.
If you could issue a query, it could also be that instantly after the query - before it returns to you - the cached data in question is removed from the cached.

Related

map a big file and scan through data

Trying to search a pattern in a big file using mmap. The file is huge (way more than the physical memory). My worry is that if I used the file size as the second parameter for mmap(), there won't be enough physical memory to satisfy the system call. So I used 0x1000 as the length in the hope that OS will automatically map the right part of file as my pointer moves. But the following code snippet gave segmentation fault.
Any ideas?
#include <stdio.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <unistd.h>
#include <sys/mman.h>
long fileSize(char *fname) {
struct stat stat_buf;
int rc = stat(fname, &stat_buf);
return rc == 0 ? stat_buf.st_size : -1;
}
int main(int argc, char *argv[]) {
long size = fileSize(argv[1]);
printf("size=%ld\n", size);
int fd = open(argv[1], O_RDONLY);
printf("fd=%d\n", fd);
char *p = mmap(0, 0x1000, PROT_READ, MAP_SHARED, fd, 0);
if (p == MAP_FAILED) {
perror ("mmap");
return 1;
}
long i;
int pktLen;
int *pInt;
for (i=0; i < size; i+=4) {
pInt = (int*)(p+i);
if (pInt[i] == 0x12345678) {
printf("found it at %ld\n", i); break;
}
}
if (i == size) {
printf("didn't find it\n");
}
close(fd);
return 0;
}
Update
Turned out I had a silly bug
The line
if (pInt[i] == 0x12345678) should have been if (pInt[0] == 0x12345678)
Use
struct stat info;
long page;
const char *map;
size_t size, mapping;
int fd, result;
page = sysconf(_SC_PAGESIZE);
if (page < 1L) {
fprintf(stderr, "Invalid page size.\n");
exit(EXIT_FAILURE);
}
fd = open(filename, O_RDONLY);
if (fd == -1) {
fprintf(stderr, "%s: Cannot open file: %s.\n", filename, strerror(errno));
exit(EXIT_FAILURE);
}
result = fstat(fd, &info);
if (result == -1) {
fprintf(stderr, "%s: Cannot get file information: %s.\n", filename, strerror(errno));
close(fd);
exit(EXIT_FAILURE);
}
if (info.st_size <= 0) {
fprintf(stderr, "%s: No data.\n", filename);
close(fd);
exit(EXIT_FAILURE);
}
size = info.st_size;
if ((off_t)size != info.st_size) {
fprintf(stderr, "%s: File is too large to map.\n", filename);
close(fd);
exit(EXIT_FAILURE);
}
/* mapping is size rounded up to a multiple of page. */
if (size % (size_t)page)
mapping = size + page - (size % (size_t)page);
else
mapping = size;
map = mmap(NULL, mapping, PROT_READ, MAP_SHARED | MAP_NORESERVE, fd, 0);
if (map == MAP_FAILED) {
fprintf(stderr, "%s: Cannot map file: %s.\n", filename, strerror(errno));
close(fd);
exit(EXIT_FAILURE);
}
if (close(fd)) {
fprintf(stderr, "%s: Unexpected error closing file descriptor.\n", filename);
exit(EXIT_FAILURE);
}
/*
* Use map[0] to map[size-1], but remember that it is not a string,
* and that there is no trailing '\0' at map[size].
*
* Accessing map[size] to map[mapping-1] is not allowed, and may
* generate a SIGBUS signal (and kill the process).
*/
/* The mapping is automatically torn down when the process exits,
* but you can also unmap it with */
munmap(map, mapping);
The important points in the code above:
You'll need to start your code with e.g.
#define _POSIX_C_SOURCE 200809L
#define _BSD_SOURCE
#include <stdlib.h>
#include <unistd.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <sys/mman.h>
#include <fcntl.h>
#include <string.h>
#include <errno.h>
The _BSD_SOURCE is required for MAP_NORESERVE to be defined, even though it is a GNU/Linux-specific feature.
mapping (length in man 2 mmap) must be a multiple of page size (sysconf(_SC_PAGESIZE)).
MAP_NORESERVE flag tells the kernel that the mapping is backed by the file only, and as such, is allowed to be larger than available RAM + SWAP.
You can (but do not need to) close the file descriptor referring to the mapped file with no issues, because the mapping itself contains a reference in-kernel.
Years ago, on a different forum, I showed a simple program to manipulate a terabyte of data (1 TiB = 1,099,511,627,776 bytes) using this very approach (although it uses a sparse backing file; i.e. mostly implicit zeroes, with less than 250 MB of actual data written to the backing file -- mostly to reduce the amount of disk space needed). Of course, it requires a 64-bit machine running Linux, as the virtual memory on 32-bit machines is limited to 232 = 4 GiB (Linux does not support segmented memory models).
The Linux kernel is surprisingly efficient in choosing which pages to keep in RAM, and which pages to evict. Of course, you can make that even more efficient, by telling the kernel which parts of the mapping you are unlikely to access (and therefore can be evicted), by using posix_madvise(address, length, advice) with advice being POSIX_MADV_DONTNEED or POSIX_MADV_WILLNEED. This has the benefit that unlike unmapping the "dontneed" parts, you can, if you need to, re-access that part of the mapping. (If the pages are already evicted, the access to the mapping will just block until the pages are re-loaded to memory. In other words, you can use posix_madvise() to "optimize" eviction logic, without limiting what part of the mapping can be accessed.)
In your case, if you do a linear or semi-linear search over the data using e.g. memmem(), you can use posix_madvise(map, mapping, POSIX_MADV_SEQUENTIAL).
Personally, I'd run the search first without using any posix_madvise() calls, and then see if it makes a significant enough positive difference, using the same data set (and several runs, of course). (You can safely -- with no risk of losing any data -- clear the page cache between test runs using sudo sh -c 'sync ; echo 3 > /proc/sys/vm/drop_caches ; sync', if you wish to exclude the effects of having the large file (mostly) already cached, between timing runs.)
The SIGSEGV is because you're accessing beyond 0x1000 bytes (in the for loop). You have to mmap() the complete size bytes of the fd.
The concept of demand paging in virtual memory subsystem helps exact same scenarios like yours - applications/application data bigger than the physical memory size. After the mmap(), as and when you access the (virtual) address, if there is no physical page mapped to it (page fault), kernel will find out a physical page that can be used (page replacement).
fd = open(argv[1], O_RDONLY);
ptr = mmap(NULL, file_size, PROT_READ, MAP_PRIVATE, fd, 0);
/* Consume the entire file's data as needed */
munmap(ptr, file_size);
Alternately you can put a loop around the mmap()/munmap() to scan the file in PAGE_SIZE or in multiples of PAGE_SIZE. The last arg of mmap() - offset will come handy for that.
From man-page :
void *mmap(void *addr, size_t length, int prot, int flags, int fd, off_t offset);
int munmap(void *addr, size_t length);
Pseudo-code :
fd = open(argv[1], O_RDONLY);
last_block_size = file_size % PAGE_SIZE;
num_pages = file_size / PAGE_SIZE + (last_block_size ? 1 : 0)
for (int i = 0; i < num_pages; i++) {
block_size = last_block_size && (i == num_pages - 1) ? last_block_size : PAGE_SIZE;
ptr = mmap(NULL, block_size, PROT_READ, MAP_PRIVATE, fd, i * PAGE_SIZE);
/* Consume the file's data range (ptr, ptr+block_size-1) as needed */
munmap(ptr, block_size);
}
Please use MAP_PRIVATE as the mapping might be just needed for your process alone. It just avoids few extra steps by the kernel for the MAP_SHARED.
Edit : It should have been MAP_PRIVATE in place of MAP_ANON. Changed.

How do I write directly to a USB drive with over 4GB data?

I have a research project where I need to be able to fill a USB stick with a known pattern of data directly, no files or file system. My goal is to fill a drive from top to bottom with my pattern; I have written a program for writing to /dev/sd* but it is slow and does not work if the drive is over 4GB in size. The writing will stop at offset oxFFFFFFF or 2^32.
My code
#include <stdio.h>
#include <fcntl.h>
#include <linux/fs.h>
#include <errno.h>
#include <inttypes.h>
#include <stdlib.h>
#include <unistd.h>
#include <string.h>
#include <sys/ioctl.h>
#include <ctype.h>
int main(int argc, char **argv)
{
int fd;
uint64_t numbytes=0;
char response[4];
long int nblocks = 0; // number of blocks up to 4GB
int error;
size_t fill_length = 8;
char buf[512];
char fill[] = "ABCDEFGH";
printf("\n\n");
fd = open(argv[1], O_RDWR);
if(error = ioctl(fd, BLKGETSIZE64, &numbytes) != 0){
printf("Failed to read block device, ioctl returned: %i\n", error);
exit(0);
}
if (numbytes > 8589934592){ // Exit if more than 8 GB drive
printf("Drive is too large.l\n");
exit(0);
}
printf("Number of bytes: %lu, i.e. %.3f GiB\n", numbytes,
(double)numbytes / (1024 * 1024 * 1024));
nblocks = numbytes / 512;
printf("Number of blocks on device: %lu\n", nblocks);
strcpy(buf, fill); // fills with pattern, one time
for(int i =0; i < (512 - fill_length); i += fill_length){ // Fills up the rest of the buffer
strcat(buf, fill); // with the pattern to be repeated.
} // 512 is the default & smallest block size
printf("buf is:\n%s\n", buf);
printf("\n*** The device at %s will be completely overwritten ***\n", argv[1]);
printf("\nAre you sure you want to proceed? (Type:<Ctrl>-C to exit)\n");
// printf("\n nblocks: %lu", nblocks);
fgets(response, 3, stdin);
printf("writting to: %s\n", argv[1]);
for (int i = 0; i <= nblocks; i++)
{
write(fd, buf, 512);
}
printf("Closing...\n");
close(fd);
printf("Closed.\n");
return 0;
}
I realize my program isn't great and is dangerous as I could wipe out a HDD, but all I am looking for at this point is tips to make this work on drives over 4GB and hopefully make the process faster. It will have limited use by myself and possibly another.
A push in the right direction would be appreciated.
Use size_t for bytesize in memory, and off_t for file offsets on disk. For general integers, use intptr_t in your program. So your for loops should start with for (intptr_t i=0;
And don't write in small blocks of 512 bytes, but in something bigger (but a power of two), e.g. 16384 bytes.
If your program still does not work, use strace to find out the failing syscalls. And use perror in your program on failure of any syscall.
your code is 'effectively' writing to a hard disk that is larger than a 32bit int.
Therefore, I suggest paying attention to the built-in USB interface on the USB stick, where you can specify which logical block to write and so forth.
Note; USB sticks with built-in load levelling will NOT write sequentially on the media, irregardless of what you pass to it in the way of commands. However, most USB sticks can be told to format and so forth so you might be able to use that feature. Or you could use some utility for setting disk sectors to all (for instance) 0's Just be carefully to not overwrite the sector/block formatting information.
You can make off_t into a 64 bit value by placing this before any of the includes.
#define _FILE_OFFSET_BITS 64 // so off_t is 64 bit, see man fseeko
#include <stdio.h>
#include <unistd.h>
etc. ...
This also apparently also causes the read(), write(), fread(), fwrite(), etc. calls manage a 64-bit file offset internally. I have used it for a similar purpose when porting to Linux.
Ditto the advice not to use such a small buffer. Make it 64K (65,536 bytes) to greatly improve performance.

How to map two virtual adresses on the same physical memory on linux?

I'm facing a quite tricky problem. I'm trying to get 2 virtual memory areas pointing to the same physical memory. The point is to have different page protection parameters on different memory areas.
On this forum, the user seems to have a solution, but it seems kinda hacky and it's pretty clear that something better can be done performance-wise :
http://www.linuxforums.org/forum/programming-scripting/19491-map-two-virtual-memory-addres-same-physical-page.html
As I'm facing the same problem, I want to give a shot here to know if somebody has a better idea. Don't be afraid to mention the dirty details behind the hood, this is what this question is about.
Thank by advance.
Since Linux kernel 3.17 (released in October 2014) you can use memfd_create system call to create a file descriptor backed by anonymous memory. Then mmap the same region several times, as mentioned in the above answers.
Note that glibc wrapper for the memfd_create system call was added in glibc 2.27 (released in February 2018). The glibc manual also describes how the descriptor returned can be used to create multiple mappings to the same underlying memory.
I'm trying to get 2 virtual memory area pointing on the same physical memory.
mmap the same region in the same file, twice, or use System V shared memory (which does not require mapping a file in memory).
I suppose if you dislike Sys V shared memrory you could use POSIX shared memory objects. They're not very popular but available on Linux and BSDs at least.
Once you get an fd with shm_open you could immediately call shm_unlink. Then no other process can attach to the same shared memory, and you can mmap it multiple times. Still a small race period available though.
As suggested by #PerJohansson, I wrote & tested following code, it works well on linux, using mmap with MAP_SHARED|MAP_FIXED flag, we can map the same physical page allocated by POSIX shm object multiple times and continuously into very large virtual memory.
#include "stdio.h"
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <sys/types.h>
#include <sys/mman.h>
#include <sys/stat.h> /* For mode constants */
#include <fcntl.h> /* For O_* constants */
void * alloc_1page_mem(int size) {
int fd;
char * ptr_base;
char * rptr;
/* Create shared memory object and set its size */
fd = shm_open("/myregion", O_CREAT | O_RDWR, S_IRUSR | S_IWUSR);
if (fd == -1) {
perror("error in shm_open");
return NULL;
}
if (ftruncate(fd, 4096) == -1) {
perror("error in ftruncate");
return NULL;
}
// following trick reserves big enough holes in VM space
ptr_base = rptr = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
munmap(rptr, size);
for(int i=0; i<size; i+=4096) {
rptr = mmap(rptr, 4096, PROT_READ | PROT_WRITE, MAP_SHARED|MAP_FIXED, fd, 0);
if (rptr == MAP_FAILED) {
perror("error in mmap");
return NULL;
}
rptr += 4096;
}
close(fd);
shm_unlink("/myregion");
return ptr_base;
}
void check(int * p, int total_cnt){
for (int i=0;i<4096/sizeof(int);i++) {
p[i] = i;
}
int fail_cnt = 0;
for (int k=0; k<total_cnt; k+= 4096/sizeof(int)) {
for (int i=0;i<4096/sizeof(int);i++) {
if (p[k+i] != i)
fail_cnt ++;
}
}
printf("fail_cnt=%d\n", fail_cnt);
}
int main(int argc, const char * argv[]) {
const char * cmd = argv[1];
int sum;
int total_cnt = 32*1024*1024;
int * p = NULL;
if (*cmd++ == '1')
p = alloc_1page_mem(total_cnt*sizeof(int));
else
p = malloc(total_cnt*sizeof(int));
sum = 0;
while(*cmd) {
switch(*cmd++) {
case 'c':
check(p, total_cnt);
break;
case 'w':
// save only 4bytes per cache line
for (int k=0;k<total_cnt;k+=64/sizeof(int)){
p[k] = sum;
}
break;
case 'r':
// read only 4bytes per cache line
for (int k=0;k<total_cnt;k+=64/sizeof(int)) {
sum += p[k];
}
break;
case 'p':
// prevent sum from being optimized
printf("sum=%d\n", sum);
}
}
return 0;
}
You can observe very low cache miss rate on memory allocated in such method:
$ sudo perf stat -e mem_load_retired.l3_miss -- ./a.out 0wrrrrr
# this produces L3 miss linearly increase with number of 'r' charaters
$ sudo perf stat -e mem_load_retired.l3_miss -- ./a.out 1wrrrrr
# this produces almost constant L3 miss.
If you are root, you can mmap("/dev/mem", ...) but there are caveats in the newer kernels, see accessing mmaped /dev/mem?

mmap, msync and linux process termination

I want to use mmap to implement persistence of certain portions of program state in a C program running under Linux by associating a fixed-size struct with a well known file name using mmap() with the MAP_SHARED flag set. For performance reasons, I would prefer not to call msync() at all, and no other programs will be accessing this file. When my program terminates and is restarted, it will map the same file again and do some processing on it to recover the state that it was in before the termination. My question is this: if I never call msync() on the file descriptor, will the kernel guarantee that all updates to the memory will get written to disk and be subsequently recoverable even if my process is terminated with SIGKILL? Also, will there be general system overhead from the kernel periodically writing the pages to disk even if my program never calls msync()?
EDIT: I've settled the problem of whether the data is written, but I'm still not sure about whether this will cause some unexpected system loading over trying to handle this problem with open()/write()/fsync() and taking the risk that some data might be lost if the process gets hit by KILL/SEGV/ABRT/etc. Added a 'linux-kernel' tag in hopes that some knowledgeable person might chime in.
I found a comment from Linus Torvalds that answers this question
http://www.realworldtech.com/forum/?threadid=113923&curpostid=114068
The mapped pages are part of the filesystem cache, which means that even if the user process that made a change to that page dies, the page is still managed by the kernel and as all concurrent accesses to that file will go through the kernel, other processes will get served from that cache.
In some old Linux kernels it was different, that's the reason why some kernel documents still tell to force msync.
EDIT: Thanks RobH corrected the link.
EDIT:
A new flag, MAP_SYNC, is introduced since Linux 4.15, which can guarantee the coherence.
Shared file mappings with this flag provide the guarantee that
while some memory is writably mapped in the address space of
the process, it will be visible in the same file at the same
offset even after the system crashes or is rebooted.
references:
http://man7.org/linux/man-pages/man2/mmap.2.html search MAP_SYNC in the page
https://lwn.net/Articles/731706/
I decided to be less lazy and answer the question of whether the data is written to disk definitively by writing some code. The answer is that it will be written.
Here is a program that kills itself abruptly after writing some data to an mmap'd file:
#include <stdint.h>
#include <string.h>
#include <stdlib.h>
#include <stdio.h>
#include <errno.h>
#include <unistd.h>
#include <sys/types.h>
#include <sys/mman.h>
#include <sys/stat.h>
#include <fcntl.h>
typedef struct {
char data[100];
uint16_t count;
} state_data;
const char *test_data = "test";
int main(int argc, const char *argv[]) {
int fd = open("test.mm", O_RDWR|O_CREAT|O_TRUNC, (mode_t)0700);
if (fd < 0) {
perror("Unable to open file 'test.mm'");
exit(1);
}
size_t data_length = sizeof(state_data);
if (ftruncate(fd, data_length) < 0) {
perror("Unable to truncate file 'test.mm'");
exit(1);
}
state_data *data = (state_data *)mmap(NULL, data_length, PROT_READ|PROT_WRITE, MAP_SHARED|MAP_POPULATE, fd, 0);
if (MAP_FAILED == data) {
perror("Unable to mmap file 'test.mm'");
close(fd);
exit(1);
}
memset(data, 0, data_length);
for (data->count = 0; data->count < 5; ++data->count) {
data->data[data->count] = test_data[data->count];
}
kill(getpid(), 9);
}
Here is a program that validates the resulting file after the previous program is dead:
#include <stdint.h>
#include <string.h>
#include <stdlib.h>
#include <stdio.h>
#include <errno.h>
#include <unistd.h>
#include <sys/types.h>
#include <sys/mman.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <assert.h>
typedef struct {
char data[100];
uint16_t count;
} state_data;
const char *test_data = "test";
int main(int argc, const char *argv[]) {
int fd = open("test.mm", O_RDONLY);
if (fd < 0) {
perror("Unable to open file 'test.mm'");
exit(1);
}
size_t data_length = sizeof(state_data);
state_data *data = (state_data *)mmap(NULL, data_length, PROT_READ, MAP_SHARED|MAP_POPULATE, fd, 0);
if (MAP_FAILED == data) {
perror("Unable to mmap file 'test.mm'");
close(fd);
exit(1);
}
assert(5 == data->count);
unsigned index;
for (index = 0; index < 4; ++index) {
assert(test_data[index] == data->data[index]);
}
printf("Validated\n");
}
I found something adding to my confusion:
munmap does not affect the object that was mappedthat is, the call to munmap
does not cause the contents of the mapped region to be written
to the disk file. The updating of the disk file for a MAP_SHARED
region happens automatically by the kernel's virtual memory algorithm
as we store into the memory-mapped region.
this is excerpted from Advanced Programming in the UNIX® Environment.
from the linux manpage:
MAP_SHARED Share this mapping with all other processes that map this
object. Storing to the region is equiva-lent to writing to the
file. The file may not actually be updated until msync(2) or
munmap(2) are called.
the two seem contradictory. is APUE wrong?
I didnot find a very precise answer to your question so decided add one more:
Firstly about losing data, using write or mmap/memcpy mechanisms both writes to page cache and are synced to underlying storage in background by OS based on its page replacement settings/algo. For example linux has vm.dirty_writeback_centisecs which determines which pages are considered "old" to be flushed to disk. Now even if your process dies after the write call has succeeded, the data would not be lost as the data is already present in kernel pages which will eventually be written to storage. The only case you would lose data is if OS itself crashes (kernel panic, power off etc). The way to absolutely make sure your data has reached storage would be call fsync or msync (for mmapped regions) as the case might be.
About the system load concern, yes calling msync/fsync for each request is going to slow your throughput drastically, so do that only if you have to. Remember you are really protecting against losing data on OS crashes which I would assume is rare and probably something most could live with. One general optimization done is to issue sync at regular intervals say 1 sec to get a good balance.
Either the Linux manpage information is incorrect or Linux is horribly non-conformant. msync is not supposed to have anything to do with whether the changes are committed to the logical state of the file, or whether other processes using mmap or read to access the file see the changes; it's purely an analogue of fsync and should be treated as a no-op except for the purposes of ensuring data integrity in the event of power failure or other hardware-level failure.
According to the manpage,
The file may not actually be
updated until msync(2) or munmap() is called.
So you will need to make sure you call munmap() prior to exiting at the very least.

Why does mmap() fail with ENOMEM on a 1TB sparse file?

I've been working with large sparse files on openSUSE 11.2 x86_64. When I try to mmap() a 1TB sparse file, it fails with ENOMEM. I would have thought that the 64 bit address space would be adequate to map in a terabyte, but it seems not. Experimenting further, a 1GB file works fine, but a 2GB file (and anything bigger) fails. I'm guessing there might be a setting somewhere to tweak, but an extensive search turns up nothing.
Here's some sample code that shows the problem - any clues?
#include <errno.h>
#include <fcntl.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/mman.h>
#include <sys/types.h>
#include <unistd.h>
int main(int argc, char *argv[]) {
char * filename = argv[1];
int fd;
off_t size = 1UL << 40; // 30 == 1GB, 40 == 1TB
fd = open(filename, O_RDWR | O_CREAT | O_TRUNC, 0666);
ftruncate(fd, size);
printf("Created %ld byte sparse file\n", size);
char * buffer = (char *)mmap(NULL, (size_t)size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
if ( buffer == MAP_FAILED ) {
perror("mmap");
exit(1);
}
printf("Done mmap - returned 0x0%lx\n", (unsigned long)buffer);
strcpy( buffer, "cafebabe" );
printf("Wrote to start\n");
strcpy( buffer + (size - 9), "deadbeef" );
printf("Wrote to end\n");
if ( munmap(buffer, (size_t)size) < 0 ) {
perror("munmap");
exit(1);
}
close(fd);
return 0;
}
The problem was that the per-process virtual memory limit was set to only 1.7GB. ulimit -v 1610612736 set it to 1.5TB and my mmap() call succeeded. Thanks, bmargulies, for the hint to try ulimit -a!
Is there some sort of per-user quota, limiting the amount of memory available to a user process?
My guess is the the kernel is having difficulty allocating the memory that it needs to keep up with this memory mapping. I don't know how swapped out pages are kept up with in the Linux kernel (and I assume that most of the file would be in the swapped out state most of the time), but it may end up needing an entry for each page of memory that the file takes up in a table. Since this file might be mmapped by more than one process the kernel has to keep up with the mapping from the process's point of view, which would map to another point of view, which would map to secondary storage (and include fields for device and location).
This would fit into your addressable space, but might not fit (at least contiguously) within physical memory.
If anyone knows more about how Linux does this I'd be interested to hear about it.

Resources