I am trying to write a program that reads a file using 'mmap' for school. I am having some difficulty creating the map. Specifically, I am getting a segmentation fault. I am not really sure what I am doing wrong here so some concrete help would be appreciated. Thank you.
#include <stdio.h>
#include <stdlib.h>
#include <sys/mman.h>
int main(int argc, char* argv[])
{
printf("Hello world!\n");
FILE* fp;// File pointer
int fd;// File descriptor
size_t size;// Length of the file
char* map;// File mmap
/* Open the file */
fp = fopen("data.txt", "r+");
/* Get the file descriptor */
fd = fileno(fp);
printf("FD: %d\n", fd);
/* Get the size of the file */
fseek(fp, 0, SEEK_END);
size = ftell(fp);
fseek(fp, 0, SEEK_SET);
printf("SIZE: %d\n", size);
/* Map the file with mmap */
map = mmap(NULL, size, PROT_READ, 0, fd, 0);
if (map == MAP_FAILED)
{
printf("MMAP FAILED\n");
} else {
printf("MMAP SUCEEDED\n");
}
/* Do something with the map */
int i;
for (i = 0; i < size; i++)
{
char c;
c = map[i];
putchar(c);
}
fclose(fp);
return(0);
}
You are not specifying anything as the flag argument, you must either specify MAP_PRIVATE or MAP_SHARED as specified here:
The flags argument determines whether updates to the mapping are
visible to other processes mapping the same region, and whether
updates are carried through to the underlying file. This behavior is determined by including exactly one of the following values in flags:
MAP_SHARED Share this mapping. Updates to the mapping are visible to
other processes that map this file, and are carried through to
the underlying file. (To precisely control when updates are
carried through to the underlying file requires the use of
msync(2).)
MAP_PRIVATE
Create a private copy-on-write mapping. Updates to the
mapping are not visible to other processes mapping the same
file, and are not carried through to the underlying file. It
is unspecified whether changes made to the file after the
mmap() call are visible in the mapped region.
In your case, since you are just reading the file, MAP_PRIVATE should be enough.
Try with:
map = mmap(NULL, size, PROT_READ, MAP_PRIVATE, fd, 0);
Related
Trying to search a pattern in a big file using mmap. The file is huge (way more than the physical memory). My worry is that if I used the file size as the second parameter for mmap(), there won't be enough physical memory to satisfy the system call. So I used 0x1000 as the length in the hope that OS will automatically map the right part of file as my pointer moves. But the following code snippet gave segmentation fault.
Any ideas?
#include <stdio.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <unistd.h>
#include <sys/mman.h>
long fileSize(char *fname) {
struct stat stat_buf;
int rc = stat(fname, &stat_buf);
return rc == 0 ? stat_buf.st_size : -1;
}
int main(int argc, char *argv[]) {
long size = fileSize(argv[1]);
printf("size=%ld\n", size);
int fd = open(argv[1], O_RDONLY);
printf("fd=%d\n", fd);
char *p = mmap(0, 0x1000, PROT_READ, MAP_SHARED, fd, 0);
if (p == MAP_FAILED) {
perror ("mmap");
return 1;
}
long i;
int pktLen;
int *pInt;
for (i=0; i < size; i+=4) {
pInt = (int*)(p+i);
if (pInt[i] == 0x12345678) {
printf("found it at %ld\n", i); break;
}
}
if (i == size) {
printf("didn't find it\n");
}
close(fd);
return 0;
}
Update
Turned out I had a silly bug
The line
if (pInt[i] == 0x12345678) should have been if (pInt[0] == 0x12345678)
Use
struct stat info;
long page;
const char *map;
size_t size, mapping;
int fd, result;
page = sysconf(_SC_PAGESIZE);
if (page < 1L) {
fprintf(stderr, "Invalid page size.\n");
exit(EXIT_FAILURE);
}
fd = open(filename, O_RDONLY);
if (fd == -1) {
fprintf(stderr, "%s: Cannot open file: %s.\n", filename, strerror(errno));
exit(EXIT_FAILURE);
}
result = fstat(fd, &info);
if (result == -1) {
fprintf(stderr, "%s: Cannot get file information: %s.\n", filename, strerror(errno));
close(fd);
exit(EXIT_FAILURE);
}
if (info.st_size <= 0) {
fprintf(stderr, "%s: No data.\n", filename);
close(fd);
exit(EXIT_FAILURE);
}
size = info.st_size;
if ((off_t)size != info.st_size) {
fprintf(stderr, "%s: File is too large to map.\n", filename);
close(fd);
exit(EXIT_FAILURE);
}
/* mapping is size rounded up to a multiple of page. */
if (size % (size_t)page)
mapping = size + page - (size % (size_t)page);
else
mapping = size;
map = mmap(NULL, mapping, PROT_READ, MAP_SHARED | MAP_NORESERVE, fd, 0);
if (map == MAP_FAILED) {
fprintf(stderr, "%s: Cannot map file: %s.\n", filename, strerror(errno));
close(fd);
exit(EXIT_FAILURE);
}
if (close(fd)) {
fprintf(stderr, "%s: Unexpected error closing file descriptor.\n", filename);
exit(EXIT_FAILURE);
}
/*
* Use map[0] to map[size-1], but remember that it is not a string,
* and that there is no trailing '\0' at map[size].
*
* Accessing map[size] to map[mapping-1] is not allowed, and may
* generate a SIGBUS signal (and kill the process).
*/
/* The mapping is automatically torn down when the process exits,
* but you can also unmap it with */
munmap(map, mapping);
The important points in the code above:
You'll need to start your code with e.g.
#define _POSIX_C_SOURCE 200809L
#define _BSD_SOURCE
#include <stdlib.h>
#include <unistd.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <sys/mman.h>
#include <fcntl.h>
#include <string.h>
#include <errno.h>
The _BSD_SOURCE is required for MAP_NORESERVE to be defined, even though it is a GNU/Linux-specific feature.
mapping (length in man 2 mmap) must be a multiple of page size (sysconf(_SC_PAGESIZE)).
MAP_NORESERVE flag tells the kernel that the mapping is backed by the file only, and as such, is allowed to be larger than available RAM + SWAP.
You can (but do not need to) close the file descriptor referring to the mapped file with no issues, because the mapping itself contains a reference in-kernel.
Years ago, on a different forum, I showed a simple program to manipulate a terabyte of data (1 TiB = 1,099,511,627,776 bytes) using this very approach (although it uses a sparse backing file; i.e. mostly implicit zeroes, with less than 250 MB of actual data written to the backing file -- mostly to reduce the amount of disk space needed). Of course, it requires a 64-bit machine running Linux, as the virtual memory on 32-bit machines is limited to 232 = 4 GiB (Linux does not support segmented memory models).
The Linux kernel is surprisingly efficient in choosing which pages to keep in RAM, and which pages to evict. Of course, you can make that even more efficient, by telling the kernel which parts of the mapping you are unlikely to access (and therefore can be evicted), by using posix_madvise(address, length, advice) with advice being POSIX_MADV_DONTNEED or POSIX_MADV_WILLNEED. This has the benefit that unlike unmapping the "dontneed" parts, you can, if you need to, re-access that part of the mapping. (If the pages are already evicted, the access to the mapping will just block until the pages are re-loaded to memory. In other words, you can use posix_madvise() to "optimize" eviction logic, without limiting what part of the mapping can be accessed.)
In your case, if you do a linear or semi-linear search over the data using e.g. memmem(), you can use posix_madvise(map, mapping, POSIX_MADV_SEQUENTIAL).
Personally, I'd run the search first without using any posix_madvise() calls, and then see if it makes a significant enough positive difference, using the same data set (and several runs, of course). (You can safely -- with no risk of losing any data -- clear the page cache between test runs using sudo sh -c 'sync ; echo 3 > /proc/sys/vm/drop_caches ; sync', if you wish to exclude the effects of having the large file (mostly) already cached, between timing runs.)
The SIGSEGV is because you're accessing beyond 0x1000 bytes (in the for loop). You have to mmap() the complete size bytes of the fd.
The concept of demand paging in virtual memory subsystem helps exact same scenarios like yours - applications/application data bigger than the physical memory size. After the mmap(), as and when you access the (virtual) address, if there is no physical page mapped to it (page fault), kernel will find out a physical page that can be used (page replacement).
fd = open(argv[1], O_RDONLY);
ptr = mmap(NULL, file_size, PROT_READ, MAP_PRIVATE, fd, 0);
/* Consume the entire file's data as needed */
munmap(ptr, file_size);
Alternately you can put a loop around the mmap()/munmap() to scan the file in PAGE_SIZE or in multiples of PAGE_SIZE. The last arg of mmap() - offset will come handy for that.
From man-page :
void *mmap(void *addr, size_t length, int prot, int flags, int fd, off_t offset);
int munmap(void *addr, size_t length);
Pseudo-code :
fd = open(argv[1], O_RDONLY);
last_block_size = file_size % PAGE_SIZE;
num_pages = file_size / PAGE_SIZE + (last_block_size ? 1 : 0)
for (int i = 0; i < num_pages; i++) {
block_size = last_block_size && (i == num_pages - 1) ? last_block_size : PAGE_SIZE;
ptr = mmap(NULL, block_size, PROT_READ, MAP_PRIVATE, fd, i * PAGE_SIZE);
/* Consume the file's data range (ptr, ptr+block_size-1) as needed */
munmap(ptr, block_size);
}
Please use MAP_PRIVATE as the mapping might be just needed for your process alone. It just avoids few extra steps by the kernel for the MAP_SHARED.
Edit : It should have been MAP_PRIVATE in place of MAP_ANON. Changed.
int fp, page;
char *data;
if(argc > 1){
printf("Read the docs");
exit(1);
}
fp = open("log.txt", O_RDONLY); //Opening file to read
page = getpagesize();
data = mmap(0, page, PROT_READ, 0,fp, 0);
initscr(); // Creating the ncurse screen
clear();
move(0, 0);
printw("%s", data);
endwin(); //Ends window
fclose(fp); //Closing file
return 0;
Here is my code I keep getting a segmentation fault for some reason.
All my header files have been included so that's not the problem (clearly, because its something to do with memory). Thanks in advance.
Edit: Got it - it wasn't being formatted as a string. and also had to use stat() to get the file info rather than getpagesize()
You can't fclose() a file descriptor you got from open(). You must use close(fp) instead. What you do is passing a small int that gets treated as a pointer. This causes a segmentation fault.
Note that your choice of identifier naming is unfortunate. Usually fp would be a pointer-to-FILE (FILE*, as used by the standard IO library), while fd would be a file descriptor (a small integer), used by the kernel's IO system calls.
Your compiler should have told you that you pass an int where a pointer-to-FILE was expected, or that you use fclose() without a prototype in scope. Did you enable the maximum warning level of your compiler?
Another segfault is possible if the data pointer does not point to a NUL (0) terminated string. Does your log.txt contain NUL-terminated strings?
You should also check if mmap() fails returning MAP_FAILED.
Okay so here is the code that got it working
#include <sys/stat.h>
int status;
struct stat s;
status = stat(file, &s);
if(status < 0){
perror("Stat:");
exit(1);
data = mmap(NULL, s.st_size, PROT_READ, MAP_PRIVATE, fd, 0);
Before i was using 'getpagesize();' thanks beej !!!
mmap's man page gives you information on the parameters:
void *mmap(void *addr, size_t length, int prot, int flags, int fd, off_t offset);
As you can see, your second argument may be wrong (except you really want to exactly map a part of the file fitting into a single page).
Also: Probably 0 is not a valid flag value? Let's have a look again at the man page:
The flags argument determines whether updates to the mapping are
visible to other processes mapping the same region, and whether
updates are carried through to the underlying file. This behavior is
determined by including exactly one of the following values in flags: MAP_SHARED or MAP_PRIVATE
So you could try something like
data = mmap(0, size, PROT_READ, MAP_SHARED, fp, 0);
Always use the provided flags, as the underlying value may differ from machine to machine.
Also, the mapped area should not be larger than the underlying file. Check the size of log.txt beforehand.
The second argument to mmap should not be page size, it should be the size of your file. Here is a nice example.
Let's say I have the standard "Hello, World! \n" saved to a text file called hello.txt. If I want to change the 'H' to a 'R' or something, can I achieve this with mmap()?
mmap does not exist in the standard C99 (or C11) specification. It is defined in POSIX.
So assuming you have a POSIX system (e.g. Linux), you could first open(2) the file for read & write:
int myfd = open("hello.txt", O_RDWR);
if (myfd<0) { perror("hello.txt open"); exit(EXIT_FAILURE); };
Then you get the size (and other meta-data) of the file with fstat(2):
struct stat mystat = {};
if (fstat(myfd,&mystat)) { perror("fstat"); exit(EXIT_FAILURE); };
Now the size of the file is in mystat.st_size.
off_t myfsz = mystat.st_size;
Now we can call mmap(2) and we need to share the mapping (to be able to write inside the file thru the virtual address space)
void*ad = mmap(NULL, myfsz, PROT_READ|PROT_WRITE, MAP_SHARED,
myfd, 0);
if (ad == MMAP_FAILED) { perror("mmap"); exit(EXIT_FAILURE); };
Then we can overwrite the first byte (and we check that indeed the first byte in that file is H since you promised so):
assert (*(char*ad) == 'H');
((char*)ad) = 'R';
We might call msync(2) to ensure the file is updated right now on the disk. If we don't, it could be updated later.
Notably for very large mappings (notably those much larger than available RAM), we can assist the kernel (and its page cache) with hints given thru madvise(2) or posix_madvise(3)...
Notice that a mapping remains in effect even after a close(2). Use munmap & mprotect or mmap with MAP_FIXED on the same address range to change them.
On Linux, you could use proc(5) to query the address space. So your program could read (e.g. after fopen, using fgets in a loop) the pseudo /proc/self/maps file (or /proc/1234/maps for process of pid 1234).
BTW, mmap is used by dlopen(3); it can be called a lot of times, my manydl.c program demonstrates that on Linux you could have many hundreds of thousands of dlopen-ed shared files (so many hundreds of thousands of memory mappings).
Here's a working example.
#include <stdio.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <sys/mman.h>
int main(){
int myFile = open("hello.txt", O_RDWR);
if(myFile < 0){
printf("open error\n");
}
struct stat myStat = {};
if (fstat(myFile, &myStat)){
printf("fstat error\n");
}
off_t size = myStat.st_size;
char *addr;
addr = mmap(NULL, size, PROT_READ|PROT_WRITE, MAP_SHARED, myFile, 0);
if (addr == MAP_FAILED){
printf("mmap error\n");
}
if (addr[0] != 'H'){
printf("Error: first char in file not H");
}
addr[0] = 'J';
return 0;
}
i want to copy whole of a file to memory using mmap in C.i write this code:
#include <stdio.h>
#include <stdlib.h>
#include <fcntl.h>
#include <unistd.h>
#include <sys/types.h>
#include <sys/mman.h>
#include <sys/stat.h>
#include <errno.h>
int main(int arg, char *argv[])
{
char c ;
int numOfWs = 0 ;
int numOfPr = 0 ;
int numberOfCharacters ;
int i=0;
int k;
int pageSize = getpagesize();
char *data;
float wsP = 0;
float prP = 0;
int fp = open("2.txt", O_RDWR);
data = mmap((caddr_t)0, pageSize, PROT_READ, MAP_SHARED, fp,pageSize);
printf("%s\n", data);
exit(0);
}
when i execute the code i get the Bus error message.
next, i want to iterate this copied file and do some thing on it.
how can i copy the file correctly?
2 things.
The second parameter of mmap() is the size of the portion of file you want to make visible in your address space. The last one is the offset in the file from which you want the map. This means that as you have called mmap() you will see only 1 page (on x86 and ARM it's 4096 bytes) starting at offset 4096 in your file. If your file is smaller than 4096 bytes, then there will be no mapping and mmap() will return MAP_FAILED (i.e. (caddr_t)-1). You didn't check the return value of the function so the following printf() dereferences an illegal pointer => BUS ERROR.
Using a memory map with string functions can be difficult. If the file doesn't contain binary 0. It can happen that these functions then try to access past the mapped size of the file and touch unmapped memory => SEGFAULT.
To open a memory for a file, you have to know the size of the file.
struct stat filestat;
if(fstat(fd, &filestat) !=0) {
perror("stat failed");
exit(1);
}
data = mmap(NULL, filestat.st_size, PROT_READ, MAP_SHARED, fp, 0);
if(data == MAP_FAILED) {
perror("mmap failed");
exit(2);
}
EDIT: The memory map will always be opened with a size that is a multiple of the pagesize. This means that the last page will be filled with 0 up to the next multiple of the pagesize. Often programs using memory mapped files with string functions (like your printf()) will work most of the time, but will suddenly crash when mapping a file whith a size exactly a multiple of the page size (4096, 8192, 12288 etc.). The often seen advice to pass to mmap() a size bigger than real file size works on Linux but is not portable and is even in violation of Posix, which explicitly states that mapping beyond the file size is undefined behaviour. The only portable way is to not use string functions on memory maps.
The last parameter of mmap is the offset within the file, where the part of file mapped to memory starts. It shall be 0 in your case
data = mmap(NULL, pageSize, PROT_READ, MAP_SHARED, fp,0);
If your file is shorter than pageSize, you will not be able to use addresses beyond the end of file. To use the full size, you shall expand the size to pageSize before calling mmap. Use something like:
ftruncate(fp, pageSize);
If you want to write to the memory (file) you shall use flag PROT_WRITE as well. I.e.
data = mmap(NULL, pageSize, PROT_READ|PROT_WRITE, MAP_SHARED, fp,0);
If your file does not contain 0 character (as end of string) and you want to print it as a string, you shall use printf with explicitly specified maximum size:
printf("%.*s\n", pageSize, data);
Also, of course, as pointed by #Jongware, you shall test result of open for -1 and mmap for MAP_FAILED.
I have a ASCII file where every line contains a record of variable length. For example
Record-1:15 characters
Record-2:200 characters
Record-3:500 characters
...
...
Record-n: X characters
As the file sizes is about 10GB, i would like to read the record in chunks. Once read, i need to transform them, write them into another file in binary format.
So, for reading, my first reaction was to create a char array such as
FILE *stream;
char buffer[104857600]; //100 MB char array
fread(buffer, sizeof(buffer), 104857600, stream);
Is it correct to assume, that linux will issue one system call and fetch the entire 100MB?
As the records are separated by new line, i search for character by character for a new line character in the buffer and reconstruct each record.
My question is that is this how i should read in chunks or is there a better alternative to read data in chunks and reconstitute each record? Is there an alternative way to read x number of variable sized lines from an ASCII file in one call ?
Next during write, i do the same. I have a write char buffer, which i pass to fwrite to write a whole set of records in one call.
fwrite(buffer, sizeof(buffer), 104857600, stream);
UPDATE: If i setbuf(stream, buffer), where buffer is my 100MB char buffer, would fgets return from buffer or cause a disk IO?
Yes, fread will fetch the entire thing at once. (Assuming it's a regular file.) But it won't read 105 MB unless the file itself is 105 MB, and if you don't check the return value you have no way of knowing how much data was actually read, or if there was an error.
Use fgets (see man fgets) instead of fread. This will search for the line breaks for you.
char linebuf[1000];
FILE *file = ...;
while (fgets(linebuf, sizeof(linebuf), file) {
// decode one line
}
There is a problem with your code.
char buffer[104857600]; // too big
If you try to allocate a large buffer (105 MB is certainly large) on the stack, then it will fail and your program will crash. If you need a buffer that big, you will have to allocate it on the heap with malloc or similar. I'd certainly keep stack usage for a single function in the tens of KB at most, although you could probably get away with a few MB on most stock Linux systems.
As an alternative, you could just mmap the entire file into memory. This will not improve or degrade performance in most cases, but it easier to work with.
int r, fdes;
struct stat st;
void *ptr;
size_t sz;
fdes = open(filename, O_RDONLY);
if (fdes < 0) abort();
r = fstat(fdes, &st);
if (r) abort();
if (st.st_size > (size_t) -1) abort(); // too big to map
sz = st.st_size;
ptr = mmap(NULL, sz, PROT_READ, MAP_SHARED, fdes, 0);
if (ptr == MAP_FAILED) abort();
close(fdes); // file no longer needed
// now, ptr has the data, sz has the data length
// you can use ordinary string functions
The advantage of using mmap is that your program won't run out of memory. On a 64-bit system, you can put the entire file into your address space at the same time (even a 10 GB file), and the system will automatically read new chunks as your program accesses the memory. The old chunks will be automatically discarded, and re-read if your program needs them again.
It's a very nice way to plow through large files.
If you can, you might find that mmaping the file will be easiest. mmap maps a (portion of a) file into memory so the whole file can be accessed essentially as an array of bytes. In your case, you might not be able to map the whole file at once it would look something like:
#include <stdio.h>
#include <sys/stat.h>
#include <sys/types.h>
#include <unistd.h>
#include <sys/mman.h>
/* ... */
struct stat stat_buf;
long pagesz = sysconf(_SC_PAGESIZE);
int fd = fileno(stream);
off_t line_start = 0;
char *file_chunk = NULL;
char *input_line;
off_t cur_off = 0;
off_t map_offset = 0;
/* map 16M plus pagesize to ensure any record <= 16M will always fit in the mapped area */
size_t map_size = 16*1024*1024+pagesz;
if (map_offset + map_size > stat_buf.st_size) {
map_size = stat_buf.st_size - map_offset;
}
fstat(fd, &stat_buf);
/* map the first chunk of the file */
file_chunk = mmap(NULL, map_size, PROT_READ, MAP_SHARED, fd, map_offset);
// until we reach the end of the file
while (cur_off < stat_buf.st_size) {
/* check if we're about to read outside the current chunk */
if (!(cur_off-map_offset < map_size)) {
// destroy the previous mapping
munmap(file_chunk, map_size);
// round down to the page before line_start
map_offset = (line_start/pagesz)*pagesz;
// limit mapped region to size of file
if (map_offset + map_size > stat_buf.st_size) {
map_size = stat_buf.st_size - map_offset;
}
// map the next chunk
file_chunk = mmap(NULL, map_size, PROT_READ, MAP_SHARED, fd, map_offset);
// adjust the line start for the new mapping
input_line = &file_chunk[line_start-map_offset];
}
if (file_chunk[cur_off-map_offset] == '\n') {
// found a new line, process the current line
process_line(input_line, cur_off-line_start);
// set up for the next one
line_start = cur_off+1;
input_line = &file_chunk[line_start-map_offset];
}
cur_off++;
}
Most of the complication is to avoid making too huge a mapping. You might be able to map the whole file using
char *file_data = mmap(NULL, stat_buf.st_size, PROT_READ, MAP_SHARED, fd, 0);
my opinion is using fgets(buff) for auto detect new line.
and then use strlen(buff) for counting the buffer size,
if( (total+strlen(buff)) > 104857600 )
then write in new chunk..
But the chunk's size will hardly be 104857600 bytes.
CMIIW