I am trying to measure DIRECT IO performance. By my understanding DIRECT I/O ignores the page cache and goes to the underlying device to fetch the data. Therefore, if we are reading the same file over and over again, DIRECT I/O would be slower compared to accesses involving the page cache as the file would be cached.
#define _GNU_SOURCE
#include <stdio.h>
#include <errno.h>
#include <fcntl.h>
#include <time.h>
char *DIRECT_FILE_PATH = "direct.dat";
char *NON_DIRECT_FILE_PATH = "no_direct.dat";
int FILE_SIZE_MB = 100;
int NUM_ITER = 100;
void lay_file(int direct_flag) {
int flag = O_RDWR | O_CREAT | O_APPEND | O_DIRECT;
mode_t mode = 0644;
int fd;
if (direct_flag) {
fd = open(DIRECT_FILE_PATH, flag, mode);
} else {
fd = open(NON_DIRECT_FILE_PATH, flag, mode);
}
if (fd == -1) {
printf("Failed to open file. Error: \t%s\n", strerror(errno));
}
ftruncate(fd, FILE_SIZE_MB*1024*1024);
close(fd);
}
void read_file(int direct_flag) {
mode_t mode = 0644;
void *buf = malloc(FILE_SIZE_MB*1024*1024);
int fd, flag;
if (direct_flag) {
flag = O_RDONLY | O_DIRECT;
fd = open(DIRECT_FILE_PATH, flag, mode);
} else {
flag = O_RDONLY;
fd = open(NON_DIRECT_FILE_PATH, flag, mode);
}
for (int i=0; i<NUM_ITER; i++) {
read(fd, buf, FILE_SIZE_MB*1024*1024);
lseek(fd,0,SEEK_SET);
}
close(fd);
}
int main() {
lay_file(0);
lay_file(1);
clock_t t;
t = clock();
read_file(1);
t = clock() - t;
double time_taken = ((double)t)/CLOCKS_PER_SEC; // in seconds
printf("DIRECT I/O read took %f seconds to execute \n", time_taken);
t = clock();
read_file(0);
t = clock() - t;
time_taken = ((double)t)/CLOCKS_PER_SEC; // in seconds
printf("NON DIRECT I/O read took %f seconds to execute \n", time_taken);
return 0;
}
Using the above code to measure DIRECT I/O performance tells me that DIRECT I/O is faster than regular IO involving the page cache. This is the output
**
DIRECT I/O read took 0.824861 seconds to execute
NON DIRECT I/O read took 1.643310 seconds to execute
**
Please let me know if I am missing something. I have an NVMe SSD as the storage device. I wonder if it is too fast to really show the difference in performance of when the page cache is used and not used.
UPDATE:
Changing the buffer size to 4KB shows that DIRECT I/O is slower. The large buffer size was probably making large sequential writes to the underlying device which is more helpful but would still like some insights.
** DIRECT I/O read took 0.000209 seconds to execute NON DIRECT I/O read took 0.000151 seconds to execute **
Related
I have the following function used to test the file system (yaffs2). It is called by a FreeRTOS task with a stack size of 65535. In debug mode, everything appears to work fine, but when I enable -O1, I get a hard fault immediately after printing "Testing filesystem...".
I tried changing the test size to a much lower value, thinking maybe there is a memory issue (note that the FreeRTOS heap is 7MB wide), but nothing changed. Any ideas what might be causing this?
#include <stdio.h>
#include "FreeRTOS.h"
#include "task.h"
#include "yaffs_trace.h"
#include "yaffsfs.h"
int fs_test(void)
{
int fd;
int test_size = 1000000;
const char path[] = "nand/testfile.dat";
int pattern = 0x55;
uint8_t *test;
uint8_t *test_read;
uint32_t start_time, stop_time;
DEBUG_PRINT("Testing filesystem...\n");
fd = yaffs_open(path, O_CREAT | O_EXCL | O_WRONLY, S_IREAD | S_IWRITE);
if(fd < 0) {
DEBUG_PRINT("cannot create test file\n");
return -1;
}
test = pvPortMalloc(test_size);
test_read = pvPortMalloc(test_size);
if(test == NULL || test_read == NULL) {
DEBUG_PRINT("not enough memory\n");
vPortFree(test);
vPortFree(test_read);
yaffs_close(fd);
yaffs_rm(path);
return -2;
}
memset(test, pattern, test_size);
start_time = xTaskGetTickCount();
yaffs_write(fd, test, test_size);
yaffs_close(fd);
stop_time = xTaskGetTickCount();
DEBUG_PRINTF("Average write speed = %d kB/s\n", test_size / (stop_time-start_time));
fd = yaffs_open(path, O_RDWR, S_IREAD | S_IWRITE);
if(fd < 0) {
DEBUG_PRINT("cannot open test file\n");
vPortFree(test);
vPortFree(test_read);
yaffs_close(fd);
yaffs_rm(path);
return -1;
}
start_time = xTaskGetTickCount();
yaffs_read(fd, test_read, test_size);
yaffs_close(fd);
stop_time = xTaskGetTickCount();
DEBUG_PRINTF("Average read speed = %d kB/s\n", test_size / (stop_time-start_time));
if (!memcmp(test, test_read, test_size))
DEBUG_PRINT("file integrity test successful\n");
else
DEBUG_PRINT("file integrity test failed!\n");
yaffs_rm(path);
vPortFree(test);
vPortFree(test_read);
return 0;
}
I am trying to understand the inner workings of Unix based OSs. I was reading on buffered I/O, and how the buffer size affects the number of system calls made, which in turn affects the total time taken by say, a copy program. To being with, here is my program:
#include <stdio.h>
#include <unistd.h>
#include <fcntl.h>
#include <time.h>
long currentTimeMillis();
int main(int argc, char *argv[]) {
int bufsize = atoi(argv[3]);
printf("copying with buffer size %d\n", bufsize);
char buf[bufsize];
//open the file
int fd_from = open(argv[1], O_RDWR);
if(-1 == fd_from) {
printf("Error opening source file\n");
return -1;
}
//file to be copied to
int fd_to = open(argv[2], O_WRONLY | O_CREAT, S_IRUSR | S_IWUSR);
if(-1 == fd_to) {
printf("Error opening destination file\n");
return -1;
}
//copy
long startTime = currentTimeMillis();
int bytes_read = 0;
long totalTimeForRead = 0;
long totalTimeForWrite = 0;
while(1) {
long readStartTime = currentTimeMillis();
int bytes_read = read(fd_from,buf,bufsize);
long readEndTime = currentTimeMillis();
if(0 == bytes_read) {
break;
}
if(-1 == bytes_read) {
printf("Error occurred while reading source file\n");
return -1;
}
totalTimeForRead += readEndTime - readStartTime;
long writeStartTime = currentTimeMillis();
int bytes_written = write(fd_to,buf,bufsize);
long writeEndTime = currentTimeMillis();
totalTimeForWrite += (writeEndTime - writeStartTime);
if(-1 == bytes_written) {
printf("Some error occurred while writing file\n");
return -1;
}
}
long endTime = currentTimeMillis();
printf("Total time to copy%ld\n", endTime - startTime);
printf("Total time to write%ld\n", totalTimeForWrite);
printf("Total time to read%ld\n", totalTimeForRead);
}
long currentTimeMillis() {
struct timeval time;
gettimeofday(&time, NULL);
return time.tv_sec * 1000 + time.tv_usec / 1000;
}
I am using a 16G MacBook Pro with 2.9GHz Intel i7 (if this information would be useful). The size of source file is 2.8G. I was a bit surprised to see that total time take by read() is much smaller than write(). Here're my findings with a buffer size of 16K:
./a.out largefile dest 16382
copying with buffer size 16382
Total time to copy5987
Total time to write5330
Total time to read638
From what I have read, write() returns immediately after transferring the data to the kernel buffer from the user buffer. So the time taken by it is this time + the time taken for the system call initiation. read() also reads from kernel buffer to user buffer, so the total time taken should be same (in both cases, there is no disk I/O).
Why is it then that there is such drastic difference in the results? Or am I benchmarking it wrong? Final question: Is it alright to do such benchmarking on a SSD, which has limited write cycles?
I am doing some benchmarking (on OS X) to see how the use of file system influences the bandwidth etc. I am using concurrency with the hope to create fragmentation in the FS.
However, it looks like using the FS is more efficient than raw disk accesses. Why?
Here is my code:
#include <pthread.h>
#include <stdio.h>
#include <fcntl.h>
#include <unistd.h>
#include <stdlib.h>
#define NO_THREADS (2)
#define PACKET_SIZE (1024 * 4)
#define SIZE_TO_WRITE (1024 * 1024 * 1024)
void write_buffer(void *arg) {
int *p_start = arg;
int start = *p_start;
char buffer[PACKET_SIZE];
char path[50];
sprintf(path, "file%d", start);
int fd = open(path, O_CREAT | O_WRONLY | O_APPEND);
//int fd = open("/dev/rdisk0s4", O_WRONLY);
if (fd < 0) {
fprintf(stderr, "Cound not open.\n", stderr);
goto end;
}
//lseek(fd, start * SIZE_TO_WRITE, SEEK_SET);
int current;
for (current = start; current < start + SIZE_TO_WRITE; current += PACKET_SIZE) {
int i;
for (i = 0; i < PACKET_SIZE; ++i) {
buffer[i] = i + current;
}
if (PACKET_SIZE != write(fd, buffer, PACKET_SIZE)) {
fprintf(stderr, "Could not write packet %d properly.", current);
goto close;
}
}
fsync(fd);
close:
close(fd);
end:
pthread_exit(0);
}
void flush(void) {
fflush(stdout);
fflush(stderr);
}
int main(void) {
pthread_t threads[NO_THREADS];
int starts[NO_THREADS];
int i;
atexit(flush);
for (i = 0; i < NO_THREADS; ++i) {
starts[i] = i;
if(pthread_create(threads + i, NULL, (void *) &write_buffer, (void *)(starts + i))) {
fprintf(stderr, "Error creating thread no %d\n", i);
return EXIT_FAILURE;
}
}
for (i = 0; i < NO_THREADS; ++i) {
if(pthread_join(threads[i], NULL)) {
fprintf(stderr, "Error joining thread\n");
return EXIT_FAILURE;
}
}
puts("Done");
return EXIT_SUCCESS;
}
With the help of the FS, the 2 threads write the file in 31.33 seconds. Without, it is achieved after minutes...
When you use /dev/rdisk0s4 instead of /path/to/normal/file%d, for every write you perform the OS will issue a disk I/O. Even if that disk is an SSD, that means that the round-trip time is probably at least a few hundred microseconds on average. When you write to the file instead, the filesystem isn't actually issuing your writes to disk until later. The Linux man page describes this well:
A successful return from write() does not make any guarantee that data has been committed to disk. In fact, on some buggy implementations, it does not even guarantee that space has successfully been reserved for the data. The only way to be sure is to call fsync(2) after you are done writing all your data.
So, the data you wrote is being buffered by the filesystem, which only requires that a copy be made in memory -- this probably takes on the order of a few microseconds at most. If you want to do an apples-to-apples comparison, you should make sure you're doing synchronous I/O for both test cases. Even running fsync after the whole test is done will probably allow the filesystem to be much faster, since it will batch up the I/O into one continuous streaming write, which could be faster than what your test directly on the disk can achieve.
In general, writing good systems benchmarks is incredibly difficult, especially when you don't know a lot about the system you're trying to test. I'd recommend using an off-the-shelf Unix filesystem benchmarking toolkit if you want high quality results -- otherwise, you could spend literally a lifetime learning about performance pathologies of the OS and FS you're testing... not that that's a bad thing, if you're interested in it like I am :-)
I have a C program that writes 32768 blocks, each block is 16K size (total size of 512MB), to an ext4 filesystem on a system running 3.18.1 kernel. The regular write system call version of this program takes 5.35 seconds to finish the writes (as measured by gettimeofday before and after the for loop). The async io version of this program however takes the following times:
to queue all the aio_writes (32768 aio_writes): 7.43 seconds
poll to finish each IO request: additional 4.93 seconds
The output files are opened with these flags:O_WRONLY, O_CREAT, O_NONBLOCK
Why does async io take more than double the write() time? Even the Time-to-queue-async-io-request/time-to-write-sync-io is 1.4.
Since some people marked it off-topic, I looked at the definition and decided to paste the code - that seems to be the only reason why it should be marked off-topic. I am not asking why the code is not working, only why aio is much slower than regular writes, especially since all parallel writes are to different blocks. Here's the aio code, followed by the non-aio code:
AIO program
#define MAX_AIO (16384*2)
#define BUFSIZE 16384
struct mys {
int status;
struct aiocb aio;
};
void set_aiocb(struct mys *aio, int num, int fd)
{
int i;
for (i = 0; i < num; i++) {
aio[i].aio.aio_fildes = fd;
aio[i].aio.aio_offset = BUFSIZE * i;
aio[i].aio.aio_buf = malloc(BUFSIZE);
set_buf(aio[i].aio.aio_buf, BUFSIZE, i);
aio[i].aio.aio_nbytes = BUFSIZE;
aio[i].aio.aio_reqprio = fd;
aio[i].aio.aio_sigevent.sigev_notify = SIGEV_NONE;
aio[i].aio.aio_sigevent.sigev_signo = SIGUSR1;
aio[i].aio.aio_sigevent.sigev_value.sival_ptr = &aio[i];
aio[i].aio.aio_lio_opcode = 0;
aio[i].status = EINPROGRESS;
}
}
void main(void)
{
int fd = open("/tmp/AIO", O_WRONLY | O_CREAT, 0666);
int i, open_reqs = MAX_AIO;
struct mys aio[MAX_AIO];
struct timeval start, end, diff;
set_aiocb(aio, MAX_AIO, fd);
gettimeofday(&start, NULL);
for (i = 0; i < MAX_AIO; i++)
aio_write(&aio[i].aio);
while (open_reqs > 0) {
for (i = 0; i < MAX_AIO; i++) {
if (aio[i].status == EINPROGRESS) {
aio[i].status = aio_error(&(aio[i].aio));
if (aio[i].status != EINPROGRESS)
open_reqs--;
}
}
}
gettimeofday(&end, NULL);
timersub(&end, &start, &diff);
printf("%d.%d\n", (int)diff.tv_sec, (int)diff.tv_usec);
}
Regular IO program
#define MAX_AIO (16384*2)
#define BUFSIZE 16384
char buf[MAX_AIO][BUFSIZE];
void main(void)
{
int i, fd = open("/tmp/NON_AIO", O_WRONLY | O_CREAT, 0666);
struct timeval start, end, diff;
gettimeofday(&start, NULL);
for (i = 0; i < MAX_AIO; i++)
write(fd, buf[i], BUFSIZE);
gettimeofday(&end, NULL);
timersub(&end, &start, &diff);
printf("%d.%d\n", (int)diff.tv_sec, (int)diff.tv_usec);
}
You aren't really comparing apples with apples.
In the AIO code, you have a separately allocated buffer for each of the write operations, so the program has 512 MiB (16 * 16 * 2 KiB) of memory allocated, plus the 32 K copies of the AIO control structure. That memory has to be allocated, initialized (each buffer gets a different value if the set_buf() function — which is not shown — sets each byte of the buffer to the value of the third parameter), then copied by the kernel to the driver, possibly via the kernel buffer pool.
In the regular IO code, you have one big, contiguous buffer which is initialized to all zeroes which you write to the disk.
To make the comparison equitable, you should use the same infrastructure in both programs, creating the AIO structures, but the regular IO code will then simply step through the structures, writing the data portion of each (while the AIO code will behave more or less as shown). And I expect you will find that the performance is a lot more nearly similar when you do that.
I am reading bytes from a serial port in C++ using a file descriptor and the posix/unix read() function. In this example, I am reading 1 byte from the serial port (baud rate settings and similiar are omitted for clarity):
#include <termios.h>
#include <fcntl.h>
#include <unistd.h>
int main(void)
{
int fd = open("/dev/ttyS0", O_RDWR | O_NOCTTY);
char buf[1];
int bytesRead = read(fd, buf, 1);
close(fd);
return 0;
}
If the device connected to /dev/ttyS0 does not send any information, the program will hang. How can I set a timeout?
I have tried setting a time out like this:
struct termios options;
tcgetattr(fd, &options);
options.c_cc[VMIN] = 0;
options.c_cc[VTIME] = 10;
tcsetattr(fd, TCSANOW, &options);
I thought it was supposed to give 1 second timeout, but it makes no difference. I think I have misunderstood VMIN and VTIME. What is VMIN and VTIME used for?
Then I searched the web and found somebody talking about the select() function. Is that the solution and if so, how would one apply that to the program above to make 1 second timeout?
Any help is appreciated. Thanks in advance :-)
Yes, use select(2). Pass in a file descriptor set containing just your fd in the read set and empty write/exception sets, and pass in an appropriate timeout. For example:
int fd = open(...);
// Initialize file descriptor sets
fd_set read_fds, write_fds, except_fds;
FD_ZERO(&read_fds);
FD_ZERO(&write_fds);
FD_ZERO(&except_fds);
FD_SET(fd, &read_fds);
// Set timeout to 1.0 seconds
struct timeval timeout;
timeout.tv_sec = 1;
timeout.tv_usec = 0;
// Wait for input to become ready or until the time out; the first parameter is
// 1 more than the largest file descriptor in any of the sets
if (select(fd + 1, &read_fds, &write_fds, &except_fds, &timeout) == 1)
{
// fd is ready for reading
}
else
{
// timeout or error
}
What is VMIN and VTIME used for?
If MIN > 0 and TIME = 0, MIN sets the number of characters to receive
before the read is satisfied. As TIME is zero, the timer is not used.
If MIN = 0 and TIME > 0, TIME serves as a timeout value. The read will
be satisfied if a single character is read, or TIME is exceeded (t =
TIME *0.1 s). If TIME is exceeded, no character will be returned.
If MIN > 0 and TIME > 0, TIME serves as an inter-character timer. The
read will be satisfied if MIN characters are received, or the time
between two characters exceeds TIME. The timer is restarted every time
a character is received and only becomes active after the first
character has been received.
If MIN = 0 and TIME = 0, read will be satisfied immediately. The
number of characters currently available, or the number of characters
requested will be returned. According to Antonino (see contributions),
you could issue a fcntl(fd, F_SETFL, FNDELAY); before reading to get
the same result.
Source : http://tldp.org/HOWTO/Serial-Programming-HOWTO/x115.html
You can attempt capture signal to stop read operation. use alarm(1) before read, and if read function did not returned, alarm will send SIGALRM signal, then you can create signal processing function to capture this signal, like this:
#include <stdio.h>
#include <stdlib.h>
#include <signal.h>
#include <setjmp.h>
static jmp_buf env_alarm;
static void sig_alarm(int signo)
{
longjmp(env_alarm, 1);
}
int main(void)
{
int fd = open("/dev/ttyS0", O_RDWR | O_NOCTTY);
char buf[1];
if (signal(SIGALRM, sig_alarm) == SIG_ERR)
{
exit(0);
}
if (setjmp(env_alarm) != 0)
{
close(fd);
printf("Timeout Or Error\n");
exit(0);
}
alarm(1);
int bytesRead = read(fd, buf, 1);
alarm(0);
close(fd);
return 0;
}
But use select or poll or epoll will be better if your program is big.
select() is the way I would solve this problem.
There are several pages on the internet that will give info on how to use select(), such as http://www.unixguide.net/unix/programming/2.1.1.shtml
There are several possible approaches. If the program will eventually be timing more than one i/o operation, select() is the clear choice.
However, if the only input is from this i/o, then selecting non-blocking i/o and timing is a straightforward method. I have expanded it from single character i/o to multi-character to make it a more generally complete example:
#include <fcntl.h>
#include <termios.h>
#include <unistd.h>
#include <sys/time.h>
int main(void)
{
int fd = open("/dev/ttyS0", O_RDWR | O_NOCTTY | O_NDELAY); // sometimes "O_NONBLOCK"
char buf[10];
int done = 0, inbuf = 0;
struct timeval start, now;
gettimeofday (&start, NULL);
while (!done)
{
int bytesRead = read(fd, &buf[inbuf], sizeof buf - inbuf);
if (bytesRead < 0)
{
error_processing_here();
continue;
}
if (bytesRead == 0) // no data read to read
{
gettimeofday (&now, NULL);
if ((now.tv.sec - start.tv_sec) * 1000000 +
now.tv.usec - start.tv_usec > timeout_value_in_microsecs)
{
done = 2; // timeout
continue;
}
sleep(1); // not timed out yet, sleep a second
continue;
}
inbuf += bytesRead;
if (we have read all we want)
done = 1;
}
if (done == 2)
timeout_condition_handling();
close(fd);
return 0;
}