Poor `mmap` performance on substantial system load - c

We're currently facing a somewhat complex problem with mmap performance on our Linux server.
We use a server with 64-core AMD Opteron 6374 and 128GiB of RAM. Here, we created a qemu virtual machine with the same core count and 64GiB of RAM. We use it for unit-testing a program I wrote. There are around 60 unit tests that run in parallel, each of which allocates a little over 1GiB of RAM. Because the process memory compresses really well, we decided to enable Zram. During our tests, the memory usage dropped to around 300MiB for each process, which is a significant gain, at a relatively small performance loss (the swap area stays in physical memory).
Currently, with our tests, we don't swap just yet, but we've observed very poor mmap performance. A single call to mmap, from our testing, could take up to 7 minutes (without swapping, of course; allocating maybe somewhere between 2MBps-20MBps of memory). Sometimes, though all mmaps on all the 60 processes are nearly instant and the processes allocate the required gigabyte of RAM. We watch them allocating tiny amounts of memory per second in real time, though:
The program I wrote follows:
// CC0, inspired by dzaima's code, which was inspired by my code.
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <stdint.h>
#include <sys/mman.h>
#include <signal.h>
#include <unistd.h>
#define u8 uint8_t
#define i32 int32_t
#define u32 uint32_t
#define i64 int64_t
#define u64 uint64_t
#define C const
#define P static
#define _(a...) {return({a;});}
#define F_(n,a...) for(int i=0;i<n;i++){a;}
#define F1(n,x,a...) for(i32 i=0;i<n;i+=x){a;}
#define INLINE P inline __attribute__((always_inline))
#define assert(X) if(!(X))__builtin_unreachable();
#define LKL(c) __builtin_expect((c),1)
typedef u32 W;
#define SZ 19
#define END 1162261467ULL
P C u8 crz[]={1,0,0,9,1,0,2,9,2,2,1},crz2[]={4,3,3,1,0,0,1,0,0,9,9,9,9,9,9,9,4,3,5,1,0,2,1,0,2,9,9,9,9,9,9,9,5,5,4,2,2,1,2,2,1,9,9,9,9,9,9,9,4,3,3,1,0,0,7,6,6,9,9,9,9,9,9,9,4,3,5,1,0,2,7,6,8,9,9,9,9,9,9,9,5,5,4,2,2,1,8,8,7,9,9,9,9,9,9,9,7,6,6,7,6,6,4,3,3,9,9,9,9,9,9,9,7,6,8,7,6,8,4,3,5,9,9,9,9,9,9,9,8,8,7,8,8,7,5,5,4,9,9,9,9,9,9,9};
#define UNR_CRZ(trans,sf1,sf2)W am=a%sf1,ad=a/sf1,dm=d%sf1,dd=d/sf1;r+=k*trans[am+sf2*dm];a=ad;d=dd;k*=sf1;
INLINE W mcrz(W a, W d){W r=0,k=1;
#pragma GCC unroll 16
F_(SZ/2,UNR_CRZ(crz2,9,16))if(SZ&1){UNR_CRZ(crz,3,4)}return r;}
INLINE W mrot(W x)_(W t=END/3,b=x%t,m=b%3,d=b/3;d+m*(t/3)+(x-b))
P u64 pgsiz;
P W*mem,pat[6];
P void mpstb(void*b,u64 l){mmap(b,l,PROT_READ|PROT_WRITE,MAP_POPULATE|MAP_PRIVATE|MAP_ANON|MAP_FIXED,-1,0);}
P void sigsegvh(int n,siginfo_t*si,void*_) {
void*a=si->si_addr,*ab=(void*)((u64)a&~(pgsiz-1));mpstb(ab, pgsiz);
W* curr=ab;i64 off=(curr-mem)%(END/3);F1(pgsiz,sizeof(W),*curr++=pat[off++%6]);}
P u64 rup(u64 v)_(((v-1)&~(pgsiz-1))+pgsiz)
#define RDS 65536
__attribute__((hot,flatten))int main(int argc, char* argv[]){
pgsiz=sysconf(_SC_PAGESIZE);mem=mmap(NULL,END*sizeof(W),PROT_NONE,MAP_NORESERVE|MAP_PRIVATE|MAP_ANON,-1,0);
struct sigaction act;memset(&act,0,sizeof(struct sigaction));act.sa_flags=SA_SIGINFO;act.sa_sigaction=sigsegvh;sigaction(SIGSEGV,&act,NULL);
FILE*f=fopen(argv[1],"rb");fseek(f,0,SEEK_END);u64 S=ftell(f);rewind(f);u64 szR=rup(S),off=0;mpstb(mem, szR*sizeof(W));char data[RDS];
C W a1_off=94-((END-1)/6-29524)%94,a2_off=94-((END-1)/3-59048)%94;while(S){int am=LKL(S>RDS)?RDS:S;fread(&data,1,am,f);
#pragma GCC unroll 32
F_(am,W w=data[i];mem[off++]=w)S-=am;}for(;off<szR;off++)mem[off]=mcrz(mem[off-1],mem[off-2]);
W n2=mem[off-2],n1=mem[off-1];u64 off2=off;F_(6,W n0=mcrz(n1,n2);pat[off2%6]=n0;n2=n1;n1=n0;off2++)W c=0,a=0,*d=mem;
P C int offs[]={0,((i64)a1_off-(i64)(END/3))%94+94,((i64)a2_off-(i64)(2*(END/3))%94+94)};P C void*j[94];F_(94,j[i]=&&INS_DEF)
#define M(n) j[n]=&&INS_##n;
M(4)M(5)M(23)M(39)M(40)M(62)M(68)M(81)
#define BRA {goto*j[(c+mem[c]+offs[c/(END/3)])%94];}
BRA;
#define NXT mem[c] = \
"SOMEBODY MAKE ME FEEL ALIVE" \
"[hj9>,5z]&gqtyfr$(we4{WP)H-Zn,[%\\3dL+Q;>U!pJS72FhOA1CB6v^=I_0/8|jsb9m<.TVac`uY*MK'X~xDl}REokN:#?G\"i#" \
"AND SHATTER ME"[mem[c]];c++;d++;BRA
INS_4:c=*d;NXT;INS_5:putchar(a);fflush(stdout);NXT;
INS_23:;int CR=getchar();a=CR==EOF?END-1:CR;NXT;INS_39:a=*d=mrot(*d);NXT;INS_40:d=mem+*d;NXT;
INS_62:a=*d=mcrz(a, *d);INS_68:NXT;INS_81:return 0;INS_DEF:NXT;
}
It's an interpreter for rotwidth=19 variant of Malbolge Unshackled (compiled with clang fast20.c -w -O3 -march=native -mtune=native -o fast20 -flto -mllvm -polly -fvisibility=hidden, clang -v yields Debian clang version 11.0.1-2). We feed it with the source code of my project (passed as an argument to the program), temporarily available here (provided hoping that it's possible to reproduce our issue; use 7za to unpack).
Each time I want to run the unit tests, i execute the following shell script:
#!/bin/bash
# XXX: `rsync` is slower
echo "[+] sending test data."
cd kiera-tests && \
tar -czf - * | \
ssh kamila#remote \
"cd ~/malbolgelisp && rm -rf tests && mkdir tests && cd tests && tar -xzf -" && \
cd ..
echo "[+] building essential tools."
ssh kamila#remote "cd ~/malbolgelisp/tests && chmod a+x setup.sh && ./setup.sh"
echo "[+] sending malbolgelisp source code..."
tool/mb_nlib d < lisp.mb | \
pv | gzip -6 | \
ssh kamila#remote \
"gunzip | ~/malbolgelisp/tests/mb_nlib e > ~/malbolgelisp/lisp.mb && vmtouch -vt ~/malbolgelisp/lisp.mb"
echo "[+] running the tests..."
ssh kamila#remote "cd ~/malbolgelisp/tests/ && ./test.sh"
I vmtouch the ~300MB file, so it must have stayed in cache across the runs.
/home/kamila/malbolgelisp/lisp.mb
[OOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOO] 76711/76711
Files: 1
Directories: 0
Touched Pages: 76711 (299M)
Elapsed: 0.071541 seconds
As we've observed, the cached memory shown by bpytop grows up to 500MiB, which means that the file must have been cached. We also reupload the file each time and it changes significantly.
We tried using Valgrind on the interpreter, but it seems to misbehave under this condition for a yet unknown reason. It's easy to deduce what is happening in the code, though:
pgsiz=sysconf(_SC_PAGESIZE);
mem=mmap(NULL,END*sizeof(W),PROT_NONE,MAP_NORESERVE|MAP_PRIVATE|MAP_ANON,-1,0);
first, entire memory area is mapped.
FILE*f=fopen(argv[1],"rb");fseek(f,0,SEEK_END);u64 S=ftell(f);rewind(f);
u64 szR=rup(S),off=0;mpstb(mem, szR*sizeof(W));
then I query the file size (~300MiB, times sizeof(W) = ~1.2GiB), and map eagerly this amount of memory using mpstb:
P void mpstb(void*b,u64 l){
mmap(b,l,PROT_READ|PROT_WRITE,MAP_POPULATE|MAP_PRIVATE|MAP_ANON|MAP_FIXED,-1,0);}
I considered using mprotect, but in the following parts of the code we execute mpstb fairly often, causing IPIs for TLB shootdowns.
the following bit of code can't be a bottleneck, since aside from the I/O it performs (which is happening on a cached file with a relatively big buffer - RDS = 65536 => should be fast) a bunch of mathematical operations which can't take 7 minutes on one run with the same data and a few seconds on the other run with the same data:
C W a1_off=94-((END-1)/6-29524)%94,a2_off=94-((END-1)/3-59048)%94;
while(S){int am=LKL(S>RDS)?RDS:S;fread(&data,1,am,f);
#pragma GCC unroll 32
F_(am,W w=data[i];mem[off++]=w)S-=am;}
for(;off<szR;off++)mem[off]=mcrz(mem[off-1],mem[off-2]);
We've also noticed that in the following test runner which is executed on the server:
#!/bin/bash
for d in b*; do
for f in $d/*.in; do
echo "[+] $f"
(./fast20 ../lisp.mb $f < $f > $f.aout; diff ${f%%.*}.out $f.aout) &
# sleep 3s
done
for job in `jobs -p`; do
wait $job
done
done
uncommenting the # sleep 3s line makes the allocations much faster, meaning that the Linux kernel simply can't handle a dozen of processes mapping a single gigabyte of memory concurrently. we've also seen these messages pop up during our testing: watchdog: BUG: soft lockup - CPU#34 stuck for 24s! that messed up our bpytop view. Some googling reveals that it's printed when the CPU is stuck for too long in the kernel, which would be yet another argument proving that mmap in this example is ridicously slow.
we've also suspected that it might be caused by memory ballooning on qemu, but disabling it made very little difference.
interestingly enough, all the processes seem to slowly and concurrently allocate memory.
the documentation for the lisp interpreter is available here and it can be used to construct test cases - the simplest one being (+ 2 2).
my question follows - can we do something about this bug? are we missing something? i know that running less processes at a time makes it actually bearable (the runtime drops from 30m to 5m), but if not the allocation performance, the tests could easily finish within 40 seconds, which would be a huge improvement. Is it mmap being inherently slow on Linux when called by multiple processes concurrently?
finally, please let me know if we should provide any further details.

Related

Using Address Sanitizer and _CrtDumpMemoryLeaks() with MSVC

I'm having an issue with compiling with both /fsanitize=address and /MDd compiler options.
#ifdef _DEBUG
#define _CRTDBG_MAP_ALLOC
#include <stdlib.h>
#include <crtdbg.h>
#else
#include <stdlib.h>
#endif
#include <stdio.h>
int main(int argc, char **argv) {
_CrtSetReportMode(_CRT_WARN, _CRTDBG_MODE_FILE);
_CrtSetReportFile(_CRT_WARN, _CRTDBG_FILE_STDOUT);
_CrtSetReportMode(_CRT_ERROR, _CRTDBG_MODE_FILE);
_CrtSetReportFile(_CRT_ERROR, _CRTDBG_FILE_STDOUT);
_CrtSetReportMode(_CRT_ASSERT, _CRTDBG_MODE_FILE);
_CrtSetReportFile(_CRT_ASSERT, _CRTDBG_FILE_STDOUT);
int *foo = malloc(sizeof(*foo) * 1024);
printf("%d\n", _CrtDumpMemoryLeaks());
return EXIT_SUCCESS;
}
With cl test.c /MDd /Zi works as expected and reports the leak.
Detected memory leaks!
Dumping objects ->
test.c(20) : {104} normal block at 0x000001ABE8BFA130, 4096 bytes long.
Data: < > CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD
Object dump complete.
1
However, adding address sanitizer, cl test.c /fsanitize=address /MDd /Zi reports no leaks.
0
I had assumed according to MSDN that this might work.
Thank you to the commenter for pointing out that even if they did work together, you'd expect that _CrtDumpMemoryLeaks() would always report a leak because:
The AddressSanitizer runtime doesn't release memory back to the OS during execution. From the OS's point of view, it may look like there's a memory leak. This design decision is intentional, so as not to allocate all the required memory up front.
I still am not quite sure I'm not seeing this behavior though and _CrtDumpMemoryLeaks() is resulting in 0 when /fsanitize=address is enabled.
However, this clears up the issue for me, and I should just expect to create different builds to test either with address sanitizer or _CrtDumpMemoryLeaks().

How to experience cache miss and hits in Linux system?

Hello I've been trying to experience cache miss and hits in Linux.
To do so, I've done a program in C, where I mesure the time in CPU cycle to do the instruction printf(). The first part mesure the time needed for a miss and the second one for a hit. Here is the given program :
#include <stdint.h>
#include <stdio.h>
#include <string.h>
#include <sched.h>
#include <sys/types.h>
#include <unistd.h>
#include <signal.h>
uint64_t rdtsc() {
uint64_t a, d;
asm volatile ("mfence");
asm volatile ("rdtsc" : "=a" (a), "=d" (d));
a = (d<<32) | a;
asm volatile ("mfence");
return a;
}
int main(int argc, char** argv)
{
size_t time = rdtsc();
printf("Hey ");
size_t delta1 = rdtsc() - time;
printf("delta: %zu\n", delta1);
size_t time2 = rdtsc();
printf("Hey ");
size_t delta2 = rdtsc() - time2;
printf("delta: %zu\n", delta2);
sleep(100);
}
Now I would like to show that two processes (two terminals) have cache in commun. So I thought that running this program in two terminals would result in :
Terminal 1:
miss
hit
Terminal 2:
hit
hit
But now I have something like:
Terminal 1:
miss
hit
Terminal 2:
miss
hit
Is my understanding incorrect? Or my program wrong?
Your assumption is somewhat correct.
printf is part of the libc library. If you use dynamic linking, the operating system may optimize memory usage by only loading the library once for all processes using it.
However, there are multiple reasons why I don't expect you to measure any sizable difference:
compared to the difference between a cache hit and cache miss, printf takes an enormous amount of time to complete and there is a lot going on that introduces noise. With just a single measurement, it is very unlikely that you're able to measure that tiny difference.
the actual reason for the first measurement to take longer is likely the lazy binding of the library function printf being resolved by the loader (https://maskray.me/blog/2021-09-19-all-about-procedure-linkage-table) or some other magic happening (buffers being setup, etc.) for the first output.
a lot of libc functions are used by many different processes. If the library is shared, it is likely, that printf may be cached even though you did not use it.
I would suggest to mount a Flush+Reload attack (https://eprint.iacr.org/2013/448.pdf) on printf in one of the terminals and use it in the other terminal. Then, you may see a timing difference.
Note: to find the actual address of printf for the attack, you need to be familiar with dynamic linking and the plt. Just using something like void* addr = printf will probably not work!

how to get rid of the "unknown" section in the perf

what I did is:
1. sudo rm -rf /root/.debug/
2. compile program with -g -O2 -fno-omit-frame-pointer
3. run the program and get the pid
4. sudo perf record -F 2000 -a -s -g -p $pid sleep 15
5. sudo perf report
then I get a tiny part of "unknown", like
- 2.50% 0.00% postgres [unknown] [k] 0000000000000000
- 0
1.12% _int_malloc ▒
0.79% _IO_vsnprintf
looks this is due to libc 'malloc' call. then I write a program on the same machine to test it.
#include <stdlib.h>
#include <stdio.h>
#include <unistd.h>
#include <string.h>
#include <sys/types.h>
int main(int argc, char *argv[])
{
while(1) {
printf("perf record -g -p %d -- sleep 5; perf report\n", getpid());
sleep(1);
void *p = malloc(10);
memset(p, 0, 10);
free(p);
}
return 0;
}
then I did the same thing as above, there is no "unknown" section.
how to explain/fix this?
The [unknown] block in the perf report output refers to the name of the dynamic shared object (DSO). perf report could not resolve the DSO path and hence it prints [unknown]. Per the latest kernel source code tree (which is 5.3.9 at the time of writing), you can see this here.
It is important to know that the determination of the DSO symbols happen with the help of the sampled event address. The function thread__resolve is responsible for doing exactly that. In older kernels, the thread__resolve method had another name - perf_event__preprocess_sample_addr.
Given the snapshot of your output, it looks like the address of the event that was sampled during perf record, is 0. This means that the address could not be resolved at all. It is an address in the kernel space (looking at the symbol [k] 0000000000000000) and perf in your case, could not resolve it.
The comments highlight setting perf_event_paranoid to a suitable value so that you can probe both kernel and user-space events successfully. Setting perf_event_paranoid to a value that allows you to correctly probe events in the kernel space should "be a step" towards correctly resolving the address.

AVR/GNU C Compiler and static memory allocation

Update - Rephrase question:
Since I know what the bug is! How to know when statical allocation fails at compile time in embedded?
Older:
I have this simple and easy to understand code in "C" below running in Atmega328P-AU with 2K SRAM. I use a well behaved UART library( I used many during debugging ) to get debug strings in my PC terminal.
There is a bug in this code: It freezes. All I get is this output...
Hello World - Loading
I should get a '+' for every loop.
Can you explain me why is freezes and why the compiler does not inform me about allocating statically more memory than the uC can get.
In the code there are all the info you may need.
/**************************************************************************************************
Info
**************************************************************************************************/
/*
Device: Atmega328P-AU - No arduino
IDE: Atmel Studio 6.2
Compiler: AVR/GNU C Compiler : 4.8.1
F_CPU: 8000000 Hz defined in makefile
Fuses:
Extended: 0x07
High: 0xD9
Low: 0xE2
Lockbit: 0xFF
When compiled it show in build output these:
text data bss dec hex filename
1088 0 57 1145 479 Bug Catcher.elf
Done executing task "RunCompilerTask".
Task "RunOutputFileVerifyTask"
Program Memory Usage : 1088 bytes 3,3 % Full
Data Memory Usage : 57 bytes 2,8 % Full
Done executing task "RunOutputFileVerifyTask".
Done building target "CoreBuild" in project "Bug Catcher.cproj".
Target "PostBuildEvent" skipped, due to false condition; ('$(PostBuildEvent)' != '') was evaluated as ('' != '').
Target "Build" in file "C:\Program Files\Atmel\Atmel Studio 6.2\Vs\Avr.common.targets" from project "C:\Users\Tedi\Desktop\Bug Catcher\Bug Catcher\Bug Catcher.cproj" (entry point):
Done building target "Build" in project "Bug Catcher.cproj".
Done building project "Bug Catcher.cproj".
Build succeeded.
========== Rebuild All: 1 succeeded, 0 failed, 0 skipped ==========
*/
/**************************************************************************************************
Definitions
**************************************************************************************************/
#define BIG_NUMBER 1000
// Atmega328P - Pin 12
#define SOFT_UART_RX_DDR DDRB
#define SOFT_UART_RX_DDR_bit DDB0
#define SOFT_UART_RX_PORT PORTB
#define SOFT_UART_RX_PORT_bit PORTB0
#define SOFT_UART_RX_PIN PINB
#define SOFT_UART_RX_PIN_bit PINB0
// Atmega328P Pin 13
#define SOFT_UART_TX_DDR DDRB
#define SOFT_UART_TX_DDR_bit DDB1
#define SOFT_UART_TX_PORT PORTB
#define SOFT_UART_TX_PORT_bit PORTB1
#define SOFT_UART_TX_PIN PINB
#define SOFT_UART_TX_PIN_bit PINB1
/**************************************************************************************************
Includes
**************************************************************************************************/
#include "softuart.h"
#include <avr/io.h>
#include <avr/interrupt.h>
#include <util/delay.h>
#include <string.h>
/**************************************************************************************************
Main function
**************************************************************************************************/
int main()
{
/**********************************************************************************************
Setup
**********************************************************************************************/
softuart_init( &SOFT_UART_TX_DDR, SOFT_UART_TX_DDR_bit,
&SOFT_UART_TX_PORT, SOFT_UART_TX_PORT_bit,
&SOFT_UART_RX_DDR, SOFT_UART_RX_DDR_bit,
&SOFT_UART_RX_PIN, SOFT_UART_RX_PIN_bit );
sei();
softuart_puts_P( "\r\n\r\nHello World - Loading\r\n\r\n" ); // Can use custom UART function.
_delay_ms( 200 );
/**********************************************************************************************
Forever loop
**********************************************************************************************/
while(1)
{
char temp[BIG_NUMBER];
memset( temp, '\0', sizeof( temp ) );
{
char temp[BIG_NUMBER];
memset( temp, '\0', sizeof( temp ) );
{
char temp[BIG_NUMBER];
memset( temp, '\0', sizeof( temp ) );
}
}
softuart_puts_P("+"); // BUG!!!!! It never reaches here.
_delay_ms( 500 );
}
}
The linker allocates the static storage, in your case 57 bytes (data plus bss segments). So as long as you have a too big variable with static storage, you should see an error message from the linker.
The variable temp[1000] is an automatic variable, it is allocated at run time on the stack. The RAM that is not statically allocated by the linker is used for the stack. This bug is an easy case, you are allocating a single variable that is bigger than the entire RAM of the device, but normally this kind of error is really really hard to detect. One solution is to check the available stack space at runtime. As a simple rule: don't allocate big stuff on the stack. You will only see it fail when that function is called.
temp[1000] is used for the entire runtime of the program, so you don't loose anything by just moving it into static storage. Put a "static" in front of it and you will (hopefully) see an error message from the linker.

Way to print out compile-time -calculated constants

I'm doing some microcontroller programming and I have code along these lines:
#define F_CPU 8000000
#define F_ADC (F_CPU / 64.0)
#define T_ADC (1.0/F_ADC)
Is there a way to print out the calculated values of, say T_ADC at compile time? I tried stringifying it
#define STRINGIFY(s) XSTRINGIFY(s)
#define XSTRINGIFY(s) #s
#pragma message ("T_ADC " STRINGIFY(T_ADC))
But that just gives the macro-expansion "(1/(8000000/64))", not the actual value.
This being a micro-controller program, it's awkward to do a printf at startup time. I'm using gcc and I'm happy to use any non-standard gcc features if that helps.
As #mbratch and #freddie said, the computation is made by the compiler, so you can not get the result simply using preprocessor directives.
The easiest way that comes to mind right now, is to assign the macro to a global const, and then read the value of the const using a debugger, or opening the binary image of the executable (you can get the address of the constant from the memory map file).
const float temp = T_ADC;
Note that you are forced to specify the C type, and this is an essential step since the result of the macro depends on it.
I implemented a baud rate calculation in the preprocessor for a microcontroller but tweaked the integer divide so it rounded (as truncation has more error). Then I displayed the achieved error in a series of categories of low, med and too much, but I stopped short of +-X.X% * due to the extra tedious coding effort.
It was along the lines of http://99-bottles-of-beer.net/language-c-c++-preprocessor-115.html but:-
tedious to do, as it's proportional to:-
the number of digits/categories required
the number of variables as nothing can be shared
fairly preprocessor specific
devoid of any compiler checks
As I don't have the code, the exercise/tediousness is left to the reader...
* Using scaled integer based calculations
It's not exactly what you're looking for but it'll help.
/* definition to expand macro then apply to pragma message */
#define VALUE_TO_STRING(x) #x
#define VALUE(x) VALUE_TO_STRING(x)
#define VAR_NAME_VALUE(var) #var "=" VALUE(var)
#define F_CPU 8000000
#define F_ADC (F_CPU / 64.0)
#define T_ADC (1.0/F_ADC)
#pragma message VAR_NAME_VALUE(T_ADC) /* prints note: #pragma message: T_ADC=(1.0/(8000000 / 64.0) */
This is called Stringification.
Edit: The pre-processor only does string replacement. You could use the pragma message and then use a simple script to do the computation. Continued from my comment above.
$ gcc a.c 2> out
$ python -c "print `cat out | cut -d = -f2`"
8e-06

Resources