Use mbind to move pages in NUMA system - c

I am trying to get my head around the numa(3) library on Linux.
I have a large array allocated (many GB of memory). Threads running on random NUMA nodes write to it, as such pages are faulted in on random NUMA memory nodes (default NUMA policy).
At the end of threaded calculation I have a single-threaded job to sum up the results. For that I first compress the array removing a lot of elements and then want to move the remainder to the NUMA node of the master thread.
The move_pages syscall is not the right one for the job because it requires an array entry for each page - too much overhead.
The documentation is unclear wether it is possible to force numa_tonode_memory to move faulted memory.
So the only way that I see is to use mbind with MPOL_MF_MOVE, but I can't get my head around creating a proper nodemask argument (or something else is failing). Here is as far as I got:
#define _GNU_SOURCE
#include <stdlib.h>
#include <sched.h>
#include <numa.h>
#include <numaif.h>
int master_node;
nodemask_t master_nodemask;
// initializer
// has to be called from master/main thread
void numa_lock_master_thread() {
int curcpu = sched_getcpu();
if (curcpu >= 0) {
// master_node = numa_node_of_cpu(curcpu);
numa_run_on_node(master_node = numa_node_of_cpu(curcpu));
if (master_node >= 0) {
struct bitmask * bindmask = numa_bitmask_alloc(numa_num_possible_nodes());
numa_bitmask_clearall(bindmask);
numa_bitmask_setbit(bindmask, master_node);
copy_bitmask_to_nodemask(bindmask, &master_nodemask);
numa_bitmask_free(bindmask);
}
} else { // has never failed before
perror("sched_getcpu");
}
}
static inline void numa_migrate_pages_to_master_node(void * addr, unsigned long len) {
if (master_node < 0)
return;
if ( mbind( addr
, len
, MPOL_BIND
, master_nodemask.n
, numa_max_node()
, MPOL_MF_MOVE)) {
perror("mbind");
}
}
from /usr/include/numa.h:
typedef struct {
unsigned long n[NUMA_NUM_NODES/(sizeof(unsigned long)*8)];
} nodemask_t;
Sometimes I get mbind: Bad address, sometimes the call succeeds, but subsequent memory accesses give a SIGSEGV.
addr is always a valid pointer returned by
mmap(NULL, (num_pages) * sysconf(_SC_PAGE_SIZE), PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE | (flags), -1, 0);
and len is page-aligned.
How can I make this work with as few syscalls as possible and without the large overhead that comes with move_pages?
And what is the proper way to set up the nodemask argument for mbind?

Related

understand membarrier function in linux

Example of using membarrier function from linux manual: https://man7.org/linux/man-pages/man2/membarrier.2.html
#include <stdlib.h>
static volatile int a, b;
static void
fast_path(int *read_b)
{
a = 1;
asm volatile ("mfence" : : : "memory");
*read_b = b;
}
static void
slow_path(int *read_a)
{
b = 1;
asm volatile ("mfence" : : : "memory");
*read_a = a;
}
int
main(int argc, char **argv)
{
int read_a, read_b;
/*
* Real applications would call fast_path() and slow_path()
* from different threads. Call those from main() to keep
* this example short.
*/
slow_path(&read_a);
fast_path(&read_b);
/*
* read_b == 0 implies read_a == 1 and
* read_a == 0 implies read_b == 1.
*/
if (read_b == 0 && read_a == 0)
abort();
exit(EXIT_SUCCESS);
}
The code above transformed to use membarrier() becomes:
#define _GNU_SOURCE
#include <stdlib.h>
#include <stdio.h>
#include <unistd.h>
#include <sys/syscall.h>
#include <linux/membarrier.h>
static volatile int a, b;
static int
membarrier(int cmd, unsigned int flags, int cpu_id)
{
return syscall(__NR_membarrier, cmd, flags, cpu_id);
}
static int
init_membarrier(void)
{
int ret;
/* Check that membarrier() is supported. */
ret = membarrier(MEMBARRIER_CMD_QUERY, 0, 0);
if (ret < 0) {
perror("membarrier");
return -1;
}
if (!(ret & MEMBARRIER_CMD_GLOBAL)) {
fprintf(stderr,
"membarrier does not support MEMBARRIER_CMD_GLOBAL\n");
return -1;
}
return 0;
}
static void
fast_path(int *read_b)
{
a = 1;
asm volatile ("" : : : "memory");
*read_b = b;
}
static void
slow_path(int *read_a)
{
b = 1;
membarrier(MEMBARRIER_CMD_GLOBAL, 0, 0);
*read_a = a;
}
int
main(int argc, char **argv)
{
int read_a, read_b;
if (init_membarrier())
exit(EXIT_FAILURE);
/*
* Real applications would call fast_path() and slow_path()
* from different threads. Call those from main() to keep
* this example short.
*/
slow_path(&read_a);
fast_path(&read_b);
/*
* read_b == 0 implies read_a == 1 and
* read_a == 0 implies read_b == 1.
*/
if (read_b == 0 && read_a == 0)
abort();
exit(EXIT_SUCCESS);
}
This "membarrier" description is taken from the Linux manual. I am still confused about how does trhe "membarrier" function add overhead to the slow side, and remove overhead from the fast side, thus resulting in an overall performance increase as long as the slow side is infrequent enough that the overhead of the membarrier() calls does not outweigh the performance gain on the fast side.
Could you please help me to describe it in more detail.
Thanks!
This pair of writes-then-read-the-other-var is https://preshing.com/20120515/memory-reordering-caught-in-the-act/, a demo of StoreLoad reordering (the only kind x86 allows, given its program-order + store buffer with store forwarding memory model).
With only one local MFENCE you could still get reordering:
FAST using just mfence, not membarrier
a = 1 exec
read_b = b; // 0
b = 1;
mfence (force b=1 to be visible before reading a)
read_a = a; // 0
a = 1 visible (global vis. delayed by store buffer)
But consider what would happen if an mfence-on-every-core had to be part of every possible order, between the slow-path's store and its reload.
This ordering would no longer be possible. If read_b=b has already read a 0, then a=1 is already pending1 (if it isn't visible already). It's impossible for it to stay private until after read_a = a because membarrier() makes sure a full barrier runs on every core, and SLOW waits for that to happen (membarrier to return) before reading a.
And there's no way to get 0,0 from having SLOW execute first; it runs membarrier itself so its store is definitely visible to other threads before it reads a.
footnote 1: Waiting to execute, or waiting in the store buffer to commit to L1d cache. The asm("":::"memory") ensures that, but is actually redundant because volatile itself guarantees that the accesses happen in asm in program order. And we basically need volatile for other reasons when hand-rolling atomics instead of using C11 _Atomic. (But generally don't do that unless you're actually writing kernel code. Use atomic_store_explicit(&a, 1, memory_order_release);).
Note it's actually the store buffer that creates StoreLoad reordering (the only kind x86 allows), not so much OoO exec. In fact, a store buffer also lets x86 execute stores out-of-order and then make them globally visible in program order (if it turns out they weren't the result of mis-speculation or something!).
Also note that in-order CPUs can do their memory accesses out of order. They start instructions (including loads) in order, but can let them complete out of order, e.g. by scoreboarding loads to allow hit-under-miss. See also How is load->store reordering possible with in-order commit?

Detecting stack overflows during runtime beforehand

I have a rather huge recursive function (also, I write in C), and while I have no doubt that the scenario where stack overflow happens is extremely unlikely, it is still possible. What I wonder is whether you can detect if stack is going to get overflown within a few iterations, so you can do an emergency stop without crashing the program.
In the C programming language itself, that is not possible. In general, you can't know easily that you ran out of stack before running out. I recommend you to instead place a configurable hard limit on the recursion depth in your implementation, so you can simply abort when the depth is exceeded. You could also rewrite your algorithm to use an auxillary data structure instead of using the stack through recursion, this gives you greater flexibility to detect an out-of-memory condition; malloc() tells you when it fails.
However, you can get something similar with a procedure like this on UNIX-like systems:
Use setrlimit to set a soft stack limit lower than the hard stack limit
Establish signal handlers for both SIGSEGV and SIGBUS to get notified of stack overflows. Some operating systems produce SIGSEGV for these, others SIGBUS.
If you get such a signal and determine that it comes from a stack overflow, raise the soft stack limit with setrlimit and set a global variable to identify that this occured. Make the variable volatile so the optimizer doesn't foil your plains.
In your code, at each recursion step, check if this variable is set. If it is, abort.
This may not work everywhere and required platform specific code to find out that the signal came from a stack overflow. Not all systems (notably, early 68000 systems) can continue normal processing after getting a SIGSEGV or SIGBUS.
A similar approach was used by the Bourne shell for memory allocation.
Heres a simple solution that works for win-32. Actually resembles what Wossname already posted but less icky :)
unsigned int get_stack_address( void )
{
unsigned int r = 0;
__asm mov dword ptr [r], esp;
return r;
}
void rec( int x, const unsigned int begin_address )
{
// here just put 100 000 bytes of memory
if ( begin_address - get_stack_address() > 100000 )
{
//std::cout << "Recursion level " << x << " stack too high" << std::endl;
return;
}
rec( x + 1, begin_address );
}
int main( void )
{
int x = 0;
rec(x,get_stack_address());
}
Here's a naive method, but it's a bit icky...
When you enter the function for the first time you could store the address of one of your variables declared in that function. Store that value outside your function (e.g. in a global). In subsequent calls compare the current address of that variable with the cached copy. The deeper you recurse the further apart these two values will be.
This will most likely cause compiler warnings (storing addresses of temporary variables) but it does have the benefit of giving you a fairly accurate way of knowing exactly how much stack you're using.
Can't say I really recommend this but it would work.
#include <stdio.h>
char* start = NULL;
void recurse()
{
char marker = '#';
if(start == NULL)
start = &marker;
printf("depth: %d\n", abs(&marker - start));
if(abs(&marker - start) < 1000)
recurse();
else
start = NULL;
}
int main()
{
recurse();
return 0;
}
An alternative method is to learn the stack limit at the start of the program, and each time in your recursive function to check whether this limit has been approached (within some safety margin, say 64 kb). If so, abort; if not, continue.
The stack limit on POSIX systems can be learned by using getrlimit system call.
Example code that is thread-safe: (note: it code assumes that stack grows backwards, as on x86!)
#include <stdio.h>
#include <sys/time.h>
#include <sys/resource.h>
void *stack_limit;
#define SAFETY_MARGIN (64 * 1024) // 64 kb
void recurse(int level)
{
void *stack_top = &stack_top;
if (stack_top <= stack_limit) {
printf("stack limit reached at recursion level %d\n", level);
return;
}
recurse(level + 1);
}
int get_max_stack_size(void)
{
struct rlimit rl;
int ret = getrlimit(RLIMIT_STACK, &rl);
if (ret != 0) {
return 1024 * 1024 * 8; // 8 MB is the default on many platforms
}
printf("max stack size: %d\n", (int)rl.rlim_cur);
return rl.rlim_cur;
}
int main (int argc, char *argv[])
{
int x;
stack_limit = (char *)&x - get_max_stack_size() + SAFETY_MARGIN;
recurse(0);
return 0;
}
Output:
max stack size: 8388608
stack limit reached at recursion level 174549

C - shared memory - dynamic array inside shared struct

i'm trying to share a struct like this
example:
typedef struct {
int* a;
int b;
int c;
} ex;
between processes, the problem is that when I initialize 'a' with a malloc, it becomes private to the heap of the process that do this(or at least i think this is what happens). Is there any way to create a shared memory (with shmget, shmat) with this struct that works?
EDIT: I'm working on Linux.
EDIT: I have a process that initialize the buffer like this:
key_t key = ftok("gr", 'p');
int mid = shmget(key, sizeof(ex), IPC_CREAT | 0666);
ex* e = NULL;
status b_status = init(&e, 8); //init gives initial values to b c and allocate space for 'a' with a malloc
e = (ex*)shmat(mid, NULL, 0);
the other process attaches himself to the shared memory like this:
key_t key = ftok("gr", 'p');
int shmid = shmget(key, sizeof(ex), 0);
ex* e;
e = (ex*)shmat(shmid, NULL, 0);
and later get an element from a, in this case that in position 1
int i = get_el(e, 1);
First of all, to share the content pointed by your int *a field, you will need to copy the whole memory related to it. Thus, you will need a shared memory that can hold at least size_t shm_size = sizeof(struct ex) + get_the_length_of_your_ex();.
From now on, since you mentioned shmget and shmat, I will assume you run a Linux system.
The first step is the shared memory segment creation. It would be a good thing if you can determine an upper bound to the size of the int *a content. This way you would not have to create/delete the shared memory segment over and over again. But if you do so, an extra overhead to state how long is the actual data will be needed. I will assume that a simple size_t will do the trick for this purpose.
Then, after you created your segment, you must set the data correctly to make it hold what you want. Notice that while the physical address of the memory segment is always the same, when calling shmat you will get virtual pointers, which are only usable in the process that called shmat. The example code below should give you some tricks to do so.
#include <sys/types.h>
#include <sys/ipc.h>
/* Assume a cannot point towards an area larger than 4096 bytes. */
#define A_MAX_SIZE (size_t)4096
struct ex {
int *a;
int b;
int c;
}
int shm_create(void)
{
/*
* If you need to share other structures,
* You'll need to pass the key_t as an argument
*/
key_t k = ftok("/a/path/of/yours");
int shm_id = 0;
if (0 > (shm_id = shmget(
k, sizeof(struct ex) + A_MAX_SIZE + sizeof(size_t), IPC_CREAT|IPC_EXCL|0666))) {
/* An error occurred, add desired error handling. */
}
return shm_id;
}
/*
* Fill the desired shared memory segment with the structure
*/
int shm_fill(int shmid, struct ex *p_ex)
{
void *p = shmat(shmid, NULL, 0);
void *tmp = p;
size_t data_len = get_my_ex_struct_data_len(p_ex);
if ((void*)(-1) == p) {
/* Add desired error handling */
return -1;
}
memcpy(tmp, p_ex, sizeof(struct ex));
tmp += sizeof(struct ex);
memcpy(tmp, &data_len, sizeof(size_t);
tmp += 4;
memcpy(tmp, p_ex->a, data_len);
shmdt(p);
/*
* If you want to keep the reference so that
* When modifying p_ex anywhere, you update the shm content at the same time :
* - Don't call shmdt()
* - Make p_ex->a point towards the good area :
* p_ex->a = p + sizeof(struct ex) + sizeof(size_t);
* Never ever modify a without detaching the shm ...
*/
return 0;
}
/* Get the ex structure from a shm segment */
int shm_get_ex(int shmid, struct ex *p_dst)
{
void *p = shmat(shmid, NULL, SHM_RDONLY);
void *tmp;
size_t data_len = 0;
if ((void*)(-1) == p) {
/* Error ... */
return -1;
}
data_len = *(size_t*)(p + sizeof(struct ex))
if (NULL == (tmp = malloc(data_len))) {
/* No memory ... */
shmdt(p);
return -1;
}
memcpy(p_dst, p, sizeof(struct ex));
memcpy(tmp, (p + sizeof(struct ex) + sizeof(size_t)), data_len);
p_dst->a = tmp;
/*
* If you want to modify "globally" the structure,
* - Change SHM_RDONLY to 0 in the shmat() call
* - Make p_dst->a point to the good offset :
* p_dst->a = p + sizeof(struct ex) + sizeof(size_t);
* - Remove from the code above all the things made with tmp (malloc ...)
*/
return 0;
}
/*
* Detach the given p_ex structure from a shm segment.
* This function is useful only if you use the shm segment
* in the way I described in comment in the other functions.
*/
void shm_detach_struct(struct ex *p_ex)
{
/*
* Here you could :
* - alloc a local pointer
* - copy the shm data into it
* - detach the segment using the current p_ex->a pointer
* - assign your local pointer to p_ex->a
* This would save locally the data stored in the shm at the call
* Or if you're lazy (like me), just detach the pointer and make p_ex->a = NULL;
*/
shmdt(p_ex->a - sizeof(struct ex) - sizeof(size_t));
p_ex->a = NULL;
}
Excuse my laziness, it would be space-optimized to not copy at all the value of the int *a pointer of the struct ex since it is completely unused in the shared memory, but I spared myself extra-code to handle this (and some pointer checkings like the p_ex arguments integrity).
But when you are done, you must find a way to share the shm ID between your processes. This could be done using sockets, pipes ... Or using ftok with the same input.
The memory you allocate to a pointer using malloc() is private to that process. So, when you try to access the pointer in another process(other than the process which malloced it) you are likely going to access an invalid memory page or a memory page mapped in another process address space. So, you are likely to get a segfault.
If you are using the shared memory, you must make sure all the data you want to expose to other processes is "in" the shared memory segment and not private memory segments of the process.
You could try, leaving the data at a specified offset in the memory segment, which can be concretely defined at compile time or placed in a field at some known location in the shared memory segment.
Eg:
If you are doing this
char *mem = shmat(shmid2, (void*)0, 0);
// So, the mystruct type is at offset 0.
mystruct *structptr = (mystruct*)mem;
// Now we have a structptr, use an offset to get some other_type.
other_type *other = (other_type*)(mem + structptr->offset_of_other_type);
Other way would be to have a fixed size buffer to pass the information using the shared memory approach, instead of using the dynamically allocated pointer.
Hope this helps.
Are you working in Windows or Linux?
In any case what you need is a memory mapped file. Documentation with code examples here,
http://msdn.microsoft.com/en-us/library/aa366551%28VS.85%29.aspx
http://menehune.opt.wfu.edu/Kokua/More_SGI/007-2478-008/sgi_html/ch03.html
You need to use shared memory/memory mapped files/whatever your OS gives you.
In general, IPC and sharing memory between processes is quite OS dependent, especially in low-level languages like C (higher-level languages usually have libraries for that - for example, even C++ has support for it using boost).
If you are on Linux, I usually use shmat for small amount, and mmap (http://en.wikipedia.org/wiki/Mmap) for larger amounts.
On Win32, there are many approaches; the one I prefer is usually using page-file backed memory mapped files (http://msdn.microsoft.com/en-us/library/ms810613.aspx)
Also, you need to pay attention to where you are using these mechanism inside your data structures: as mentioned in the comments, without using precautions the pointer you have in your "source" process is invalid in the "target" process, and needs to be replaced/adjusted (IIRC, pointers coming from mmap are already OK(mapped); at least, under windows pointers you get out of MapViewOfFile are OK).
EDIT: from your edited example:
What you do here:
e = (ex*)shmat(mid, NULL, 0);
(other process)
int shmid = shmget(key, sizeof(ex), 0);
ex* e = (ex*)shmat(shmid, NULL, 0);
is correcty, but you need to do it for each pointer you have, not only for the "main" pointer to the struct. E.g. you need to do:
e->a = (int*)shmat(shmget(another_key, dim_of_a, IPC_CREAT | 0666), NULL, 0);
instead of creating the array with malloc.
Then, on the other process, you also need to do shmget/shmat for the pointer.
This is why, in the comments, I said that I usually prefer to pack the structs: so I do not need to go through the hassle to to these operations for every pointer.
Convert the struct:
typedef struct {
int b;
int c;
int a[];
} ex;
and then on parent process:
int mid = shmget(key, sizeof(ex) + arraysize*sizeof(int), 0666);
it should work.
In general, it is difficult to work with dynamic arrays inside structs in c, but in this way you are able to allocate the proper memory (this will also work in malloc: How to include a dynamic array INSIDE a struct in C?)

Run-time mocking in C?

This has been pending for a long time in my list now. In brief - I need to run mocked_dummy() in the place of dummy() ON RUN-TIME, without modifying factorial(). I do not care on the entry point of the software. I can add up any number of additional functions (but cannot modify code within /*---- do not modify ----*/).
Why do I need this?
To do unit tests of some legacy C modules. I know there are a lot of tools available around, but if run-time mocking is possible I can change my UT approach (add reusable components) make my life easier :).
Platform / Environment?
Linux, ARM, gcc.
Approach that I'm trying with?
I know GDB uses trap/illegal instructions for adding up breakpoints (gdb internals).
Make the code self modifiable.
Replace dummy() code segment with illegal instruction, and return as immediate next instruction.
Control transfers to trap handler.
Trap handler is a reusable function that reads from a unix domain socket.
Address of mocked_dummy() function is passed (read from map file).
Mock function executes.
There are problems going ahead from here. I also found the approach is tedious and requires good amount of coding, some in assembly too.
I also found, under gcc each function call can be hooked / instrumented, but again not very useful since the the function is intended to be mocked will anyway get executed.
Is there any other approach that I could use?
#include <stdio.h>
#include <stdlib.h>
void mocked_dummy(void)
{
printf("__%s__()\n",__func__);
}
/*---- do not modify ----*/
void dummy(void)
{
printf("__%s__()\n",__func__);
}
int factorial(int num)
{
int fact = 1;
printf("__%s__()\n",__func__);
while (num > 1)
{
fact *= num;
num--;
}
dummy();
return fact;
}
/*---- do not modify ----*/
int main(int argc, char * argv[])
{
int (*fp)(int) = atoi(argv[1]);
printf("fp = %x\n",fp);
printf("factorial of 5 is = %d\n",fp(5));
printf("factorial of 5 is = %d\n",factorial(5));
return 1;
}
test-dept is a relatively recent C unit testing framework that allows you to do runtime stubbing of functions. I found it very easy to use - here's an example from their docs:
void test_stringify_cannot_malloc_returns_sane_result() {
replace_function(&malloc, &always_failing_malloc);
char *h = stringify('h');
assert_string_equals("cannot_stringify", h);
}
Although the downloads section is a little out of date, it seems fairly actively developed - the author fixed an issue I had very promptly. You can get the latest version (which I've been using without issues) with:
svn checkout http://test-dept.googlecode.com/svn/trunk/ test-dept-read-only
the version there was last updated in Oct 2011.
However, since the stubbing is achieved using assembler, it may need some effort to get it to support ARM.
This is a question I've been trying to answer myself. I also have the requirement that I want the mocking method/tools to be done in the same language as my application. Unfortunately this cannot be done in C in a portable way, so I've resorted to what you might call a trampoline or detour. This falls under the "Make the code self modifiable." approach you mentioned above. This is were we change the actually bytes of a function at runtime to jump to our mock function.
#include <stdio.h>
#include <stdlib.h>
// Additional headers
#include <stdint.h> // for uint32_t
#include <sys/mman.h> // for mprotect
#include <errno.h> // for errno
void mocked_dummy(void)
{
printf("__%s__()\n",__func__);
}
/*---- do not modify ----*/
void dummy(void)
{
printf("__%s__()\n",__func__);
}
int factorial(int num)
{
int fact = 1;
printf("__%s__()\n",__func__);
while (num > 1)
{
fact *= num;
num--;
}
dummy();
return fact;
}
/*---- do not modify ----*/
typedef void (*dummy_fun)(void);
void set_run_mock()
{
dummy_fun run_ptr, mock_ptr;
uint32_t off;
unsigned char * ptr, * pg;
run_ptr = dummy;
mock_ptr = mocked_dummy;
if (run_ptr > mock_ptr) {
off = run_ptr - mock_ptr;
off = -off - 5;
}
else {
off = mock_ptr - run_ptr - 5;
}
ptr = (unsigned char *)run_ptr;
pg = (unsigned char *)(ptr - ((size_t)ptr % 4096));
if (mprotect(pg, 5, PROT_READ | PROT_WRITE | PROT_EXEC)) {
perror("Couldn't mprotect");
exit(errno);
}
ptr[0] = 0xE9; //x86 JMP rel32
ptr[1] = off & 0x000000FF;
ptr[2] = (off & 0x0000FF00) >> 8;
ptr[3] = (off & 0x00FF0000) >> 16;
ptr[4] = (off & 0xFF000000) >> 24;
}
int main(int argc, char * argv[])
{
// Run for realz
factorial(5);
// Set jmp
set_run_mock();
// Run the mock dummy
factorial(5);
return 0;
}
Portability explanation...
mprotect() - This changes the memory page access permissions so that we can actually write to memory that holds the function code. This isn't very portable, and in a WINAPI env, you may need to use VirtualProtect() instead.
The memory parameter for mprotect is aligned to the previous 4k page, this also can change from system to system, 4k is appropriate for vanilla linux kernel.
The method that we use to jmp to the mock function is to actually put down our own opcodes, this is probably the biggest issue with portability because the opcode I've used will only work on a little endian x86 (most desktops). So this would need to be updated for each arch you plan to run on (which could be semi-easy to deal with in CPP macros.)
The function itself has to be at least five bytes. The is usually the case because every function normally has at least 5 bytes in its prologue and epilogue.
Potential Improvements...
The set_mock_run() call could easily be setup to accept parameters for reuse. Also, you could save the five overwritten bytes from the original function to restore later in the code if you desire.
I'm unable to test, but I've read that in ARM... you'd do similar but you can jump to an address (not an offset) with the branch opcode... which for an unconditional branch you'd have the first bytes be 0xEA and the next 3 bytes are the address.
Chenz
An approach that I have used in the past that has worked well is the following.
For each C module, publish an 'interface' that other modules can use. These interfaces are structs that contain function pointers.
struct Module1
{
int (*getTemperature)(void);
int (*setKp)(int Kp);
}
During initialization, each module initializes these function pointers with its implementation functions.
When you write the module tests, you can dynamically changes these function pointers to its mock implementations and after testing, restore the original implementation.
Example:
void mocked_dummy(void)
{
printf("__%s__()\n",__func__);
}
/*---- do not modify ----*/
void dummyFn(void)
{
printf("__%s__()\n",__func__);
}
static void (*dummy)(void) = dummyFn;
int factorial(int num)
{
int fact = 1;
printf("__%s__()\n",__func__);
while (num > 1)
{
fact *= num;
num--;
}
dummy();
return fact;
}
/*---- do not modify ----*/
int main(int argc, char * argv[])
{
void (*oldDummy) = dummy;
/* with the original dummy function */
printf("factorial of 5 is = %d\n",factorial(5));
/* with the mocked dummy */
oldDummy = dummy; /* save the old dummy */
dummy = mocked_dummy; /* put in the mocked dummy */
printf("factorial of 5 is = %d\n",factorial(5));
dummy = oldDummy; /* restore the old dummy */
return 1;
}
You can replace every function by the use of LD_PRELOAD. You have to create a shared library, which gets loaded by LD_PRELOAD. This is a standard function used to turn programs without support for SOCKS into SOCKS aware programs. Here is a tutorial which explains it.

C branch on static variable optimization

Let me preface this by saying I haven't profiled this code, nor is it a critical path. This is mostly for my own curiosity.
I have a function that declares/defines a static int to a known error value that will cause the code to take a branch. However, if the function succeeds, I know with certainty that the branch will never be taken again. Is there a compile time optimization for this? Specifically GNU/gcc/glibc?
So I have this:
static unsigned long volatile *getReg(unsigned long addr){
static int fd = -1;
if (fd < 0){
if (fd = open("file", O_RDWR | O_SYNC) < 0){
return NULL;
}
}
}
So once the function completes successfully (if this function returns null, I exit the program), I know that fd will for all future calls be valid and will never take the first branch. I know there's the __builtin_expect() macro, so I could write
if (__builtin_expect((fd<0),0){
But from what I understand that's only a HINT to the compiler, and it still has to perform the condition check. And I also realize it will in 99.9999% of the cases be more than enough so that any further performance increase is negligible.
I was wondering if there was a way of preventing even the first condition check (the fd <0 ) after the very first time it gets run.
The short answer is "no".
I mean, sure, you could maybe play tricks with pointers to functions, monkey-patching your code, etc., but that would almost certainly be slower than just doing the test.
Branches are only expensive when they are mis-predicted. __builtin_expect will arrange to ensure that this branch is only mis-predicted the first time.
You are talking about literally one or two cycles here, and possibly not even that, depending on what else the CPU is doing near this code.
[update]
If something like this really is being called millions or billions of times per second, you would deal with it by restructuring your code to initialize fd early and then use it repeatedly without bothering to test. For example, you might add an initGlobalState(); call near the top of main() and open the file then. (You would want a corresponding destroyGlobalState(); to close it again.)
And of course, a file descriptor is a horrible example, because anything you are doing to it will take vastly more than one or two cycles anyway.
In C++, constructors, destructors, and the RAII idiom makes this sort of approach very natural, by the way.
Split the function in two, in their own source file ... and let the caller worry about it :)
static int fd;
unsigned long volatile *getReg(unsigned long addr) {
/* do stuff with fd and addr */
return 0;
}
int getRegSetup(void) {
fd = open("file", O_RDWR | O_SYNC);
if (fd < 0) return 1; /* error */
/* continue processing */
return 0; /* ok */
}
The caller then does
/* ... */
if (getRegSetup()) {
/* error */
} else {
do {
ptr = getReg(42);
} while (ptr);
}
/* ... */
Well one of the ways to fix this would be to use a function pointer to call the method. Initialize the function ptr to your long function and at the end of the first call set it to the version without additional initialization.
That said, it sounds like an absolute maintenance nightmare and is surely not worth to avoid one branch - but you get rid of the branch.. (and certainly get rid of any chance that the function is inlined which depending on how long the function is will be almost certainly detrimental)
__builtin_expect is only a hint. It helps compiler to generate better code. For example, re-arrange jump labels so that mainline code is continually aligned in memory, which makes it more friendly for code cache lines, easier to fetch from main memory etc. Running profile guided optimization is even better.
I don't see any locking in your code, so I assume this function is not supposed to be called from multiple threads at the same time. In this case you have to move fd out of the function scope, so that double checked locking is not applied. Then, re-arrange the code a bit (that's what GCC supposed to do with branch hints, but you know...). Plus, you can copy a file descriptor from main memory / cache line into a register if you access it often. The code will look something like this:
static int g_fd = -1;
static unsigned long volatile *getReg(unsigned long addr)
{
register int fd = g_fd;
if (__builtin_expect ((fd > 0), 1))
{
on_success:
return NULL; // Do important stuff here.
}
fd = open("file", O_RDWR | O_SYNC);
if (__builtin_expect ((fd > 0), 1))
{
g_fd = fd;
goto on_success;
}
return NULL;
}
But please don't take this seriously. System calls and file I/O are so bad so optimizing stuff like this doesn't make any sense (with some exceptions).
And if you really want to call it once, then you better off moving file open into a separate function that is called once, and before everything else. And yes, take a look at GCC`s profile feedback and LTO. That will help you achieve good results without spending too much time on stuff like this.
For anyone curious, this is what I came up with. Note that this is a module to a larger, long running program. Also, that it hasn't been reviewed, and is basically a bad hack anyway.
__attribute__((noinline)) static unsigned int volatile *get_mem(unsigned int addr) {
static void *map = 0 ;
static unsigned prevPage = -1U ;
static int fd = -1;
int poss_err = 0;
register unsigned page = addr & ~MAP_MASK ;
if ( unlikely(fd < 0) ) {
if ((fd = open("/dev/mem", O_RDWR | O_SYNC)) < 0) {
longjmp(mem_err, errno);
}
}
if ( page != prevPage ) {
if ( map ) {
if (unlikely((munmap(map,MAP_SIZE) < 0))) poss_err = 1;
}
if (unlikely((map = mmap(0, MAP_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED, fd, page )) == MAP_FAILED)) longjmp(mem_err, errno);
prevPage = page ;
}
return (unsigned int volatile *)((char *)map+(addr & MAP_MASK));
}
static void set_reg(const struct reg_info * const r, unsigned int val)
{
unsigned int volatile * const mem = get_mem(r->addr);
*mem = (*mem & (~(r->mask << r->shift))) | (val << r->shift);
}
// This isn't in the final piece. There are several entry points into this module. Just an example
static int entryPoint(unsigned int value){
if (setjmp(mem_err)!=0) {
// Serious error
return -1;
}
for (i=0; i<n; i++) {
if (strlen(regs[i].name) == strlen(name) &&
strncmp(regs[i].name, name, strlen (name))==0) {
set_reg(&regs[i], value);
return value;
}
}
}
This obviously isn't an answer to the question, since it checks the condition on every call.

Resources