achieve GCC cas function for version 4.1.2 and earlier - c

My new company project, they want the code run for the 32-bit, the compile server is a CentOS 5.0 with GCC 4.1.1, that was the nightmare.
There are lots of functions using in the project like __sync_fetch_and_add was given in GCC 4.1.2 and later.
I was told can not upgrade GCC version, so I have to make another solution after Googling for several hours.
When I wrote a demo to test, I just got the wrong answer, the code blow want to replace function __sync_fetch_and_add
#include <string.h>
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <pthread.h>
static int count = 0;
int compare_and_swap(int* reg, int oldval, int newval)
{
register char result;
#ifdef __i386__
__asm__ volatile ("lock; cmpxchgl %3, %0; setz %1"
: "=m"(*reg), "=q" (result)
: "m" (*reg), "r" (newval), "a" (oldval)
: "memory");
return result;
#elif defined(__x86_64__)
__asm__ volatile ("lock; cmpxchgq %3, %0; setz %1"
: "=m"(*reg), "=q" (result)
: "m" (*reg), "r" (newval), "a" (oldval)
: "memory");
return result;
#else
#error:architecture not supported and gcc too old
#endif
}
void *test_func(void *arg)
{
int i = 0;
for(i = 0; i < 2000; ++i) {
compare_and_swap((int *)&count, count, count + 1);
}
return NULL;
}
int main(int argc, const char *argv[])
{
pthread_t id[10];
int i = 0;
for(i = 0; i < 10; ++i){
pthread_create(&id[i], NULL, test_func, NULL);
}
for(i = 0; i < 10; ++i) {
pthread_join(id[i], NULL);
}
//10*2000=20000
printf("%d\n", count);
return 0;
}
Whent I got the wrong result:
[root#centos-linux-7 workspace]# ./asm
17123
[root#centos-linux-7 workspace]# ./asm
14670
[root#centos-linux-7 workspace]# ./asm
14604
[root#centos-linux-7 workspace]# ./asm
13837
[root#centos-linux-7 workspace]# ./asm
14043
[root#centos-linux-7 workspace]# ./asm
16160
[root#centos-linux-7 workspace]# ./asm
15271
[root#centos-linux-7 workspace]# ./asm
15280
[root#centos-linux-7 workspace]# ./asm
15465
[root#centos-linux-7 workspace]# ./asm
16673
I realize in this line
compare_and_swap((int *)&count, count, count + 1);
count + 1 was wrong!
Then how can I implement the same function as __sync_fetch_and_add. The compare_and_swap function works when the third parameter is constant.
By the way, compare_and_swap function is that right? I just Googled for that, not familiar with assembly.
I got despair with this question.
………………………………………………………………………………………………………………………………………………………………………………………………………………………
after seeing the answer below,I use while and got the right answer,but seems confuse more.
here is the code:
#include <string.h>
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <pthread.h>
static unsigned long count = 0;
int sync_add_and_fetch(int* reg, int oldval, int incre)
{
register char result;
#ifdef __i386__
__asm__ volatile ("lock; cmpxchgl %3, %0; setz %1" : "=m"(*reg), "=q" (result) : "m" (*reg), "r" (oldval + incre), "a" (oldval) : "memory");
return result;
#elif defined(__x86_64__)
__asm__ volatile ("lock; cmpxchgq %3, %0; setz %1" : "=m"(*reg), "=q" (result) : "m" (*reg), "r" (newval + incre), "a" (oldval) : "memory");
return result;
#else
#error:architecture not supported and gcc too old
#endif
}
void *test_func(void *arg)
{
int i=0;
int result = 0;
for(i=0;i<2000;++i)
{
result = 0;
while(0 == result)
{
result = sync_add_and_fetch((int *)&count, count, 1);
}
}
return NULL;
}
int main(int argc, const char *argv[])
{
pthread_t id[10];
int i = 0;
for(i=0;i<10;++i){
pthread_create(&id[i],NULL,test_func,NULL);
}
for(i=0;i<10;++i){
pthread_join(id[i],NULL);
}
//10*2000=20000
printf("%u\n",count);
return 0;
}
the answer goes right to 20000,so i think when you use sync_add_and_fetch function,you should goes with a while loop is stupid,so I write like this:
#include <string.h>
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <pthread.h>
static unsigned long count = 0;
int compare_and_swap(int* reg, int oldval, int incre)
{
register char result;
#ifdef __i386__
__asm__ volatile ("lock; cmpxchgl %3, %0; setz %1" : "=m"(*reg), "=q" (result) : "m" (*reg), "r" (oldval + incre), "a" (oldval) : "memory");
return result;
#elif defined(__x86_64__)
__asm__ volatile ("lock; cmpxchgq %3, %0; setz %1" : "=m"(*reg), "=q" (result) : "m" (*reg), "r" (newval + incre), "a" (oldval) : "memory");
return result;
#else
#error:architecture not supported and gcc too old
#endif
}
void sync_add_and_fetch(int *reg,int oldval,int incre)
{
int ret = 0;
while(0 == ret)
{
ret = compare_and_swap(reg,oldval,incre);
}
}
void *test_func(void *arg)
{
int i=0;
for(i=0;i<2000;++i)
{
sync_add_and_fetch((int *)&count, count, 1);
}
return NULL;
}
int main(int argc, const char *argv[])
{
pthread_t id[10];
int i = 0;
for(i=0;i<10;++i){
pthread_create(&id[i],NULL,test_func,NULL);
}
for(i=0;i<10;++i){
pthread_join(id[i],NULL);
}
//10*2000=20000
printf("%u\n",count);
return 0;
}
but when i run this code with ./asm after g++ -g -o asm asm.cpp -lpthread.the asm just stuck for more than 5min,see top in another terminal:
3861 root 19 0 102m 888 732 S 400 0.0 2:51.06 asm
I just confused,is this code not the same?

The 64-bit compare_and_swap is wrong as it swaps 64 bits but int is only 32 bits.
compare_and_swap should be used in a loop which retries it until is succeeds.

Your result look right to me. lock cmpxchg succeeds most of the time, but will fail if another core beat you to the punch. You're doing 20k attempts to cmpxchg count+1, not 20k atomic increments.
To write __sync_fetch_and_add with inline asm, you'll want to use lock xadd. It's specifically designed to implement fetch-add.
Implementing other operations, like fetch-or or fetch-and, require a CAS retry loop if you actually need the old value. So you could make a version of the function that doesn't return the old value, and is just a sync-and without the fetch, using lock and with a memory destination. (Compiler builtins can make this optimization based on whether the result is needed or not, but an inline asm implementation doesn't get a chance to choose asm based on that information.)
For efficiency, remember that and, or, add and many other instructions can use immediate operands, so a "re"(src) constraint would be appropriate (not "ri" for int64_t on x86-64, because that would allow immediates too large. https://gcc.gnu.org/onlinedocs/gcc/Machine-Constraints.html). But cmpxchg, xadd, and xchg can't use immediates, of course.
I'd suggest looking at compiler output for modern gcc (e.g. on http://godbolt.org/) for functions using the builtin, to see what compilers do.
But beware that inline asm can compile correctly given one set of surrounding code, but not the way you expect given different code. e.g. if the surrounding code copied a value after using CAS on it (probably unlikely), the compiler might decide to give the asm template two different memory operands for "=m"(*reg) and "m"(*reg), but your asm template assumes they will always be the same address.
IDK if gcc4.1 supports it, but "+m"(*reg) would declare a read/write memory operand. Otherwise perhaps you can use a matching constraint to say that the input is in the same location as an earlier operand, like "0"(*reg). But that might only work for registers, not memory, I didn't check.
"a" (oldval) is a bug: cmpxchg writes EAX on failure.
It's not ok to tell the compiler you leave a reg unmodified, and then write an asm template that does modify it. You will get unpredictable behaviour from stepping on the compiler's toes.
See c inline assembly getting "operand size mismatch" when using cmpxchg for a safe inline-asm wrapper for lock cmpxchg. It's written for gcc6 flag-output, so you'll have to back-port that and maybe a few other syntax details to crusty old gcc4.1.
That answer also addresses returning the old value so it doesn't have to be separately loaded.
(Using ancient gcc4.1 sounds like a bad idea to me, especially for writing multi-threaded code. So much room for error from porting working code with __sync builtins to hand-rolled asm. The risks of using a newer compiler, like stable gcc5.5 if not gcc7.4, are different but probably smaller.)
If you're going to rewrite code using __sync builtins, the sane thing would be to rewrite it using C11 stdatomic.h, or GNU C's more modern __atomic builtins that are intended to replace __sync.
The Linux kernel does successfully use inline asm for hand-rolled atomics, though, so it's certainly possible.

If you truly are in such a predicament, I would start with the following header file:
#ifndef SYNC_H
#define SYNC_H
#if defined(__x86_64__) || defined(__i386__)
static inline int sync_val_compare_and_swap_int(int *ptr, int oldval, int newval)
{
__asm__ __volatile__( "lock cmpxchgl %[newval], %[ptr]"
: "+a" (oldval), [ptr] "+m" (*ptr)
: [newval] "r" (newval)
: "memory" );
return oldval;
}
static inline int sync_fetch_and_add_int(int *ptr, int val)
{
__asm__ __volatile__( "lock xaddl %[val], %[ptr]"
: [val] "+r" (val), [ptr] "+m" (*ptr)
:
: "memory" );
return val;
}
static inline int sync_add_and_fetch_int(int *ptr, int val)
{
const int old = val;
__asm__ __volatile__( "lock xaddl %[val], %[ptr]"
: [val] "+r" (val), [ptr] "+m" (*ptr)
:
: "memory" );
return old + val;
}
static inline int sync_fetch_and_sub_int(int *ptr, int val) { return sync_fetch_and_add_int(ptr, -val); }
static inline int sync_sub_and_fetch_int(int *ptr, int val) { return sync_add_and_fetch_int(ptr, -val); }
/* Memory barrier */
static inline void sync_synchronize(void) { __asm__ __volatile__( "mfence" ::: "memory"); }
#else
#error Unsupported architecture.
#endif
#endif /* SYNC_H */
The same extended inline assembly works for both x86 and x86-64. Only the int type is implemented, and you do need to replace possible __sync_synchronize() calls with sync_synchronize(), and each __sync_...() call with sync_..._int().
To test, you can use e.g.
#include <stdlib.h>
#include <pthread.h>
#include <string.h>
#include <errno.h>
#include <stdio.h>
#include "sync.h"
#define THREADS 16
#define PERTHREAD 8000
void *test_func1(void *sumptr)
{
int *const sum = sumptr;
int n = PERTHREAD;
while (n-->0)
sync_add_and_fetch_int(sum, n + 1);
return NULL;
}
void *test_func2(void *sumptr)
{
int *const sum = sumptr;
int n = PERTHREAD;
while (n-->0)
sync_fetch_and_add_int(sum, n + 1);
return NULL;
}
void *test_func3(void *sumptr)
{
int *const sum = sumptr;
int n = PERTHREAD;
int oldval, curval, newval;
while (n-->0) {
curval = *sum;
do {
oldval = curval;
newval = curval + n + 1;
} while ((curval = sync_val_compare_and_swap_int(sum, oldval, newval)) != oldval);
}
return NULL;
}
static void *(*worker[3])(void *) = { test_func1, test_func2, test_func3 };
int main(void)
{
pthread_t thread[THREADS];
pthread_attr_t attrs;
int sum = 0;
int t, result;
pthread_attr_init(&attrs);
pthread_attr_setstacksize(&attrs, 65536);
for (t = 0; t < THREADS; t++) {
result = pthread_create(thread + t, &attrs, worker[t % 3], &sum);
if (result) {
fprintf(stderr, "Failed to create thread %d of %d: %s.\n", t+1, THREADS, strerror(errno));
exit(EXIT_FAILURE);
}
}
pthread_attr_destroy(&attrs);
for (t = 0; t < THREADS; t++)
pthread_join(thread[t], NULL);
t = THREADS * PERTHREAD * (PERTHREAD + 1) / 2;
if (sum == t)
printf("sum = %d (as expected)\n", sum);
else
printf("sum = %d (expected %d)\n", sum, t);
return EXIT_SUCCESS;
}
Unfortunately, I don't have an ancient version of GCC to test, so this has only been tested with GCC 5.4.0 and GCC-4.9.3 for x86 and x86-64 (using -O2) on Linux.
If you find any bugs or issues in the above, please let me know in a comment so I can verify and fix as needed.

Related

Inline assembly in C with a jump and two return statements

I would like to have a void function which prints whether carry (overflow) happened and print the value of the (possibly overflowed) summand.
This is my try, but it does want to compile:
#include <stdio.h>
typedef unsigned long ulong;
void
func()
{
ulong a = 3;
ulong b = 1;
ulong c;
__asm__(
"\taddq\t%2, %q0\n" /* Add b to a and store in c */
"\tjc\tcarry"
: "=r" (c) /* Outputs */
: "0" (a), "rme" (b) /* Inputs */
);
printf("no carry\nc = %lu\n", c);
return;
__asm__("carry:");
printf("carry\nc = %lu\n", c);
return;
}
int
main()
{
func();
}
However, it runs when I remove the first return statement, but then it prints twice if no carry happened.
How can I do this with two return statements?
You need to use asm goto for that. To do that, add goto after __asm__, change your label to be a C label, pass the label after the clobbers, and then use a %l to refer to it. Here's your program with those fixes applied:
#include <stdio.h>
typedef unsigned long ulong;
void
func()
{
ulong a = 3;
ulong b = 1;
ulong c;
__asm__ goto(
"\taddq\t%2, %q0\n" /* Add b to a and store in c */
"\tjc\t%l[carry]"
: "=r" (c) /* Outputs */
: "0" (a), "rme" (b) /* Inputs */
:
: carry
);
printf("no carry\nc = %lu\n", c);
return;
carry:
printf("carry\nc = %lu\n", c);
return;
}
int
main()
{
func();
}
Though as was mentioned in the comments, for this use case in particular, you should just use __builtin_uaddl_overflow instead, unless your goal is just to learn how to jump out of inline assembly.

On ARM macOS when explicitly raise()-ing a signal, some return addresses are garbled on the stack

Here's a simple program for ARM macOS that installs a signal handler for SIGSEGV, then generates one. In the signal handler function, the stack is walked with the usual frame pointer chasing algorithm, then the symbolized version is printed out:
#include <stdio.h>
#include <signal.h>
#include <unistd.h>
#include <execinfo.h>
#include <stdlib.h>
void handler(int signum, siginfo_t* siginfo, void* context)
{
__darwin_ucontext* ucontext = (__darwin_ucontext*) context;
__darwin_mcontext64* machineContext = ucontext->uc_mcontext;
uint64_t programCounter = machineContext->__ss.__pc;
uint64_t framePointer = machineContext->__ss.__fp;
void* bt[100];
int n = 0;
while (framePointer != 0) {
bt[n] = (void*)programCounter;
programCounter = *(uint64_t*)(framePointer + 8);
framePointer = *(uint64_t*)(framePointer);
++n;
}
char** symbols = backtrace_symbols(bt, n);
printf ("Call stack:\n");
for (int i = 0; i < n; ++i) {
printf ("\t %s\n", symbols[i]);
}
free (symbols);
abort ();
}
void Crash ()
{
raise (SIGSEGV);
//*(volatile int*)0 = 0;
}
int main()
{
struct sigaction sigAction;
sigAction.sa_sigaction = handler;
sigAction.sa_flags = SA_SIGINFO;
sigaction (SIGSEGV, &sigAction, nullptr);
Crash ();
}
This works fine when a "regular" SIGSEGV happens, but when it's raised explicitly, return values on the stack seem garbled, specifically, the upper part seems to contain garbage:
Call stack:
0 libsystem_kernel.dylib 0x0000000185510e68 __pthread_kill + 8
1 libsystem_c.dylib 0x116a000185422e14 raise + [...] // Should be 0x0000000185422e14
2 SignalHandlerTest 0x8f6a000104bc3eb8 _Z5Crashv + [...] // Should be 0x0000000104bc3eb8
3 SignalHandlerTest 0x0000000104bc3ef8 main + 56
4 libdyld.dylib 0x0000000185561450 start + 4
The behavior is the same regardless of which signal is raised. What am I missing?
As #Codo has correctly identified, this is PAC.
The upper bits of the address are not garbled, but rather contain a salted hash of the register's lower bits.
And contrary to your claims, this happens with regular segfaults too. For example, calling fprintf(NULL, "a"); results in:
Call stack:
0 libsystem_c.dylib 0x000000019139d8a0 flockfile + 28
1 libsystem_c.dylib 0x1d550001913a5870 vfprintf_l + 2113595600120315944
2 libsystem_c.dylib 0x341c80019139efd0 fprintf + 3755016926808506440
3 t 0x5f29000100483e9c Crash + 6857011907648290844
4 t 0x0000000100483edc main + 56
5 libdyld.dylib 0x00000001914b1430 start + 4
This is because all system binaries, including libraries, are compiled for the arm64e ABI and will make use of PAC. Now, your binary is running as a regular old arm64 binary and would crash if it passed an unsigned function pointer to a library function, or got a signed one returned. So the kernel actually disables 3 of the 4 keys that your process can use (IA, IB, DA and DB). But one of those, IB, is used solely for stack frames and so that one is left enabled even in arm64 binaries.
The reason why some return addresses are still not signed though is:
The main + 56 and start + 4 were pushed by your code, which is arm64 and hence doesn't sign them.
The flockfile + 28 is the instruction that crashed, whose address was never pushed to the stack, but extracted from the thread state.
So everything's working exactly as it's supposed to.
Edit:
After attempting to use this to aid me in debugging myself, I find the PAC'ed addresses to be annoying after all. You commented about ptrauth_strip in ptrauth.h, but that will actually not work inside an arm64 process (it's aliased to a macro that does nothing), nor will __builtin_ptrauth_strip (the compiler will error out).
The compiler won't even let you use a raw xpaci instruction when targeting arm64, but nothing on the hardware level prevents the instruction from working, so you can still manually inject the opcode.
Based on this, I wrote a signal handler that properly strips PAC signatures from an arm64 process:
#include <errno.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <execinfo.h>
#ifdef __arm64__
extern void* xpaci(uint64_t pc);
__asm__
(
"_xpaci:\n"
" mov x1, x30\n"
" mov x30, x0\n"
" .4byte 0xd50320ff\n" // xpaclri
" mov x0, x30\n"
" ret x1\n"
);
#else
static inline void* xpaci(uint64_t pc)
{
return (void*)pc;
}
#endif
static void handler(int signum, siginfo_t *siginfo, void *ctx)
{
_STRUCT_MCONTEXT64 *mctx = ((_STRUCT_UCONTEXT*)ctx)->uc_mcontext;
#ifdef __arm64__
uint64_t orig_pc = mctx->__ss.__pc;
uint64_t orig_fp = mctx->__ss.__fp;
#elif defined(__x86_64__)
uint64_t orig_pc = mctx->__ss.__rip;
uint64_t orig_fp = mctx->__ss.__rbp;
#else
# error "Unknown arch"
#endif
uint64_t pc = orig_pc;
uint64_t fp = orig_fp;
size_t n = 0;
while(1)
{
if(!xpaci(pc))
{
break;
}
++n;
if(!fp)
{
break;
}
pc = ((uint64_t*)fp)[1];
fp = ((uint64_t*)fp)[0];
}
void **bt = malloc(n * sizeof(void*));
if(!bt)
{
fprintf(stderr, "malloc: %s\n", strerror(errno));
exit(-1);
}
pc = orig_pc;
fp = orig_fp;
for(size_t i = 0; i < n; ++i)
{
bt[i] = xpaci(pc);
if(!fp)
{
break;
}
pc = ((uint64_t*)fp)[1];
fp = ((uint64_t*)fp)[0];
}
char **sym = backtrace_symbols(bt, n);
fprintf(stderr, "Caught signal with call stack:\n");
for(size_t i = 0; i < n; ++i)
{
fprintf(stderr, "%s\n", sym[i]);
}
free(sym);
free(bt);
exit(-1);
}
It uses xpaclri rather than xpaci, since the former is a NOP on arm64 (non-arm64e) hardware while the latter would be undefined.

exchange function in system call table in x86

i am trying to redefine an systemcall for sys_open and to track user behaviour with that.
I use a Linux Kernel 4.13.0-041300.
This is my code so far
#include <linux/module.h>
#include <linux/kernel.h>
#include <linux/init.h>
#include <linux/syscalls.h>
MODULE_LICENSE ("GPL");
//this is where the original sys_open call position will be saved
asmlinkage long (*original_open)(const char __user *filename, int flags, umode_t mode);
unsigned long **sys_call_table;
//this is to track how often my replaced function was called...
static int zeug = 0;
//this is my open function that i want to be replaced in the sys_call_table
asmlinkage long replaced_open(const char __user *filename, int flags, umode_t mode)
{
printk ("replaced wurde aufgerufen...\n");
zeug++;
return original_open(filename, flags, mode);
}
static void enable_page_protection(void)
{
unsigned long value;
asm volatile("mov %%cr0, %0" : "=r" (value));
if((value & 0x00010000))
return;
asm volatile("mov %0, %%cr0" : : "r" (value | 0x00010000));
}
static void disable_page_protection(void)
{
unsigned long value;
asm volatile("mov %%cr0, %0" : "=r" (value));
if(!(value & 0x00010000))
return;
asm volatile("mov %0, %%cr0" : : "r" (value & ~0x00010000));
}
//the function to get the system_call_table
static unsigned long **aquire_sys_call_table(void)
{
unsigned long int offset = PAGE_OFFSET;
unsigned long **sct;
while (offset < ULLONG_MAX) {
sct = (unsigned long **)offset;
if (sct[__NR_close] == (unsigned long *) sys_close)
return sct;
offset += sizeof(void *);
}
return NULL;
}
static int __init minit (void)
{
printk ("minit: startet...\n");
if(!(sys_call_table = aquire_sys_call_table()))
return -1;
printk ("minit: sys_call_table ersetzt...\n");
disable_page_protection();
{
//here i print the function name of the current function in sys_call_table
printk ("minit: eintrag vor ersetzen:%pF\n", sys_call_table[__NR_open]);
//here i store the real sys_open function and change to my func
original_open =(void * )xchg(&sys_call_table[__NR_open],(unsigned long *)replaced_open);
}
enable_page_protection();
return 0;
}
static void mexit (void)
{
printk ("mexit gestartet.\n");
printk ("Open was called %d times...\n",zeug);
if(!sys_call_table) return;
//here i print the stored function again
printk ("bei exit:%pF\n", sys_call_table[__NR_open]);
disable_page_protection();
{
//change back to original sys_open function
xchg(&sys_call_table[__NR_open], (unsigned long *)original_open);
}
printk ("nach zurücksetzen:%pF\n", sys_call_table[__NR_open]);
enable_page_protection();
}
module_init(minit);
module_exit(mexit);
My plan:
After insmod this module to kernel, every systemcall of sys_open will be "redirected" to my function replaced_open. This function will count its calls and then call the original open function.
After rmmod of my module the original system_call open will be used again.
It seems, that the replacing works. So after insmmod I get the result replaced_open+0x0/0x40 [kroot].
That means, the original function sys_open was replaced to my replaced_open right?
and after removing my module I get the message SyS_open+0x0/0x20.
So it seems like replacing works.
My problem is: I don't see any printed messages from my replaced_open function. Also it seems that counting doesn't work.
It feels like the function wasn't replaced properly.
Do you have any help for me?

Clock Cycles Count Variation Cortex A53 AArch64

I try to count cpu clock cycles for my function on ARM Cortex-A53 using following function:
#include <sys/time.h>
readticks(unsigned int *result, int enabled)
{
struct timeval t;
unsigned int cc;
unsigned int val;
if (!enabled) {
// program the performance-counter control-register:
asm volatile("msr pmcr_el0, %0" : : "r" (17));
//enable all counters
asm volatile("msr PMCNTENSET_EL0, %0" : : "r" (0x8000000f));
//clear the overflow
asm volatile("msr PMOVSCLR_EL0, %0" : : "r" (0x8000000f));
enabled = 1;
}
//read the coutner value
asm volatile("mrs %0, PMCCNTR_EL0" : "=r" (cc));
gettimeofday(&t,(struct timezone *) 0);
result[0] = cc;
result[1] = t.tv_usec;
result[2] = t.tv_sec;
}
and here is my user space application:
#include <stio.h>
#include <inttypes.h>
#include <time.h>
int main(){
unsigned int init[3] = {0};
unsigned int start[3] = {0};
unsigned int end[3] = {0};
unsigned int overhead = 0;
readticks(init, 0);
readticks(start, 1);
readticks(end, 1);
overhead = end[0] - start[0];
readticks(init, 0);
readticks(start, 1);
foo(); //This is my function
readticks(end, 1);
end[0] = end[0] - start[0] - overhead;
printf("clock cycles= %d\n", end[0]);
return 0;
}
When I run my code for several times, I have got different clock cycles with relatively high variation (almost 5000). My code should be run around 4000 clock cycles, but I have got 4500 - 9500 clock cycles. Is there any way around that gives me more accurate clock cycles count?
use the following macro
#define mfcp(rn) ({u32 rval = 0U; \
__asm__ __volatile__(\
"mrc " rn "\n"\
: "=r" (rval)\
);\
rval;\
})
#endif
call mfcp with counter register
uint64_t t1,t2;
t1 = mfcp(CNTPCT_EL0);
// your code
t2 = mfcp(CNTPCT_EL0);

Deadlock and race condition in mutex implementation

I'm trying to implement a mutex in C using the atomic assembly instruction "bts" to atomically set a bit and return the original value.
However, when I run the following code, it occasionally deadlocks and often shows race conditions:
#include <stdio.h>
#include <stdlib.h>
#include <pthread.h>
typedef unsigned char mutex;
#define MUTEX_FREE 0
#define MUTEX_BUSY 1
// adapted from http://www.acm.uiuc.edu/sigops/roll_your_own/i386/atomic.html
mutex testAndSet(mutex *m) {
int result;
asm ("bts $0, %1; sbbl %0, %0"
:"=r" (result)
:"m" (*m)
:"memory");
return (result & 1);
}
void P(mutex *m) {
// Must use atomic testAndSet to avoid race conditions
while(testAndSet(m) == MUTEX_BUSY)
usleep(10);
}
void V(mutex *m) {
*m = MUTEX_FREE;
}
//////////////
// Test:
//////////////
const int NTHREADS = 100;
const int NINCS = 100;
int counter = 0;
mutex m = MUTEX_FREE;
void criticalSection() {
int i;
for(i=0;i<NINCS;i++) {
P(&m);
counter++;
V(&m);
}
}
int main() {
int i;
pthread_t threads[NTHREADS];
for(i=0; i<NTHREADS; i++) {
pthread_create(&threads[i], NULL, (void *) &criticalSection, NULL);
}
for(i=0; i<NTHREADS; i++) {
pthread_join(threads[i], NULL);
}
printf("got counter=%d, expected=%d\n", counter, NTHREADS*NINCS);
}
The code seems to work if I use the "xchgb" instruction instead of "bts" as follows:
mutex testAndSet(mutex *m) {
unsigned char result = MUTEX_BUSY;
asm ("xchgb %1, %0"
:"=m" (*m), "=r" (result)
:"1" (result)
:"memory");
return result;
}
Where is the race condition in the original code? Shouldn't the "bts" instruction be atomic, guaranteeing thread safety?
Furthermore, is my modified solution actually correct?
(I'm running OS X 10.8 and compiling with gcc.)
Try using the LOCK prefix to lock the memory bus:
asm ("lock bts $0, %1; ...");
The xchg instruction worked because that always asserts the LOCK# signal regardless of the presence or absence of the LOCK prefix.

Resources