C - Optimization Boundary with Inline Assembly?

C - Optimization Boundary with Inline Assembly? - c

Consider following code:
// String literals
#define _def0Impl(a0) #a0
#define _def0(a0) _def0Impl(a0)
// Labels
#define _asm_label(tag) tag: asm volatile (_def0(tag) ":")
// Assume 32 bits
typedef unsigned int uptr;
int main (int argc, void *argv[]) {
register int ctr, var;
uptr tbl[0x4];
ctr = 0x0;
var = 0x0;
// Push some tasks to tbl ...
// Suppose that tbl holds {&&tag0, &&tag1, &&tag2, &&tag1}
// Suppose that ctr holds 0xC
// tag* may exported to somewhere else.
ctr = 0x3 * sizeof(uptr);
tbl[0x0] = &&tag0;
tbl[0x1] = &&tag1;
tbl[0x2] = &&tag2;
tbl[0x3] = &&tag1;
// Run tasks table
goto *(((uptr)&tbl[0x0]) + ctr);
_asm_label(tag2);
// Task I
ctr -= sizeof(uptr);
var += 0x1;
goto *(((uptr)&tbl[0x0]) + ctr);
_asm_label(tag1);
// Task II
ctr -= sizeof(uptr);
var -= 0x1;
goto *(((uptr)&tbl[0x0]) + ctr);
_asm_label(tag0);
// Continue executation
return var;
}
Can I re-write this implementation with inline assembly?
Old statement
Consider following code:
#define _asm_label(tag) asm volatile(tag ":")
// PowerPC for example
#define _asm_jump(tag) asm volatile ("b " tag)
#define _asm_bar() asm volatile ("" ::: "cc", "memory")
int main(int argc, void *argv[]) {
register int var;
var = 0;
_asm_jump("bar");
_asm_bar(); // Boundary
var += 1;
_asm_label("bar");
_asm_bar(); // Boundary
var += 1;
return var;
}
With -O0 gcc generates:
li 30,0
b bar
# 0 "" 2
addi 30,30,1
bar:
# 0 "" 2
addi 30,30,1
mr 9,30
mr 3,9 # r3 = 0x1
But with -O2:
b bar
# 0 "" 2
bar:
# 0 "" 2
lwz 0,12(1) # restore link register
li 3,2 # incorrect
The output is incorrect since the statements get optimized out.
Are there any ways to make a "barrier" of optimization in GCC?
Edit : Attempt #1
Adding volatile to var.
With -O2:
li 9,0
stw 9,8(1)
# 10 "attempt1.c" 1
b bar
# 0 "" 2
lwz 9,8(1)
addi 9,9,1
stw 9,8(1)
# 15 "attempt1.c" 1
bar:
# 0 "" 2
lwz 9,8(1)
lwz 0,28(1)
addi 9,9,1
stw 9,8(1)
In this case, var is put into stack (r1 + 0x8).
However, put volatile on var will stop all optimization about var.
I am thinking about make use of asm goto, but it is only available on gcc >= 4.5, iirc.

The output is incorrect
The output is completely fine, your code is not correct.
Are there any ways to make a "barrier" of optimization in GCC?
The best you can get is
__asm volatile ("" ::: "memory", <more-clobbers>)
However, that doesn't fix your wrong code. The code is wrong because the inline asm has side effects you don't tell the compiler, this will almost certainly bite you sooner or later. If jumping is what you want, then like so:
int func (void)
{
int var = 0;
__asm volatile goto ("b %0" :::: labl);
var += 1;
labl:;
var += 1;
return var;
}
Generated code:
func:
# 5 "b.c" 1
b .L3
# 0 "" 2
li 3,2
blr
.p2align 4,,15
.L3:
.L2:
li 3,1
blr

Related

Performing syscalls in inline C assembly causes a segfault

I recently dabbled into low level programming, and want to make a function somesyscall that accepts (CType rax, CType rbx, CType rcx, CType rdx). struct CType looks like:
/*
TYPES:
0 int
1 string
2 bool
*/
typedef struct {
void* val;
int typev;
} CType;
the function is a bit messy, but in theory should work:
#include <errno.h>
#include <stdbool.h>
#include "ctypes.h"
//define functions to set registers
#define seteax(val) asm("mov %0, %%rax" :: "g" (val) : "%rax")
#define setebx(val) asm("mov %0, %%rbx" :: "g" (val) : "%rbx")
#define setecx(val) asm("mov %0, %%rcx" :: "g" (val) : "%rcx")
#define setedx(val) asm("mov %0, %%rdx" :: "g" (val) : "%rdx")
///////////////////////////////////
#define setregister(value, register) \
switch (value.typev) { \
case 0: { \
register(*((double*)value.val)); \
break; \
} \
case 1: { \
register(*((char**)value.val)); \
break; \
} \
case 2: { \
register(*((bool*)value.val)); \
break; \
} \
}
static inline long int somesyscall(CType a0, CType a1, CType a2, CType a3) {
//set the registers
setregister(a0, seteax);
setregister(a1, setebx);
setregister(a2, setecx);
setregister(a3, setedx);
///////////////////
asm("int $0x80"); //interrupt
//fetch back the rax
long int raxret;
asm("mov %%rax, %0" : "=r" (raxret));
return raxret;
}
when I run with:
#include "syscall_unix.h"
int main() {
CType rax;
rax.val = 39;
rax.typev = 0;
CType rbx;
rbx.val = 0;
rbx.typev = 0;
CType rcx;
rcx.val = 0;
rcx.typev = 0;
CType rdx;
rdx.val = 0;
rdx.typev = 0;
printf("%ld", somesyscall(rax, rbx, rcx, rdx));
}
and compile (and run binary) with
clang test.c
./a.out
I get a segfault. However, everything seems to look correct. Am I doing anything wrong here?

After macro expansion you will have something like
long int raxret;
asm("mov %0, %%rax" :: "g" (a0) : "%rax");
asm("mov %0, %%rbx" :: "g" (a1) : "%rbx");
asm("mov %0, %%rcx" :: "g" (a2) : "%rcx");
asm("mov %0, %%rdx" :: "g" (a3) : "%rdx");
asm("int $0x80");
asm("mov %%rax, %0" : "=r" (raxret));
This doesn't work because you haven't told the compiler that it's not allowed to reuse rax, rbx, rcx, and rdx for something else during the sequence of asm statements. For instance, the register allocator might decide to copy a2 from the stack to rax and then use rax as the input operand for the mov %0, %%rcx instruction -- clobbering the value you put in rax.
(asm statements with no outputs are implicitly volatile so the first 5 can't reorder relative to each other, but the final one can move anywhere. For example, be moved after later code to where the compiler finds it convenient to generate raxret in a register of its choice. RAX might no longer have the system call return value at that point - you need to tell the compiler that the output comes from the asm statement that actually produces it, without assuming any registers survive between asm statements.)
There are two different ways to tell the compiler not to do that:
Put only the int instruction in an asm, and express all of the requirements for what goes in what register with constraint letters:
asm volatile ("int $0x80"
: "=a" (raxret) // outputs
: "a" (a0), "b" (a1), "c" (a2), "d" (a3) // pure inputs
: "memory", "r8", "r9", "r10", "r11" // clobbers
// 32-bit int 0x80 system calls in 64-bit code zero R8..R11
// for native "syscall", clobber "rcx", "r11".
);
This is possible for this simple example but not always possible in general, because there aren't constraint letters for every single register, especially not on CPUs other than x86.
// use the native 64-bit syscall ABI
// remove the r8..r11 clobbers for 32-bit mode
Put only the int instruction in an asm, and express the requirements for what goes in what register with explicit register variables:
register long rax asm("rax") = a0;
register long rbx asm("rbx") = a1;
register long rcx asm("rcx") = a2;
register long rdx asm("rdx") = r3;
// Note that int $0x80 only looks at the low 32 bits of input regs
// so `uint32_t` would be more appropriate than long
// but really you should just use "syscall" in 64-bit code.
asm volatile ("int $0x80"
: "+r" (rax) // read-write: in=call num, out=retval
: "r" (rbx), "r" (rcx), "r" (rdx) // read-only inputs
: "memory", "r8", "r9", "r10", "r11"
);
return rax;
This will work regardless of which registers you need to use. It's also probably more compatible with the macros you're trying to use to erase types.
Incidentally, if this is 64-bit x86/Linux then you should be using syscall rather than int $0x80, and the arguments belong in the ABI-standard incoming-argument registers (rdi, rsi, rdx, rcx, r8, r9 in that order), not in rbx, rcx, rdx etc. The system call number still goes in rax, though. (Use call numbers from #include <asm/unistd.h> or <sys/syscall.h>, which will be appropriate for the native ABI of the mode you're compiling for, another reason not to use int $0x80 in 64-bit mode.)
Also, the asm statement for the system-call instruction should have a "memory" clobber and be declared volatile; almost all system calls access memory somehow.
(As a micro-optimization, I suppose you could have a list of system calls that don't read memory, write memory, or modify the virtual address space, and avoid the memory clobber for them. It would be a pretty short list and I'm not sure it would be worth the trouble. Or use the syntax shown in How can I indicate that the memory *pointed* to by an inline ASM argument may be used? to tell GCC which memory might be read or written, instead of a "memory" clobber, if you write wrappers for specific syscalls.
Some of the no-pointer cases include getpid where it would be a lot faster to call into the VDSO to avoid a round trip to kernel mode and back, like glibc does for the appropriate syscalls. That also applies to clock_gettime which does take pointers.)
Incidentally, beware of the actual kernel interfaces not matching up with the interfaces presented by the C library's wrappers. This is generally documented in the NOTES section of the man page, e.g. for brk(2) and getpriority(2)

unknow register name, asm

I want to compile assemble code (MIPS) by using C, while I came across the problem about "no register name". The error information is:
ABS.c:8:2: error: unknown register name ‘$s1’ in ‘asm’
and my code (ABS.c) are as follow:
#include<stdio.h>
#include<stdlib.h>
int ABS(int x)
{
int x_abs;
__asm__ __volatile__(
"move $s1, %1\n"
"bgez $s1, DONE\n"
"NOP\n"
"mult $s1, #-1\n"
"mflo $s1\n"
"DONE: move %0, $s1\n"
: "=r"(x_abs)
: "r"(x)
: "$s1");
return x_abs;
}
int main()
{
int result = ABS(2);
printf("%d", result);
system("pause");
return 0;
}
I wonder how I could fix that problem, THX！

All registers in MIPS are accessible through symbolic names or numbers. I had the same issue, I solved it by using numbers instead of symbolic names, e.g. mov $17, %1 instead of mov $s1, %1

Atomically increment two integers with CAS

Apparently, it is possible to atomically increment two integers with compare-and-swap instructions. This talk claims that such an algorithm exists but it does not detail what it looks like.
How can this be done?
(Note, that the obvious solution of incrementing the integers one after the other is not atomic. Also, stuffing multiple integers into one machine word does not count because it would restrict the possible range.)

Make me think of a sequence lock. Not very accurate (putting this from memory) but something along the lines of:
let x,y and s be 64 bit integers.
To increment:
atomic s++ (I mean atomic increment using 64 bit CAS op)
memory barrier
atomic x++
atomic y++
atomic s++
memory barrier
To read:
do {
S1 = load s
X = load x
Y = load y
memory barrier
S2 = load s
} while (S1 != S2)
Also see https://en.wikipedia.org/wiki/Seqlock

If sse2 is available, you can use paddq to add 2 64 bit integers to two other 64 bit integers in one instruction.
#include "emmintrin.h"
//initialize your values somewhere:
//const __m128i ones = _mm_set1_epi64x(1);
//volatile register __m128i vars =
// _mm_set_epi64x(24,7);
static inline __m128i inc_both(__m128i vars, __m128i ones){
return _mm_add_epi64(vars,ones);
}
This should compile to
paddq %xmm0, %xmm1
Since it is static inline, it may use other xmm registers though. If there is significant register pressure the ones operands may become ones(℅rip)
Note: this can be used for adding values other than 1 and there are similar operations for most other math, bitwise and compare instructions, should you need them.
So you can use the lock prefix and make it into an inline asm macro
#define inc64x2(vars) asm volatile( \
"paddq %0, %1\n":"+x"(vars):"x"(ones) \
);
The arm neon equivalent is something like: vaddq_s64(...), but there is a great article about arm/x86 equivalents here.

I've got a solution I've tested. Contained herein is a soup to nuts proof of concept program.
The algorithm is a "use CAS thread id gate" as the 3rd integer. I watched the video talk twice, and I believe this qualifies. It may not be the algorithm that the presenter was thinking of, but it does work.
The X and Y values can be anywhere in memory and the program places them far enough away from each other that they are on different cache lines. It doesn't really matter.
A quick description of the algorithm:
Each thread has a unique id number or tid (non-zero), taken from one's favorite source: pthead_t, getpid, gettid, make one up by whatever means you want. In the program, it just assigns them sequentially starting from 1.
Each thread will call the increment function with this number.
The increment function will spin on a global gate variable using CAS with an old value of 0 and a new value of tid.
When the CAS succeeds, the thread now "owns" things. In other words, if the gate is zero, it's up for grabs. A non-zero value is the tid of the owner and the gate is locked.
Now, the owner is free to increment the X and Y values with simple x += 1 and y += 1.
After that, the increment function releases by doing a store of 0 into the gate.
Here is the diagnostic/proof-of-concept program with everything. The algorithm itself has no restrictions, but I coded it for my machine.
Some caveats:
It assumes gcc/clang
It assumes a 64 bit x86_64 arch.
This was coded using nothing but inline asm and needs no [nor uses any] compiler atomic support for clarity, simplicity, and transparency.
This was built under linux, but should work on any "reasonable" x86 machine/OS (e.g. BSD, OSX should be fine, cygwin probably, and mingw maybe)
Other arches are fine if they support CAS, I just didn't code for them (e.g. arm might work if you code the CAS with ldex/stex pairs)
There are enough abstract primitives that this would/should be easy.
No attempt at Windows compatibility [if you want it, do your own port but send me no tears--or comments :-)].
The makefile and program have been defaulted to best values
Some x86 CPUs may need to use different defaults (e.g. need fence instructions). See the makefile.
Anyway, here it is:
// caslock -- prove cas lock algorithm
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <time.h>
#include <pthread.h>
#define systls __thread
// repeat the madness only once
#ifdef __clang__
#define inline_common inline
#else
#define inline_common static inline
#endif
#define inline_always inline_common __attribute__((__always_inline__))
#define inline_never __attribute__((__noinline__))
// WARNING: inline CAS fails for gcc but works for clang!
#if _USE_CASINLINE_
#define inline_cas inline_always
#else
#define inline_cas inline_never
#endif
typedef unsigned int u32;
typedef unsigned long long u64;
#ifndef LOOPMAX
#define LOOPMAX 1000000
#endif
#ifndef TIDMAX
#define TIDMAX 20
#endif
#if _USE_VPTR_
typedef volatile u32 *xptr32_p;
typedef volatile u64 *xptr64_p;
#else
typedef u32 *xptr32_p;
typedef u64 *xptr64_p;
#endif
#if _USE_TID64_
typedef u64 tid_t;
#define tidload(_xptr) loadu64(_xptr)
#define tidcas(_xptr,_oval,_nval) casu64(_xptr,_oval,_nval)
#define tidstore(_xptr,_nval) storeu64(_xptr,_nval)
#else
typedef u32 tid_t;
#define tidload(_xptr) loadu32(_xptr)
#define tidcas(_xptr,_oval,_nval) casu32(_xptr,_oval,_nval)
#define tidstore(_xptr,_nval) storeu32(_xptr,_nval)
#endif
tid_t tidgate; // gate control
tid_t readycnt; // number of threads ready
tid_t donecnt; // number of threads complete
// ensure that the variables are nowhere near each other
u64 ary[100];
#define kickoff ary[32] // sync to fire threads
#define xval ary[31] // the X value
#define yval ary[87] // the Y value
int inctype; // increment algorithm to use
tid_t tidmax; // maximum number of tasks
u64 loopmax; // loop maximum for each task
// task control
struct tsk {
tid_t tsk_tid; // task id
u32 tsk_casmiss; // cas miss count
};
typedef struct tsk tsk_t;
tsk_t *tsklist; // task list
systls tsk_t *tskcur; // current task block
// show progress
#define PGR(_pgr) \
do { \
fputs(_pgr,stdout); \
fflush(stdout); \
} while (0)
// NOTE: some x86 arches need fence instructions
// 0 -- no fence instructions
// 1 -- use mfence
// 2 -- use lfence/sfence
#if _USE_BARRIER_ == 0
#define BARRIER_RELEASE ""
#define BARRIER_ACQUIRE ""
#define BARRIER_ALL ""
#elif _USE_BARRIER_ == 1
#define BARRIER_ACQUIRE "\tmfence\n"
#define BARRIER_RELEASE "\tmfence\n"
#define BARRIER_ALL "\tmfence\n"
#elif _USE_BARRIER_ == 2
#define BARRIER_ACQUIRE "\tlfence\n"
#define BARRIER_RELEASE "\tsfence\n"
#define BARRIER_ALL "\tmfence\n"
#else
#error caslock: unknown barrier type
#endif
// barrier_acquire -- acquire barrier
inline_always void
barrier_acquire(void)
{
__asm__ __volatile__ (
BARRIER_ACQUIRE
:
:
: "memory");
}
// barrier_release -- release barrier
inline_always void
barrier_release(void)
{
__asm__ __volatile__ (
BARRIER_RELEASE
:
:
: "memory");
}
// barrier -- barrier
inline_always void
barrier(void)
{
__asm__ __volatile__ (
BARRIER_ALL
:
:
: "memory");
}
// casu32 -- compare and exchange four bytes
// RETURNS: 1=ok, 0=fail
inline_cas int
casu32(xptr32_p xptr,u32 oldval,u32 newval)
{
char ok;
__asm__ __volatile__ (
" lock\n"
" cmpxchg %[newval],%[xptr]\n"
" sete %[ok]\n"
: [ok] "=r" (ok),
[xptr] "=m" (*xptr)
: "a" (oldval),
[newval] "r" (newval)
: "memory");
return ok;
}
// casu64 -- compare and exchange eight bytes
// RETURNS: 1=ok, 0=fail
inline_cas int
casu64(xptr64_p xptr,u64 oldval,u64 newval)
{
char ok;
__asm__ __volatile__ (
" lock\n"
" cmpxchg %[newval],%[xptr]\n"
" sete %[ok]\n"
: [ok] "=r" (ok),
[xptr] "=m" (*xptr)
: "a" (oldval),
[newval] "r" (newval)
: "memory");
return ok;
}
// loadu32 -- load value with barrier
// RETURNS: loaded value
inline_always u32
loadu32(const xptr32_p xptr)
{
u32 val;
barrier_acquire();
val = *xptr;
return val;
}
// loadu64 -- load value with barrier
// RETURNS: loaded value
inline_always u64
loadu64(const xptr64_p xptr)
{
u64 val;
barrier_acquire();
val = *xptr;
return val;
}
// storeu32 -- store value with barrier
inline_always void
storeu32(xptr32_p xptr,u32 val)
{
*xptr = val;
barrier_release();
}
// storeu64 -- store value with barrier
inline_always void
storeu64(xptr64_p xptr,u64 val)
{
*xptr = val;
barrier_release();
}
// qsleep -- do a quick sleep
inline_always void
qsleep(int bigflg)
{
struct timespec ts;
if (bigflg) {
ts.tv_sec = 1;
ts.tv_nsec = 0;
}
else {
ts.tv_sec = 0;
ts.tv_nsec = 1000;
}
nanosleep(&ts,NULL);
}
// incby_tidgate -- increment by using thread id gate
void
incby_tidgate(tid_t tid)
// tid -- unique id for accessing entity (e.g. thread id)
{
tid_t *gptr;
tid_t oval;
gptr = &tidgate;
// acquire the gate
while (1) {
oval = 0;
// test mode -- just do a nop instead of CAS to prove diagnostic
#if _USE_CASOFF_
*gptr = oval;
break;
#else
if (tidcas(gptr,oval,tid))
break;
#endif
++tskcur->tsk_casmiss;
}
#if _USE_INCBARRIER_
barrier_acquire();
#endif
// increment the values
xval += 1;
yval += 1;
#if _USE_INCBARRIER_
barrier_release();
#endif
// release the gate
// NOTE: CAS will always provide a barrier
#if _USE_CASPOST_ && (_USE_CASOFF_ == 0)
oval = tidcas(gptr,tid,0);
#else
tidstore(gptr,0);
#endif
}
// tskcld -- child task
void *
tskcld(void *arg)
{
tid_t tid;
tid_t oval;
u64 loopcur;
tskcur = arg;
tid = tskcur->tsk_tid;
// tell master thread that we're fully ready
while (1) {
oval = tidload(&readycnt);
if (tidcas(&readycnt,oval,oval + 1))
break;
}
// wait until we're given the starting gun
while (1) {
if (loadu64(&kickoff))
break;
qsleep(0);
}
// do the increments
for (loopcur = loopmax; loopcur > 0; --loopcur)
incby_tidgate(tid);
barrier();
// tell master thread that we're fully complete
while (1) {
oval = tidload(&donecnt);
if (tidcas(&donecnt,oval,oval + 1))
break;
}
return (void *) 0;
}
// tskstart -- start a child task
void
tskstart(tid_t tid)
{
pthread_attr_t attr;
pthread_t thr;
int err;
tsk_t *tsk;
tsk = tsklist + tid;
tsk->tsk_tid = tid;
pthread_attr_init(&attr);
pthread_attr_setdetachstate(&attr,1);
err = pthread_create(&thr,&attr,tskcld,tsk);
pthread_attr_destroy(&attr);
if (err)
printf("tskstart: error -- err=%d\n",err);
}
// tskall -- run a single test
void
tskall(void)
{
tid_t tidcur;
tsk_t *tsk;
u64 incmax;
u64 val;
int err;
xval = 0;
yval = 0;
kickoff = 0;
readycnt = 0;
donecnt = 0;
tidgate = 0;
// prealloc the task blocks
tsklist = calloc(tidmax + 1,sizeof(tsk_t));
// start all tasks
PGR(" St");
for (tidcur = 1; tidcur <= tidmax; ++tidcur)
tskstart(tidcur);
// wait for all tasks to be fully ready
PGR(" Sw");
while (1) {
if (tidload(&readycnt) == tidmax)
break;
qsleep(1);
}
// the starting gun -- all tasks are waiting for this
PGR(" Ko");
storeu64(&kickoff,1);
// wait for all tasks to be fully done
PGR(" Wd");
while (1) {
if (tidload(&donecnt) == tidmax)
break;
qsleep(1);
}
PGR(" Done\n");
// check the final count
incmax = loopmax * tidmax;
// show per-task statistics
for (tidcur = 1; tidcur <= tidmax; ++tidcur) {
tsk = tsklist + tidcur;
printf("tskall: tsk=%llu tsk_casmiss=%d (%.3f%%)\n",
(u64) tidcur,tsk->tsk_casmiss,(double) tsk->tsk_casmiss / loopmax);
}
err = 0;
// check for failure
val = loadu64(&xval);
if (val != incmax) {
printf("tskall: xval fault -- xval=%lld incmax=%lld\n",val,incmax);
err = 1;
}
// check for failure
val = loadu64(&yval);
if (val != incmax) {
printf("tskall: yval fault -- yval=%lld incmax=%lld\n",val,incmax);
err = 1;
}
if (! err)
printf("tskall: SUCCESS\n");
free(tsklist);
}
// main -- master control
int
main(void)
{
loopmax = LOOPMAX;
tidmax = TIDMAX;
inctype = 0;
tskall();
return 0;
}
Here is the Makefile. Sorry for the extra boilerplate:
# caslock/Makefile -- make file for caslock
#
# options:
# LOOPMAX -- maximum loops / thread
#
# TIDMAX -- maximum number of threads
#
# BARRIER -- generate fence/barrier instructions
# 0 -- none
# 1 -- use mfence everywhere
# 2 -- use lfence for acquire, sfence for release
#
# CASOFF -- disable CAS to prove diagnostic works
# 0 -- normal mode
# 1 -- inhibit CAS during X/Y increment
#
# CASINLINE -- inline the CAS functions
# 0 -- do _not_ inline
# 1 -- inline them (WARNING: this fails for gcc but works for clang!)
#
# CASPOST -- increment gate release mode
# 0 -- use fenced store
# 1 -- use CAS store (NOTE: not really required)
#
# INCBARRIER -- use extra barriers around increments
# 0 -- rely on CAS for barrier
# 1 -- add extra safety barriers immediately before increment of X/Y
#
# TID64 -- use 64 bit thread "id"s
# 0 -- use 32 bit
# 1 -- use 64 bit
#
# VPTR -- use volatile pointers in function definitions
# 0 -- use ordinary pointers
# 1 -- use volatile pointers (NOTE: not really required)
ifndef _CASLOCK_MK_
_CASLOCK_MK_ = 1
OLIST += caslock.o
ifndef LOOPMAX
LOOPMAX = 1000000
endif
ifndef TIDMAX
TIDMAX = 20
endif
ifndef BARRIER
BARRIER = 0
endif
ifndef CASINLINE
CASINLINE = 0
endif
ifndef CASOFF
CASOFF = 0
endif
ifndef CASPOST
CASPOST = 0
endif
ifndef INCBARRIER
INCBARRIER = 0
endif
ifndef TID64
TID64 = 0
endif
ifndef VPTR
VPTR = 0
endif
CFLAGS += -DLOOPMAX=$(LOOPMAX)
CFLAGS += -DTIDMAX=$(TIDMAX)
CFLAGS += -D_USE_BARRIER_=$(BARRIER)
CFLAGS += -D_USE_CASINLINE_=$(CASINLINE)
CFLAGS += -D_USE_CASOFF_=$(CASOFF)
CFLAGS += -D_USE_CASPOST_=$(CASPOST)
CFLAGS += -D_USE_INCBARRIER_=$(INCBARRIER)
CFLAGS += -D_USE_TID64_=$(TID64)
CFLAGS += -D_USE_VPTR_=$(VPTR)
STDLIB += -lpthread
ALL += caslock
CLEAN += caslock
OVRPUB := 1
ifndef OVRTOP
OVRTOP := $(shell pwd)
OVRTOP := $(dir $(OVRTOP))
endif
endif
# ovrlib/rules.mk -- rules control
#
# options:
# GDB -- enable debug symbols
# 0 -- normal
# 1 -- use -O0 and define _USE_GDB_=1
#
# CLANG -- use clang instead of gcc
# 0 -- use gcc
# 1 -- use clang
#
# BNC -- enable benchmarks
# 0 -- normal mode
# 1 -- enable benchmarks for function enter/exit pairs
ifdef OVRPUB
ifndef SDIR
SDIR := $(shell pwd)
STAIL := $(notdir $(SDIR))
endif
ifndef GENTOP
GENTOP := $(dir $(SDIR))
endif
ifndef GENDIR
GENDIR := $(GENTOP)/$(STAIL)
endif
ifndef ODIR
ODIR := $(GENDIR)
endif
PROTOLST := true
PROTOGEN := #true
endif
ifndef SDIR
$(error rules: SDIR not defined)
endif
ifndef ODIR
$(error rules: ODIR not defined)
endif
ifndef GENDIR
$(error rules: GENDIR not defined)
endif
ifndef GENTOP
$(error rules: GENTOP not defined)
endif
ifndef _RULES_MK_
_RULES_MK_ = 1
CLEAN += *.proto
CLEAN += *.a
CLEAN += *.o
CLEAN += *.i
CLEAN += *.dis
CLEAN += *.TMP
QPROTO := $(shell $(PROTOLST) -i -l -O$(GENTOP) $(SDIR)/*.c $(CPROTO))
HDEP += $(QPROTO)
###VPATH += $(GENDIR)
###VPATH += $(SDIR)
ifdef INCLUDE_MK
-include $(INCLUDE_MK)
endif
ifdef GSYM
CFLAGS += -gdwarf-2
endif
ifdef GDB
CFLAGS += -gdwarf-2
DFLAGS += -D_USE_GDB_
else
CFLAGS += -O2
endif
ifndef ZPRT
DFLAGS += -D_USE_ZPRT_=0
endif
ifdef BNC
DFLAGS += -D_USE_BNC_=1
endif
ifdef CLANG
CC := clang
endif
DFLAGS += -I$(GENTOP)
DFLAGS += -I$(OVRTOP)
CFLAGS += -Wall -Werror
CFLAGS += -Wno-unknown-pragmas
CFLAGS += -Wempty-body
CFLAGS += -fno-diagnostics-color
# NOTE: we now need this to prevent inlining (enabled at -O2)
ifndef CLANG
CFLAGS += -fno-inline-small-functions
endif
# NOTE: we now need this to prevent inlining (enabled at -O3)
CFLAGS += -fno-inline-functions
CFLAGS += $(DFLAGS)
endif
all: $(PREP) proto $(ALL)
%.o: %.c $(HDEP)
$(CC) $(CFLAGS) -c -o $*.o $<
%.i: %.c
cpp $(DFLAGS) -P $*.c > $*.i
%.s: %.c
$(CC) $(CFLAGS) -S -o $*.s $<
# build a library (type (2) build)
$(LIBNAME):: $(OLIST)
ar rv $# $(OLIST)
.PHONY: proto
proto::
$(PROTOGEN) -i -v -O$(GENTOP) $(SDIR)/*.c $(CPROTO)
.PHONY: clean
clean::
rm -f $(CLEAN)
.PHONY: help
help::
egrep '^#' Makefile
caslock:: $(OLIST) $(LIBLIST) $(STDLIB)
$(CC) $(CFLAGS) -o caslock $(OLIST) $(LIBLIST) $(STDLIB)
NOTE: I may have blown some of the asm constraints because when doing the CAS function as an inline, compiling with gcc produces incorrect results. However, clang works fine with inline. So, the default is that the CAS function is not inline. For consistency, I didn't use a different default for gcc/clang, even though I could.
Here's the disassembly of the relevant function with inline as built by gcc (this fails):
00000000004009c0 <incby_tidgate>:
4009c0: 31 c0 xor %eax,%eax
4009c2: f0 0f b1 3d 3a 1a 20 lock cmpxchg %edi,0x201a3a(%rip) # 602404 <tidgate>
4009c9: 00
4009ca: 0f 94 c2 sete %dl
4009cd: 84 d2 test %dl,%dl
4009cf: 75 23 jne 4009f4 <L01>
4009d1: 0f 1f 80 00 00 00 00 nopl 0x0(%rax)
4009d8:L00 64 48 8b 14 25 f8 ff mov %fs:0xfffffffffffffff8,%rdx
4009df: ff ff
4009e1: 83 42 04 01 addl $0x1,0x4(%rdx)
4009e5: f0 0f b1 3d 17 1a 20 lock cmpxchg %edi,0x201a17(%rip) # 602404 <tidgate>
4009ec: 00
4009ed: 0f 94 c2 sete %dl
4009f0: 84 d2 test %dl,%dl
4009f2: 74 e4 je 4009d8 <L00>
4009f4:L01 48 83 05 dc 17 20 00 addq $0x1,0x2017dc(%rip) # 6021d8 <ary+0xf8>
4009fb: 01
4009fc: 48 83 05 94 19 20 00 addq $0x1,0x201994(%rip) # 602398 <ary+0x2b8>
400a03: 01
400a04: c7 05 f6 19 20 00 00 movl $0x0,0x2019f6(%rip) # 602404 <tidgate>
400a0b: 00 00 00
400a0e: c3 retq
Here's the disassembly of the relevant function with inline as built by clang (this succeeds):
0000000000400990 <incby_tidgate>:
400990: 31 c0 xor %eax,%eax
400992: f0 0f b1 3d 3a 1a 20 lock cmpxchg %edi,0x201a3a(%rip) # 6023d4 <tidgate>
400999: 00
40099a: 0f 94 c0 sete %al
40099d: eb 1a jmp 4009b9 <L01>
40099f: 90 nop
4009a0:L00 64 48 8b 04 25 f8 ff mov %fs:0xfffffffffffffff8,%rax
4009a7: ff ff
4009a9: ff 40 04 incl 0x4(%rax)
4009ac: 31 c0 xor %eax,%eax
4009ae: f0 0f b1 3d 1e 1a 20 lock cmpxchg %edi,0x201a1e(%rip) # 6023d4 <tidgate>
4009b5: 00
4009b6: 0f 94 c0 sete %al
4009b9:L01 84 c0 test %al,%al
4009bb: 74 e3 je 4009a0 <L00>
4009bd: 48 ff 05 e4 17 20 00 incq 0x2017e4(%rip) # 6021a8 <ary+0xf8>
4009c4: 48 ff 05 9d 19 20 00 incq 0x20199d(%rip) # 602368 <ary+0x2b8>
4009cb: c7 05 ff 19 20 00 00 movl $0x0,0x2019ff(%rip) # 6023d4 <tidgate>
4009d2: 00 00 00
4009d5: c3 retq
4009d6: 66 2e 0f 1f 84 00 00 nopw %cs:0x0(%rax,%rax,1)
4009dd: 00 00 00

Strange behavior with gcc inline assembly?

I'm new to gcc inline assembly.
Why this code output "1" instead of "5"?
code:
#include <stdio.h>
static inline int atomic_add(volatile int *mem, int add)
{
asm volatile(
"lock xadd %0, (%1);"
: "=a"(add)
: "r"(mem), "a"(add)
: "memory"
);
return add;
}
int main(void)
{
int a=1;
int b=5;
printf ( "%d\n", atomic_add(&a, b) );
return 0;
}
run:
$ ./a.out
1 # why not 5?
many thx. :)

The variable add stars out with the value 5 and *mem starts out with 1.
The lock xadd %0, (%1) assembly template gets compiled by gcc to:
lock xadd %eax, (%edx)
GCC has to use eax because your constraints indicate that %0 should use %eax. Your constraint also ties %eax to the variable add. I believe that GCC is free to us whatever register it wants for the other operand (in my test it happened to use %edx).
So:
%eax starts with 5, and %edx points to a memory location that has the value 1
the xadd instruction swaps the two operands and places the sum in the destination, so after executing %eax has 1 and the memory pointed to by%edxcontains6`
Your constraint also indicates that %eax should be stored back into the variable add, so add gets 1. And that is what is returned from the function.

In x86, XADD is the Exchange and Add instruction. So the register holding the add parameter became 1 after the lock xadd instruction. add is then returned by atomic_add() thus you see 1 printed instead of 5.
For atomic_add() you probably want to just use lock add instead of lock xadd:
#include <stdio.h>
static inline int atomic_add(volatile int *mem, int add)
{
asm volatile(
"lock add %0, (%1);"
: "=a"(add)
: "r"(mem), "a"(add)
: "memory"
);
return add;
}
int main(void)
{
int a=1;
int b=5;
printf ( "%d\n", atomic_add(&a, b) );
return 0;
}
And this prints 5 like you expect:
$ ./a.out
5

Printf problem with gcc on mmix

I have compiled the gcc 4.6.0 for mmix according to http://www.bitrange.com/mmix/install.html. After I try their simple hello world, or any other call to printf with more than the first string, only the first string gets printed. E.g.
lada#:~/f/c> cat hellommix.c
#include <stdio.h>
#include <stdlib.h>
int main (int argc, char **argv)
{
printf ("hello, %s\n", argc > 1 ? argv[1] : "world");
exit (0);
}
lada#:~/f/c> mmix-gcc hellommix.c
lada#:~/f/c> mmix ./a.out "abc"
hello, lada#:~/f/c>
The generated assembly looks like this:
# 1 "hellommix.c"
! mmixal:= 8H LOC Data_Section
.text ! mmixal:= 9H LOC 8B
.section .rodata
.p2align 2
LOC #+(4-#)&3
LC:0 IS #
BYTE "world",#0
.p2align 2
LOC #+(4-#)&3
LC:1 IS #
BYTE "hello, %s",#a,#0
.text ! mmixal:= 9H LOC 8B
.p2align 2
LOC #+(4-#)&3
.global main
main IS #
SUBU $254,$254,24
STOU $253,$254,16
ADDU $253,$254,24
GET $2,rJ
SET $3,$0
SUBU $0,$253,24
STOU $1,$0,0
SUBU $0,$253,12
STTU $3,$0,0
SUBU $0,$253,12
LDT $0,$0,0
SLU $0,$0,32
SR $0,$0,32
CMP $0,$0,1
BNP $0,L:2
SUBU $0,$253,24
LDO $0,$0,0
LDO $0,$0,8
JMP L:3
L:2 IS #
GETA $0,LC:0
L:3 IS #
GETA $5,LC:1
SET $6,$0
PUSHJ $4,printf
PUT rJ,$2
SETL $5,0
PUSHJ $4,exit
.data ! mmixal:= 8H LOC 9B

Try those:
put a fflush (stdout); before exiting. (though normally, posix' man 3 exit tells that all buffers are flushed; maybe something mmix specific)
Dump all arguments, just to see what's there.
-
for (int x=0; x!=argc; ++x) {
printf ("arg %d: \"%s\"\n", x, argv[x]);
}

Enter code:
setbuf(stdout,NULL);
after variable declaration.
Like this, just add setbuf(stdout,NULL); this code. On the first top only. Then you can do the code.

Develop Reference

c reactjs sql-server angularjs arrays wpf database batch-file google-app-engine silverlight

C - Optimization Boundary with Inline Assembly? - c

Related

Performing syscalls in inline C assembly causes a segfault

unknow register name, asm

Atomically increment two integers with CAS

Strange behavior with gcc inline assembly?

Printf problem with gcc on mmix

Categories

Resources