I want to calculate x * y % 998244353 by GCC asm, so I wrote the asm code :
int Modmul(int x, int y)
{
int t;
__asm__ __volatile__ ("mull %%ebx\n\tdivl %%ecx\n\t" : "=d"(t) : "a"(x), "b"(y), "c"(998244353) : "eax", "edx", "memory");
return t;
}
However, it got compile error "'asm' operand has impossible constraints', and if I remove : "eax", "edx", "memory", it can pass the compilation but got the wrong answer, why?
BTW, I replaced ebx with r0 and ecx with r1:
int Modmul(int x, int y)
{
int t;
__asm__ __volatile__ ("mull %0\n\tdivl %1\n\t" : "=d"(t) : "a"(x), "r"(y), "r"(998244353));
return t;
}
It can pass the compilation, but it got runtime error, why?
Related
This question already has answers here:
Referencing memory operands in .intel_syntax GNU C inline assembly
(1 answer)
Calling printf in extended inline ASM
(1 answer)
Is this assembly function call safe/complete?
(2 answers)
Calling a function in gcc inline assembly
(1 answer)
Closed 1 year ago.
I am currently playing around with in-line simply and I've gotten a bit stuck. I have managed to call a function with no parameters but when it comes to calling one with two parameters I get stuck.
My code below should call a function (add) that adds to predefined numbers together and it should call a second one (add parameter) with two parameters which should be added together.
#include <stdio.h>
int c = 4;
int d = 5;
void add() {
int result = 1 + 2;
printf("Result: %d\n", result);
}
void add_parameter(int a, int b) {
int result = a + b;
printf("Result: %d\n", result);
}
int main()
{
__asm__ __volatile__ ( "call add" );
// __asm__ __volatile__(
// "mov eax, offset c"
// "push eax"
// "mov eax, offset d"
// "push eax"
// "call add_parameter"
// "pop ebx"
// "pop ebx"
// );
__asm__ __volatile__ ( "mov eax, offset c" );
__asm__ __volatile__ ( "push eax" );
__asm__ __volatile__ ( "mov eax, offset d" );
__asm__ __volatile__ ( "push eax" );
__asm__ __volatile__ ( "call add_parameter" );
__asm__ __volatile__ ( "pop ebx" );
__asm__ __volatile__ ( "pop ebx" );
return 0;
}
My problem at the moment is that when I try to compile the program I get an error that says
p_function.c:31: Error: too many memory references for `mov'
p_function.c:33: Error: too many memory references for `mov'
In my program I've tried two approaches one being one single ASM call with the whole ASM code in it and one where I had split each line into its own asm call.
Unfortunately I am not sure which one of these approaches is correct let alone the most effective but I get the same error regardless of which approach I use.
How can I fix this problem and call the function add_parameter
Thanks
Are the following 16 byte atomic operations correctly implemented? Are there any better alternatives?
typedef struct {
uintptr_t low;
uintptr_t high;
} uint128_atomic;
uint128_atomic load_relaxed(uint128_atomic const *atomic)
{
uint128_atomic ret;
asm volatile("xor %%eax, %%eax\n"
"xor %%ebx, %%ebx\n"
"xor %%ecx, %%ecx\n"
"xor %%edx, %%edx\n"
"lock; cmpxchg16b %1"
: "=A"(ret)
: "m"(*atomic)
: "cc", "rbx", "rcx");
return ret;
}
bool cmpexch_weak_relaxed(
uint128_atomic *atomic,
uint128_atomic *expected,
uint128_atomic desired)
{
bool matched;
uint128_atomic e = *expected;
asm volatile("lock; cmpxchg16b %1\n"
"setz %0"
: "=q"(matched), "+m"(atomic->ui)
: "a"(e.low), "d"(e.high), "b"(desired.low), "c"(desired.high)
: "cc");
return matched;
}
void store_relaxed(uint128_atomic *atomic, uint128_atomic val)
{
uint128_atomic old = *atomic;
asm volatile("lock; cmpxchg16b %0"
: "+m"(*atomic)
: "a"(old.low), "d"(old.high), "b"(val.low), "c"(val.high)
: "cc");
}
For a full working example, checkout:
https://godbolt.org/g/CemfSg
Updated implementation can be found here: https://godbolt.org/g/vGNQG5
I came up with the following implementation, after applying all the suggestions from #PeterCordes, #David Wohlferd and #prl. Thanks a lot!
struct _uint128_atomic {
volatile uint64_t low;
volatile uint64_t high;
} __attribute__((aligned(16)));
typedef struct _uint128_atomic uint128_atomic;
bool
cmpexch_weak_relaxed(
uint128_atomic *atomic,
uint128_atomic *expected,
uint128_atomic desired)
{
bool matched;
uint128_atomic e = *expected;
asm volatile("lock cmpxchg16b %1"
: "=#ccz"(matched), "+m"(*atomic), "+a"(e.low), "+d"(e.high)
: "b"(desired.low), "c"(desired.high)
: "cc");
if (!matched)
*expected = e;
return matched;
}
uint128_atomic
load_relaxed(uint128_atomic const *atomic)
{
uint128_atomic ret = {0, 0};
asm volatile("lock cmpxchg16b %1"
: "+A"(ret)
: "m"(*atomic), "b"(0), "c"(0)
: "cc");
return ret;
}
void
store_relaxed(uint128_atomic *atomic, uint128_atomic val)
{
uint128_atomic old = *atomic;
while (!cmpexch_weak_relaxed(atomic, &old, val))
;
}
Please keep in mind that the implementation is GCC specific, and will not work on clang. The implementation of GCCs inline assembly in clang is suboptimal at best, and garbage at worst.
The GCC implementation can also be found on Godbolt's Compiler Explorer here.
A suboptimal, but working, clang implementation can be found here.
Why don't you just use the C11 atomic intrinsics?
#include <stdatomic.h>
inline __uint128_t load_relaxed(_Atomic __uint128_t *obj)
{
return atomic_load_explicit(obj, memory_order_relaxed);
}
inline _Bool cmpexch_weak_relaxed(_Atomic __uint128_t *obj,
__uint128_t *expected,
__uint128_t desired)
{
return atomic_compare_exchange_weak_explicit(obj, expected, desired,
memory_order_relaxed, memory_order_relaxed);
}
This compiles to more-or-less the assembly you wrote, using clang 4.0.1 and -march=native. But, unlike what you wrote, the compiler actually understands what's going on, so code generation around these functions will be correct. There is, as far as I know, no way to annotate a GNU-style assembly insert to tell the compiler that it has the semantics of an atomic operation.
No, you need "+a" and "+d" in cmpexch_weak_relaxed and store_relaxed.
Other than that, I don't see any problems. (I compared to my own implementations in working software.)
As far as improvements, I suggest
uint128_atomic load_relaxed(uint128_atomic const *atomic)
{
uint128_atomic ret = { 0, 0 };
asm volatile("lock; cmpxchg16b %1"
: "+A"(ret)
: "m"(*atomic), "b"(0), "c"(0)
: "cc");
return ret;
}
(I see that David Wohlferd also made this suggestion in a comment.)
In this code:
int a[2]={5,2},i=0;
asm volatile
(
"incl %1\n"
"incl %0"
:"+r"(a[i]),"+r"(i)
:
:
);
printf("%d\n",a[i]);
I'm trying to increase a[1] by 1 (for a result of 2+1=3) but the output shows 2, which means it hasn't changed. What's the problem and how can I fix it?
I want to write something like this:
#include <stdint.h>
inline uint64_t with_rsp(uint64_t x, uint64_t y) {
uint64_t z, w;
uint64_t rsp;
asm ("mov %%rsp, %[rsp]\t\n"
"mov $0x13, %%rsp\t\n"
"mov %[x], %%rdx\t\n"
"mulx %[y], %[z], %[w]\t\n"
"mov %[rsp], %%rsp\t\n"
: [z] "=&r" (z), [w] "=&r" (w)
: [x] "r" (x), [y] "r" (y), [rsp] "m" (rsp)
: "rdx"
);
return z + w;
}
inline uint64_t with_rbp(uint64_t x, uint64_t y) {
uint64_t z, w;
uint64_t rbp;
asm ("mov %%rbp, %[rbp]\t\n"
"mov $0x13, %%rbp\t\n"
"mov %[x], %%rdx\t\n"
"mulx %[y], %[z], %[w]\t\n"
"mov %[rbp], %%rbp\t\n"
: [z] "=&r" (z), [w] "=&r" (w)
: [x] "r" (x), [y] "r" (y), [rbp] "m" (rbp)
: "rdx"
);
return z + w;
}
int main() {
uint64_t x = 15, y = 3, zw;
if (inline_asm_uses_rbp()) {
zw = with_rsp(x, y);
} else {
zw = with_rbp(x, y);
}
return zw;
}
Ideally, the if statement should compile away at compile-time (but I don't think I can do this with preprocessor macros, because those get evaluated before the code is assembled). So I'm fine with needing some sort of jump to get it to work, though I'd prefer to not need that.
The reason I need this is that I have some inline assembly that needs to be able to use 15 registers, plus some memory locations on the stack, and gcc is choosing rsp-based offsets in some locations where the function is inlined, and it's choosing rbp-based offsets in other locations. (A separate assembly module isn't a good match for this because I'd like to avoid the overhead of a function call.)
I am trying to refer to register literals; I need this because I want to use instructions like mulx which require one of their inputs to be in rdx.
I've tried this:
#include <stdint.h>
int main() {
uint64_t x = 15, y = 3, z, w;
asm ("mov %[x], rdx\t\n"
"mulx %[y], %[z], %[w]\t\n"
: [z] "=&r" (z), [w] "=&r" (w)
: [x] "r" (x), [y] "r" (y)
: "rdx"
);
return z + w;
}
which only works with -masm=intel and otherwise gives
$ gcc foo.c
/tmp/ccGealWM.o: In function `main':
foo.c:(.text+0x20): undefined reference to `rdx'
collect2: error: ld returned 1 exit status
and I've tried
#include <stdint.h>
int main() {
uint64_t x = 15, y = 3, z, w;
asm ("mov %[x], %rdx\t\n"
"mulx %[y], %[z], %[w]\t\n"
: [z] "=&r" (z), [w] "=&r" (w)
: [x] "r" (x), [y] "r" (y)
: "rdx"
);
return z + w;
}
which gives
$ gcc foo.c
foo.c: In function ‘main’:
foo.c:5:7: error: invalid 'asm': operand number missing after %-letter
asm ("mov %[x], %rdx\t\n"
^