I originally had the following C code:
volatile register uint16_t counter asm("r12");
__uint24 getCounter() {
__uint24 res = counter;
res = (res << 8) | TCNT0;
return res;
}
This function runs in some hot places and is inlined, and I'm trying to cram a lot of stuff into an ATtiny13, so it came time to optimize it.
That function compiles to:
getCounter:
movw r24,r12
ldi r26,0
clr r22
mov r23,r24
mov r24,r25
in r25,0x32
or r22,r25
ret
I came up with this assembly:
inline __uint24 getCounter() {
//__uint24 res = counter;
//res = (res << 8) | TCNT0;
uint32_t result;
asm(
"in %A[result],0x32" "\n\t"
"movw %C[result],%[counter]" "\n\t"
"mov %B[result],%C[result]" "\n\t"
"mov %C[result],%D[result]" "\n\t"
: [result] "=r" (result)
: [counter] "r" (counter)
:
);
return (__uint24) result;
}
The reason for uint32_t is to "allocate" the fourth consecutive register and for the compiler to understand it is clobbered (since I cannot do something like "%D[result]" in the clobber list)
Is my assembly correct? From my testing it seems like it is.
Is there a way to allow the compiler to optimize getCounter() better so there's not need for confusing assembly?
Is there a better way to do this in assembly?
Related
This question already has answers here:
Can I modify input operands in gcc inline assembly
(1 answer)
Count the number of set bits in a 32-bit integer
(65 answers)
Inline assembly reusing same register when it shouldn't [duplicate]
(2 answers)
Closed 1 year ago.
A simple implementation of the popcnt function in C:
int popcnt(uint64_t x) {
int s = 0;
for (int i = 0; i < 64; i++) {
if ((x << i) & 1 == 1) s++;
}
return s;
}
I am using inline assembly language (x86-64) to implement popcnt,
int asm_popcnt(uint64_t x) {
int i = 0, sum = 0;
uint64_t tmp = 0;
asm ( ".Pct: \n\t"
"movq %[xx], %[tm]\n\t"
"andq $0x1, %[tm]\n\t"
"test %[tm], %[tm]\n\t"
"je .Grt \n\t"
"incl %[ss] \n\t"
".Grt: \n\t"
"shrq $0x1, %[xx]\n\t"
"incl %[ii] \n\t"
"cmpl $0x3f, %[ii]\n\t"
"jle .Pct \n\t"
: [ss] "+r"(sum)
: [xx] "r"(x) , [ii] "r"(i),
[tm] "r"(tmp)
);
return sum;
}
but received WA (online judge)
I tested all powers of 2 (from 0x1 to (0x1 << 63)) on my computer and it returned 1, which indicates that my asm_popcnt can identify all bits of any 64_bits integer since all other integers are just combinations of 0x1, 0x2, 0x4, etc.(for example, 0x11a = 0x2 "or" 0x8 "or" 0x10 "or" 0x100). Therefore there shouldn't be cases for OJ to return a "WA". Is there anything wrong in my code? The jump instruction?
I'm trying to write a simple function using in-line assembly and use it in a C program
The mem_io_read is a function that reads a memory address bypassing cache (event though the address is located in a cacheable memory region). It's for aarch64 machine.
static inline int mem_io_read(unsigned long paddr)
{
unsigned long val;
register pa;
__asm__ __volatile__("mov %0, %1\n\t" : "=r" (pa) : "r"(paddr)); <-- move paddr to a register pa
__asm__ __volatile__("ldnp %0, [%1]\n\t" : "=r" (val) : "r" (pa)); <-- load data from addr in pa
return val;
}
main()
{
...
uint32_t SCP_WR_ADDR = &scp_wait; // where test1val was located. //x06000000;
uint32_t chk_scp_rd_data = 0;
// Send flag for proceeding SCP test
(*(volatile uint32_t *)(SCP_WR_ADDR)) = 0x87654321; <-- send signal to the other processor (scp)
// Receives flag from SCP
while(chk_scp_rd_data != 0x12345678) <--- read back until the value is changed (reverse order)
{
chk_scp_rd_data = mem_io_read(SCP_WR_ADDR);
}
}
When I compile this using gcc, I get this error
/tmp/ccCpQGc5.s: Assembler messages:
/tmp/ccCpQGc5.s:26: Error: operand 2 must be an integer register -- `ldnp x0,[x0]'
I can't figure out what is wrong here. Please help.
ADD : from Peter Cordes's comment, I changed it to this one. It is compiled ok.
static int inline mem_io_read(unsigned long paddr)
{
int val, val1;
__asm__ __volatile__("ldnp %0, %1, [%2]\n\t" : "=r" (val), "=r" (val1) : "r" (paddr) : "memory");
return val;
}
I have an array with 32bit values (nativeParameters with length nativeParameterCount) and a pointer to the function (void* to a cdecl function, here method->nativeFunction) thats supposed to be called. Now I'm trying to do this:
// Push parameters for call
if (nativeParameterCount != 0) {
uint32_t count = 0;
pushParameter:
uint32_t value = nativeParameters[nativeParameterCount - count - 1];
asm("push %0" : : "r"(value));
if (++count < nativeParameterCount) goto pushParameter;
}
// Call method
asm("call *%0" : : "r"(method->nativeFunction));
// Return value
uint32_t eax;
uint32_t edx;
asm("push %eax");
asm("push %edx");
asm("pop %0" : "=r"(edx));
asm("pop %0" : "=r"(eax));
uint64_t returnValue = eax;
// If the typesize of the methods return type is >4 bytes, or with EDX
Type returnType = method->returnType.type;
if (TYPE_SIZES[returnType] > 4) {
returnValue |= (((uint64_t) edx) << 32);
}
// Clean stack
asm("add %%esp, %0" : : "r"(parameterByteSize));
Is this approach suitable to perform a native call (assuming that all target functions accept only 32bit values as parameters)? Can I be sure that it doesn't destroy the stack or mess with registers, or somehow else influence the normal flow? Also, are there other ways of doing this?
Instead of doing this manually yourself, you might want to use the dyncall libary which does all this handling for you.
I want to call a system call (prctl) in assembly inline and retrieve the result of the system call. But I cannot make it work.
This is the code I am using:
int install_filter(void)
{
long int res =-1;
void *prg_ptr = NULL;
struct sock_filter filter[] = {
BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_TRAP),
/* If a trap is not generate, the application is killed */
BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_KILL),
};
struct sock_fprog prog = {
.len = (unsigned short)(sizeof(filter)/sizeof(filter[0])),
.filter = filter,
};
prg_ptr = &prog;
no_permis();
__asm__ (
"mov %1, %%rdx\n"
"mov $0x2, %%rsi \n"
"mov $0x16, %%rdi \n"
"mov $0x9d, %%rax\n"
"syscall\n"
"mov %%rax, %0\n"
: "=r"(res)
: "r"(prg_ptr)
: "%rdx", "%rsi", "%rdi", "%rax"
);
if ( res < 0 ){
perror("prctl");
exit(EXIT_FAILURE);
}
return 0;
}
The address of the filter should be the input (prg_ptr) and I want to save the result in res.
Can you help me?
For inline assembly, you don't use movs like this unless you have to, and even then you have to do ugly shiffling. That's because you have no idea what registers arguments arrive in. Instead, you should use:
__asm__ __volatile__ ("syscall" : "=a"(res) : "d"(prg_ptr), "S"(0x2), "D"(0x16), "a"(0x9d) : "memory");
I also added __volatile__, which you should use for any asm with side-effects other than its output, and a memory clobber (memory barrier), which you should use for any asm with side-effects on memory or for which reordering it with respect to memory accesses would be invalid. It's good practice to always use both of these for syscalls unless you know you don't need them.
If you're still having problems, use strace to observe the syscall attempt and see what's going wrong.
I need to find the architecture type of a CPU. I do not have access to /proc/cpuinfo, as the machine is running syslinux. I know there is a way to do it with inline ASM, however I believe my syntax is incorrect as my variable iedx is not being set properly.
I'm drudging along with ASM, and by no means an expert. If anyone has any tips or can point me in the right direction, I would be much obliged.
static int is64Bit(void) {
int iedx = 0;
asm("mov %eax, 0x80000001");
asm("cpuid");
asm("mov %0, %%eax" : : "a" (iedx));
if ((iedx) && (1 << 29))
{
return 1;
}
return 0;
}
How many bugs can you fit in so few lines ;)
Try
static int is64bit(void) {
int iedx = 0;
asm volatile ("movl $0x80000001, %%eax\n"
"cpuid\n"
: "=d"(iedx)
: /* No Inputs */
: "eax", "ebx", "ecx"
);
if(iedx & (1 << 29))
{
return 1;
}
return 0;
}