Integer overflow not consistent - c

Pardon me if this question has been posed before. I looked for answers to similar questions, but I'm still puzzled with my problem. So I will shoot the question anyway.
I'm using a C library called libexif for image data. I run my application (which uses this library) both on my Linux desktop and my MIPS board.
For a particular image file when I try to fetch the created time, I was getting an error/invalid value. On debugging further I saw that for this particular image file, I was not getting the tag (EXIF_TAG_DATE_TIME) as expected.
This library has several utility functions. Most functions are structured like below
int16_t
exif_get_sshort (const unsigned char *buf, ExifByteOrder order)
{
if (!buf) return 0;
switch (order) {
case EXIF_BYTE_ORDER_MOTOROLA:
return ((buf[0] << 8) | buf[1]);
case EXIF_BYTE_ORDER_INTEL:
return ((buf[1] << 8) | buf[0]);
}
/* Won't be reached */
return (0);
}
uint16_t
exif_get_short (const unsigned char *buf, ExifByteOrder order)
{
return (exif_get_sshort (buf, order) & 0xffff);
}
When the library tries to investigate the presence of tags in raw data, it calls exif_get_short() and assigns the value returned to a variable which is of type enum (int).
In the error case, exif_get_short() which is supposed to return unsigned value (34687) returns a negative number (-30871) which messes up the whole tag extraction from the image data.
34687 is outside the range of maximum representable int16_t value. And therefore leads to an overflow. When I make this slight modification in code, everything seems to work fine
uint16_t
exif_get_short (const unsigned char *buf, ExifByteOrder order)
{
int temp = (exif_get_sshort (buf, order) & 0xffff);
return temp;
}
But since this is a pretty stable library and in use for quite some time, it led me to believe that I may be missing something here. Moreover this is the general way the code is structured for other utility functions as well. Ex: exif_get_long() calls exif_get_slong(). I would then have to change all utility functions.
What is confusing me is that when I run this piece of code on my linux desktop for the error file, I see no problems and things work fine with the original library code. Which led to me believe that perhaps UINT16_MAX and INT16_MAX macros have different values on my desktop and MIPS board. But unfortunately, thats not the case. Both print identical values on the board and desktop. If this piece of code fails, it should fail also on my desktop.
What am I missing here? Any hints would be much appreciated.
EDIT:
The code which calls exif_get_short() goes something like this:
ExifTag tag;
...
tag = exif_get_short (d + offset + 12 * i, data->priv->order);
switch (tag) {
...
...
The type ExifTag is as follows:
typedef enum {
EXIF_TAG_GPS_VERSION_ID = 0x0000,
EXIF_TAG_INTEROPERABILITY_INDEX = 0x0001,
...
...
}ExifTag ;
The cross compiler being used is mipsisa32r2el-timesys-linux-gnu-gcc
CFLAGS = -pipe -mips32r2 -mtune=74kc -mdspr2 -Werror -O3 -Wall -W -D_REENTRANT -fPIC $(DEFINES)
I'm using libexif within Qt - Qt Media hub (actually libexif comes along with Qt Media hub)
EDIT2: Some additional observations:
I'm observing something bizarre. I have put print statements in exif_get_short(). Just before return
printf("return_value %d\n %u\n",exif_get_sshort (buf, order) & 0xffff, exif_get_sshort (buf, order) & 0xffff);
return (exif_get_sshort (buf, order) & 0xffff);
I see the following o/p:
return_value 34665 34665
I then also inserted print statements in the code which calls exif_get_short()
....
tag = exif_get_short (d + offset + 12 * i, data->priv->order);
printf("TAG %d %u\n",tag,tag);
I see the following o/p:
TAG -30871 4294936425
EDIT3 : Posting assembly code for exif_get_short() and exif_get_sshort() taken on MIPS board
.file 1 "exif-utils.c"
.section .mdebug.abi32
.previous
.gnu_attribute 4, 1
.abicalls
.text
.align 2
.globl exif_get_sshort
.ent exif_get_sshort
.type exif_get_sshort, #function
exif_get_sshort:
.set nomips16
.frame $sp,0,$31 # vars= 0, regs= 0/0, args= 0, gp= 0
.mask 0x00000000,0
.fmask 0x00000000,0
.set noreorder
.set nomacro
beq $4,$0,$L2
nop
beq $5,$0,$L3
nop
li $2,1 # 0x1
beq $5,$2,$L8
nop
$L2:
j $31
move $2,$0
$L3:
lbu $2,0($4)
lbu $3,1($4)
sll $2,$2,8
or $2,$2,$3
j $31
seh $2,$2
$L8:
lbu $2,1($4)
lbu $3,0($4)
sll $2,$2,8
or $2,$2,$3
j $31
seh $2,$2
.set macro
.set reorder
.end exif_get_sshort
.align 2
.globl exif_get_short
.ent exif_get_short
.type exif_get_short, #function
exif_get_short:
.set nomips16
.frame $sp,0,$31 # vars= 0, regs= 0/0, args= 0, gp= 0
.mask 0x00000000,0
.fmask 0x00000000,0
.set noreorder
.cpload $25
.set nomacro
lw $25,%call16(exif_get_sshort)($28)
jr $25
nop
.set macro
.set reorder
.end exif_get_short
Just for completeness, the ASM code taken from my linux machine
.file "exif-utils.c"
.text
.p2align 4,,15
.globl exif_get_sshort
.type exif_get_sshort, #function
exif_get_sshort:
.LFB1:
.cfi_startproc
xorl %eax, %eax
testq %rdi, %rdi
je .L2
testl %esi, %esi
jne .L8
movzbl (%rdi), %edx
movzbl 1(%rdi), %eax
sall $8, %edx
orl %edx, %eax
ret
.p2align 4,,10
.p2align 3
.L8:
cmpl $1, %esi
jne .L2
movzbl 1(%rdi), %edx
movzbl (%rdi), %eax
sall $8, %edx
orl %edx, %eax
.L2:
rep
ret
.cfi_endproc
.LFE1:
.size exif_get_sshort, .-exif_get_sshort
.p2align 4,,15
.globl exif_get_short
.type exif_get_short, #function
exif_get_short:
.LFB2:
.cfi_startproc
jmp exif_get_sshort#PLT
.cfi_endproc
.LFE2:
.size exif_get_short, .-exif_get_short
EDIT4: Hopefully my last update :-)
ASM code with compiler option set to -O1
exif_get_short:
.set nomips16
.frame $sp,32,$31 # vars= 0, regs= 1/0, args= 16, gp= 8
.mask 0x80000000,-4
.fmask 0x00000000,0
.set noreorder
.cpload $25
.set nomacro
addiu $sp,$sp,-32
sw $31,28($sp)
.cprestore 16
lw $25,%call16(exif_get_sshort)($28)
jalr $25
nop
lw $28,16($sp)
andi $2,$2,0xffff
lw $31,28($sp)
j $31
addiu $sp,$sp,32
.set macro
.set reorder
.end exif_get_short

One thing the MIPS assembly shows (though I'm not an expert in MIPS assembly, so there's a decent chance I'm missing something or otherwise wrong) is that the exif_get_short() function is just an alias for the exif_get_sshort() function. All that exif_get_short() does is jump to the address of the exif_get_sshort() function.
The exif_get_sshort() function sign extends the 16 bit value it's returning to the full 32-bit register used for the return. There's nothing wrong with that - it's actually probably what the MIPS ABI specifies (I'm not sure).
However, since the exif_get_short() function just jumps to the exif_get_sshort() function, it has no opportunity to clear the upper 16 bits of the register.
So when the 16 bit value 0x8769 is being returned from the buffer (whether from exif_get_sshort() or exif_get_short()), the $2 register used to return the function result contains 0xffff8769, which can have the following interpretations:
as a 32-bit signed int: -30871
as a 32-bit `unsigned int: 4294936425
as a 16-bit signed int16_t: -30871
as a 16-bit unsigned uint16_t: 34665
If the compiler is supposed to ensure that the $2 return register has a top 16-bit set to zero for a uint16_t return type, then it has a bug in the code it's emitting for exif_get_short() - instead of jumping to exif_get_sshort(), it should call exif_get_sshort() and clear the upper half of $2 before returning.
From the description of the behavior you're seeing, it looks like the code calling exif_get_short() expects that the $2 resister used for the return value will have the upper 16 bits cleared so that the entire 32-bit register can be used as-is for the 16-bit uint16_t value.
I'm not sure what the MIPS ABI specifies (but I'd guess that it specifies that the upper 16 bits of the $2 register should eb cleared by exif_get_short()), but there seems to be either a code generation bug that exif_get_short() doesn't ensure $2 is entirely correct before it returns or a bug where the caller of exif_get_short() assumes that the full 32-bits of $2 are valid when only 16 bits are.

This is broken on so many levels, I don't know where to begin. Just look at what is done here:
The unsigned chars are read from the buffer.
They are assigned to a signed int16_t in exif_get_sshort.
This is assigned to an unsigned uint16_t in exif_get_short.
This is finally assigned to an enum which is of type signed int.
I'd say it's a miracle it works at all.
First, the assignment from the chars to int16_t is done with the values, not with the representation:
return ((buf[0] << 8) | buf[1]);
Which already throws you into the pit of undefined behaviour when the result actually is negative. Plus, it only works when the signed int representation of the implementation is the same as the one used in the file format (two's complement, I guess). It will fail for one's complement and sign-magnitude. So check what's the case for the MIPS implementation.
The clean way would be the other way around: Assign two chars from the buffer to an uint16_t, which will be well defined operations, and use this to return an int16_t. You could then care for further corrections of the value for different representations, if necessary.
Additionally, here:
if (!buf) return 0;
0 is a very bad choice of a return value, because it is a valid enum constant:
EXIF_TAG_GPS_VERSION_ID = 0x0000,
If this is expected to be the default for invalidity, then the constant should be returned, not the magic number. Though this seems to be a generic function to return an int16_t, thus some other error mechanism should be used here.
For your specific question, well, follow the flow of conversions between signed and unsigned on your MIPS implementation, including the default promotions done, and examine all the intermediate values to find the point where it breaks. Your MIPS uses 32 bit ints, not 16 bit, right? Check INT_MAX and UINT_MAX.

Related

For GNU Assembly x64 AT&T syntax: How to add 2 quad numbers? [duplicate]

I have written a Assembly program to display the factorial of a number following AT&T syntax. But it's not working. Here is my code
.text
.globl _start
_start:
movq $5,%rcx
movq $5,%rax
Repeat: #function to calculate factorial
decq %rcx
cmp $0,%rcx
je print
imul %rcx,%rax
cmp $1,%rcx
jne Repeat
# Now result of factorial stored in rax
print:
xorq %rsi, %rsi
# function to print integer result digit by digit by pushing in
#stack
loop:
movq $0, %rdx
movq $10, %rbx
divq %rbx
addq $48, %rdx
pushq %rdx
incq %rsi
cmpq $0, %rax
jz next
jmp loop
next:
cmpq $0, %rsi
jz bye
popq %rcx
decq %rsi
movq $4, %rax
movq $1, %rbx
movq $1, %rdx
int $0x80
addq $4, %rsp
jmp next
bye:
movq $1,%rax
movq $0, %rbx
int $0x80
.data
num : .byte 5
This program is printing nothing, I also used gdb to visualize it work fine until loop function but when it comes in next some random value start entering in various register. Help me to debug so that it could print factorial.
As #ped7g points out, you're doing several things wrong: using the int 0x80 32-bit ABI in 64-bit code, and passing character values instead of pointers to the write() system call.
Here's how to print an integer in x8-64 Linux, the simple and somewhat-efficient1 way, using the same repeated division / modulo by 10.
System calls are expensive (probably thousands of cycles for write(1, buf, 1)), and doing a syscall inside the loop steps on registers so it's inconvenient and clunky as well as inefficient. We should write the characters into a small buffer, in printing order (most-significant digit at the lowest address), and make a single write() system call on that.
But then we need a buffer. The maximum length of a 64-bit integer is only 20 decimal digits, so we can just use some stack space. In x86-64 Linux, we can use stack space below RSP (up to 128B) without "reserving" it by modifying RSP. This is called the red-zone. If you wanted to pass the buffer to another function instead of a syscall, you would have to reserve space with sub $24, %rsp or something.
Instead of hard-coding system-call numbers, using GAS makes it easy to use the constants defined in .h files. Note the mov $__NR_write, %eax near the end of the function. The x86-64 SystemV ABI passes system-call arguments in similar registers to the function-calling convention. (So it's totally different from the 32-bit int 0x80 ABI, which you shouldn't use in 64-bit code.)
// building with gcc foo.S will use CPP before GAS so we can use headers
#include <asm/unistd.h> // This is a standard Linux / glibc header file
// includes unistd_64.h or unistd_32.h depending on current mode
// Contains only #define constants (no C prototypes) so we can include it from asm without syntax errors.
.p2align 4
.globl print_integer #void print_uint64(uint64_t value)
print_uint64:
lea -1(%rsp), %rsi # We use the 128B red-zone as a buffer to hold the string
# a 64-bit integer is at most 20 digits long in base 10, so it fits.
movb $'\n', (%rsi) # store the trailing newline byte. (Right below the return address).
# If you need a null-terminated string, leave an extra byte of room and store '\n\0'. Or push $'\n'
mov $10, %ecx # same as mov $10, %rcx but 2 bytes shorter
# note that newline (\n) has ASCII code 10, so we could actually have stored the newline with movb %cl, (%rsi) to save code size.
mov %rdi, %rax # function arg arrives in RDI; we need it in RAX for div
.Ltoascii_digit: # do{
xor %edx, %edx
div %rcx # rax = rdx:rax / 10. rdx = remainder
# store digits in MSD-first printing order, working backwards from the end of the string
add $'0', %edx # integer to ASCII. %dl would work, too, since we know this is 0-9
dec %rsi
mov %dl, (%rsi) # *--p = (value%10) + '0';
test %rax, %rax
jnz .Ltoascii_digit # } while(value != 0)
# If we used a loop-counter to print a fixed number of digits, we would get leading zeros
# The do{}while() loop structure means the loop runs at least once, so we get "0\n" for input=0
# Then print the whole string with one system call
mov $__NR_write, %eax # call number from asm/unistd_64.h
mov $1, %edi # fd=1
# %rsi = start of the buffer
mov %rsp, %rdx
sub %rsi, %rdx # length = one_past_end - start
syscall # write(fd=1 /*rdi*/, buf /*rsi*/, length /*rdx*/); 64-bit ABI
# rax = return value (or -errno)
# rcx and r11 = garbage (destroyed by syscall/sysret)
# all other registers = unmodified (saved/restored by the kernel)
# we don't need to restore any registers, and we didn't modify RSP.
ret
To test this function, I put this in the same file to call it and exit:
.p2align 4
.globl _start
_start:
mov $10120123425329922, %rdi
# mov $0, %edi # Yes, it does work with input = 0
call print_uint64
xor %edi, %edi
mov $__NR_exit, %eax
syscall # sys_exit(0)
I built this into a static binary (with no libc):
$ gcc -Wall -static -nostdlib print-integer.S && ./a.out
10120123425329922
$ strace ./a.out > /dev/null
execve("./a.out", ["./a.out"], 0x7fffcb097340 /* 51 vars */) = 0
write(1, "10120123425329922\n", 18) = 18
exit(0) = ?
+++ exited with 0 +++
$ file ./a.out
./a.out: ELF 64-bit LSB executable, x86-64, version 1 (SYSV), statically linked, BuildID[sha1]=69b865d1e535d5b174004ce08736e78fade37d84, not stripped
Footnote 1: See Why does GCC use multiplication by a strange number in implementing integer division? for avoiding div r64 for division by 10, because that's very slow (21 to 83 cycles on Intel Skylake). A multiplicative inverse would make this function actually efficient, not just "somewhat". (But of course there'd still be room for optimizations...)
Related: Linux x86-32 extended-precision loop that prints 9 decimal digits from each 32-bit "limb": see .toascii_digit: in my Extreme Fibonacci code-golf answer. It's optimized for code-size (even at the expense of speed), but well-commented.
It uses div like you do, because that's smaller than using a fast multiplicative inverse). It uses loop for the outer loop (over multiple integer for extended precision), again for code-size at the cost of speed.
It uses the 32-bit int 0x80 ABI, and prints into a buffer that was holding the "old" Fibonacci value, not the current.
Another way to get efficient asm is from a C compiler. For just the loop over digits, look at what gcc or clang produce for this C source (which is basically what the asm is doing). The Godbolt Compiler explorer makes it easy to try with different options and different compiler versions.
See gcc7.2 -O3 asm output which is nearly a drop-in replacement for the loop in print_uint64 (because I chose the args to go in the same registers):
void itoa_end(unsigned long val, char *p_end) {
const unsigned base = 10;
do {
*--p_end = (val % base) + '0';
val /= base;
} while(val);
// write(1, p_end, orig-current);
}
I tested performance on a Skylake i7-6700k by commenting out the syscall instruction and putting a repeat loop around the function call. The version with mul %rcx / shr $3, %rdx is about 5 times faster than the version with div %rcx for storing a long number-string (10120123425329922) into a buffer. The div version ran at 0.25 instructions per clock, while the mul version ran at 2.65 instructions per clock (although requiring many more instructions).
It might be worth unrolling by 2, and doing a divide by 100 and splitting up the remainder of that into 2 digits. That would give a lot better instruction-level parallelism, in case the simpler version bottlenecks on mul + shr latency. The chain of multiply/shift operations that brings val to zero would be half as long, with more work in each short independent dependency chain to handle a 0-99 remainder.
Related:
NASM version of this answer, for x86-64 or i386 Linux How do I print an integer in Assembly Level Programming without printf from the c library?
How to convert a binary integer number to a hex string? - Base 16 is a power of 2, conversion is much simpler and doesn't require div.
Several things:
0) I guess this is 64b linux environment, but you should have stated so (if it is not, some of my points will be invalid)
1) int 0x80 is 32b call, but you are using 64b registers, so you should use syscall (and different arguments)
2) int 0x80, eax=4 requires the ecx to contain address of memory, where the content is stored, while you give it the ASCII character in ecx = illegal memory access (the first call should return error, i.e. eax is negative value). Or using strace <your binary> should reveal the wrong arguments + error returned.
3) why addq $4, %rsp? Makes no sense to me, you are damaging rsp, so the next pop rcx will pop wrong value, and in the end you will run way "up" into the stack.
... maybe some more, I didn't debug it, this list is just by reading the source (so I may be even wrong about something, although that would be rare).
BTW your code is working. It just doesn't do what you expected. But work fine, precisely as the CPU is designed and precisely what you wrote in the code. Whether that does achieve what you wanted, or makes sense, that's different topic, but don't blame the HW or assembler.
... I can do a quick guess how the routine may be fixed (just partial hack-fix, still needs rewrite for syscall under 64b linux):
next:
cmpq $0, %rsi
jz bye
movq %rsp,%rcx ; make ecx to point to stack memory (with stored char)
; this will work if you are lucky enough that rsp fits into 32b
; if it is beyond 4GiB logical address, then you have bad luck (syscall needed)
decq %rsi
movq $4, %rax
movq $1, %rbx
movq $1, %rdx
int $0x80
addq $8, %rsp ; now rsp += 8; is needed, because there's no POP
jmp next
Again didn't try myself, just writing it from head, so let me know how it changed situation.

How are char arrays / strings stored in binary files?

When I compile this code using different compilers and inspect the output in a hex editor I am expecting to find the string "Nancy" somewhere.
#include <stdio.h>
int main()
{
char temp[6] = "Nancy";
printf("%s", temp);
return 0;
}
The output file for gcc -o main main.c looks like this:
The output for g++ -o main main.c, I can't see to find "Nancy" anywhere.
Compiling the same code in visual studio (MSVC 1929) I see the full string in a hex editor:
Why do I get some random bytes in the middle of the string in (1)?
There is no single rule about how a compiler stores data in the output files it produces.
Data can be stored in a “constant” section.
Data can be built into the “immediate” operands of instructions, in which data is encoded in various fields of the bits that encode an instruction.
Data can be computed from other data by instructions generated by the compiler.
I suspect the case where you see “Nanc” in one place and “y” in another is the compiler using a load instruction (may be written with “mov”) that loads the bytes forming “Nanc” as an immediate operand and another load instruction that loads the bytes forming “y” with a trailing null character, along with other instructions to store the loaded data on the stack and pass its address to printf.
You have not provided enough information to diagnose the g++ case: You did not name the compiler or its version number or provide any part of the generated output.
I reproduced it, using gcc 9.3.0 (Linux Mint 20.2), on x86-64 system (Intel
Result of hexdump -C:
Note the byte sequence is the same.
So I use gcc -S -c:
.file "teststr.c"
.text
.section .rodata
.LC0:
.string "%s"
.text
.globl main
.type main, #function
main:
.LFB0:
.cfi_startproc
endbr64
pushq %rbp
.cfi_def_cfa_offset 16
.cfi_offset 6, -16
movq %rsp, %rbp
.cfi_def_cfa_register 6
subq $16, %rsp
movq %fs:40, %rax
movq %rax, -8(%rbp)
xorl %eax, %eax
movl $1668178254, -14(%rbp) # NOTE THIS PART HERE
movw $121, -10(%rbp) # AND HERE
leaq -14(%rbp), %rax
movq %rax, %rsi
leaq .LC0(%rip), %rdi
movl $0, %eax
call printf#PLT
movl $0, %eax
movq -8(%rbp), %rdx
xorq %fs:40, %rdx
je .L3
call __stack_chk_fail#PLT
.L3:
leave
.cfi_def_cfa 7, 8
ret
.cfi_endproc
.LFE0:
.size main, .-main
.ident "GCC: (Ubuntu 9.3.0-17ubuntu1~20.04) 9.3.0"
.section .note.GNU-stack,"",#progbits
.section .note.gnu.property,"a"
.align 8
.long 1f - 0f
.long 4f - 1f
.long 5
0:
.string "GNU"
1:
.align 8
.long 0xc0000002
.long 3f - 2f
2:
.long 0x3
3:
.align 8
4:
The highlighted value 1668178254 is hex 636E614E or "cnaN" (which, due to the endian reversal as x86 is a little-endian system, becomes "Nanc") in ASCII encoding, and 121 is hex 79, or "y".
So it uses two move instructions instead of a loop copy from a byte string section of the file given it's a short string, and the intervening "garbage" is (I believe) the following movw instruction. Likely a way to optimize the initialization, versus looping byte-by-byte through memory, even though no optimization flag was "officially" given to the compiler - that's the thing, the compiler can do what it wants to do in this regard. Microsoft's compiler, then, seems to be more "pedantic" in how it compiles because it does, in fact, apparently forgo that optimization in favor of putting the string together contiguously.
Generally a compiled program is split into different types of "section". The assembler file will use directives to switch between them.
Code (".text")
Static read-only data (".section .rodata")
Initialised global or static variables (".data")
Uninitialised (or zero-initialized) global or static variables (".bss")
String literals in C can be used in two different ways.
As a pointer to constant data.
As an initaliser for an array.
If a string literal is used as a pointer then it is likely the compiler will place the string data in the read only data section.
If a string literal is used to initialise a global/static array then it is likely the compiler will place the array in the initilised data section (or the read-only data section if the array is declared as const).
However in your case the array you are initialising is an automatic local variable. So it can't be pre-initialised before program start. The compiler must include code to initialise it each time your function runs.
The compiler might choose to do that by storing the string in a read-only data location and then using a copy routine (either inlined or a call) to copy it to the local array. (In that case there will be a contiguous copy of the whole thing, otherwise there won't be.) It may chose to simply generate instructions to set the elements of the array one by one. It may choose to generate instructions that set several array elements at the same time. (e.g. 4 bytes and then 2 bytes, including the terminating '\0')
P.S. I've noticed some people posting https//godbolt.org/ links on other answers to this question. The Compiler Explorer is a useful tool but be aware that it hides the section switching directives from the assembler output by default.

Issue with unsigned long long int with sparc assembly code on sparc64

I have an issue with the C code below into which I have included Sparc Assembly. The code is compiled and running on Debian 9.0 Sparc64. It does a simple summation and print the result of this sum which equals to nLoop.
The problem is that for an initial number of iterations greater than 1e+9, the final sum at the end is systematically equal to 1410065408 : I don't understand why since I put explicitly unsigned long long int type for sum variable and so sum can be in [0, +18,446,744,073,709,551,615] range.
For example, for nLoop = 1e+9, I expect sum to be equal to 1e+9.
Does issue come rather from included Assembly Sparc code which could not handle 64 bits variables (in input or output) ?
#include <stdio.h>
#include <stdlib.h>
int main (int argc, char *argv[])
{
int i;
// Init sum
unsigned long long int sum = 0ULL;
// Number of iterations
unsigned long long int nLoop = 10000000000ULL;
// Loop with Sparc assembly into C source
asm volatile ("clr %%g1\n\t"
"clr %%g2\n\t"
"mov %1, %%g1\n" // %1 = input parameter
"loop:\n\t"
"add %%g2, 1, %%g2\n\t"
"subcc %%g1, 1, %%g1\n\t"
"bne loop\n\t"
"nop\n\t"
"mov %%g2, %0\n" // %0 = output parameter
: "=r" (sum) // output
: "r" (nLoop) // input
: "g1", "g2"); // clobbers
// Print results
printf("Sum = %llu\n", sum);
return 0;
}
How to fix this problem of range and allow to use 64 bits variables into Sparc Assembly code ?
PS: I tried to compile with gcc -m64, issue remains.
Update 1
As requested by #zwol, below is the output Assembly Sparc code generated with : gcc -O2 -m64 -S loop.c -o loop.s
.file "loop.c"
.section ".text"
.section .rodata.str1.8,"aMS",#progbits,1
.align 8
.LC0:
.asciz "Sum = %llu\n"
.section .text.startup,"ax",#progbits
.align 4
.global main
.type main, #function
.proc 04
main:
.register %g2, #scratch
save %sp, -176, %sp
sethi %hi(_GLOBAL_OFFSET_TABLE_-4), %l7
call __sparc_get_pc_thunk.l7
add %l7, %lo(_GLOBAL_OFFSET_TABLE_+4), %l7
sethi %hi(9764864), %o1
or %o1, 761, %o1
sllx %o1, 10, %o1
#APP
! 13 "loop.c" 1
clr %g1
clr %g2
mov %o1, %g1
loop:
add %g2, 1, %g2
subcc %g1, 1, %g1
bne loop
nop
mov %g2, %o1
! 0 "" 2
#NO_APP
mov 0, %i0
sethi %gdop_hix22(.LC0), %o0
xor %o0, %gdop_lox10(.LC0), %o0
call printf, 0
ldx [%l7 + %o0], %o0, %gdop(.LC0)
return %i7+8
nop
.size main, .-main
.ident "GCC: (Debian 7.3.0-15) 7.3.0"
.section .text.__sparc_get_pc_thunk.l7,"axG",#progbits,__sparc_get_pc_thunk.l7,comdat
.align 4
.weak __sparc_get_pc_thunk.l7
.hidden __sparc_get_pc_thunk.l7
.type __sparc_get_pc_thunk.l7, #function
.proc 020
__sparc_get_pc_thunk.l7:
jmp %o7+8
add %o7, %l7, %l7
.section .note.GNU-stack,"",#progbits
UPDATE 2:
As suggested by #Martin Rosenau, I did following modifications :
loop:
add %g2, 1, %g2
subcc %g1, 1, %g1
bpne %icc, loop
bpne %xcc, loop
nop
mov %g2, %o1
But at the compilation, I get :
Error: Unknown opcode: `bpne'
What could be the reason for this compilation error ?
subcc %%g1, 1, %%g1
bne loop
Your problem is the bne instruction:
Unlike the x86-64 CPU Sparc64 CPUs don't have different instructions for 32- and 64-bit subtraction:
If you want subtract 1 from 0x12345678 the result is 0x12345677. If you subtract 1 from 0xF00D12345678 the result is 0xF00D12345677 so if you only use the lower 32 bits of a register a 64-bit subtraction has the same effect as the 32-bit subtraction.
Therefore the Sparc64 CPUs do not have different instructions for 64-bit and 32-bit addition, subtraction, multiplication, left shift etc.
These CPUs have different instructions for 32-bit and 64-bit operations when the upper 32 bits influence the lower 32 bits (e.g. right shift).
However the zero flag depends on the result of the subcc operation.
To solve this problem the Sparc64 CPUs have each of the integer flags (zero, overflow, carry, sign) twice:
The 32-bit zero flag will be set if the lower 32 bits of a register are zero; the 64-bit zero flag will be set if all 64 bits of a register are zero.
To be compatible with existing 32-bit programs the bne instruction will check the 32-bit zero flag, not the 64-bit zero flag.
is systematically equal to 1410065408
1e10 = 0x200000000 + 1410065408 so after 1410065408 steps the value 0x200000000 is reached which has the lower 32 bits set to 0 and bne will not jump any more.
However for 1e11 you should not get 1410065408 but 1215752192 as a result because 1e11 = 0x1700000000 + 1215752192.
bne
There is a new instruction named bpne which has up to 4 arguments!
In the simplest variant (with only two arguments) the instruction should (I have not used Sparc for 5 years now, so I'm not sure) work like this:
bpne %icc, loop # Like bne (based on the 32-bit result)
bpne %xcc, loop # Like bne, but based on the 64-bit result
EDIT
Error: Unknown opcode: 'bpne'
I just tried using GNU assembler:
GNU assembler names the new instruction bne - just like the old one:
bne loop # Old variant
bne %icc, loop # New variant based on the 32-bit result
bne %xcc, loop # (New variant) Based on the 64-bit result
subcc %g1, 1, %g1
bpne %icc, loop
bpne %xcc, loop
nop
The first bpne (or bne) makes no sense: Whenever the first line would do the jump the second line would also jump. And if you don't use .reorder (however this is the default) you would also need to add a nop between the two branch instructions...
The code should look like this (assuming your assembler also names bpne bne):
subcc %g1, 1, %g1
bne %xcc, loop
nop
Try "bne %xcc, loop" which should branch based on the 64 bit result.

gcc mips signed arithmetic on bytes

I have fed this simple code to gcc
volatile signed char x, y, z;
void test()
{
x = 0x31;
y = x + 3;
}
Volatile has been added just to avoid gcc optimization (set to -O0 anyway).
Resulting mips code was:
x:
y:
z:
test():
addiu $sp,$sp,-8
sw $fp,4($sp)
move $fp,$sp
lui $2,%hi(x)
li $3,49 # 0x31
sb $3,%lo(x)($2)
lui $2,%hi(x)
lbu $2,%lo(x)($2)
seb $2,$2
andi $2,$2,0x00ff
addiu $2,$2,3
andi $2,$2,0x00ff
seb $3,$2
lui $2,%hi(y)
sb $3,%lo(y)($2)
nop
move $sp,$fp
lw $fp,4($sp)
addiu $sp,$sp,8
j $31
nop
For (y= x+3) gcc loads the byte as unsigned and then sign extend it and then or it with 0xff?
why not simply load it using lb (which is supposed to sign extend it)?
GCC does the same for signed half words (using 0xffff of course).
I'm not particularly adept at reading MIPS assembly, but do note that you have compiled with -O0, which is supposed to generate unoptimized code. That more or less means code that implements the exact semantics of the C abstract machine. In particular,
0x31 is a constant of type int
the assignment x = 0x31 includes an implicit conversion (in this case) of the right-hand int operand to the type of the left-hand operand (signed char)
3 is also a constant of type int
evaluating x + 3 involves performing the integer promotions on the arguments, and in particular, converting the signed char value of x to (signed) int, then performing the addition
assigning the result to y involves another implicit conversion from int to signed char
In principle, all of those conversions and promotions that are implicit in the C code need to be performed explicitly by the unoptimized assembly program, and that appears roughly to be what you are seeing.
Overall, it's not very useful to ask why the assembly output of a non-optimizing compilation is not as efficient as you think it could be. If you want more efficient code, enable optimization.
gcc -march=native -O3 gives the following code on an Intel machine:
test:
.seh_endprologue
movb $49, x(%rip)
movzbl x(%rip), %eax
addl $3, %eax
movb %al, y(%rip)
ret
.seh_endproc
.comm z, 1, 0
.comm y, 1, 0
.comm x, 1, 0
.ident "GCC: (GNU) 6.4.0"
There's nothing wrong with your code. gcc/MIPS just isn't very good at code generation, which is not too surprising since it receives a lot less care and attention than gcc/Intel. In my experience clang usually generates better MIPS code than gcc does.

What's going on in Apple LLVM-gcc x86 assembly?

I'm interested in learning more x86/x86_64 assembly. Alas, I am on a Mac. No problem, right?
$ gcc --version
i686-apple-darwin11-llvm-gcc-4.2 (GCC) 4.2.1 (Based on Apple Inc. build
5658) (LLVM build 2336.11.00)
Copyright (C) 2007 Free Software Foundation, Inc.
This is free software; see the source for copying conditions. There is NO
warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
I wrote a simple "Hello World" in C to get a base-line on what sort of code I'll have to write. I did a little x86 back in college, and have looked up numerous tutorials, but none of them look like the freakish output I'm seeing here:
.section __TEXT,__text,regular,pure_instructions
.globl _main
.align 4, 0x90
_main:
Leh_func_begin1:
pushq %rbp
Ltmp0:
movq %rsp, %rbp
Ltmp1:
subq $32, %rsp
Ltmp2:
movl %edi, %eax
movl %eax, -4(%rbp)
movq %rsi, -16(%rbp)
leaq L_.str(%rip), %rax
movq %rax, %rdi
callq _puts
movl $0, -24(%rbp)
movl -24(%rbp), %eax
movl %eax, -20(%rbp)
movl -20(%rbp), %eax
addq $32, %rsp
popq %rbp
ret
Leh_func_end1:
.section __TEXT,__cstring,cstring_literals
L_.str:
.asciz "Hello, World!"
.section __TEXT,__eh_frame,coalesced,no_toc+strip_static_syms+live_support
EH_frame0:
Lsection_eh_frame:
Leh_frame_common:
Lset0 = Leh_frame_common_end-Leh_frame_common_begin
.long Lset0
Leh_frame_common_begin:
.long 0
.byte 1
.asciz "zR"
.byte 1
.byte 120
.byte 16
.byte 1
.byte 16
.byte 12
.byte 7
.byte 8
.byte 144
.byte 1
.align 3
Leh_frame_common_end:
.globl _main.eh
_main.eh:
Lset1 = Leh_frame_end1-Leh_frame_begin1
.long Lset1
Leh_frame_begin1:
Lset2 = Leh_frame_begin1-Leh_frame_common
.long Lset2
Ltmp3:
.quad Leh_func_begin1-Ltmp3
Lset3 = Leh_func_end1-Leh_func_begin1
.quad Lset3
.byte 0
.byte 4
Lset4 = Ltmp0-Leh_func_begin1
.long Lset4
.byte 14
.byte 16
.byte 134
.byte 2
.byte 4
Lset5 = Ltmp1-Ltmp0
.long Lset5
.byte 13
.byte 6
.align 3
Leh_frame_end1:
.subsections_via_symbols
Now...maybe things have changed a bit, but this isn't exactly friendly, even for assembly code. I'm having a hard time wrapping my head around this...Would someone help break down what is going on in this code and why it is all needed?
Many, many thanks in advance.
Since the question is really about those odd labels and data and not really about the code itself, I'm only going to shed some light on them.
If an instruction of the program causes an execution error (such as division by 0 or access to an inaccessible memory region or an attempt to execute a privileged instruction), it results in an exception (not a C++ kind of exception, rather an interrupt kind of it) and forces the CPU to execute the appropriate exception handler in the OS kernel. If we were to totally disallow these exceptions, the story would be very short, the OS would simply terminate the program.
However, there are advantages of letting programs handle their own exceptions and so the primary exception handler in the OS handler reflects some of exceptions back into the program for handling. For example, a program could attempt to recover from the exception or it could save a meaningful crash report before terminating.
In either case, it is useful to know the following:
the function, where the exception has occurred, not just the offending instruction in it
the function that called that function, the function that called that one and so on
and possibly (mainly for debugging):
the line of the source code file, from which this instruction was generated
the lines where these function calls were made
the function parameters
Why do we need to know the call tree?
Well, if the program registers its own exception handlers, it usually does it something like the C++ try and catch blocks:
fxn()
{
try
{
// do something potentially harmful
}
catch()
{
// catch and handle attempts to do something harmful
}
catch()
{
// catch and handle attempts to do something harmful
}
}
If neither of those catches catches, the exception propagates to the caller of fxn and potentially to the caller of the caller of fxn, until there's a catch that catches the exception or until the default exception handler that simply terminates the program.
So, you need to know the code regions that each try covers and you need to know how to get to the next closest try (in the caller of fxn, for example) if the immediate try/catch doesn't catch the exception and it has to bubble up.
The ranges for try and locations of catch blocks are easy to encode in a special section of the executable and they are easy to work with (just do a binary search for the offending instruction addresses in those ranges). But figuring out the next outer try block is harder because you may need to find out the return address from the function, where the exception occurred.
And you may not always rely on rbp+8 pointing to the return address on the stack, because the compiler may optimize the code in such a way that rbp is no longer involved in accessing function parameters and local variables. You can access them through rsp+something as well and save a register and a few instructions, but given the fact that different functions allocate different number of bytes on the stack for the locals and the parameters passed to other functions and adjust rsp differently, just the value of rsp isn't enough to find out the return address and the calling function. rsp can be an arbitrary number of bytes away from where the return address is on the stack.
For such scenarios the compiler includes additional information about functions and their stack usage in a dedicated section of the executable. The exception-handling code examines this information and properly unwinds the stack when exceptions have to propagate to the calling functions and their try/catch blocks.
So, the data following _main.eh contains that additional information. Note that it explicitly encodes the beginning and the size of main() by referring to Leh_func_begin1 and Leh_func_end1-Leh_func_begin1. This piece of info allows the exception-handling code to identify main()'s instructions as main()'s.
It also appears that main() isn't very unique and some of its stack/exception info is the same as in other functions and it makes sense to share it between them. And so there's a reference to Leh_frame_common.
I can't comment further on the structure of _main.eh and the exact meaning of those constants like 144 and 13 as I don't know the format of this data. But generally one doesn't need to know these details unless they are the compiler or the debugger developers.
I hope this give you an idea of what those labels and constants are for.
Ok lets give it a try
// First section of code, declaring the main function that has to be align on a 32 bit boundary.
UPDATE:
My explanation of the .align directive may be wrong. See gas documentation below.
.section __TEXT,__text,regular,pure_instructions
.globl _main
.align 4, 0x90
_main:
Store the previous base pointer and allocate stack space for local variables.
Leh_func_begin1:
pushq %rbp
Ltmp0:
movq %rsp, %rbp
Ltmp1:
subq $32, %rsp
Ltmp2:
Push the arguments on the stack and call puts()
movl %edi, %eax
movl %eax, -4(%rbp)
movq %rsi, -16(%rbp)
leaq L_.str(%rip), %rax
movq %rax, %rdi
callq _puts
Put return value on stack, free local memory, restore base pointer and return.
movl $0, -24(%rbp)
movl -24(%rbp), %eax
movl %eax, -20(%rbp)
movl -20(%rbp), %eax
addq $32, %rsp
popq %rbp
ret
Leh_func_end1:
Next section, also a code section, containing the string to print.
.section __TEXT,__cstring,cstring_literals
L_.str:
.asciz "Hello, World!"
The rest is unknown to me, could be data used be the c startup code and or debugging info.
.section __TEXT,__eh_frame,coalesced,no_toc+strip_static_syms+live_support
...
UPDATE:
Documentation on the .align directive from:
http://sourceware.org/binutils/docs-2.23.1/as/Align.html#Align
"The way the required alignment is specified varies from system to system. For the arc, hppa, i386 using ELF, i860, iq2000, m68k, or32, s390, sparc, tic4x, tic80 and xtensa, the first expression is the alignment request in bytes. For example `.align 8' advances the location counter until it is a multiple of 8. If the location counter is already a multiple of 8, no change is needed. For the tic54x, the first expression is the alignment request in words.
For other systems, including ppc, i386 using a.out format, arm and strongarm, it is the number of low-order zero bits the location counter must have after advancement. For example `.align 3' advances the location counter until it a multiple of 8. If the location counter is already a multiple of 8, no change is needed.
This inconsistency is due to the different behaviors of the various native assemblers for these systems which GAS must emulate. GAS also provides .balign and .p2align directives, described later, which have a consistent behavior across all architectures (but are specific to GAS)."
//jk
You can find the answers for pretty much any questions you've got related to the directives here and here.
For example:
.section __TEXT,__text,regular,pure_instructions
Declares a section named __TEXT,__text with the default section type and specify that this section will contain only machine code (i.e. no data).
.globl _main
Makes the _main label (symbol) global, so that it will be visible to the linker.
.align 4, 0x90
Aligns the location counter to the next 2^4 (==16) byte boundary. The space in between will be filled with the value 0x90 (==NOP).
As for the code itself, it's obviously doing a lot of redundant intermediary loads and stores. Try compiling with optimizations enabled as one of the commentators suggested and you should find that the resulting code will make more sense.

Resources