How to add inline comments to generated assembly from arduino - c

I am trying to add inline comments to the generated assembly on my arduino. So for instance
/*
Testing
*/
#include <avr/io.h>
#include <iostream>
int ledPin = 13;
void setup()
{
asm volatile("\n# comment 1");
pinMode(ledPin, OUTPUT);
}
void loop()
{
asm volatile("\n# comment 2");
digitalWrite(ledPin, HIGH);
delay(1000);
digitalWrite(ledPin, LOW);
delay(1000);
}
when the assembly is generated for the code i want to see "comment 1" and "comment 2" as a marker at that particular line in the assembly code.
Here is the assembly generated code
C:\Users\****\AppData\Local\Temp\build1888832469367065438.tmp\sketch_jul17a.cpp.o: file format elf32-avr
Disassembly of section .text.loop:
00000000 <loop>:
loop():
C:\Program Files (x86)\Arduino/sketch_jul17a.ino:19
0: 80 91 00 00 lds r24, 0x0000
4: 61 e0 ldi r22, 0x01 ; 1
6: 0e 94 00 00 call 0 ; 0x0 <loop>
C:\Program Files (x86)\Arduino/sketch_jul17a.ino:20
a: 68 ee ldi r22, 0xE8 ; 232
c: 73 e0 ldi r23, 0x03 ; 3
e: 80 e0 ldi r24, 0x00 ; 0
10: 90 e0 ldi r25, 0x00 ; 0
12: 0e 94 00 00 call 0 ; 0x0 <loop>
C:\Program Files (x86)\Arduino/sketch_jul17a.ino:21
16: 80 91 00 00 lds r24, 0x0000
1a: 60 e0 ldi r22, 0x00 ; 0
1c: 0e 94 00 00 call 0 ; 0x0 <loop>
C:\Program Files (x86)\Arduino/sketch_jul17a.ino:22
20: 68 ee ldi r22, 0xE8 ; 232
22: 73 e0 ldi r23, 0x03 ; 3
24: 80 e0 ldi r24, 0x00 ; 0
26: 90 e0 ldi r25, 0x00 ; 0
28: 0e 94 00 00 call 0 ; 0x0 <loop>
C:\Program Files (x86)\Arduino/sketch_jul17a.ino:23
2c: 08 95 ret
Disassembly of section .text.setup:
00000000 <setup>:
setup():
C:\Program Files (x86)\Arduino/sketch_jul17a.ino:13
0: 80 91 00 00 lds r24, 0x0000
4: 61 e0 ldi r22, 0x01 ; 1
6: 0e 94 00 00 call 0 ; 0x0 <setup>
C:\Program Files (x86)\Arduino/sketch_jul17a.ino:14
a: 08 95 ret
The comments are not included in the assembly code, how can I do this

First of all comments in "regular" (not cpp pre-processed) assembler code begin with a hash sign, not with two slashes. So you might want that a comment named "# onesectimer" is in the assembler code.
This may be archieved the following way:
asm("\n# onesectimer");
void OneSecTimer()
{
if(bags!=0){
asm("\n# for counter 1");
if(counter1 == 3)
...
--- Edit ---
Reading your comments and your edits I think you are mixing the words "assembly" and "disassembly":
When translating C code to binary code the C compiler generates "assembly" code. This code may contain comments:
# This is a comment
lds r24, 0
ldi r22, 1
call digitalWrite
The assembler then translates this "assembly" code to binary code. In binary code there is no information about comments any more but only the binary data to be written to the memory.
The "disassembly" translates the binary data back to assembly code but only information that is present in binary code may be disassembled - so you cannot have any comments in disassembly code!
What you may do is to insert a symbol into the object file at a point of interest:
digitalWrite(0,1);
asm volatile(".global Here_is_Delay\nHere_is_Delay:");
delay(1000);
The names of these symbols must be unique over the whole project and not be identical to any function or variable name used.
Depending on the disassembler (not sure about the AVR one) you'll see the symbols then:
loop():
C:\Program Files (x86)\Arduino/sketch_jul17a.ino:19
0: 80 91 00 00 lds r24, 0x0000
4: 61 e0 ldi r22, 0x01 ; 1
6: 0e 94 00 00 call 0 ; 0x0 <loop>
Here_is_Delay():
C:\Program Files (x86)\Arduino/sketch_jul17a.ino:20
a: 68 ee ldi r22, 0xE8 ; 232
c: 73 e0 ldi r23, 0x03 ; 3
...

Related

How can I remove null bytes from my object code?

I want to use my own shellcode for a buffer overflow exploit so for that I have written a script in C language[shellcode script].
I have used the following commands.:
gcc -c file.c -o file.o
objdump -sS -D file.o
root#kali:~/shellcode# cat file.c
#include<stdio.h>
int main()
{
printf("Hi");
}
The above code is of 'file.c'.
I expect the output of the 'objdump -sS -D file.o' to be free from null-bytes, but actually this is my output after typing that command:
file.o: file format elf64-x86-64
Contents of section .text:
0000 554889e5 488d3d00 000000b8 00000000 UH..H.=.........
0010 e8000000 00b80000 00005dc3 ..........].
Contents of section .rodata:
0000 486900 Hi.
Contents of section .comment:
0000 00474343 3a202844 65626961 6e20382e .GCC: (Debian 8.
0010 332e302d 36292038 2e332e30 00 3.0-6) 8.3.0.
Contents of section .eh_frame:
0000 14000000 00000000 017a5200 01781001 .........zR..x..
0010 1b0c0708 90010000 1c000000 1c000000 ................
0020 00000000 1c000000 00410e10 8602430d .........A....C.
0030 06570c07 08000000 .W......
Disassembly of section .text:
0000000000000000 <main>:
0: 55 push %rbp
1: 48 89 e5 mov %rsp,%rbp
4: 48 8d 3d 00 00 00 00 lea 0x0(%rip),%rdi # b <main+0xb>
b: b8 00 00 00 00 mov $0x0,%eax
10: e8 00 00 00 00 callq 15 <main+0x15>
15: b8 00 00 00 00 mov $0x0,%eax
1a: 5d pop %rbp
1b: c3 retq
Disassembly of section .rodata:
0000000000000000 <.rodata>:
0: 48 rex.W
1: 69 .byte 0x69
...
Disassembly of section .comment:
0000000000000000 <.comment>:
0: 00 47 43 add %al,0x43(%rdi)
3: 43 3a 20 rex.XB cmp (%r8),%spl
6: 28 44 65 62 sub %al,0x62(%rbp,%riz,2)
a: 69 61 6e 20 38 2e 33 imul $0x332e3820,0x6e(%rcx),%esp
11: 2e 30 2d 36 29 20 38 xor %ch,%cs:0x38202936(%rip) # 3820294e <main+0x3820294e>
18: 2e 33 2e xor %cs:(%rsi),%ebp
1b: 30 00 xor %al,(%rax)
Disassembly of section .eh_frame:
0000000000000000 <.eh_frame>:
0: 14 00 adc $0x0,%al
2: 00 00 add %al,(%rax)
4: 00 00 add %al,(%rax)
6: 00 00 add %al,(%rax)
8: 01 7a 52 add %edi,0x52(%rdx)
b: 00 01 add %al,(%rcx)
d: 78 10 js 1f <.eh_frame+0x1f>
f: 01 1b add %ebx,(%rbx)
11: 0c 07 or $0x7,%al
13: 08 90 01 00 00 1c or %dl,0x1c000001(%rax)
19: 00 00 add %al,(%rax)
1b: 00 1c 00 add %bl,(%rax,%rax,1)
1e: 00 00 add %al,(%rax)
20: 00 00 add %al,(%rax)
22: 00 00 add %al,(%rax)
24: 1c 00 sbb $0x0,%al
26: 00 00 add %al,(%rax)
28: 00 41 0e add %al,0xe(%rcx)
2b: 10 86 02 43 0d 06 adc %al,0x60d4302(%rsi)
31: 57 push %rdi
32: 0c 07 or $0x7,%al
34: 08 00 or %al,(%rax)
...
Can somebody please explain me how I can remove null-bytes from this program, or if possible write the output in assembly so that I can learn what to change and how
P.S - I know mov $0x0, $rsp can be done by xor $rsp, $rsp but I don't know about movq, lea, add, sub, etc.
Thank you for your precious time.
Removing nullbytes (\x00) from shellcode is only necessary if you are using functions that depend on a trailing \x00, such as strcpy:
char * strcpy ( char * destination, const char * source );
which copies the C string pointed by source into the array pointed by destination, including the terminating null character (and stopping at that point).
However strncpy copies the first num characters of source to destination, padding it with zeros until num characters have been written to destination.
char * strncpy ( char * destination, const char * source, size_t num );
This means if you pass your shellcode size/length to the parameter num, it will copy all characters into the buffer, without the hassle of removing nullbytes as they aren't terminating copying from source to destination.
To get the length of the shellcode:
#include <stdio.h>
#include <string.h>
int main()
{
char* evil="\x90\x83\xc8\xff\xf7\xd0\x50";
printf("%d",strlen(evil));
}
will return:
7

_delay_us(); timing explanation

I made a program with atmega16 and was trying to make my own delay_us() so I looked into the compiler avr-gcc library for _delay_us() function and this is its code:
static inline void _delay_us(double __us) __attribute__((always_inline));
/*
\ingroup util_delay
Perform a delay of \c __us microseconds, using _delay_loop_1().
The macro F_CPU is supposed to be defined to a
constant defining the CPU clock frequency (in Hertz).
The maximal possible delay is 768 us / F_CPU in MHz.
If the user requests a delay greater than the maximal possible one,
_delay_us() will automatically call _delay_ms() instead. The user
will not be informed about this case.
*/
void
_delay_us(double __us)
{
uint8_t __ticks;
double __tmp = ((F_CPU) / 3e6) * __us; //number of ticks per us * delay time in us
if (__tmp < 1.0)
__ticks = 1;
else if (__tmp > 255)
{
_delay_ms(__us / 1000.0);
return;
}
else
__ticks = (uint8_t)__tmp;
_delay_loop_1(__ticks); // function decrements ticks untill it reaches 0( takes 3 cycles)
}
I got confused about how, if I use a 1Mhz clock, this function that contains floating point arithmetic will be able to make small delays (like _delay_us(10)), because executing all the the setup code will definitely take more time than that. So I wrote this program:
#include <avr/io.h>
#include <avr/interrupt.h>
#include <util/delay.h>
#define F_CPU 1000000UL
int main()
{
_delay_ms(1000);
DDRB=0XFF;
PORTB=0XFF;
_delay_us(10);
PORTB=0;
for(;;){}
return 0;
}
I simulated it using protues and used an oscilloscope and connected one of PORTB pins to its input. Then I saw that the delay was exactly 10 us. How could the delay be that accurate considering this set up code statement that uses floating point arithmetic:
double __tmp = ((F_CPU) / 4e3) * __ms;
This should have taken a lot of cycles that makes _delay_us(10) exceed the 10 us period, but the time was exactly 10us!!
All the floating point arithmetic is calculated by the preprocessor as these delay functions are actually macros. So at the point the mcu executes the code, all that's left is a loop that uses an integer to do the delay.
typedef unsigned char uint8_t;
#define F_CPU 16000000
extern void _delay_loop_1( uint8_t );
static void _delay_us(double __us)
{
uint8_t __ticks;
double __tmp = ((F_CPU) / 3e6) * __us;
if (__tmp < 1.0)
{
__ticks = 1;
}
else
{
if (__tmp > 255)
{
_delay_ms(__us / 1000.0);
return;
}
else
{
__ticks = (uint8_t)__tmp;
}
}
_delay_loop_1(__ticks);
}
void fun1 ( void )
{
_delay_us(10);
}
which with gcc can produce this:
00000000 <fun1>:
0: 85 e3 ldi r24, 0x35 ; 53
2: 00 c0 rjmp .+0 ; 0x4 <__zero_reg__+0x3>
the number to feed _delay_loop_1 is computed at compile time not runtime, all of that dead code goes away.
but add this:
void fun2 ( void )
{
uint8_t ra;
for(ra=1;ra<10;ra++) _delay_us(ra);
}
and things change dramatically.
00000000 <fun1>:
0: 85 e3 ldi r24, 0x35 ; 53
2: 00 c0 rjmp .+0 ; 0x4 <fun2>
00000004 <fun2>:
4: 8f 92 push r8
6: 9f 92 push r9
8: af 92 push r10
a: bf 92 push r11
c: cf 92 push r12
e: df 92 push r13
10: ef 92 push r14
12: ff 92 push r15
14: cf 93 push r28
16: c1 e0 ldi r28, 0x01 ; 1
18: 6c 2f mov r22, r28
1a: 70 e0 ldi r23, 0x00 ; 0
1c: 80 e0 ldi r24, 0x00 ; 0
1e: 90 e0 ldi r25, 0x00 ; 0
20: 00 d0 rcall .+0 ; 0x22 <fun2+0x1e>
22: 86 2e mov r8, r22
24: 97 2e mov r9, r23
26: a8 2e mov r10, r24
28: b9 2e mov r11, r25
2a: 2b ea ldi r18, 0xAB ; 171
2c: 3a ea ldi r19, 0xAA ; 170
2e: 4a ea ldi r20, 0xAA ; 170
30: 50 e4 ldi r21, 0x40 ; 64
32: 00 d0 rcall .+0 ; 0x34 <fun2+0x30>
34: c6 2e mov r12, r22
36: d7 2e mov r13, r23
38: e8 2e mov r14, r24
3a: f9 2e mov r15, r25
3c: 20 e0 ldi r18, 0x00 ; 0
3e: 30 e0 ldi r19, 0x00 ; 0
40: 40 e8 ldi r20, 0x80 ; 128
42: 5f e3 ldi r21, 0x3F ; 63
44: 00 d0 rcall .+0 ; 0x46 <fun2+0x42>
46: 87 fd sbrc r24, 7
48: 00 c0 rjmp .+0 ; 0x4a <fun2+0x46>
4a: 20 e0 ldi r18, 0x00 ; 0
4c: 30 e0 ldi r19, 0x00 ; 0
4e: 4f e7 ldi r20, 0x7F ; 127
50: 53 e4 ldi r21, 0x43 ; 67
52: 9f 2d mov r25, r15
54: 8e 2d mov r24, r14
56: 7d 2d mov r23, r13
58: 6c 2d mov r22, r12
5a: 00 d0 rcall .+0 ; 0x5c <fun2+0x58>
5c: 18 16 cp r1, r24
5e: 04 f0 brlt .+0 ; 0x60 <fun2+0x5c>
60: 9f 2d mov r25, r15
62: 8e 2d mov r24, r14
64: 7d 2d mov r23, r13
66: 6c 2d mov r22, r12
68: 00 d0 rcall .+0 ; 0x6a <fun2+0x66>
6a: 86 2f mov r24, r22
6c: 00 d0 rcall .+0 ; 0x6e <fun2+0x6a>
6e: cf 5f subi r28, 0xFF ; 255
70: ca 30 cpi r28, 0x0A ; 10
72: 01 f4 brne .+0 ; 0x74 <fun2+0x70>
74: cf 91 pop r28
76: ff 90 pop r15
78: ef 90 pop r14
7a: df 90 pop r13
7c: cf 90 pop r12
7e: bf 90 pop r11
80: af 90 pop r10
82: 9f 90 pop r9
84: 8f 90 pop r8
86: 08 95 ret
88: 20 e0 ldi r18, 0x00 ; 0
8a: 30 e0 ldi r19, 0x00 ; 0
8c: 4a e7 ldi r20, 0x7A ; 122
8e: 54 e4 ldi r21, 0x44 ; 68
90: 9b 2d mov r25, r11
92: 8a 2d mov r24, r10
94: 79 2d mov r23, r9
96: 68 2d mov r22, r8
98: 00 d0 rcall .+0 ; 0x9a <fun2+0x96>
9a: 00 d0 rcall .+0 ; 0x9c <fun2+0x98>
9c: 00 c0 rjmp .+0 ; 0x9e <fun2+0x9a>
9e: 81 e0 ldi r24, 0x01 ; 1
a0: 00 c0 rjmp .+0 ; 0xa2 <__SREG__+0x63>
hmm, how good is the optimizer?
void fun3 ( void )
{
uint8_t ra;
for(ra=20;ra<22;ra++) _delay_us(ra);
}
thought so
00000004 <fun3>:
4: 8a e6 ldi r24, 0x6A ; 106
6: 00 d0 rcall .+0 ; 0x8 <fun3+0x4>
8: 80 e7 ldi r24, 0x70 ; 112
a: 00 c0 rjmp .+0 ; 0xc <fun3+0x8>
figured the count to 10 would have done it.
A lot of times you will see delay loop functions like this get used with hardcoded values because the spec for whatever you are bit banging, etc has those values and it is easy when the thing says pop reset then wait 100us, you just call a delay with 100 in it. Now if there is one file with:
fun4(10);
and another file (another optimization domain) you have the above with this added:
void fun4 ( uint8_t x)
{
_delay_us(x);
}
you can then understand where this is headed...runtime...dont even need to compile it to see that it will have the problem. Now some compilers like llvm you can optimize across file domains, but they dont target the AVR, their MSP430 was a publicity stunt more than reality as it doesnt work and is not supported. Their arm support is obviously good, but they change their command line options pretty much every minor release, I have long since gotten tired trying to use them as I have to constantly change my makefiles to keep up, and their optimized code is sadly as not as fast as gccs despite gccs code getting worse every release and llvm getting a little better (worse/better are in the eyes of the beholder of course).

How does a compiled "Hello World" C program store the String using machine language?

so I've started learning about machine language today. I wrote a basic "Hello World" program in C which prints "Hello, world!" ten times using a for loop. I then used the Gnu Debugger to disassemble main and look at the code in machine language (my computer has a x86 processor and I've set gdb up to use intel syntax):
user#PC:~/Path/To/Code$ gdb -q ./a.out
Reading symbols from ./a.out...done.
(gdb) list
1 #include <stdio.h>
2
3 int main()
4 {
5 int i;
6 for(i = 0; i < 10; i++) {
7 printf("Hello, world!\n");
8 }
9 return 0;
10 }
(gdb) disassemble main
Dump of assembler code for function main:
0x0804841d <+0>: push ebp
0x0804841e <+1>: mov ebp,esp
0x08048420 <+3>: and esp,0xfffffff0
0x08048423 <+6>: sub esp,0x20
0x08048426 <+9>: mov DWORD PTR [esp+0x1c],0x0
0x0804842e <+17>: jmp 0x8048441 <main+36>
0x08048430 <+19>: mov DWORD PTR [esp],0x80484e0
0x08048437 <+26>: call 0x80482f0 <puts#plt>
0x0804843c <+31>: add DWORD PTR [esp+0x1c],0x1
0x08048441 <+36>: cmp DWORD PTR [esp+0x1c],0x9
0x08048446 <+41>: jle 0x8048430 <main+19>
0x08048448 <+43>: mov eax,0x0
0x0804844d <+48>: leave
0x0804844e <+49>: ret
End of assembler dump.
(gdb) x/s 0x80484e0
0x80484e0: "Hello, world!"
I understand most of the machine code and what each of the commands do. If I understood it correctly, the address "0x80484e0" is loaded into the esp register so that can use the memory at this address. I examined the address, and to no surprise it contained the desired string. My question now is - how did that string get there in the first place? I can't find a part in the program that sets the string up at this location.
I also don't understand something else: When I first start the program, the eip points to , where the variable i is initialized at [esp+0x1c]. However, the address that esp points to is changed later on in the program (to 0x80484e0), but [esp+0x1c] is still used for "i" after that change. Shouldn't the adress [esp+0x1c] change when the address esp points to changes?
I binary or program is made up of both machine code and data. In this case your string which you put in the source code, the compiler too that data which is just bytes, and because of how it was used was considered read only data, so depending on the compiler that might land in .rodata or .text or some other name the compiler might use. Gcc would probably call it .rodata. The program itself is in .text. The linker comes along and when it links things finds a place for .text, .data, .bss, .rodata, and any other items you may have and then connects the dots. In the case of your call to printf the linker knows where it put the string, the array of bytes, and it was told what its name was (some internal temporary name no doubt) and the printf call was told about that name to so the linker patches up the instruction to grab the address to the format string before calling printf.
Disassembly of section .text:
0000000000400430 <main>:
400430: 53 push %rbx
400431: bb 0a 00 00 00 mov $0xa,%ebx
400436: 66 2e 0f 1f 84 00 00 nopw %cs:0x0(%rax,%rax,1)
40043d: 00 00 00
400440: bf e4 05 40 00 mov $0x4005e4,%edi
400445: e8 b6 ff ff ff callq 400400 <puts#plt>
40044a: 83 eb 01 sub $0x1,%ebx
40044d: 75 f1 jne 400440 <main+0x10>
40044f: 31 c0 xor %eax,%eax
400451: 5b pop %rbx
400452: c3 retq
400453: 66 2e 0f 1f 84 00 00 nopw %cs:0x0(%rax,%rax,1)
40045a: 00 00 00
40045d: 0f 1f 00 nopl (%rax)
Disassembly of section .rodata:
00000000004005e0 <_IO_stdin_used>:
4005e0: 01 00 add %eax,(%rax)
4005e2: 02 00 add (%rax),%al
4005e4: 48 rex.W
4005e5: 65 6c gs insb (%dx),%es:(%rdi)
4005e7: 6c insb (%dx),%es:(%rdi)
4005e8: 6f outsl %ds:(%rsi),(%dx)
4005e9: 2c 20 sub $0x20,%al
4005eb: 77 6f ja 40065c <__GNU_EH_FRAME_HDR+0x68>
4005ed: 72 6c jb 40065b <__GNU_EH_FRAME_HDR+0x67>
4005ef: 64 21 00 and %eax,%fs:(%rax)
the compiler will have encoded this instruction but left the address as zeros probably or some fill
400440: bf e4 05 40 00 mov $0x4005e4,%edi
so that the linker could fill it in later. The gnu disassembler attempts to disassemble the .rodata (and .data, etc) blocks which doesnt make sense, so ignore the instructions it is trying to interpret your string which starts at address 0x4005e4.
Before linking a disassembly of the object shows the two sections .text and .rodata
Disassembly of section .text.startup:
0000000000000000 <main>:
0: 53 push %rbx
1: bb 0a 00 00 00 mov $0xa,%ebx
6: 66 2e 0f 1f 84 00 00 nopw %cs:0x0(%rax,%rax,1)
d: 00 00 00
10: bf 00 00 00 00 mov $0x0,%edi
15: e8 00 00 00 00 callq 1a <main+0x1a>
1a: 83 eb 01 sub $0x1,%ebx
1d: 75 f1 jne 10 <main+0x10>
1f: 31 c0 xor %eax,%eax
21: 5b pop %rbx
22: c3 retq
0000000000000000 <.rodata.str1.1>:
0: 48 rex.W
1: 65 6c gs insb (%dx),%es:(%rdi)
3: 6c insb (%dx),%es:(%rdi)
4: 6f outsl %ds:(%rsi),(%dx)
5: 2c 20 sub $0x20,%al
7: 77 6f ja 78 <main+0x78>
9: 72 6c jb 77 <main+0x77>
b: 64 21 00 and %eax,%fs:(%rax)
unlinked it has to just pad this address/offset for the linker to fill in later.
10: bf 00 00 00 00 mov $0x0,%edi
also note the object contains only the string in .rodata. linking with libraries and other items to make it a complete program clearly added more .rodata, but the linker manages all of that.
Perhaps easier to see with this example
void more_fun ( unsigned int, unsigned int, unsigned int );
unsigned int a;
unsigned int b=5;
const unsigned int c=7;
void fun ( void )
{
more_fun(a,b,c);
}
disassembled as a object
Disassembly of section .text:
0000000000000000 <fun>:
0: 8b 35 00 00 00 00 mov 0x0(%rip),%esi # 6 <fun+0x6>
6: 8b 3d 00 00 00 00 mov 0x0(%rip),%edi # c <fun+0xc>
c: ba 07 00 00 00 mov $0x7,%edx
11: e9 00 00 00 00 jmpq 16 <fun+0x16>
Disassembly of section .data:
0000000000000000 <b>:
0: 05 .byte 0x5
1: 00 00 add %al,(%rax)
...
Disassembly of section .rodata:
0000000000000000 <c>:
0: 07 (bad)
1: 00 00 add %al,(%rax)
...
and for whatever reason you have to link it to see the .bss section. The point of the example is the machine code for the function is in .text, the uninitialized global is in .bss, the initialized global is .data and the const initialized global is .rodata. The compiler was smart enough to know that a const even if it is global wont change so it can just hardcode that value into the math and not need to read from ram, but the other two variables it has to read from ram so generates an instruction with the address zeros to be filled in by the linker at link time.
In your case your read only/const data was a collection of bytes and it wasnt a math operation so the bytes as defined in your source file were placed in memory so they could be pointed at as the first parameter to printf.
There is more to a binary than just machine code. And the compiler and linker can have things placed in memory for the machine code to get, the machine code itself does not have to write every value that will be used by the rest of the machine code.
The compiler 'hard wires' the string into the object code and the linker then 'hard wires' it into the machine code.
Not that the string is embedded into the code, and not stored in a data area meaning that if you took a pointer to the string and attempted to change it you would get an exception.

Bitwise shift of an array of chars in AVR C

Initially I need to send and receive serially some data. The packet length is 48 bits.
For shorter packets (32 bits) I could do something like that:
unsigned long data=0x12345678;
for(i=0;i<32;i++){
if(data & 0x80000000)
setb_MOD;
else
clrb_MOD;
data <<= 1;
}
This code compilation is really pleasing me:
code<<=1;
ac: 88 0f add r24, r24
ae: 99 1f adc r25, r25
b0: aa 1f adc r26, r26
b2: bb 1f adc r27, r27
b4: 80 93 63 00 sts 0x0063, r24
b8: 90 93 64 00 sts 0x0064, r25
bc: a0 93 65 00 sts 0x0065, r26
c0: b0 93 66 00 sts 0x0066, r27
After I needed to extend the packet (to 48 bits) I faced with the need to shift an array:
unsigned char data[6]={0x12,0x34,0x56,0x78,0xAB,0xCD};
for(i=0;i<48;i++){
if(data[5] & 0x80)
setb_MOD;
else
clrb_MOD;
for(j=5;j>0;j--){
data[j]<<=1;
if(data[j-1] & 0x80)
data[j]+=1;
}
data[0] <<= 1;
}
The compilled code is slightly depends on an optimization settings but generally it is doing what I commanded in C:
for(j=5;j>0;j--){
code[j]<<=1;
a8: 82 91 ld r24, -Z
aa: 88 0f add r24, r24
ac: 80 83 st Z, r24
if(code[j-1]&0x80)
ae: 9e 91 ld r25, -X
b0: 97 fd sbrc r25, 7
b2: 13 c0 rjmp .+38 ; 0xda <__vector_2+0x74>
clrb_MOD;
}
else{
setb_MOD;
}
for(j=5;j>0;j--){
b4: 80 e0 ldi r24, 0x00 ; 0
b6: a3 36 cpi r26, 0x63 ; 99
b8: b8 07 cpc r27, r24
ba: b1 f7 brne .-20 ; 0xa8 <__vector_2+0x42>
code[j]<<=1;
if(code[j-1]&0x80)
code[j]+=1;
}
As you can see there is no obvious (for a human) solution to shift an array byte after byte.
I'd like to skip injection of inline assembler as I don't really manage this technique and I don't really understand how do I address C variables in Asm. Is there any alternatives?
If you know your input is less than 64 bits, you can do something like (assuming stdint.h is available, otherwise convert to unsigned long long, etc):
union BitShifter
{
uint64_t u64;
uint32_t u32[2];
uint16_t u16[4];
uint8_t u8[8];
};
union BitShifter MyBitshifter;
MyBitShifter.u64 <<= 1;
The compiler should use the best instruction to accomplish that (probably two 32 bit shifts and some other logic to get the bit from one word to another. Of course, the backend might be lazy and do it as bytes...
Depending on the endianness of the AVR, you'll have to swizzle your bytes in the right order to the outgoing bit order correct.

Creation and addressing arrays in AVR Assembly (Using the ATMega8535)

I am having trouble with the creation and addressing of an array created purely in assembly using the instruction set for the Atmel ATMega8535.
What I understand so far is as follows:
The array contains contiguous data that is equal in length.
The creation of the array involves defining the beginning and end locations of the array (much like you would the stack).
You would address an index in the array by adding an offset of the base address of the array.
What I am looking to do specifically is create a 1-D array of 8-bit integers with predefined values populating it during initialization it does not have to be written to, only addressed when needed. The problem ultimately lying in not being able to translate the logic into the assembly code.
I have tried with little progress to do so using support from the following books:
Some Assembly Required: Assembly Language Programming with the AVR Microcontroller by Timothy S Margush
Get Going with...AVR Microcontrollers by Peter Sharpe
Any help, advice or further resources would be greatly appreciated.
If your array is read-only, you do not need to copy it to RAM. You can
keep it in Flash and read it from there when needed. This will save you
precious RAM, at the cost of slower access (read from RAM is 2 cycles,
read from flash is 3 cycles).
You can declare your array like this:
.global my_array
.type my_array, #object
my_array:
.byte 12, 34, 56, 78
Then, to read a member of the array, you have to compute:
adress of member = array base address + member index
If your members were more than one byte, you would have to also multiply
the index by the size, but this is not the case here. Then, you put the
address of the required member in the Z register and issue an lpm
instruction. Here is a function implementing this logic:
.global read_data
; input: r24 = array index, r1 = 0
; output: r24 = array value
; clobbers: r30, r31
read_data:
ldi r30, lo8(my_array) ; load Z = address of my_array
ldi r31, hi8(my_array) ; ...high byte also
add r30, r24 ; add the array index
adc r31, r1 ; ...and add 0 to propagate the carry
lpm r24, Z
ret
#scottt advised you to first write in C, then look at the generated
assembly. I consider this very good advice, let's follow it:
#include <stdint.h>
__flash const uint8_t my_array[] = {12, 34, 56, 78};
uint8_t read_data(uint8_t index)
{
return my_array[index];
}
The __flash keyword identifying a “named address space” is an embedded
C extension supported by
gcc. The
generated assembly is slightly different from the previous one: instead
of computing base_address + index, gcc does index − (−base_address):
read_data:
mov r30, r24 ; load Z = array index
ldi r31, 0 ; ...high byte of index is 0
subi r30, lo8(-(my_array)) ; subtract -(address of my array)
sbci r31, hi8(-(my_array)) ; ...high byte also
lpm r24, Z
ret
This is just as efficient as the previous hand-rolled assembly, except
that it does not need the r1 register to be initialized to zero. But
keeping r1 to zero is part of the gcc ABI anyway, so it should make no
difference.
The role of the linker
This section is meant to answer the question in the comment: how can we
access the array if we do not know its address? The answer is: we access
it by its name, just like in the code snippets above. Choosing the final
address for the array, as well as replacing the name by the appropriate
address, is the linker’s job.
Assembling (with avr-gcc -c) and disassembling (with avr-objdump -d)
the first code snippet gives this:
my_array.o, section .text:
00000000 <my_array>:
0: 0c 22 38 4e ."8N
If we were compiling from C, gcc would have put the array in the
.progmem.data section instead of .text, but it makes little difference.
The numbers “0c 22 38 4e” are the array contents, in hex. The characters
to the right are the ASCII equivalents, ‘.’ being the placeholder for
non printing characters.
The object file also carries this symbol table, shown by avr-nm:
my_array.o:
00000000 T my_array
meaning the symbol “my_array” has been defined as referring to offset 0
into the .text section (implied by “T”) of this object.
Assembling and disassembling the second code snippet gives this:
read_data.o, section .text:
00000000 <read_data>:
0: e0 e0 ldi r30, 0x00
2: f0 e0 ldi r31, 0x00
4: e8 0f add r30, r24
6: f1 1d adc r31, r1
8: 84 91 lpm r24, Z
a: 08 95 ret
Comparing the disassembly with the actual source code, it can be seen
that the assembler replaced the address of my_array with 0x00, which is
almost guaranteed to be wrong. But it also left a note to the linker in
the form of “relocation records”, shown by avr-objdump -r:
read_data.o, RELOCATION RECORDS FOR [.text]:
OFFSET TYPE VALUE
00000000 R_AVR_LO8_LDI my_array
00000002 R_AVR_HI8_LDI my_array
This tells the linker that the ldi instructions at offsets 0x00 and
0x02 are intended to load the low byte and the high byte (respectively)
of the final address of my_array. The object file also carries this
symbol table:
read_data.o:
U my_array
00000000 T read_data
where the “U” line means the file makes use of an undefined symbol named
“my_array”.
Linking these pieces together, with a suitable main(), yields a binary
containing the C runtime from avr-lbc, together with our code:
0000003c <my_array>:
3c: 0c 22 38 4e ."8N
00000040 <read_data>:
40: ec e3 ldi r30, 0x3C
42: f0 e0 ldi r31, 0x00
44: e8 0f add r30, r24
46: f1 1d adc r31, r1
48: 84 91 lpm r24, Z
4a: 08 95 ret
It should be noted that, not only has the linker moved the pieces around
to their final addresses, it has also fixed the arguments of the ldi
instructions so that they now point to the correct address of my_array.
The code should look something like this:
.section .text
.global main
main:
ldi r30,lo8(data)
ldi r31,hi8(data)
ldd r24,Z+3
sts output,r24
ld r24,Z
sts output,r24
ldi r24,0
ldi r25,0
ret
.global data
.data
data:
.byte 1, 2, 3, 4
.comm output,1,1
Explanation
For people who have programmed in assembler using the GNU toolchain before, there are lessons that are transferable even to unfamiliar instruction sets:
You reserve space for an array with the assembler directives .byte 1, 2, 3, 4, .word 1, 2 (.word is 16 bits for AVR) or .space 100.
When learning a new instruction set, write C programs and ask the C compiler to generate assembler output. Find a good assembler programming reference for the instruction set as you read the assembler code.
Applying this trick below.
byte-array.c
/* volatile our code doesn't get optimized out even when compiler optimization is on */
volatile char output;
char data[] = { 1, 2, 3, 4 };
int main(void)
{
output = data[3];
output = data[0];
return 0;
}
Generate Assembler from C
avr-gcc -mmcu=atmega8 -Wall -Os -S byte-array.c
This will generate the assembler file byte-array.s.
byte-array.s
.file "byte-array.c"
__SP_H__ = 0x3e
__SP_L__ = 0x3d
__SREG__ = 0x3f
__tmp_reg__ = 0
__zero_reg__ = 1
.section .text.startup,"ax",#progbits
.global main
.type main, #function
main:
/* prologue: function */
/* frame size = 0 */
/* stack size = 0 */
.L__stack_usage = 0
ldi r30,lo8(data)
ldi r31,hi8(data)
ldd r24,Z+3
sts output,r24
ld r24,Z
sts output,r24
ldi r24,0
ldi r25,0
ret
.size main, .-main
.global data
.data
.type data, #object
.size data, 4
data:
.byte 1
.byte 2
.byte 3
.byte 4
.comm output,1,1
.ident "GCC: (Fedora 4.9.2-1.fc21) 4.9.2"
.global __do_copy_data
.global __do_clear_bss
Read this explanation of Pointer Registers to see how the AVR instruction set uses the r30, r31 register pair as the pointer register Z. Read up on the ld, st, ldi, ldd, sts and std instructions.
Implementation Notes
If you link the program then disassemble it:
avr-gcc -mmcu=atmega8 -Os byte-array.c -o byte-array.elf
avr-objdump -d byte-array.elf
00000000 <__vectors>:
0: 12 c0 rjmp .+36 ; 0x26 <__ctors_end>
2: 2c c0 rjmp .+88 ; 0x5c <__bad_interrupt>
4: 2b c0 rjmp .+86 ; 0x5c <__bad_interrupt>
6: 2a c0 rjmp .+84 ; 0x5c <__bad_interrupt>
8: 29 c0 rjmp .+82 ; 0x5c <__bad_interrupt>
a: 28 c0 rjmp .+80 ; 0x5c <__bad_interrupt>
c: 27 c0 rjmp .+78 ; 0x5c <__bad_interrupt>
e: 26 c0 rjmp .+76 ; 0x5c <__bad_interrupt>
10: 25 c0 rjmp .+74 ; 0x5c <__bad_interrupt>
12: 24 c0 rjmp .+72 ; 0x5c <__bad_interrupt>
14: 23 c0 rjmp .+70 ; 0x5c <__bad_interrupt>
16: 22 c0 rjmp .+68 ; 0x5c <__bad_interrupt>
18: 21 c0 rjmp .+66 ; 0x5c <__bad_interrupt>
1a: 20 c0 rjmp .+64 ; 0x5c <__bad_interrupt>
1c: 1f c0 rjmp .+62 ; 0x5c <__bad_interrupt>
1e: 1e c0 rjmp .+60 ; 0x5c <__bad_interrupt>
20: 1d c0 rjmp .+58 ; 0x5c <__bad_interrupt>
22: 1c c0 rjmp .+56 ; 0x5c <__bad_interrupt>
24: 1b c0 rjmp .+54 ; 0x5c <__bad_interrupt>
00000026 <__ctors_end>:
26: 11 24 eor r1, r1
28: 1f be out 0x3f, r1 ; 63
2a: cf e5 ldi r28, 0x5F ; 95
2c: d4 e0 ldi r29, 0x04 ; 4
2e: de bf out 0x3e, r29 ; 62
30: cd bf out 0x3d, r28 ; 61
00000032 <__do_copy_data>:
32: 10 e0 ldi r17, 0x00 ; 0
34: a0 e6 ldi r26, 0x60 ; 96
36: b0 e0 ldi r27, 0x00 ; 0
38: e4 e8 ldi r30, 0x84 ; 132
3a: f0 e0 ldi r31, 0x00 ; 0
3c: 02 c0 rjmp .+4 ; 0x42 <__SREG__+0x3>
3e: 05 90 lpm r0, Z+
40: 0d 92 st X+, r0
42: ac 36 cpi r26, 0x6C ; 108
44: b1 07 cpc r27, r17
46: d9 f7 brne .-10 ; 0x3e <__SP_H__>
00000048 <__do_clear_bss>:
48: 10 e0 ldi r17, 0x00 ; 0
4a: ac e6 ldi r26, 0x6C ; 108
4c: b0 e0 ldi r27, 0x00 ; 0
4e: 01 c0 rjmp .+2 ; 0x52 <.do_clear_bss_start>
00000050 <.do_clear_bss_loop>:
50: 1d 92 st X+, r1
00000052 <.do_clear_bss_start>:
52: ad 36 cpi r26, 0x6D ; 109
54: b1 07 cpc r27, r17
56: e1 f7 brne .-8 ; 0x50 <.do_clear_bss_loop>
58: 02 d0 rcall .+4 ; 0x5e <main>
5a: 12 c0 rjmp .+36 ; 0x80 <_exit>
0000005c <__bad_interrupt>:
5c: d1 cf rjmp .-94 ; 0x0 <__vectors>
0000005e <main>: ...
00000080 <_exit>:
80: f8 94 cli
00000082 <__stop_program>:
82: ff cf rjmp .-2 ; 0x82 <__stop_program>
You can see avr-gcc automatically generates startup code, including:
the interrupt vector (__vectors), which uses rjmp to jump to the Interrupt Service Routines.
initialize the status register, SREG , and the stack pointer, SPL/SPH (__ctors_end)
copies the data segment content from FLASH to RAM for initialized, writable global variables (__do_copy_data)
clears the BSS segment for uninitialized writable global variables (__do_clear_bss etc)
calls our main() function
calls _exit() if main() ever returns
_exit() is just a cli to disable interrupts
and an infinite loop (__stop_program)

Resources