int main(int argc, char *argv[])
{
char string[100];
string = *argv[1];
}
Why doesn't this work? Do I actually need to use loops to iterate through each element and do everything the long way?
Why doesn't this work?
Because that's simply how it works in C. Trying with string = argv[1] (without *) would be a better guess, but you cannot copy arrays with simple assignments.
Do I actually need to use loops to iterate through each element and do everything the long way?
Unless you are prepared to use functions like strcpy, strncpy or strdup or something similar, then yes. Using strncpy in your code would look like this:
char string[100];
strncpy(string, argv[1], sizeof(string));
string[sizeof(string) - 1] = 0;
The last line is to make sure that string is terminated. Clunky? Yes, it is. There are better functions in some compilers like strlcpy, which is available on POSIX systems, but it's not a part of the C standard. If you use strlcpy instead of strncpy you can skip the last line.
If you're planing to do a lot of string copying and don't have a compiler supporting strlcpy, it might be a good idea to write your own implementation (good practice) or just copy an existing one. Here is one I found:
size_t
strlcpy(char *dst, const char *src, size_t siz)
{
char *d = dst;
const char *s = src;
size_t n = siz;
/* Copy as many bytes as will fit */
if (n != 0) {
while (--n != 0) {
if ((*d++ = *s++) == '\0')
break;
}
}
/* Not enough room in dst, add NUL and traverse rest of src */
if (n == 0) {
if (siz != 0)
*d = '\0'; /* NUL-terminate dst */
while (*s++)
;
}
return(s - src - 1); /* count does not include NUL */
}
Source: https://android.googlesource.com/platform/system/core.git/+/brillo-m7-dev/libcutils/strlcpy.c
In the main function in C argv is a vector to strings which are arrays of characters themself. So argv is a pointer to a pointer (like **char).
Your code assigns a reference to one pointer (to first argument).
char* string = argv[1]; would do it. To copy the whole string (array of characters) use strcpy. To copy all arguments use memcpy.
But usually in a C program you do not copy arguments, just use references to them.
The short answer is: because it is. In the C language only structures and unions are copied by value with one exception:
Initialization of the array
void foo(void)
{
char x[] = "This string literal will be copied! Test it yourself";
char z[] = "This string literal will be copied as well But because it is much loger memcpy will be used! Test it yourself";
float y[] = {1.0,2.0, 3,0,4.0,5.0,1.0,2.0, 3,0,4.0,5.0,1.0,2.0, 3,0,4.0,5.0};
long long w[] = {1,2,3,4,5,6,7,8,9,0};
foo1(x,z); // this functions are only to prevent the variable removal
foo2(y,w);
}
and the compiled code:
foo:
push {r4, lr}
sub sp, sp, #320
mov ip, sp
ldr lr, .L4
ldr r4, .L4+4
ldmia lr!, {r0, r1, r2, r3}
stmia ip!, {r0, r1, r2, r3}
ldmia lr!, {r0, r1, r2, r3}
stmia ip!, {r0, r1, r2, r3}
ldmia lr!, {r0, r1, r2, r3}
stmia ip!, {r0, r1, r2, r3}
ldm lr, {r0, r1}
str r0, [ip], #4
strb r1, [ip]
add r0, sp, #208
mov r2, #110
ldr r1, .L4+8
bl memcpy
mov r1, r4
add r0, sp, #56
mov r2, #72
bl memcpy
mov r2, #80
add r1, r4, #72
add r0, sp, #128
bl memcpy
add r1, sp, #208
mov r0, sp
bl foo1
add r1, sp, #128
add r0, sp, #56
bl foo2
add sp, sp, #320
pop {r4, pc}
.L4:
.word .LC2
.word .LANCHOR0
.word .LC3
.LC2:
.ascii "This string literal will be copied! Test it yoursel"
.ascii "f\000"
.LC3:
.ascii "This string literal will be copied as well But beca"
.ascii "use it is much loger memcpy will be used! Test it y"
.ascii "ourself\000"
Structures and unions are copied by the value sothe assignment copies the whole structure to another.
typedef struct
{
char str[100];
}string;
string a = {.str = "This string literal will be copied before main starts"},b;
void foo3(string c)
{
string g = a;
b = a;
foo4(g);
}
and the code:
foo3:
sub sp, sp, #16
push {r4, r5, r6, lr}
mov r6, #100
sub sp, sp, #104
ldr r5, .L4
add ip, sp, #116
add r4, sp, #4
stmib ip, {r0, r1, r2, r3}
mov r2, r6
mov r1, r5
mov r0, r4
bl memcpy
mov r2, r6
mov r1, r5
ldr r0, .L4+4
bl memcpy
add r1, sp, #20
mov r2, #84
add r0, sp, #136
bl memcpy
ldm r4, {r0, r1, r2, r3}
add sp, sp, #104
pop {r4, r5, r6, lr}
add sp, sp, #16
b foo4
.L4:
.word .LANCHOR0
.word b
a:
.ascii "This string literal will be copied before main star"
.ascii "ts\000"
you can play with it yourself:
https://godbolt.org/z/lag4uL
In C, an array name is not an L-value expression. Hence you can not use it in an assignment statement. To make a copy of a character array, you can either use a for statement or strcpy function, which is declared in string.h header file.
Related
What is it that makes value in a variable of type _Bool 1 , even when we assign a value greater than 1 to it.
For ex:
_Bool tmp = 10;
printf("%x , %lu", tmp, sizeof(tmp));
This would print 1, 1. Trying to understand what is it that makes a variable of size Byte act as a single bit and when assigned a value > 1 which has LSB 0 still get converted to 1.
What is it that makes value in a variable of type _Bool 1 , even when we assign a value greater than 1 to it. The compiler does.
For example on ARM (arm-none-eabi-gcc):
#include "stdio.h"
#include "stdbool.h"
int main()
{
_Bool tmp = 10;
printf("%x , %lu", tmp, sizeof(tmp));
return 0;
}
compiles to:
.LC0:
.ascii "%x , %lu\000"
main:
stmfd sp!, {fp, lr}
add fp, sp, #4
sub sp, sp, #8
mov r3, #1
strb r3, [fp, #-5]
ldrb r3, [fp, #-5] # zero_extendqisi2
mov r2, #1
mov r1, r3
ldr r0, .L3
bl printf
mov r3, #0
mov r0, r3
sub sp, fp, #4
ldmfd sp!, {fp, lr}
bx lr
.L3:
.word .LC0
you can see in the instruction mov r3, #1 that the compiler directly converts the initialisation value 10 to 1 as specified by the standard .
According to this thread, initializing an array with a shorter string literal pads the array with zeros.
So, is there any reason why these two functions (test1 and test2) would produce different results when compiled for ARM Cortex M4?
extern void write(char * buff);
void test1(void)
{
char buff[8] = {'t', 'e', 's', 't', 0, 0, 0, 0 };
write(buff);
}
void test2(void)
{
char buff[8] = "test";
write(buff);
}
I am getting equivalent assemblies for x86-64, but on ARM gcc I get different output:
test1:
str lr, [sp, #-4]!
sub sp, sp, #12
mov r3, sp
ldr r2, .L4
ldm r2, {r0, r1}
stm r3, {r0, r1}
mov r0, r3
bl write
add sp, sp, #12
ldr pc, [sp], #4
.L4:
.word .LANCHOR0
test2:
mov r3, #0
str lr, [sp, #-4]!
ldr r2, .L8
sub sp, sp, #12
ldm r2, {r0, r1}
str r0, [sp]
mov r0, sp
strb r1, [sp, #4]
strb r3, [sp, #5]
strb r3, [sp, #6]
strb r3, [sp, #7]
bl write
add sp, sp, #12
ldr pc, [sp], #4
.L8:
.word .LANCHOR0+8
First of, the code is equivalent in the sense that the contents of memory constituting the objects named buf will be the same.
That said, the compiler clearly produces worse code for the second function. Consequently, since there is a way to produce more optimized code that has equivalent semantics, it would be reasonable to consider this an optimization failure in the compiler and file a bug for it.
If the compiler wanted to emit the same code, it would have to recognize that the string literal representation in memory can be zero-padded without changing the semantics of the program (though the string literal itself cannot be padded, since sizeof "test" cannot be equal to sizeof "test\0\0\0").
However, since this would only be advantageous if the string literal is used to initialize an explicit-length array (usually a bad idea) that is longer than the literal and where normal string semantics are not sufficient (bytes past the null-terminator are relevant), the value of better optimization for this case seems limited.
Addendum: If you set godbolt to not remove assembler directives, you can see how the literals are created:
.section .rodata
.align 2
.set .LANCHOR0,. + 0
.byte 116
.byte 101
.byte 115
.byte 116
.byte 0
.byte 0
.byte 0
.byte 0
.ascii "test\000"
.space 3
Interestingly, the compiler does not deduplicate, and it leaves the padding after the string literal to the assembler (.space 3), rather than explicitly zeroing it.
I'm currently learning ARM and I have to convert a particular C snippet into ARM code to test on the machine. However, I don't know how to declare array in ARM specifically and I didn't find very relevant resources that explained it..
i = 0
while (i < 0xB) {
x[i] = i*2;
}
Many thanks in advance
Since, there is not answer to this question, I'm just going to use GodBolt and input the OP's code there.
I'm taking the liberty of formatting the given code as so:
void assign(int num) {
int x[10];
int i = 0;
while(i < 0xB) {
x[i] = i*2;
}
}
The compiler used to translate this is ARM gcc 5.4.1
Translated Code:
assign(int):
str fp, [sp, #-4]!
add fp, sp, #0
sub sp, sp, #60
str r0, [fp, #-56]
mov r3, #0
str r3, [fp, #-8]
.L3:
ldr r3, [fp, #-8]
cmp r3, #10
bgt .L4
ldr r3, [fp, #-8]
mov r2, r3, asl #1
ldr r3, [fp, #-8]
mov r3, r3, asl #2
sub r1, fp, #4
add r3, r1, r3
str r2, [r3, #-44]
b .L3
.L4:
mov r0, r0 # nop
sub sp, fp, #0
ldr fp, [sp], #4
bx lr
Source: https://godbolt.org/g/5Tv3gk
Just write the code in C and let the compiler translate it to ARM assembly. Then inspect the generated code.
I'm writing embedded C/assembler code for the NXP LPC810 microcontroller (just a hobby project).
I have a function fn. I also have an exact copy of that function's machine code in an array of uint8_t. (I have checked the hex file.)
I create a function pointer fnptr, with the same type as fn and point it at the array, using a cast.
It all cross-compiles without warnings.
When the MCU executes fn it works correctly.
When the MCU executes fnptr it crashes (I can't see any debug, as there are only 8 pins, all in use).
The code is position independent.
The array has the correct 4 byte alignment.
fn is in the .text section of the elf file.
The array is forced into the .text section of the elf file (still in flash, not RAM).
I have assumed that there is no NX-like functionality on such a basic Coretex M0+ MCU. (Cortex M3 and M4 do have some form of read-only memory protection for code.)
Are there other reasons why the machine code in the array does not work?
Update:
Here is the code:
#include "stdio.h"
#include "serial.h"
extern "C" void SysTick_Handler() {
// generate an interrupt for delay
}
void delay(int millis) {
while (--millis >= 0) {
__WFI(); // wait for SysTick interrupt
}
}
extern "C" int fn(int a, int b) {
return a + b;
}
/* arm-none-eabi-objdump -d firmware.elf
00000162 <fn>:
162: 1840 adds r0, r0, r1
164: 4770 bx lr
166: 46c0 nop ; (mov r8, r8)
*/
extern "C" const uint8_t machine_code[6] __attribute__((aligned (4))) __attribute__((section (".text"))) = {
0x40,0x18,
0x70,0x47,
0xc0,0x46
};
int main() {
LPC_SWM->PINASSIGN0 = 0xFFFFFF04UL;
serial.init(LPC_USART0, 115200);
SysTick_Config(12000000/1000); // 1ms ticks
int(*fnptr)(int a, int b) = (int(*)(int, int))machine_code;
for (int a = 0; ; a++) {
int c = fnptr(a, 1000000);
printf("Hello world2 %d.\n", c);
delay(1000);
}
}
And here is the disassembled output from arm-none-eabi-objdump -D -Mforce-thumb firmware.elf:
00000162 <fn>:
162: 1840 adds r0, r0, r1
164: 4770 bx lr
166: 46c0 nop ; (mov r8, r8)
00000168 <machine_code>:
168: 1840 adds r0, r0, r1
16a: 4770 bx lr
16c: 46c0 nop ; (mov r8, r8)
16e: 46c0 nop ; (mov r8, r8)
00000170 <main>:
...
I amended the code to call the original fn though a function pointer too, in order to be able to generate working and non-working assembly code that was hopefully near-identical.
machine_code has become much longer, as I am now using no optimisation (-O0).
#include "stdio.h"
#include "serial.h"
extern "C" void SysTick_Handler() {
// generate an interrupt for delay
}
void delay(int millis) {
while (--millis >= 0) {
__WFI(); // wait for SysTick interrupt
}
}
extern "C" int fn(int a, int b) {
return a + b;
}
/*
000002bc <fn>:
2bc: b580 push {r7, lr}
2be: b082 sub sp, #8
2c0: af00 add r7, sp, #0
2c2: 6078 str r0, [r7, #4]
2c4: 6039 str r1, [r7, #0]
2c6: 687a ldr r2, [r7, #4]
2c8: 683b ldr r3, [r7, #0]
2ca: 18d3 adds r3, r2, r3
2cc: 1c18 adds r0, r3, #0
2ce: 46bd mov sp, r7
2d0: b002 add sp, #8
2d2: bd80 pop {r7, pc}
*/
extern "C" const uint8_t machine_code[24] __attribute__((aligned (4))) __attribute__((section (".text"))) = {
0x80,0xb5,
0x82,0xb0,
0x00,0xaf,
0x78,0x60,
0x39,0x60,
0x7a,0x68,
0x3b,0x68,
0xd3,0x18,
0x18,0x1c,
0xbd,0x46,
0x02,0xb0,
0x80,0xbd
};
int main() {
LPC_SWM->PINASSIGN0 = 0xFFFFFF04UL;
serial.init(LPC_USART0, 115200);
SysTick_Config(12000000/1000); // 1ms ticks
int(*fnptr)(int a, int b) = (int(*)(int, int))fn;
//int(*fnptr)(int a, int b) = (int(*)(int, int))machine_code;
for (int a = 0; ; a++) {
int c = fnptr(a, 1000000);
printf("Hello world2 %d.\n", c);
delay(1000);
}
}
I compiled the code above, generating firmware.fn.elf and firmware.machinecode.elf by uncommenting //int(*fnptr)(int a, int b) = (int(*)(int, int))machine_code; (and commenting-out the line above).
The first code (fn) worked, the second code (machine_code) crashed.
fn's text and the code at machine_code are identical:
000002bc <fn>:
2bc: b580 push {r7, lr}
2be: b082 sub sp, #8
2c0: af00 add r7, sp, #0
2c2: 6078 str r0, [r7, #4]
2c4: 6039 str r1, [r7, #0]
2c6: 687a ldr r2, [r7, #4]
2c8: 683b ldr r3, [r7, #0]
2ca: 18d3 adds r3, r2, r3
2cc: 1c18 adds r0, r3, #0
2ce: 46bd mov sp, r7
2d0: b002 add sp, #8
2d2: bd80 pop {r7, pc}
000002d4 <machine_code>:
2d4: b580 push {r7, lr}
2d6: b082 sub sp, #8
2d8: af00 add r7, sp, #0
2da: 6078 str r0, [r7, #4]
2dc: 6039 str r1, [r7, #0]
2de: 687a ldr r2, [r7, #4]
2e0: 683b ldr r3, [r7, #0]
2e2: 18d3 adds r3, r2, r3
2e4: 1c18 adds r0, r3, #0
2e6: 46bd mov sp, r7
2e8: b002 add sp, #8
2ea: bd80 pop {r7, pc}
000002ec <main>:
...
The only difference in the calling code is the location of the code called:
$ diff firmware.fn.bin.xxd firmware.machine_code.bin.xxd
54c54
< 0000350: 0040 0640 e02e 0000 bd02 0000 4042 0f00 .#.#........#B..
---
> 0000350: 0040 0640 e02e 0000 d402 0000 4042 0f00 .#.#........#B..
The second address d402 is the address of the machine_code array.
Curiously, the first address bd02 is a little-endian odd number (d is odd in hex).
The address of fn is 02bc (bc02 in big endian), so the pointer to fn is not the address of fn, but the address of fn plus one (or with the low bit set).
Changing the code to:
...
int main() {
LPC_SWM->PINASSIGN0 = 0xFFFFFF04UL;
serial.init(LPC_USART0, 115200);
SysTick_Config(12000000/1000); // 1ms ticks
//int(*fnptr)(int a, int b) = (int(*)(int, int))fn;
int machine_code_addr_low_bit_set = (int)machine_code | 1;
int(*fnptr)(int a, int b) = (int(*)(int, int))machine_code_addr_low_bit_set;
for (int a = 0; ; a++) {
int c = fnptr(a, 1000000);
printf("Hello world2 %d.\n", c);
delay(1000);
}
}
Makes it work.
Googling, I found:
The mechanism for switching makes use of the fact that all instructions must be (at least) halfword-aligned, which means that bit[0] of the branch target address is redundant. Therefore this bit can be re-used to indicate the target instruction set at that address. Bit[0] cleared to 0 means ARM and bit[0] set to 1 means Thumb.
on http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.faqs/ka12545.html
tl;dr
You need to set the low bit on function pointers when executing data as code on ARM Thumb.
I need to reduce the code bloat for the Cortex-M0 microprocessor.
At startup the ROM data has to be copied to the RAM data once. Therefore I have this piece of code:
void __startup( void ){
extern unsigned int __data_init_start;
extern unsigned int __data_start;
extern unsigned int __data_end;
// copy .data section from flash to ram
s = & __data_init_start;
d = & __data_start;
e = & __data_end;
while( d != e ){
*d++ = *s++;
}
}
The assembly code that is generated by the compiler looks like this:
ldr r1, .L10+8
ldr r2, .L10+12
sub r0, r1, r2
lsr r3, r0, #2
add r3, r3, #1
lsl r1, r3, #2
mov r3, #0
.L4:
add r3, r3, #4
cmp r3, r1
beq .L9
.L5:
ldr r4, .L10+16
add r0, r2, r3
add r4, r3, r4
sub r4, r4, #4
ldr r4, [r4]
sub r0, r0, #4
str r4, [r0]
b .L4
How can I optimize this code so the code size is at minimum?
The compiler (or you!) does not realize that the range to copy is end - start. There seems to be some unnecessarily shuffling of data going on -- the 2 add and the sub in the loop. Also, it seems to me the compiler makes sure that the number of copies to make is a multiple of 4. An obvious optimization, then, is to make sure it is in advance! Below I assume it is (if not, the bne will fail and happily keep on copying and trample all over your memory).
Using my decade-old ARM assembler knowlegde (yes, that is a major disclaimer), and post-incrementing, I think the following short snippet is what it can be condensed to. From 18 instructions down to 8, not too bad. If it works.
ldr r1, __data_init_start
ldr r2, __data_start
ldr r3, __data_end
sub r4, r3, r2
.L1:
ldr r3, [r1], #4 ; safe to re-use r3 here
str r3, [r2], #4
subs r4, r4, #4
bne L1
May be that platform guarantees that writing to an unsigned int * you may change an unsigned int * value (i.e. it doesn't take advantage of type mismatch aliasing rules).
Then the code is inefficient because e is a global variable and the generated code logic must take in account that writing to *d may change the value of e.
Making at least e a local should solve this problem (most compilers know that aliasing a local that never had its address taken is not possible from a C point of view).