Identical functions have different performance, why? - c
I made a program to test performance of different functions, that do the same in different ways. However, some of them generate the almost exact same machine code except for different addresses and when i measure the execution time one of them is faster than the other. This happens consistently in almost every run and i have no idea why. Strange is also that when i remove some other test functions, the difference dispersal. I wrote the code for another answer on StackOverflow and want to understand why.
I run on Debian 10 Buster, AMD64 and use GCC. I compile it with -O3.
Below is my code. The function lowerA() is almost every time faster than lower9(), despite the same code If i remove the unused functions lower1()-lower8() the difference goes away.
#include <string.h>
#include <stdio.h>
#include <stdint.h>
#include <errno.h>
#include <stdlib.h>
#include <inttypes.h>
#include <time.h>
//#define DEBUG
#ifdef DEBUG
#define N 10
#else
#define N 1000UL*100
#endif
#define M 20
#define STR_(x) #x
#define STR(x) STR_(x)
void lower1(char *s)
{
size_t i;
for (i = 0; i < strlen(s); i++)
{
if (s[i] >= 'A' && s[i] <= 'Z')
{
s[i] -= ('A' - 'a');
}
}
}
void lower2(char *s)
{
size_t i;
size_t len = strlen(s);
for (i = 0; i < len; i++)
{
if (s[i] >= 'A' && s[i] <= 'Z')
{
s[i] -= ('A' - 'a');
}
}
}
void lower3(char *s)
{
size_t i;
size_t len = strlen(s);
int d='A'-'a';
for (i = 0; i < len; i++)
{
if (s[i] >= 'A' && s[i] <= 'Z')
{
s[i] -= d;
}
}
}
void lower4(char *s)
{
size_t i;
size_t len = strlen(s);
for (i = 0; i < len; i++)
{
if (s[i] >= 'A' && s[i] <= 'Z')
{
s[i] += 0x20;
}
}
}
void lower5(char *s)
{
size_t i;
for (i = 0; i < strlen(s); i++)
{
if (s[i] >= 'A' && s[i] <= 'Z')
{
s[i] += ('a' - 'A');
}
}
}
void lower6(char *s)
{
size_t i;
for (i = 0; i < strlen(s); i++)
{
if (s[i] >= 'A' && s[i] <= 'Z')
{
s[i] |= 0x20;
}
}
}
void lower7(char *s)
{
size_t i;
size_t len = strlen(s);
for (i = 0; i < len; i++)
{
if (s[i] >= 'A' && s[i] <= 'Z')
{
s[i] |= 0x20;
}
}
}
void lower8(char *s)
{
size_t len = strlen(s);
while(len--)
{
if (*s >= 'A' && *s <= 'Z')
{
*s |= 0x20;
}
s++;
}
}
void lower9(char *s)
{
while(1)
{
if (!*s)
{
break;
}
if (*s >= 'A' && *s <= 'Z')
{
*s |= 0x20;
}
s++;
}
}
void lowerA(char *s)
{
while(*s)
{
if (*s >= 'A' && *s <= 'Z')
{
*s |= 0x20;
}
s++;
}
}
uint64_t die(const char *msg)
{
fprintf(stderr,"die: %s : %s\n",msg,strerror(errno));
exit(1);
}
uint64_t getTime(void)
{
uint64_t time;
struct timespec t_v;
if(clock_gettime(CLOCK_BOOTTIME,&t_v)<0)
{
die("cant get time");
}
time=t_v.tv_sec*1000000000ULL;
time+=t_v.tv_nsec;
return time;
}
void test(void (*fp)(char *),char (*s)[M],const char *name)
{
static char (*copy)[M];
copy=malloc(N*M);
if(!copy)
{
die("can't alloc memory");
}
memcpy(copy,s,N*M);
uint64_t start=getTime();
for(size_t u=0;u<N;u++)
{
fp(copy[u]);
}
uint64_t end=getTime();
printf("time %13"PRIu64" %s\n",end-start,name);
#ifdef DEBUG
for(size_t u=0;u<N;u++)
{
printf("%3zu %"STR(M)"s %"STR(M)"s\n",u,s[u],copy[u]);
}
#endif
free(copy);
}
void runTest(void)
{
//create a random string
srand(getTime());
static char string[N][M];
for(size_t u=0;u<N;u++)
{
size_t l=rand()%M;
for(size_t i=0;i<l;i++)
{
string[u][i]=rand()%('z'-'/')+'/';
}
string[u][l]='\0';
}
#define TEST(s) test(s,string,STR(s))
TEST(lower9);
TEST(lowerA);
}
int main(void)
{
for(unsigned i=0;i<8;i++)
{
runTest();
}
return 1;
}
This is the disassembly of the function lower9() and lowerA():
Dump of assembler code for function lower9:
0x00000000000017b0 <+0>: movzbl (%rdi),%eax
0x00000000000017b3 <+3>: test %al,%al
0x00000000000017b5 <+5>: je 0x17eb <lower9+59>
0x00000000000017b7 <+7>: nopw 0x0(%rax,%rax,1)
0x00000000000017c0 <+16>: lea -0x41(%rax),%edx
0x00000000000017c3 <+19>: cmp $0x19,%dl
0x00000000000017c6 <+22>: ja 0x17e0 <lower9+48>
0x00000000000017c8 <+24>: or $0x20,%eax
0x00000000000017cb <+27>: add $0x1,%rdi
0x00000000000017cf <+31>: mov %al,-0x1(%rdi)
0x00000000000017d2 <+34>: movzbl (%rdi),%eax
0x00000000000017d5 <+37>: test %al,%al
0x00000000000017d7 <+39>: jne 0x17c0 <lower9+16>
0x00000000000017d9 <+41>: retq
0x00000000000017da <+42>: nopw 0x0(%rax,%rax,1)
0x00000000000017e0 <+48>: add $0x1,%rdi
0x00000000000017e4 <+52>: movzbl (%rdi),%eax
0x00000000000017e7 <+55>: test %al,%al
0x00000000000017e9 <+57>: jne 0x17c0 <lower9+16>
0x00000000000017eb <+59>: retq
End of assembler dump.
Dump of assembler code for function lowerA:
0x00000000000017f0 <+0>: movzbl (%rdi),%eax
0x00000000000017f3 <+3>: test %al,%al
0x00000000000017f5 <+5>: je 0x182b <lowerA+59>
0x00000000000017f7 <+7>: nopw 0x0(%rax,%rax,1)
0x0000000000001800 <+16>: lea -0x41(%rax),%edx
0x0000000000001803 <+19>: cmp $0x19,%dl
0x0000000000001806 <+22>: ja 0x1820 <lowerA+48>
0x0000000000001808 <+24>: or $0x20,%eax
0x000000000000180b <+27>: add $0x1,%rdi
0x000000000000180f <+31>: mov %al,-0x1(%rdi)
0x0000000000001812 <+34>: movzbl (%rdi),%eax
0x0000000000001815 <+37>: test %al,%al
0x0000000000001817 <+39>: jne 0x1800 <lowerA+16>
0x0000000000001819 <+41>: retq
0x000000000000181a <+42>: nopw 0x0(%rax,%rax,1)
0x0000000000001820 <+48>: add $0x1,%rdi
0x0000000000001824 <+52>: movzbl (%rdi),%eax
0x0000000000001827 <+55>: test %al,%al
0x0000000000001829 <+57>: jne 0x1800 <lowerA+16>
0x000000000000182b <+59>: retq
End of assembler dump.
One example result, sorted:
time 4145299 lowerA
time 4153573 lowerA
time 4155081 lowerA
time 4158537 lowerA
time 4173954 lowerA
time 4190982 lowerA
time 4196201 lowerA
time 4202252 lower9
time 4209932 lower9
time 4214722 lowerA
time 4215861 lower9
time 4216471 lower9
time 4243532 lower9
time 4315601 lower9
time 4323003 lower9
time 4331462 lower9
Why is lowerA() a tiny bit faster?
Edit
with clang i do not get the same behavior.
Edit2
Swapping the calls for lower9() and lowerA() makes lower9() faster.
(Editor's note: as described in Idiomatic way of performance evaluation?, that can mean insufficient warm-up, or in this case more likely that it comes down to some fiddly difference in code alignment after inlining. Especially given that clang showed no difference.)
Edit3
I use a Intel Xeon CPU W3570 # 3.20GHz, Bloomfield microarchitecture, aka Nehalem server.
Edit4
gcc --version gives gcc (Debian 8.3.0-6) 8.3.0
Peter Cordes said the code could be inlined, i think he is correct, this is the dissasemply of runTest():
Dump of assembler code for function runTest:
0x00000000000019d0 <+0>: push %r15
0x00000000000019d2 <+2>: mov $0x7,%edi
0x00000000000019d7 <+7>: push %r14
0x00000000000019d9 <+9>: push %r13
0x00000000000019db <+11>: push %r12
0x00000000000019dd <+13>: push %rbp
0x00000000000019de <+14>: push %rbx
0x00000000000019df <+15>: sub $0x58,%rsp
0x00000000000019e3 <+19>: mov %rsp,%rsi
0x00000000000019e6 <+22>: callq 0x1050 <clock_gettime#plt>
0x00000000000019eb <+27>: test %eax,%eax
0x00000000000019ed <+29>: js 0x1cd1 <runTest+769>
0x00000000000019f3 <+35>: imul $0x3b9aca00,(%rsp),%rdi
0x00000000000019fb <+43>: add 0x8(%rsp),%rdi
0x0000000000001a00 <+48>: xor %r14d,%r14d
0x0000000000001a03 <+51>: xor %r13d,%r13d
0x0000000000001a06 <+54>: callq 0x1080 <srand#plt>
0x0000000000001a0b <+59>: nopl 0x0(%rax,%rax,1)
0x0000000000001a10 <+64>: callq 0x10e0 <rand#plt>
0x0000000000001a15 <+69>: mov %eax,%ebx
0x0000000000001a17 <+71>: mov $0x66666667,%eax
0x0000000000001a1c <+76>: imul %ebx
0x0000000000001a1e <+78>: mov %ebx,%eax
0x0000000000001a20 <+80>: sar $0x1f,%eax
0x0000000000001a23 <+83>: sar $0x3,%edx
0x0000000000001a26 <+86>: sub %eax,%edx
0x0000000000001a28 <+88>: lea (%rdx,%rdx,4),%eax
0x0000000000001a2b <+91>: shl $0x2,%eax
0x0000000000001a2e <+94>: sub %eax,%ebx
0x0000000000001a30 <+96>: movslq %ebx,%rbx
0x0000000000001a33 <+99>: test %rbx,%rbx
0x0000000000001a36 <+102>: je 0x1a78 <runTest+168>
0x0000000000001a38 <+104>: lea 0x2681(%rip),%rax # 0x40c0 <string.3410>
0x0000000000001a3f <+111>: mov $0x1b4e81b5,%ebp
0x0000000000001a44 <+116>: lea (%rax,%r14,1),%r15
0x0000000000001a48 <+120>: lea (%rbx,%r15,1),%r12
0x0000000000001a4c <+124>: nopl 0x0(%rax)
0x0000000000001a50 <+128>: callq 0x10e0 <rand#plt>
0x0000000000001a55 <+133>: add $0x1,%r15
0x0000000000001a59 <+137>: mov %eax,%ecx
0x0000000000001a5b <+139>: imul %ebp
0x0000000000001a5d <+141>: mov %ecx,%eax
0x0000000000001a5f <+143>: sar $0x1f,%eax
0x0000000000001a62 <+146>: sar $0x3,%edx
0x0000000000001a65 <+149>: sub %eax,%edx
0x0000000000001a67 <+151>: imul $0x4b,%edx,%edx
0x0000000000001a6a <+154>: sub %edx,%ecx
0x0000000000001a6c <+156>: add $0x2f,%ecx
0x0000000000001a6f <+159>: mov %cl,-0x1(%r15)
0x0000000000001a73 <+163>: cmp %r12,%r15
0x0000000000001a76 <+166>: jne 0x1a50 <runTest+128>
0x0000000000001a78 <+168>: lea 0x0(%r13,%r13,4),%rax
0x0000000000001a7d <+173>: lea 0x263c(%rip),%rdi # 0x40c0 <string.3410>
0x0000000000001a84 <+180>: add $0x1,%r13
0x0000000000001a88 <+184>: add $0x14,%r14
0x0000000000001a8c <+188>: lea (%rdi,%rax,4),%rax
0x0000000000001a90 <+192>: movb $0x0,(%rax,%rbx,1)
0x0000000000001a94 <+196>: cmp $0x186a0,%r13
0x0000000000001a9b <+203>: jne 0x1a10 <runTest+64>
0x0000000000001aa1 <+209>: mov $0x1e8480,%edi
0x0000000000001aa6 <+214>: callq 0x10b0 <malloc#plt>
0x0000000000001aab <+219>: mov %rax,%r12
0x0000000000001aae <+222>: mov %rax,0x1eaa8b(%rip) # 0x1ec540 <copy.3400>
0x0000000000001ab5 <+229>: test %rax,%rax
0x0000000000001ab8 <+232>: je 0x1cdd <runTest+781>
0x0000000000001abe <+238>: lea 0x25fb(%rip),%rsi # 0x40c0 <string.3410>
0x0000000000001ac5 <+245>: mov %rax,%rdi
0x0000000000001ac8 <+248>: mov $0x1e8480,%edx
0x0000000000001acd <+253>: callq 0x10a0 <memcpy#plt>
0x0000000000001ad2 <+258>: lea 0x10(%rsp),%rsi
0x0000000000001ad7 <+263>: mov $0x7,%edi
0x0000000000001adc <+268>: callq 0x1050 <clock_gettime#plt>
0x0000000000001ae1 <+273>: test %eax,%eax
0x0000000000001ae3 <+275>: js 0x1cd1 <runTest+769>
0x0000000000001ae9 <+281>: mov 0x18(%rsp),%rbp
0x0000000000001aee <+286>: mov %r12,%rsi
0x0000000000001af1 <+289>: imul $0x3b9aca00,0x10(%rsp),%rbx
0x0000000000001afa <+298>: lea 0x1e8480(%r12),%rdi
0x0000000000001b02 <+306>: nopw 0x0(%rax,%rax,1)
0x0000000000001b08 <+312>: movzbl (%rsi),%eax
0x0000000000001b0b <+315>: mov %rsi,%rdx
0x0000000000001b0e <+318>: test %al,%al
0x0000000000001b10 <+320>: je 0x1b35 <runTest+357>
0x0000000000001b12 <+322>: nopw 0x0(%rax,%rax,1)
0x0000000000001b18 <+328>: lea -0x41(%rax),%ecx
0x0000000000001b1b <+331>: cmp $0x19,%cl
0x0000000000001b1e <+334>: ja 0x1c88 <runTest+696>
0x0000000000001b24 <+340>: or $0x20,%eax
0x0000000000001b27 <+343>: add $0x1,%rdx
0x0000000000001b2b <+347>: mov %al,-0x1(%rdx)
0x0000000000001b2e <+350>: movzbl (%rdx),%eax
0x0000000000001b31 <+353>: test %al,%al
0x0000000000001b33 <+355>: jne 0x1b18 <runTest+328>
0x0000000000001b35 <+357>: add $0x14,%rsi
0x0000000000001b39 <+361>: cmp %rdi,%rsi
0x0000000000001b3c <+364>: jne 0x1b08 <runTest+312>
0x0000000000001b3e <+366>: lea 0x20(%rsp),%rsi
0x0000000000001b43 <+371>: mov $0x7,%edi
0x0000000000001b48 <+376>: callq 0x1050 <clock_gettime#plt>
0x0000000000001b4d <+381>: test %eax,%eax
0x0000000000001b4f <+383>: js 0x1cd1 <runTest+769>
0x0000000000001b55 <+389>: lea 0x4e6(%rip),%rdx # 0x2042
0x0000000000001b5c <+396>: lea 0x4d0(%rip),%rdi # 0x2033
0x0000000000001b63 <+403>: xor %eax,%eax
0x0000000000001b65 <+405>: imul $0x3b9aca00,0x20(%rsp),%rsi
0x0000000000001b6e <+414>: sub %rbp,%rsi
0x0000000000001b71 <+417>: add 0x28(%rsp),%rsi
0x0000000000001b76 <+422>: sub %rbx,%rsi
0x0000000000001b79 <+425>: callq 0x1070 <printf#plt>
0x0000000000001b7e <+430>: mov 0x1ea9bb(%rip),%rdi # 0x1ec540 <copy.3400>
0x0000000000001b85 <+437>: callq 0x1030 <free#plt>
0x0000000000001b8a <+442>: mov $0x1e8480,%edi
0x0000000000001b8f <+447>: callq 0x10b0 <malloc#plt>
0x0000000000001b94 <+452>: mov %rax,%r12
0x0000000000001b97 <+455>: mov %rax,0x1ea9a2(%rip) # 0x1ec540 <copy.3400>
0x0000000000001b9e <+462>: test %rax,%rax
0x0000000000001ba1 <+465>: je 0x1cdd <runTest+781>
0x0000000000001ba7 <+471>: lea 0x2512(%rip),%rsi # 0x40c0 <string.3410>
0x0000000000001bae <+478>: mov %rax,%rdi
0x0000000000001bb1 <+481>: mov $0x1e8480,%edx
0x0000000000001bb6 <+486>: callq 0x10a0 <memcpy#plt>
0x0000000000001bbb <+491>: lea 0x30(%rsp),%rsi
0x0000000000001bc0 <+496>: mov $0x7,%edi
0x0000000000001bc5 <+501>: callq 0x1050 <clock_gettime#plt>
0x0000000000001bca <+506>: test %eax,%eax
0x0000000000001bcc <+508>: js 0x1cd1 <runTest+769>
0x0000000000001bd2 <+514>: mov 0x38(%rsp),%rbp
0x0000000000001bd7 <+519>: mov %r12,%rsi
0x0000000000001bda <+522>: imul $0x3b9aca00,0x30(%rsp),%rbx
0x0000000000001be3 <+531>: lea 0x1e8480(%r12),%rdi
0x0000000000001beb <+539>: nopl 0x0(%rax,%rax,1)
0x0000000000001bf0 <+544>: movzbl (%rsi),%eax
0x0000000000001bf3 <+547>: mov %rsi,%rdx
0x0000000000001bf6 <+550>: test %al,%al
0x0000000000001bf8 <+552>: je 0x1c1d <runTest+589>
0x0000000000001bfa <+554>: nopw 0x0(%rax,%rax,1)
0x0000000000001c00 <+560>: lea -0x41(%rax),%ecx
0x0000000000001c03 <+563>: cmp $0x19,%cl
0x0000000000001c06 <+566>: ja 0x1cb0 <runTest+736>
0x0000000000001c0c <+572>: or $0x20,%eax
0x0000000000001c0f <+575>: add $0x1,%rdx
0x0000000000001c13 <+579>: mov %al,-0x1(%rdx)
0x0000000000001c16 <+582>: movzbl (%rdx),%eax
0x0000000000001c19 <+585>: test %al,%al
0x0000000000001c1b <+587>: jne 0x1c00 <runTest+560>
0x0000000000001c1d <+589>: add $0x14,%rsi
0x0000000000001c21 <+593>: cmp %rsi,%rdi
0x0000000000001c24 <+596>: jne 0x1bf0 <runTest+544>
0x0000000000001c26 <+598>: lea 0x40(%rsp),%rsi
0x0000000000001c2b <+603>: mov $0x7,%edi
0x0000000000001c30 <+608>: callq 0x1050 <clock_gettime#plt>
0x0000000000001c35 <+613>: test %eax,%eax
0x0000000000001c37 <+615>: js 0x1cd1 <runTest+769>
0x0000000000001c3d <+621>: lea 0x405(%rip),%rdx # 0x2049
0x0000000000001c44 <+628>: lea 0x3e8(%rip),%rdi # 0x2033
0x0000000000001c4b <+635>: xor %eax,%eax
0x0000000000001c4d <+637>: imul $0x3b9aca00,0x40(%rsp),%rsi
0x0000000000001c56 <+646>: sub %rbp,%rsi
0x0000000000001c59 <+649>: add 0x48(%rsp),%rsi
0x0000000000001c5e <+654>: sub %rbx,%rsi
0x0000000000001c61 <+657>: callq 0x1070 <printf#plt>
0x0000000000001c66 <+662>: mov 0x1ea8d3(%rip),%rdi # 0x1ec540 <copy.3400>
0x0000000000001c6d <+669>: callq 0x1030 <free#plt>
0x0000000000001c72 <+674>: add $0x58,%rsp
0x0000000000001c76 <+678>: pop %rbx
0x0000000000001c77 <+679>: pop %rbp
0x0000000000001c78 <+680>: pop %r12
0x0000000000001c7a <+682>: pop %r13
0x0000000000001c7c <+684>: pop %r14
0x0000000000001c7e <+686>: pop %r15
0x0000000000001c80 <+688>: retq
0x0000000000001c81 <+689>: nopl 0x0(%rax)
0x0000000000001c88 <+696>: add $0x1,%rdx
0x0000000000001c8c <+700>: movzbl (%rdx),%eax
0x0000000000001c8f <+703>: test %al,%al
0x0000000000001c91 <+705>: jne 0x1b18 <runTest+328>
0x0000000000001c97 <+711>: add $0x14,%rsi
0x0000000000001c9b <+715>: cmp %rdi,%rsi
0x0000000000001c9e <+718>: jne 0x1b08 <runTest+312>
0x0000000000001ca4 <+724>: jmpq 0x1b3e <runTest+366>
0x0000000000001ca9 <+729>: nopl 0x0(%rax)
0x0000000000001cb0 <+736>: add $0x1,%rdx
0x0000000000001cb4 <+740>: movzbl (%rdx),%eax
0x0000000000001cb7 <+743>: test %al,%al
0x0000000000001cb9 <+745>: jne 0x1c00 <runTest+560>
0x0000000000001cbf <+751>: add $0x14,%rsi
0x0000000000001cc3 <+755>: cmp %rsi,%rdi
0x0000000000001cc6 <+758>: jne 0x1bf0 <runTest+544>
0x0000000000001ccc <+764>: jmpq 0x1c26 <runTest+598>
0x0000000000001cd1 <+769>: lea 0x33a(%rip),%rdi # 0x2012
0x0000000000001cd8 <+776>: callq 0x1850 <die>
0x0000000000001cdd <+781>: lea 0x33c(%rip),%rdi # 0x2020
0x0000000000001ce4 <+788>: callq 0x1850 <die>
End of assembler dump.
Edit5 The inlined parts are also identical, as far i understand it.
You gave us one key observation
If i remove the unused functions lower1()-lower8() the difference goes
away.
That shows that it has nothing to do with the different lowerX() implementations, but with the location in memory, probably via affecting the cache hit/miss ratio. And it's not only the lowerX() function's location in memory, but more the combination with its calling function's location that influences the caches and thus the results.
That means that the effect depends on the overall memory layout of your final executable.
The memory layout in turn depends on the compiler toolchain and its settings, the order in which your functions are found in the source files, and probably a lot of additional influences.
Even for a given memory layout, the performance effect will vary a lot for different CPU / cache versions. So a different machine you might just give you the opposite results.
Anyway, as long as the performance difference is so tiny (2 or 3 percent), I can hardly imagine an application where that matters, especially in a function like uppercasing that rarely dominates real-world applications.
Related
Understanding and translating assembly code
So a little background. I am a beginner with c and assembly code, we have an "bomb" assignment (written in c)which calls methods that require certain passwords, but the code is not visible and I need to determine the correct password by looking at the assembly code. The code indicates the password for this method is 6 numbers, which is passed as "input" to method phase 2 (I am trying to avoid triggering ). I am having trouble understanding what is going on here so if anyone can help me translate this into C code, or if i need to look in any particular registers/locations it would help greatly. There are 4 more phases which are each supposed to be more complex so I want to get a good understanding in how to approach reading these. Also if anyone has a good resource (like a printable table) with assembly code keywords that would be helpful too, and also if there are any differences between 32-bit and 64-bit registers i need to worry about other than the register names.. (gdb) disas Dump of assembler code for function phase_2: 0x0000000000400f49 <+0>: push %rbp 0x0000000000400f4a <+1>: push %rbx 0x0000000000400f4b <+2>: sub $0x28,%rsp 0x0000000000400f4f <+6>: mov %fs:0x28,%rax 0x0000000000400f58 <+15>: mov %rax,0x18(%rsp) 0x0000000000400f5d <+20>: xor %eax,%eax 0x0000000000400f5f <+22>: mov %rsp,%rsi 0x0000000000400f62 <+25>: callq 0x401708 <read_six_numbers> 0x0000000000400f67 <+30>: cmpl $0x0,(%rsp) 0x0000000000400f6b <+34>: jne 0x400f74 <phase_2+43> 0x0000000000400f6d <+36>: cmpl $0x1,0x4(%rsp) 0x0000000000400f72 <+41>: je 0x400f79 <phase_2+48> 0x0000000000400f74 <+43>: callq 0x4016d2 <explode_bomb> 0x0000000000400f79 <+48>: mov %rsp,%rbx 0x0000000000400f7c <+51>: lea 0x10(%rsp),%rbp 0x0000000000400f81 <+56>: mov 0x4(%rbx),%eax 0x0000000000400f84 <+59>: add (%rbx),%eax 0x0000000000400f86 <+61>: cmp %eax,0x8(%rbx) 0x0000000000400f89 <+64>: je 0x400f90 <phase_2+71> => 0x0000000000400f8b <+66>: callq 0x4016d2 <explode_bomb> 0x0000000000400f90 <+71>: add $0x4,%rbx 0x0000000000400f94 <+75>: cmp %rbp,%rbx 0x0000000000400f97 <+78>: jne 0x400f81 <phase_2+56> 0x0000000000400f99 <+80>: mov 0x18(%rsp),%rax 0x0000000000400f9e <+85>: xor %fs:0x28,%rax 0x0000000000400fa7 <+94>: je 0x400fae <phase_2+101> 0x0000000000400fa9 <+96>: callq 0x400b90 <__stack_chk_fail#plt> 0x0000000000400fae <+101>: add $0x28,%rsp 0x0000000000400fb2 <+105>: pop %rbx 0x0000000000400fb3 <+106>: pop %rbp 0x0000000000400fb4 <+107>: retq End of assembler dump.
Your assembly is equivalent to this, see phase_2 function #include <stdio.h> __attribute__((noinline)) void read_six_numbers(void *xxx, int *num) { num[0] = 0; num[1] = 1; num[2] = 1; num[3] = 2; num[4] = 3; num[5] = 5; } __attribute__((noinline)) void explode_bomb() { printf("explode_bomb.\n"); } void phase_2(void *xxx) { int num[6]; int i; read_six_numbers(xxx, num); if (num[0] != 0 || num[1] != 1) explode_bomb(); for (i = 0; i < 4; i++) { if (num[i] + num[i + 1] == num[i + 2]) continue; explode_bomb(); } } int main() { phase_2(NULL); return 0; }
how to defuse this binary bomb phase 4
I am having trouble this piece of code in assembly language. Essentially I have to input 2 numbers that matches 2 numbers the code is comparing with. On line 0x08048c47 in phase_4, it compares the first input with 2, so I know the first input has to be 2. It then moves 4 spaces from the first input to next input, which then gets 2 subtracted from it. Now the (input-2) is compared with 2. It will continue the instruction if the inputs are below than or equal to 2. I've tested this with numbers 2,3,4 which pass the comparison. Other numbers greater than 4 and less than 2 do not pass the comparison and will cause the bomb to explode. I'm stuck on this part because the value being returned from func4 is not the same was the value represented at 0x08048c6e in phase_4, which is 8(%esp). On my computer when I debug it, it shows that it is 8, and the answers to my inputs 2,3,4 are 40, 60, 80 respectively. disas func4 0x08048bda <+0>: push %edi 0x08048bdb <+1>: push %esi 0x08048bdc <+2>: push %ebx 0x08048bdd <+3>: mov 0x10(%esp),%ebx 0x08048be1 <+7>: mov 0x14(%esp),%edi 0x08048be5 <+11>: test %ebx,%ebx 0x08048be7 <+13>: jle 0x8048c14 <func4+58> 0x08048be9 <+15>: mov %edi,%eax 0x08048beb <+17>: cmp $0x1,%ebx 0x08048bee <+20>: je 0x8048c19 <func4+63> 0x08048bf0 <+22>: sub $0x8,%esp 0x08048bf3 <+25>: push %edi 0x08048bf4 <+26>: lea -0x1(%ebx),%eax 0x08048bf7 <+29>: push %eax 0x08048bf8 <+30>: call 0x8048bda <func4> 0x08048bfd <+35>: add $0x8,%esp 0x08048c00 <+38>: lea (%edi,%eax,1),%esi 0x08048c03 <+41>: push %edi 0x08048c04 <+42>: sub $0x2,%ebx 0x08048c07 <+45>: push %ebx 0x08048c08 <+46>: call 0x8048bda <func4> 0x08048c0d <+51>: add $0x10,%esp 0x08048c10 <+54>: add %esi,%eax 0x08048c12 <+56>: jmp 0x8048c19 <func4+63> 0x08048c14 <+58>: mov $0x0,%eax 0x08048c19 <+63>: pop %ebx 0x08048c1a <+64>: pop %esi 0x08048c1b <+65>: pop %edi 0x08048c1c <+66>: ret disas phase_4 0x08048c1d <+0>: sub $0x1c,%esp 0x08048c20 <+3>: mov %gs:0x14,%eax 0x08048c26 <+9>: mov %eax,0xc(%esp) 0x08048c2a <+13>: xor %eax,%eax 0x08048c2c <+15>: lea 0x4(%esp),%eax 0x08048c30 <+19>: push %eax 0x08048c31 <+20>: lea 0xc(%esp),%eax 0x08048c35 <+24>: push %eax 0x08048c36 <+25>: push $0x804a25f 0x08048c3b <+30>: pushl 0x2c(%esp) 0x08048c3f <+34>: call 0x8048810 <__isoc99_sscanf#plt> 0x08048c44 <+39>: add $0x10,%esp 0x08048c47 <+42>: cmp $0x2,%eax 0x08048c4a <+45>: jne 0x8048c58 <phase_4+59> 0x08048c4c <+47>: mov 0x4(%esp),%eax 0x08048c50 <+51>: sub $0x2,%eax 0x08048c53 <+54>: cmp $0x2,%eax 0x08048c56 <+57>: jbe 0x8048c5d <phase_4+64> 0x08048c58 <+59>: call 0x8049123 <explode_bomb> 0x08048c5d <+64>: sub $0x8,%esp 0x08048c60 <+67>: pushl 0xc(%esp) 0x08048c64 <+71>: push $0x6 0x08048c66 <+73>: call 0x8048bda <func4> 0x08048c6b <+78>: add $0x10,%esp 0x08048c6e <+81>: cmp 0x8(%esp),%eax 0x08048c72 <+85>: je 0x8048c79 <phase_4+92> 0x08048c74 <+87>: call 0x8049123 <explode_bomb> 0x08048c79 <+92>: mov 0xc(%esp),%eax 0x08048c7d <+96>: xor %gs:0x14,%eax 0x08048c84 <+103>: je 0x8048c8b <phase_4+110> 0x08048c86 <+105>: call 0x8048790 <__stack_chk_fail#plt> 0x08048c8b <+110>: add $0x1c,%esp 0x08048c8e <+113>: ret
8(%esp) is the first number, under the framework of x86. enter 40 2 or 60 3 or 80 4 should work. Equivalent to the following logic #include <stdio.h> #include <stdlib.h> void explode_bomb() { printf("explode bomb.\n"); exit(1); } unsigned func4(int val, unsigned num) { int ret; if (val <= 0) return 0; if (num == 1) return 1; ret = func4(val - 1, num); ret += num; val -= 2; ret += func4(val, num); return ret; } void phase_4(const char *input) { unsigned num1, num2; if (sscanf(input, "%u %u", &num1, &num2) != 2) explode_bomb(); if (num2 - 2 > 2) explode_bomb(); if (func4(6, num2) != num1) explode_bomb(); } int main() { phase_4("40 2"); phase_4("60 3"); phase_4("80 4"); printf("success.\n"); return 0; }
Understanding how some assembly code is translated
Hi i'm currently doing a binary bomb and am wondering if I am understanding some stuff correctly. I have this; 0x00000000004011d4 <+0>: sub $0x8,%rsp 0x00000000004011d8 <+4>: cmpb $0x59,(%rdi) 0x00000000004011db <+7>: jne 0x4011fd <phase_1+41> 0x00000000004011dd <+9>: cmpb $0x46,0x2(%rdi) 0x00000000004011e1 <+13>: jne 0x4011fd <phase_1+41> 0x00000000004011e3 <+15>: cmpb $0x68,0x1(%rdi) 0x00000000004011e7 <+19>: je 0x40120b <phase_1+55> 0x00000000004011e9 <+21>: movsbl 0x10(%rdi),%ecx 0x00000000004011ed <+25>: movsbl 0x5(%rdi),%edx 0x00000000004011f1 <+29>: add $0xb,%edx 0x00000000004011f4 <+32>: mov $0x1,%eax 0x00000000004011f9 <+37>: cmp %edx,%ecx 0x00000000004011fb <+39>: je 0x401210 <phase_1+60> 0x00000000004011fd <+41>: callq 0x401b20 <bomb_activation> 0x0000000000401202 <+46>: mov $0xffffffffffffffff,%rax 0x0000000000401209 <+53>: jmp 0x401210 <phase_1+60> 0x000000000040120b <+55>: mov $0x0,%eax 0x0000000000401210 <+60>: add $0x8,%rsp 0x0000000000401214 <+64>: retq and so far I have translated it to this; if(arr[0] != 'Y'){ bomb_activation(); } if(arr[2] != 'F'){ bomb_activation(); } if(arr[1] == 'h'){ bomb_activation(); } int a = arr[10]; int b = arr[5]; b += 11; status = 1; if(t1 != t2){ bomb_activation(); } return status; } As you can probably tell i'm really confused on how exactly to read these lines, I see it as moving the 10th element of the array into the ecx registry and filling the rest of the registry with 0s and the same logic to edx, however i'm not too sure how to determine what the value of arr[5] or arr[10] is just from this. 0x00000000004011e9 <+21>: movsbl 0x10(%rdi),%ecx 0x00000000004011ed <+25>: movsbl 0x5(%rdi),%edx 0x00000000004011f1 <+29>: add $0xb,%edx 0x00000000004011f4 <+32>: mov $0x1,%eax 0x00000000004011f9 <+37>: cmp %edx,%ecx and more specifically how I am meant to determine the size of the array, maybe I am not understanding it at all though, any help would be great thanks.
Binary Bomb (Phase 4) %d %d
I have binary Phase that is not returning required result i.e 12. Any suggestions? Phase 4 Dump of assembler code for function phase_4: 0x000000000040100b <+0>: sub $0x18,%rsp 0x000000000040100f <+4>: lea 0x8(%rsp),%rcx 0x0000000000401014 <+9>: lea 0xc(%rsp),%rdx 0x0000000000401019 <+14>: mov $0x40278d,%esi 0x000000000040101e <+19>: mov $0x0,%eax 0x0000000000401023 <+24>: callq 0x400b90 <__isoc99_sscanf#plt> 0x0000000000401028 <+29>: cmp $0x2,%eax => 0x000000000040102b <+32>: je 0x401054 <phase_4+73> 0x000000000040102d <+34>: callq 0x401538 <explode_bomb> 0x0000000000401032 <+39>: mov $0xe,%edx 0x0000000000401037 <+44>: mov $0x0,%esi 0x000000000040103c <+49>: mov 0xc(%rsp),%edi 0x0000000000401040 <+53>: callq 0x400fd7 <func4> 0x0000000000401045 <+58>: cmp $0x12,%eax 0x0000000000401048 <+61>: je 0x40105d <phase_4+82> 0x000000000040104a <+63>: callq 0x401538 <explode_bomb> 0x000000000040104f <+68>: add $0x18,%rsp 0x0000000000401053 <+72>: retq 0x0000000000401054 <+73>: cmpl $0xe,0xc(%rsp) 0x0000000000401059 <+78>: jbe 0x401032 <phase_4+39> 0x000000000040105b <+80>: jmp 0x40102d <phase_4+34> 0x000000000040105d <+82>: cmpl $0x12,0x8(%rsp) 0x0000000000401062 <+87>: jne 0x40104a <phase_4+63> 0x0000000000401064 <+89>: jmp 0x40104f <phase_4+68> End of assembler dump. func4 is as follows: Dump of assembler code for function func4: => 0x0000000000400fd7 <+0>: push %rbx 0x0000000000400fd8 <+1>: mov %edx,%eax 0x0000000000400fda <+3>: sub %esi,%eax 0x0000000000400fdc <+5>: mov %eax,%ebx 0x0000000000400fde <+7>: shr $0x1f,%ebx 0x0000000000400fe1 <+10>: add %eax,%ebx 0x0000000000400fe3 <+12>: sar %ebx 0x0000000000400fe5 <+14>: add %esi,%ebx 0x0000000000400fe7 <+16>: cmp %edi,%ebx 0x0000000000400fe9 <+18>: jg 0x400ff3 <func4+28> 0x0000000000400feb <+20>: cmp %edi,%ebx 0x0000000000400fed <+22>: jl 0x400fff <func4+40> 0x0000000000400fef <+24>: mov %ebx,%eax 0x0000000000400ff1 <+26>: pop %rbx 0x0000000000400ff2 <+27>: retq 0x0000000000400ff3 <+28>: lea -0x1(%rbx),%edx 0x0000000000400ff6 <+31>: callq 0x400fd7 <func4> 0x0000000000400ffb <+36>: add %eax,%ebx 0x0000000000400ffd <+38>: jmp 0x400fef <func4+24> 0x0000000000400fff <+40>: lea 0x1(%rbx),%esi 0x0000000000401002 <+43>: callq 0x400fd7 <func4> 0x0000000000401007 <+48>: add %eax,%ebx 0x0000000000401009 <+50>: jmp 0x400fef <func4+24> End of assembler dump. I have written a test C program that I believe equivalent to above assembly code for func4. #include <stdio.h> int main() { int i=0; for(int i=0;i<15;i++) { int z=func4(i,0,14); printf("in main program: For input %d -> %d\n",i,z); } return 0; } int func4(int x, int low, int high) { int mid = (low + high) / 2; if (x == mid) { return (mid); } else if (x < mid) { int w=mid+func4(x, low, mid - 1); return w; } else { int p=mid+func4(x, mid + 1, high); return p; } } This program returns OUTPUT as follows: in main program: For input 0 -> 11 in main program: For input 1 -> 11 in main program: For input 2 -> 13 in main program: For input 3 -> 10 in main program: For input 4 -> 19 in main program: For input 5 -> 15 in main program: For input 6 -> 21 in main program: For input 7 -> 7 in main program: For input 8 -> 35 in main program: For input 9 -> 27 in main program: For input 10 -> 37 in main program: For input 11 -> 18 in main program: For input 12 -> 43 in main program: For input 13 -> 31 in main program: For input 14 -> 45 I figured out that function will take two arguments and the second argument should be 12. But I am not to get value 12 returned from func4. Any suggestions??
The assembly code is actually expecting a value of 0x12 (18 rather than 12) from func4: 0x0000000000401040 <+53>: callq 0x400fd7 <func4> 0x0000000000401045 <+58>: cmp $0x12,%eax <- Compare result to 0x12 0x0000000000401048 <+61>: je 0x40105d <phase_4+82> 0x000000000040104a <+63>: callq 0x401538 <explode_bomb> I haven't looked at your C translation of func4 for correctness, but assuming it's correct it looks like func4(11, 0, 14) gives you what you want.
GDB breakpoint in main() cannot access memory
My question is why when i set a breakpoint in main() with GDB i receive the error <0xffffffffffffe550: Cannot access memory at address 0xffffffffffffe550> I wanted to set a breakpoint in main() so i could examine the memory in the stack. My disassembled code is this: 0x00000000004008e8 <+0>: push %rbp 0x00000000004008e9 <+1>: mov %rsp,%rbp 0x00000000004008ec <+4>: add $0xffffffffffffff80,%rsp 0x00000000004008f0 <+8>: mov %edi,-0x74(%rbp) 0x00000000004008f3 <+11>: mov %rsi,-0x80(%rbp) => 0x00000000004008f7 <+15>: movl $0x1,-0x4(%rbp) 0x00000000004008fe <+22>: cmpl $0x1,-0x74(%rbp) 0x0000000000400902 <+26>: jle 0x400920 <main+56> 0x0000000000400904 <+28>: mov -0x80(%rbp),%rax 0x0000000000400908 <+32>: add $0x8,%rax 0x000000000040090c <+36>: mov (%rax),%rdx 0x000000000040090f <+39>: lea -0x70(%rbp),%rax 0x0000000000400913 <+43>: mov %rdx,%rsi 0x0000000000400916 <+46>: mov %rax,%rdi 0x0000000000400919 <+49>: callq 0x400670 <strcpy#plt> 0x000000000040091e <+54>: jmp 0x400924 <main+60> 0x0000000000400920 <+56>: movb $0x0,-0x70(%rbp) 0x0000000000400924 <+60>: callq 0x4006a0 <getuid#plt> 0x0000000000400929 <+65>: mov %eax,-0x8(%rbp) 0x000000000040092c <+68>: mov $0x0,%esi 0x0000000000400931 <+73>: mov $0x400c6e,%edi 0x0000000000400936 <+78>: mov $0x0,%eax 0x000000000040093b <+83>: callq 0x400720 <open#plt> 0x0000000000400940 <+88>: mov %eax,-0xc(%rbp) 0x0000000000400943 <+91>: cmpl $0xffffffff,-0xc(%rbp) 0x0000000000400947 <+95>: jne 0x40096b <main+131> 0x0000000000400949 <+97>: mov $0x400c80,%edi 0x000000000040094e <+102>: callq 0x400856 <fatal> 0x0000000000400953 <+107>: jmp 0x40096b <main+131> 0x0000000000400955 <+109>: lea -0x70(%rbp),%rdx 0x0000000000400959 <+113>: mov -0x8(%rbp),%ecx 0x000000000040095c <+116>: mov -0xc(%rbp),%eax 0x000000000040095f <+119>: mov %ecx,%esi 0x0000000000400961 <+121>: mov %eax,%edi 0x0000000000400963 <+123>: callq 0x40098c <print_notes> 0x0000000000400968 <+128>: mov %eax,-0x4(%rbp) 0x000000000040096b <+131>: cmpl $0x0,-0x4(%rbp) 0x000000000040096f <+135>: jne 0x400955 <main+109> 0x0000000000400971 <+137>: mov $0x400cb0,%edi 0x0000000000400976 <+142>: callq 0x400680 <puts#plt> 0x000000000040097b <+147>: mov -0xc(%rbp),%eax 0x000000000040097e <+150>: mov %eax,%edi 0x0000000000400980 <+152>: callq 0x4006e0 <close#plt> 0x0000000000400985 <+157>: mov $0x0,%eax 0x000000000040098a <+162>: leaveq 0x000000000040098b <+163>: retq And the code until my main is this: #include <stdio.h> #include <fcntl.h> #include <string.h> #include <stdlib.h> #include <unistd.h> #include "usefullfuncs.h" #define FILENAME "/var/notes" int print_notes(int, int, char *); int find_user_note(int,int); int search_note(char *, char *); int main(int argc, char *argv[]){ int userid,printing=1,fd; char searchstring[100]; if(argc>1) strcpy(searchstring,argv[1]); else searchstring[0] = 0; userid = getuid(); fd = open(FILENAME,O_RDONLY); if(fd == -1) fatal("in main() while opening file for reading"); while(printing) printing = print_notes(fd,userid,searchstring); printf("-------[ end of note data ]-------\n"); close(fd); } int print_notes(int fd,int uid,char *searchstring){ int note_lenght; char note_buffer[100]; note_lenght = find_user_note(fd,uid); if(note_lenght == -1) return 0; read(fd,note_buffer,note_lenght); note_buffer[note_lenght] = 0; if(search_note(note_buffer,searchstring)) printf(note_buffer); return 1; } int find_user_note(int fd,int user_uid){ int note_uid=-1; unsigned char byte; int lenght; while(note_uid != user_uid){ if(read(fd,¬e_uid,4)!=4) return -1; if(read(fd,&byte,1)!=1) return -1; byte = lenght = 0; while(byte != '\n'){ if(read(fd,&byte,1)!=1) return -1; lenght++; } } lseek(fd,lenght*-1,SEEK_CUR); printf("[DEBUG] found a %d byte note for user id %d\n",lenght,note_uid); return lenght; } int search_note(char *note, char *keyword){ int i,keyword_lenght,match=0; keyword_lenght = strlen(keyword); if(keyword_lenght == 0) return 1; for(i=0;i < strlen(note);i++){ if(note[i] == keyword[match]) match++; else{ if(note[i] == keyword[0]) match = 1; else match = 0; } if(match == keyword_lenght) return 1; } return 0; } Thanks in advance.
(gdb) x/24s $esp 0xffffffffffffe550: <error: Cannot access memory at address 0xffffffffffffe550> On an x86-64 target, $rsp should be used. Using $esp will lead to incorrect results. $esp is taken from the bottom 32 bits of the 64-bit $rsp register, and gdb treats it as type int32_t. $rsp in your example was probably 0x7fffffffe550. Gdb's x command, which wants to use a 64-bit address, will take the bottom 32 bits of $rsp, 0xffffe550, and sign-extend that to 0xffffffffffffe550. That's almost certainly an invalid address; typical user-space addresses on Linux don't go above 0x7ffffffff000 or so. Try x/24s $rsp. If you're trying to follow exercises out of an old book, you may be able to duplicate their 32-bit examples by giving gcc the -m32 option, if it supports it. Then you can use $esp.