A lot of related questions <How is x86 instruction cache synchronized? > mention x86 should properly handle i-cache synchronization in self modifying code. I wrote the following piece of code which toggles a function call on and off from different threads interleaved with its execution. I am using compare and swap operation as an additional guard so that the modification is atomic. But I am getting intermittent crashes (SIGSEGV, SIGILL) and analyzing the core dump makes me suspicious if the processor is trying to execute partially updated instructions. The code and the analysis given below. May be I am missing something here. Let me know if that's the case.
toggle.c
#include <stdio.h>
#include <inttypes.h>
#include <time.h>
#include <pthread.h>
#include <sys/mman.h>
#include <errno.h>
#include <unistd.h>
int active = 1; // Whether the function is toggled on or off
uint8_t* funcAddr = 0; // Address where function call happens which we need to toggle on/off
uint64_t activeSequence = 0; // Byte sequence for toggling on the function CALL
uint64_t deactiveSequence = 0; // NOP byte sequence for toggling off the function CALL
inline int modify_page_permissions(uint8_t* addr) {
long page_size = sysconf(_SC_PAGESIZE);
int code = mprotect((void*)(addr - (((uint64_t)addr)%page_size)), page_size,
PROT_READ | PROT_WRITE | PROT_EXEC);
if (code) {
fprintf(stderr, "mprotect was not successfull! code %d\n", code);
fprintf(stderr, "errno value is : %d\n", errno);
return 0;
}
// If the 8 bytes we need to modify straddles a page boundary make the next page writable too
if (page_size - ((uint64_t)addr)%page_size < 8) {
code = mprotect((void*)(addr-((uint64_t)addr)%page_size+ page_size) , page_size,
PROT_READ | PROT_WRITE | PROT_EXEC);
if (code) {
fprintf(stderr, "mprotect was not successfull! code %d\n", code);
fprintf(stderr, "errno value is : %d\n", errno);
return 0;;
}
}
return 1;
}
void* add_call(void* param) {
struct timespec ts;
ts.tv_sec = 0;
ts.tv_nsec = 50000;
while (1) {
if (!active) {
if (activeSequence != 0) {
int status = modify_page_permissions(funcAddr);
if (!status) {
return 0;
}
uint8_t* start_addr = funcAddr - 8;
fprintf(stderr, "Activating foo..\n");
uint64_t res = __sync_val_compare_and_swap((uint64_t*) start_addr,
*((uint64_t*)start_addr), activeSequence);
active = 1;
} else {
fprintf(stderr, "Active sequence not initialized..\n");
}
}
nanosleep(&ts, NULL);
}
}
int remove_call(uint8_t* addr) {
if (active) {
// Remove gets called first before add so we initialize active and deactive state byte sequences during the first call the remove
if (deactiveSequence == 0) {
uint64_t sequence = *((uint64_t*)(addr-8));
uint64_t mask = 0x0000000000FFFFFF;
uint64_t deactive = (uint64_t) (sequence & mask);
mask = 0x9090909090000000; // We NOP 5 bytes of CALL instruction and leave rest of the 3 bytes as it is
activeSequence = sequence;
deactiveSequence = deactive | mask;
funcAddr = addr;
}
int status = modify_page_permissions(addr);
if (!status) {
return -1;
}
uint8_t* start_addr = addr - 8;
fprintf(stderr, "Deactivating foo..\n");
uint64_t res = __sync_val_compare_and_swap((uint64_t*)start_addr,
*((uint64_t*)start_addr), deactiveSequence);
active = 0;
// fprintf(stderr, "Result : %p\n", res);
}
}
int counter = 0;
void foo(int i) {
// Use the return address to determine where we need to patch foo CALL instruction (5 bytes)
uint64_t* addr = (uint64_t*)__builtin_extract_return_addr(__builtin_return_address(0));
fprintf(stderr, "Foo counter : %d\n", counter++);
remove_call((uint8_t*)addr);
}
// This thread periodically checks if the method is inactive and if so reactivates it
void spawn_add_call_thread() {
pthread_t tid;
pthread_create(&tid, NULL, add_call, (void*)NULL);
}
int main() {
spawn_add_call_thread();
int i=0;
for (i=0; i<1000000; i++) {
// fprintf(stderr, "i : %d..\n", i);
foo(i);
}
fprintf(stderr, "Final count : %d..\n\n\n", counter);
}
Core dump analysis
Program terminated with signal 4, Illegal instruction.
#0 0x0000000000400a28 in main () at toggle.c:123
(gdb) info frame
Stack level 0, frame at 0x7fff7c8ee360:
rip = 0x400a28 in main (toggle.c:123); saved rip 0x310521ed5d
source language c.
Arglist at 0x7fff7c8ee350, args:
Locals at 0x7fff7c8ee350, Previous frame's sp is 0x7fff7c8ee360
Saved registers:
rbp at 0x7fff7c8ee350, rip at 0x7fff7c8ee358
(gdb) disas /r 0x400a28,+30
Dump of assembler code from 0x400a28 to 0x400a46:
=> 0x0000000000400a28 <main+64>: ff (bad)
0x0000000000400a29 <main+65>: ff (bad)
0x0000000000400a2a <main+66>: ff eb ljmpq *<internal disassembler error>
0x0000000000400a2c <main+68>: e7 48 out %eax,$0x48
(gdb) disas /r main
Dump of assembler code for function main:
0x00000000004009e8 <+0>: 55 push %rbp
...
0x0000000000400a24 <+60>: 89 c7 mov %eax,%edi
0x0000000000400a26 <+62>: e8 11 ff ff ff callq 0x40093c <foo>
0x0000000000400a2b <+67>: eb e7 jmp 0x400a14 <main+44>
So as can be seen the instruction pointer seems to positioned within an address inside the CALL instruction and processor is apparently trying to execute that misaligned instruction causing an illegal instruction fault.
I think your problem is that you replaced a 5-byte CALL instruction with 5 1-byte NOPs. Consider what happens when your thread has executed 3 of the NOPs, and then your master thread decides to swap the CALL instruction back in. Your thread's PC will be three bytes in the middle of the CALL instruction and will therefore execute an unexpected and likely illegal instruction.
What you need to do is swap the 5-byte CALL instruction with a 5-byte NOP. You just need to find a multibyte instruction that does nothing (such as or'ing a register against itself) and if you need some extra bytes, prepend some prefix bytes such as a gs override prefix and an address-size override prefix (both of which will do nothing). By using a 5-byte NOP, your thread will be guaranteed to either be at the CALL instruction or past the CALL instruction, but never inside of it.
On 80x86 most calls use a relative displacement, not an absolute address. Essentially its "call the code at here + < displacement >" and not "call the code at < address >".
For 64-bit code, the displacement may be 8 bits or 32-bits. It's never 64-bits.
For example, for a 2-byte "call with 8-bit displacement" instruction, you'd be trashing 6 bytes before the call instruction, the call opcode itself, and the instruction's operand (the displacement).
For another example, for a 5-byte "call with 32-bit displacement" instruction, you'd be trashing 3 bytes before the call instruction, the call opcode itself, and the instruction's operand (the displacement).
However...
These aren't the only way to call. For example, you can call using a function pointer, where the address of the code being called is not in the instruction at all (but may be in a register or be a variable in memory). There's also an optimisation called "tail call optimisation" where a call followed by a ret is replaced with a jmp (likely with some additional stack diddling for passing parameters, cleaning up the caller's local variables, etc).
Essentially; your code is severely broken, you can't cover all the possible corner cases, you shouldn't be doing this to begin with, and you probably should be using a function pointer instead of self modifying code (which would be faster and easier and portable too).
Related
I'am doing an exercice for an Operational Systems class and getting an SegFault error when calling printf with arguments.
The objective of the exercice is to simulate the initialization of a thread and print a counter, not very difficult. I have a table of 4 entries each with size 4096 bytes, each entry must represent the thread's stack represented as
#define STACK_SIZE 4096
char table[4][STACK_SIZE];
I defined a type called coroutine that will get only a stack address
typedef void* coroutine_t;
The i have a initialization code. This code must take the end of the routine stack, append the address of the coroutine and the initialization of the registers and return the pointer that will be the stack pointer for the coroutine.
coroutine_t init_coroutine(void *stack_begin, unsigned int stack_size,
void (*initial_pc)(void)) {
char *stack_end = ((char *)stack_begin) + stack_size;
void **ptr = (void**) stack_end;
ptr--;
*ptr = initial_pc;
ptr--;
*ptr = stack_end; /* Frame pointer */
ptr--;
*ptr = 0; /* RBX*/
ptr--;
*ptr = 0; /* R12 */
ptr--;
*ptr = 0; /* R13 */
ptr--;
*ptr = 0; /* R14 */
ptr--;
*ptr = 0; /* R15 */
return ptr;
}
Then i have this code in x86 assembly to enter the coroutine that just pop the register previously pushed
.global enter_coroutine /* Makes enter_coroutine visible to the linker*/
enter_coroutine:
mov %rdi,%rsp /* RDI contains the argument to enter_coroutine. */
/* And is copied to RSP. */
pop %r15
pop %r14
pop %r13
pop %r12
pop %rbx
pop %rbp
ret /* Pop the program counter */
The rest of my code is this
coroutine_t cr;
void test_function() {
int counter = 0;
while(1) {
printf("counter1: %d\n", counter);
counter++;
}
}
int main() {
cr = init_coroutine(table[0], STACK_SIZE, &test_function);
enter_coroutine(cr);
return 0;
}
So for the error
If i run as it is i will get a segfault when the program call printf the output from gdb is
Program received signal SIGSEGV, Segmentation fault.
0x00007ffff7dfcfdd in __vfprintf_internal (s=0x7ffff7f9d760 <_IO_2_1_stdout_>, format=0x555555556004 "counter1: %d\n", ap=ap#entry=0x555555558f48 <table+3848>, mode_flags=mode_flags#entry=0) at vfprintf-internal.c:1385
I assume it has some thing happening with the stack for two causes:
If i just print a string without parameters i get no error
If i remove the first ptr-- statement from the init_coroutine function it will also work, but will alocate things in the end of the stack and hence in the other thread's stack
I'am running this in a Intel(R) Core(TM) i5-5200U CPU with ubuntu 21.10 and ggc version 11.2.0
Could you give me some light here ?
I wasn't able to reproduce the problem on my x86_64 Linux box, but I was on compiler explorer, and the problem seems to be simple stack overflow (i.e., 4096 is too small a stack for printf).
Increasing the stack size (or choosing table[1], table[2], or table[3] instead table[0], which is effectively the same as increasing stack size) appears to make it work: https://gcc.godbolt.org/z/rnfMThbjo
So I have been learning about the concept of hooking and using trampolines in order to bypass/execute data in a WinAPI hook function (In a different executable file, using DLL injection). So far I know how to make it (the trampoline and hook) using a mixture of assembly and C, but I can't seem to do it with just using C, as I seem to be missing something. I'd appreciate if someone could tell me what I'm doing wrong and how to fix it up.
Right now my code:
#include <Windows.h>
unsigned char* address = 0;
__declspec(naked) int __stdcall MessageBoxAHookTrampoline(HWND Window, char* Message, char* Title, int Type) {
__asm
{
push ebp
mov ebp, esp
mov eax, address
add eax, 5
jmp eax
}
}
int __stdcall MessageBoxAHook(HWND Window, char* Message, char* Title, int Type) {
wchar_t* WMessage = L"Hooked!";
wchar_t* WTitle = L"Success!";
MessageBoxW(0, WMessage, WTitle, 0);
return MessageBoxAHookTrampoline(Window, Message, Title, Type);
}
unsigned long __stdcall Thread(void* Context) {
address = (unsigned char*)GetProcAddress(LoadLibraryA("user32"), "MessageBoxA");
ULONG OP = 0;
if (VirtualProtect(address, 1, PAGE_EXECUTE_READWRITE, &OP)) {
memset(address, 0x90, 5);
*address = 0xE9;
*(unsigned long*)(address + 1) = (unsigned long)MessageBoxAHook - (unsigned long)address - 5;
}
else {
MessageBoxA(0, "Failed to change protection", "RIP", 0);
}
return 1;
}
// Entry point.
BOOL WINAPI DllMain(HINSTANCE hinstDLL, DWORD fdwReason, LPVOID lpReserved) {
if (fdwReason == DLL_PROCESS_ATTACH) {
CreateThread(0, 0, Thread, 0, 0, 0);
}
else if (fdwReason == DLL_PROCESS_DETACH) {
}
return true;
}
So question is: How would I make a function say InstallHook that will install the hook and return a trampoline so I can use it easily?
Function prototype probably would be: void* InstallHook(void* originalFunc, void* targetFunc, int jumpsize), or so I've understood reading online, but unsure what jumpsize would be used for.
So far I know that the first 5 bytes must be preserved and restored, and then there's a jump to the address of the original hooked function. So I'd have to use malloc to allocate memory, memcpy to copy bytes over, the 0xE9 is the value of a jump instruction and such, but I just don't know how to implement it using just pure C. I figure it would be something similar to the code in this question. So how can I write a hook function that returns a trampoline using pure C for WinAPI functions?
If I understood the question correctly, you want to avoid "hard-coding" the trampoline function in assembly, presumably so you could have multiple trampolines in use at the same time without duplicating the code. You can achieve this using VirtualAlloc (malloc won't work since the returned memory won't be executable).
I wrote this from memory without access to a compiler so it might have some minor bugs, but the general idea is here. Normally you would also use VirtualProtect to change the page permissions to r-x instead of rwx once you're done modifying it, but I've left that out for the sake of simplicity:
void *CreateTrampoline(void *originalFunc)
{
/* Allocate the trampoline function */
uint8_t *trampoline = VirtualAlloc(
NULL,
5 + 5, /* 5 for the prologue, 5 for the JMP */
MEM_COMMIT | MEM_RESERVE,
PAGE_EXECUTE_READWRITE); /* Make trampoline executable */
/* Copy the original function's prologue */
memcpy(trampoline, originalFunc, 5);
/* JMP rel/32 opcode */
trampoline[5] = 0xE9;
/* JMP rel/32 operand */
uint32_t jmpDest = (uint32_t)originalFunc + 5; /* Skip original prologue */
uint32_t jmpSrc = (uint32_t)trampoline + 10; /* Starting after the JMP */
uint32_t delta = jmpDest - jmpSrc;
memcpy(trampoline + 6, &delta, 4);
return trampoline;
}
Your InstallHook function would then just call CreateTrampoline to create a trampoline, then patch the first 5 bytes of the original function with a JMP rel/32 to your hook.
Be warned, this only works on WinAPI functions, because Microsoft requires that they have a 5-byte prologue to enable hot-patching (which is what you're doing here). Normal functions do not have this requirement -- usually they only start with push ebp; mov ebp, esp which is only 3 bytes (and sometimes not even that, if the compiler decides to optimize it out).
Edit: here's how the math works:
_______________delta______________
| |
trampoline | originalFunc |
| | | |
v | v v
[prologue][jmp delta] [prologue][rest of func]
|________||_________| |________|
5 + 5 5
I'm trying to hook the Windows API function FindWindowA(). I successfully did it with the code below without "hotpatching" it: I've overwritten the bytes at the beginning of the function. myHook() is called and a message box shows up when FindWindowA() is called.
user32.dll has hotpatching enabled and I'd like to overwrite the NOPs before the actual function instead of overwriting the function itself. However, the code below won't work when I set hotpatching to TRUE. It does nothing when FindWindowA() gets executed.
#include <stdio.h>
#include <windows.h>
void myHook()
{
MessageBoxA(NULL, "Hooked", "Hook", MB_ICONINFORMATION);
}
int main(int argc, char *argv[])
{
BOOLEAN hotpatching = FALSE;
LPVOID fwAddress = GetProcAddress(GetModuleHandleA("user32.dll"), "FindWindowA");
LPVOID fwHotpatchingAddress = (LPVOID)((DWORD)fwAddress - 5);
LPVOID myHookAddress = &myHook;
DWORD jmpOffset = (DWORD)&myHook - (DWORD)(!hotpatching ? fwAddress : fwHotpatchingAddress) - 5; // -5 because "JMP offset" = 5 bytes (1 + 4)
printf("fwAddress: %X\n", fwAddress);
printf("fwHotpatchingAddress: %X\n", fwHotpatchingAddress);
printf("myHookAddress: %X\n", myHookAddress);
printf("jmpOffset: %X\n", jmpOffset);
printf("Ready?\n\n");
getchar();
char JMP[1] = {0xE9};
char RETN[1] = {0xC3};
LPVOID offset0 = NULL;
LPVOID offset1 = NULL;
LPVOID offset2 = NULL;
if (!hotpatching)
offset0 = fwAddress;
else
offset0 = fwHotpatchingAddress;
offset1 = (LPVOID)((DWORD)offset0 + 1);
offset2 = (LPVOID)((DWORD)offset1 + 4);
DWORD oldProtect = 0;
VirtualProtect(offset0, 6, PAGE_EXECUTE_READWRITE, &oldProtect);
memcpy(fwAddress, JMP, 1);
memcpy(offset1, &jmpOffset, 4);
memcpy(offset2, RETN, 1);
VirtualProtect(offset0, 6, oldProtect, &oldProtect);
printf("FindWindowA() Patched");
getchar();
FindWindowA(NULL, "Test");
getchar();
return 0;
}
Could you tell me what's wrong?
Thank you.
Hotpatching enabled executable images are prepared by the compiler and linker to allow replacing the image while in use. The following two changes are applied (x86):
The function entry point is set to a 2-byte no-op mov edi, edi (/hotpatch).
Five consecutive nop's are prepended to each function entry point (/FUNCTIONPADMIN).
To illustrate this, here is a typical disassembly listing of a hotpaching enabled function:
(2) 768C8D66 90 nop
768C8D67 90 nop
768C8D68 90 nop
768C8D69 90 nop
768C8D6A 90 nop
(1) 768C8D6B 8B FF mov edi,edi
(3) 768C8D6D 55 push ebp
768C8D6E 8B EC mov ebp,esp
(1) designates the function entry point with the 2-byte no-op. (2) is the padding provided by the linker, and (3) is where the non-trivial function implementation starts.
To hook into a function you have to overwrite (2) with a jump to your hook function jmp myHook, and make this code reachable by replacing (1) with a relative jump jmp $-5.
The hook function must leave the stack in a consistent state. It should be declared as __declspec(naked) to prevent the compiler from generating function prolog and epilog code. The final instruction must either perform stack cleanup in line with the calling convention of the hooked function, or jump back to the hooked function at the address designated by (3).
I would like to generate a function at runtime in C. And by this I mean I would essentially like to allocate some memory, point at it and execute it via function pointer. I realize this is a very complex topic and my question is naïve. I also realize there are some very robust libraries out there that do this (e.g. nanojit).
But I would like to learn the technique, starting with the basics. Could someone knowledgeable give me a very simple example in C?
EDIT: The answer below is great but here is the same example for Windows:
#include <Windows.h>
#define MEMSIZE 100*1024*1024
typedef void (*func_t)(void);
int main() {
HANDLE proc = GetCurrentProcess();
LPVOID p = VirtualAlloc(
NULL,
MEMSIZE,
MEM_RESERVE|MEM_COMMIT,
PAGE_EXECUTE_READWRITE);
func_t func = (func_t)p;
PDWORD code = (PDWORD)p;
code[0] = 0xC3; // ret
if(FlushInstructionCache(
proc,
NULL,
0))
{
func();
}
CloseHandle(proc);
VirtualFree(p, 0, MEM_RELEASE);
return 0;
}
As said previously by other posters, you'll need to know your platform pretty well.
Ignoring the issue of casting a object pointer to a function pointer being, technically, UB, here's an example that works for x86/x64 OS X (and possibly Linux too). All the generated code does is return to the caller.
#include <unistd.h>
#include <sys/mman.h>
typedef void (*func_t)(void);
int main() {
/*
* Get a RWX bit of memory.
* We can't just use malloc because the memory it returns might not
* be executable.
*/
unsigned char *code = mmap(NULL, getpagesize(),
PROT_READ|PROT_EXEC|PROT_WRITE,
MAP_SHARED|MAP_ANON, 0, 0);
/* Technically undefined behaviour */
func_t func = (func_t) code;
code[0] = 0xC3; /* x86 'ret' instruction */
func();
return 0;
}
Obviously, this will be different across different platforms but it outlines the basics needed: get executable section of memory, write instructions, execute instructions.
This requires you to know your platform. For instance, what is the C calling convention on your platform? Where are parameters stored? What register holds the return value? What registers must be saved and restored? Once you know that, you can essentially write some C code that assembles code into a block of memory, then cast that memory into a function pointer (though this is technically forbidden in ANSI C, and will not work depending if your platform marks some pages of memory as non-executable aka NX bit).
The simple way to go about this is simply to write some code, compile it, then disassemble it and look at what bytes correspond to which instructions. You can write some C code that fills allocated memory with that collection of bytes and then casts it to a function pointer of the appropriate type and executes.
It's probably best to start by reading the calling conventions for your architecture and compiler. Then learn to write assembly that can be called from C (i.e., follows the calling convention).
If you have tools, they can help you get some things right easier. For example, instead of trying to design the right function prologue/epilogue, I can just code this in C:
int foo(void* Data)
{
return (Data != 0);
}
Then (MicrosoftC under Windows) feed it to "cl /Fa /c foo.c". Then I can look at "foo.asm":
_Data$ = 8
; Line 2
push ebp
mov ebp, esp
; Line 3
xor eax, eax
cmp DWORD PTR _Data$[ebp], 0
setne al
; Line 4
pop ebp
ret 0
I could also use "dumpbin /all foo.obj" to see that the exact bytes of the function were:
00000000: 55 8B EC 33 C0 83 7D 08 00 0F 95 C0 5D C3
Just saves me some time getting the bytes exactly right...
I'm attempting to write a simple buffer overflow using C on Mac OS X 10.6 64-bit. Here's the concept:
void function() {
char buffer[64];
buffer[offset] += 7; // i'm not sure how large offset needs to be, or if
// 7 is correct.
}
int main() {
int x = 0;
function();
x += 1;
printf("%d\n", x); // the idea is to modify the return address so that
// the x += 1 expression is not executed and 0 gets
// printed
return 0;
}
Here's part of main's assembler dump:
...
0x0000000100000ebe <main+30>: callq 0x100000e30 <function>
0x0000000100000ec3 <main+35>: movl $0x1,-0x8(%rbp)
0x0000000100000eca <main+42>: mov -0x8(%rbp),%esi
0x0000000100000ecd <main+45>: xor %al,%al
0x0000000100000ecf <main+47>: lea 0x56(%rip),%rdi # 0x100000f2c
0x0000000100000ed6 <main+54>: callq 0x100000ef4 <dyld_stub_printf>
...
I want to jump over the movl instruction, which would mean I'd need to increment the return address by 42 - 35 = 7 (correct?). Now I need to know where the return address is stored so I can calculate the correct offset.
I have tried searching for the correct value manually, but either 1 gets printed or I get abort trap – is there maybe some kind of buffer overflow protection going on?
Using an offset of 88 works on my machine. I used Nemo's approach of finding out the return address.
This 32-bit example illustrates how you can figure it out, see below for 64-bit:
#include <stdio.h>
void function() {
char buffer[64];
char *p;
asm("lea 4(%%ebp),%0" : "=r" (p)); // loads address of return address
printf("%d\n", p - buffer); // computes offset
buffer[p - buffer] += 9; // 9 from disassembling main
}
int main() {
volatile int x = 7;
function();
x++;
printf("x = %d\n", x); // prints 7, not 8
}
On my system the offset is 76. That's the 64 bytes of the buffer (remember, the stack grows down, so the start of the buffer is far from the return address) plus whatever other detritus is in between.
Obviously if you are attacking an existing program you can't expect it to compute the answer for you, but I think this illustrates the principle.
(Also, we are lucky that +9 does not carry out into another byte. Otherwise the single byte increment would not set the return address how we expected. This example may break if you get unlucky with the return address within main)
I overlooked the 64-bitness of the original question somehow. The equivalent for x86-64 is 8(%rbp) because pointers are 8 bytes long. In that case my test build happens to produce an offset of 104. In the code above substitute 8(%%rbp) using the double %% to get a single % in the output assembly. This is described in this ABI document. Search for 8(%rbp).
There is a complaint in the comments that 4(%ebp) is just as magic as 76 or any other arbitrary number. In fact the meaning of the register %ebp (also called the "frame pointer") and its relationship to the location of the return address on the stack is standardized. One illustration I quickly Googled is here. That article uses the terminology "base pointer". If you wanted to exploit buffer overflows on other architectures it would require similarly detailed knowledge of the calling conventions of that CPU.
Roddy is right that you need to operate on pointer-sized values.
I would start by reading values in your exploit function (and printing them) rather than writing them. As you crawl past the end of your array, you should start to see values from the stack. Before long you should find the return address and be able to line it up with your disassembler dump.
Disassemble function() and see what it looks like.
Offset needs to be negative positive, maybe 64+8, as it's a 64-bit address. Also, you should do the '+7' on a pointer-sized object, not on a char. Otherwise if the two addresses cross a 256-byte boundary you will have exploited your exploit....
You might try running your code in a debugger, stepping each assembly line at a time, and examining the stack's memory space as well as registers.
I always like to operate on nice data types, like this one:
struct stackframe {
char *sf_bp;
char *sf_return_address;
};
void function() {
/* the following code is dirty. */
char *dummy;
dummy = (char *)&dummy;
struct stackframe *stackframe = dummy + 24; /* try multiples of 4 here. */
/* here starts the beautiful code. */
stackframe->sf_return_address += 7;
}
Using this code, you can easily check with the debugger whether the value in stackframe->sf_return_address matches your expectations.