I've tried to run the following C code from two different computers.
#include <string.h>
int a[100000];
int main(){
for(int sz = 100000; sz > 1; sz --){
memmove(a, a+1, 4*(sz - 1));
}
}
Computer A uses 800ms, while computer B uses 6200ms. The running time of B is always far more than A.
Environment
compile command
Shell is bash.
And the -O gcc command doesn't influence runtime.
gcc myfile.c -o mybin
time ./mybin
Computer A
gcc 9.3.0
glibc: ldd (Ubuntu GLIBC 2.31-0ubuntu9) 2.31
uname_result(system='Linux', release='5.4.0-100-generic', machine='x86_64')
CPU: Intel(R) Xeon(R) Gold 6140 CPU # 2.30GHz
Computer B
gcc 9.3.0
glibc: ldd (Ubuntu GLIBC 2.31-0ubuntu9.2) 2.31
uname_result(system='Linux', release='4.4.0-210-generic', machine='x86_64')
CPU: Intel(R) Xeon(R) Platinum 8369B CPU # 2.70GHz
Question
Then I run the following file on same virtual machines, with different kernal version(4.4.210-0404210-generic x86_64 and 5.4.0-113-generic x86_64) with gcc 9.3.0. Two test cost less than 500ms.
#include <string.h>
#include <time.h>
#include <stdio.h>
#define TICK(X) clock_t X = clock()
#define TOCK(X) printf("time %s: %g sec.\n", (#X), (double)(clock() - (X)) / CLOCKS_PER_SEC)
int a[100000];
int main(){
TICK(timer);
for(int sz = 100000; sz > 100; sz --){
memmove(a, a+1, 4*(sz - 1));
}
TOCK(timer);
}
How can I find the cause?
I am developing a Hobby operating system, for that I want to know the mechanism of memory allocation in Linux, to understand that, I created a simple C program that defines a unsigned char of some hex numbers and then runs in a empty infinite loop, I did this to keep the process alive. Then I used pmap to get page-mapping information. Now I know the location of stack segment, also I have created a program that uses process_vm_readv syscall to read the contents of that address, all I see a stream of 00 when I read the contents of stack segment and some random numbers at last, How can I be able to figure out how the array is stored in the stack segment?
If that is possible, how can I analyze the hex stream to extract meaningful information ?
Here I am adding a demonstration for accessing address space of a remote process, There are two programs local.c which will read and write a variable in another program named remote.c (These program assumes sizeof(int)==4 )
local.c
#define _GNU_SOURCE
#include <sys/uio.h>
#include <unistd.h>
#include <stdio.h>
#include <sys/syscall.h>
int main()
{
char buf[4];
struct iovec local[1];
struct iovec remote[1];
int pid;
void *addr;
printf("Enter remote pid\n");
scanf("%d",&pid);
printf("Enter remote address\n");
scanf("%p", &addr);
local[0].iov_base = buf;
local[0].iov_len = 4;
remote[0].iov_base = addr;
remote[0].iov_len = 4;
if(syscall(SYS_process_vm_readv,pid,local,1,remote,1,0) == -1) {
perror("");
return -1;
}
printf("read : %d\n",*(int*)buf);
*(int*)buf = 4321;
if(syscall(SYS_process_vm_writev,pid,local,1,remote,1,0) == -1) {
perror("");
return -1;
}
return 0;
}
remote.c
#define _GNU_SOURCE
#include <sys/uio.h>
#include <unistd.h>
#include <stdio.h>
#include <sys/syscall.h>
int main()
{
int a = 1234;
printf("%d %p\n",getpid(),&a);
while(a == 1234);
printf ("'a' changed to %d\n",a);
return 0;
}
And if you run this on a Linux machine,
[ajith#localhost Desktop]$ gcc remote.c -o remote -Wall
[ajith#localhost Desktop]$ ./remote
4574 0x7fffc4f4eb6c
'a' changed to 4321
[ajith#localhost Desktop]$
[ajith#localhost Desktop]$ gcc local.c -o local -Wall
[ajith#localhost Desktop]$ ./local
Enter remote pid
4574
Enter remote address
0x7fffc4f4eb6c
read : 1234
[ajith#localhost Desktop]$
Using the similar way you can read stack frame to the io-vectors, But you need to know the stack frame structure format to parse the values of local variables from stack frame. stack frame contains function parameters, return address, local variables, etc
I thought I`d first share this here to have your opinions before doing anything else. I found out while designing an algorithm that the gcc compiled code performance for some simple code was catastrophic compared to clang's.
How to reproduce
Create a test.c file containing this code :
#include <sys/stat.h>
#include <sys/types.h>
#include <stdio.h>
#include <stdint.h>
#include <stdlib.h>
#include <stdbool.h>
#include <string.h>
int main(int argc, char *argv[]) {
const uint64_t size = 1000000000;
const size_t alloc_mem = size * sizeof(uint8_t);
uint8_t *mem = (uint8_t*)malloc(alloc_mem);
for (uint_fast64_t i = 0; i < size; i++)
mem[i] = (uint8_t) (i >> 7);
uint8_t block = 0;
uint_fast64_t counter = 0;
uint64_t total = 0x123456789abcdefllu;
uint64_t receiver = 0;
for(block = 1; block <= 8; block ++) {
printf("%u ...\n", block);
counter = 0;
while (counter < size - 8) {
__builtin_memcpy(&receiver, &mem[counter], block);
receiver &= (0xffffffffffffffffllu >> (64 - ((block) << 3)));
total += ((receiver * 0x321654987cbafedllu) >> 48);
counter += block;
}
}
printf("=> %llu\n", total);
return EXIT_SUCCESS;
}
gcc
Compile and run :
gcc-7 -O3 test.c
time ./a.out
1 ...
2 ...
3 ...
4 ...
5 ...
6 ...
7 ...
8 ...
=> 82075168519762377
real 0m23.367s
user 0m22.634s
sys 0m0.495s
info :
gcc-7 -v
Using built-in specs.
COLLECT_GCC=gcc-7
COLLECT_LTO_WRAPPER=/usr/local/Cellar/gcc/7.3.0/libexec/gcc/x86_64-apple-darwin17.4.0/7.3.0/lto-wrapper
Target: x86_64-apple-darwin17.4.0
Configured with: ../configure --build=x86_64-apple-darwin17.4.0 --prefix=/usr/local/Cellar/gcc/7.3.0 --libdir=/usr/local/Cellar/gcc/7.3.0/lib/gcc/7 --enable-languages=c,c++,objc,obj-c++,fortran --program-suffix=-7 --with-gmp=/usr/local/opt/gmp --with-mpfr=/usr/local/opt/mpfr --with-mpc=/usr/local/opt/libmpc --with-isl=/usr/local/opt/isl --with-system-zlib --enable-checking=release --with-pkgversion='Homebrew GCC 7.3.0' --with-bugurl=https://github.com/Homebrew/homebrew-core/issues --disable-nls
Thread model: posix
gcc version 7.3.0 (Homebrew GCC 7.3.0)
So we get about 23s of user time. Now let's do the same with cc (clang on macOS) :
clang
cc -O3 test.c
time ./a.out
1 ...
2 ...
3 ...
4 ...
5 ...
6 ...
7 ...
8 ...
=> 82075168519762377
real 0m9.832s
user 0m9.310s
sys 0m0.442s
info :
Apple LLVM version 9.0.0 (clang-900.0.39.2)
Target: x86_64-apple-darwin17.4.0
Thread model: posix
InstalledDir: /Applications/Xcode.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin
That's more than 2.5x faster !! Any thoughts ?
I replaced the __builtin_memcpy function by memcpy to test things out and this time the compiled code runs in about 34s on both sides - consistent and slower as expected.
It would appear that the combination of __builtin_memcpy and bitmasking is interpreted very differently by both compilers.
I had a look at the assembly code, but couldn't see anything standing out that would explain such a drop in performance as I'm not an asm expert.
Edit 03-05-2018 :
Posted this bug : https://gcc.gnu.org/bugzilla/show_bug.cgi?id=84719.
I find it suspicious that you get different code for memcpy vs __builtin_memcpy. I don't think that's supposed to happen, and indeed I cannot reproduce it on my (linux) system.
If you add #pragma GCC unroll 16 (implemented in gcc-8+) before the for loop, gcc gets the same perf as clang (making block a constant is essential to optimize the code), so essentially llvm's unrolling is more aggressive than gcc's, which can be good or bad depending on cases. Still, feel free to report it to gcc, maybe they'll tweak the unrolling heuristics some day and an extra testcase could help.
Once unrolling is taken care of, gcc does ok for some values (block equals 4 or 8 in particular), but much worse for some others, in particular 3. But that's better analyzed with a smaller testcase without the loop on block. Gcc seems to have trouble with memcpy(,,3), it works much better if you always read 8 bytes (the next line already takes care of the extra bytes IIUC). Another thing that could be reported to gcc.
Is it possible to get the memory size ( i.e., RAM) via C code? For instance, we can generate a directory using system("mkdir -p /path/") where mkdir is actually a unix command. Can I do something similar to get the memory size assuming I am on linux system?
Thank you
For Linux systems there is sysinfo():
#include <sys/sysinfo.h>
#include <stdio.h>
int
main(int argc, char **argv)
{
struct sysinfo info;
sysinfo(&info);
// use info.totalram, info.freeram, etc
printf("%lu bytes\n", info.totalram);
return 0;
}
Check you man page on sysinfo(2) to know all the details of the struct.
Use free
$ free
total used free shared buff/cache available
Mem: 8093896 3657260 2163392 68360 2273244 4109636
Swap: 27733868 1045556 26688312
In a program:
/* mem.c */
int main()
{
system("free");
}
And as a proof of concept:
$ gcc mem.c
$ ./a.out
total used free shared buff/cache available
Mem: 8093896 3692808 2122384 68360 2278704 4074036
Swap: 27733868 1045544 26688324
I am trying to make the .text section writable for a C program. I looked through the options provided in this SO question and zeroed on modifying the linker script to achieve this.
For this I created a writable memory region using
MEMORY { rwx (wx) : ORIGIN = 0x400000, LENGTH = 256K}
and at the section .text added:
.text :
{
*(.text.unlikely .text.*_unlikely)
*(.text.exit .text.exit.*)
*(.text.startup .text.startup.*)
*(.text.hot .text.hot.*)
*(.text .stub .text.* .gnu.linkonce.t.*)
/* .gnu.warning sections are handled specially by elf32.em. */
*(.gnu.warning)
} >rwx
On compiling the code with gcc flag -T and giving my linker file as an argument I am getting an error:
error: no memory region specified for loadable section '.interp'
I am only trying to change the memory permissions for the .text region. Working on Ubuntu x86_64 architecture.
Is there a better way to do this?
Any help is highly appreciated.
Thanks
The Linker Script
Linker Script on pastie.org
In Linux, you can use mprotect() to enable/disable text section write protection from the runtime code; see the Notes section in man 2 mprotect.
Here is a real-world example. First, however, a caveat:
I consider this just a proof of concept implementation, and not something I'd ever use in a real world application. It may look enticing for use in a high-performance library of some sort, but in my experience, changing the API (or the paradigm/approach) of the library usually yields much better results -- and fewer hard-to-debug bugs.
Consider the following six files:
foo1.c:
int foo1(const int a, const int b) { return a*a - 2*a*b + b*b; }
foo2.c:
int foo2(const int a, const int b) { return a*a + b*b; }
foo.h.header:
#ifndef FOO_H
#define FOO_H
extern int foo1(const int a, const int b);
extern int foo2(const int a, const int b);
foo.h.footer:
#endif /* FOO_H */
main.c:
#include <unistd.h>
#include <sys/mman.h>
#include <errno.h>
#include <string.h>
#include <stdio.h>
#include "foo.h"
int text_copy(const void *const target,
const void *const source,
const size_t length)
{
const long page = sysconf(_SC_PAGESIZE);
void *start = (char *)target - ((long)target % page);
size_t bytes = length + (size_t)((long)target % page);
/* Verify sane page size. */
if (page < 1L)
return errno = ENOTSUP;
/* Although length should not need to be a multiple of page size,
* adjust it up if need be. */
if (bytes % (size_t)page)
bytes = bytes + (size_t)page - (bytes % (size_t)page);
/* Disable write protect on target pages. */
if (mprotect(start, bytes, PROT_READ | PROT_WRITE | PROT_EXEC))
return errno;
/* Copy code.
* Note: if the target code is being executed, we're in trouble;
* this offers no atomicity guarantees, so other threads may
* end up executing some combination of old/new code.
*/
memcpy((void *)target, (const void *)source, length);
/* Re-enable write protect on target pages. */
if (mprotect(start, bytes, PROT_READ | PROT_EXEC))
return errno;
/* Success. */
return 0;
}
int main(void)
{
printf("foo1(): %d bytes at %p\n", foo1_SIZE, foo1_ADDR);
printf("foo2(): %d bytes at %p\n", foo2_SIZE, foo2_ADDR);
printf("foo1(3, 5): %d\n", foo1(3, 5));
printf("foo2(3, 5): %d\n", foo2(3, 5));
if (foo2_SIZE < foo1_SIZE) {
printf("Replacing foo1() with foo2(): ");
if (text_copy(foo1_ADDR, foo2_ADDR, foo2_SIZE)) {
printf("%s.\n", strerror(errno));
return 1;
}
printf("Done.\n");
} else {
printf("Replacing foo2() with foo1(): ");
if (text_copy(foo2_ADDR, foo1_ADDR, foo1_SIZE)) {
printf("%s.\n", strerror(errno));
return 1;
}
printf("Done.\n");
}
printf("foo1(3, 5): %d\n", foo1(3, 5));
printf("foo2(3, 5): %d\n", foo2(3, 5));
return 0;
}
function-info.bash:
#!/bin/bash
addr_prefix=""
addr_suffix="_ADDR"
size_prefix=""
size_suffix="_SIZE"
export LANG=C
export LC_ALL=C
nm -S "$#" | while read addr size kind name dummy ; do
[ -n "$addr" ] || continue
[ -n "$size" ] || continue
[ -z "$dummy" ] || continue
[ "$kind" = "T" ] || continue
[ "$name" != "${name#[A-Za-z]}" ] || continue
printf '#define %s ((void *)0x%sL)\n' "$addr_prefix$name$addr_suffix" "$addr"
printf '#define %s %d\n' "$size_prefix$name$size_suffix" "0x$size"
done || exit $?
Remember to make it executable using chmod u+x ./function-info.bash
First, compile the sources using valid sizes but invalid addresses:
gcc -W -Wall -O3 -c foo1.c
gcc -W -Wall -O3 -c foo2.c
( cat foo.h.header ; ./function-info.bash foo1.o foo2.o ; cat foo.h.footer) > foo.h
gcc -W -Wall -O3 -c main.c
The sizes are correct but the addresses are not, because the code is yet to be linked. Relative to the final binary, the object file contents are usually relocated at link time. So, link the sources to get example executable, example:
gcc -W -Wall -O3 main.o foo1.o foo2.o -o example
Extract the correct (sizes and) addresses:
( cat foo.h.header ; ./function-info.bash example ; cat foo.h.footer) > foo.h
Recompile and link,
gcc -W -Wall -O3 -c main.c
gcc -W -Wall -O3 foo1.o foo2.o main.o -o example
and verify that the constants now do match:
mv -f foo.h foo.h.used
( cat foo.h.header ; ./function-info.bash example ; cat foo.h.footer) > foo.h
cmp -s foo.h foo.h.used && echo "Done." || echo "Recompile and relink."
Due to high optimization (-O3) the code that utilizes the constants may change size, requiring a yet another recompile-relink. If the last line outputs "Recompile and relink", just repeat the last two steps, i.e. five lines.
(Note that since foo1.c and foo2.c do not use the constants in foo.h, they obviously do not need to be recompiled.)
On x86_64 (GCC-4.6.3-1ubuntu5), running ./example outputs
foo1(): 21 bytes at 0x400820
foo2(): 10 bytes at 0x400840
foo1(3, 5): 4
foo2(3, 5): 34
Replacing foo1() with foo2(): Done.
foo1(3, 5): 34
foo2(3, 5): 34
which shows that the foo1() function indeed was replaced. Note that the longer function is always replaced with the shorter one, because we must not overwrite any code outside the two functions.
You can modify the two functions to verify this; just remember to repeat the entire procedure (so that you use the correct _SIZE and _ADDR constants in main()).
Just for giggles, here is the generated foo.h for the above:
#ifndef FOO_H
#define FOO_H
extern int foo1(const int a, const int b);
extern int foo2(const int a, const int b);
#define foo1_ADDR ((void *)0x0000000000400820L)
#define foo1_SIZE 21
#define foo2_ADDR ((void *)0x0000000000400840L)
#define foo2_SIZE 10
#define main_ADDR ((void *)0x0000000000400610L)
#define main_SIZE 291
#define text_copy_ADDR ((void *)0x0000000000400850L)
#define text_copy_SIZE 226
#endif /* FOO_H */
You might wish to use a smarter scriptlet, say an awk one that uses nm -S to obtain all function names, addresses, and sizes, and in the header file replaces only the values of existing definitions, to generate your header file. I'd use a Makefile and some helper scripts.
Further notes:
The function code is copied as-is, no relocation etc. is done. (This means that if the machine code of the replacement function contains absolute jumps, the execution continues in the original code. These example functions were chosen, because they're unlikely to have absolute jumps in them. Run objdump -d foo1.o foo2.o to verify from the assembly.)
That is irrelevant if you use the example just to investigate how to modify executable code within the running process. However, if you build runtime-function-replacing schemes on top of this example, you may need to use position independent code for the replaced code (see the GCC manual for relevant options for your architecture) or do your own relocation.
If another thread or signal handler executes the code being modified, you're in serious trouble. You get undefined results. Unfortunately, some libraries start extra threads, which may not block all possible signals, so be extra careful when modifying code that might be run by a signal handler.
Do not assume the compiler compiles the code in a specific way or uses a specific organization. My example uses separate compilation units, to avoid the cases where the compiler might share code between similar functions.
Also, it examines the final executable binary directly, to obtain the sizes and addresses to be modified to modify an entire function implementation. All verifications should be done on the object files or final executable, and disassembly, instead of just looking at the C code.
Putting any code that relies on the address and size constants into a separate compilation unit makes it easier and faster to recompile and relink the binary. (You only need to recompile the code that uses the constants directly, and you can even use less optimization for that code, to eliminate extra recompile-relink cycles, without impacting the overall code quality.)
In my main.c, both the address and length supplied to mprotect() are page-aligned (based on the user parameters). The documents say only the address has to be. Since protections are page-granular, making sure the length is a multiple of the page size does not hurt.
You can read and parse /proc/self/maps (which is a kernel-generated pseudofile; see man 5 proc, /proc/[pid]/maps section, for further info) to obtain the existing mappings and their protections for the current process.
In any case, if you have any questions, I'd be happy to try and clarify the above.
Addendum:
It turns out that using the GNU extension dl_iterate_phdr() you can enable/disable write protection on all text sections trivially:
#define _GNU_SOURCE
#include <unistd.h>
#include <dlfcn.h>
#include <sys/mman.h>
#include <link.h>
static int do_write_protect_text(struct dl_phdr_info *info, size_t size, void *data)
{
const int protect = (data) ? PROT_READ | PROT_EXEC : PROT_READ | PROT_WRITE | PROT_EXEC;
size_t page;
size_t i;
page = sysconf(_SC_PAGESIZE);
if (size < sizeof (struct dl_phdr_info))
return ENOTSUP;
/* Ignore libraries. */
if (info->dlpi_name && info->dlpi_name[0] != '\0')
return 0;
/* Loop over each header. */
for (i = 0; i < (size_t)info->dlpi_phnum; i++)
if ((info->dlpi_phdr[i].p_flags & PF_X)) {
size_t ptr = (size_t)info->dlpi_phdr[i].p_vaddr;
size_t len = (size_t)info->dlpi_phdr[i].p_memsz;
/* Start at the beginning of the relevant page, */
if (ptr % page) {
len += ptr % page;
ptr -= ptr % page;
}
/* and use full pages. */
if (len % page)
len += page - (len % page);
/* Change protections. Ignore unmapped sections. */
if (mprotect((void *)ptr, len, protect))
if (errno != ENOMEM)
return errno;
}
return 0;
}
int write_protect_text(int protect)
{
int result;
result = dl_iterate_phdr(do_write_protect_text, (void *)(long)protect);
if (result)
errno = result;
return result;
}
Here is an example program you can use to test the above write_protect_text() function:
#define _POSIX_C_SOURCE 200809L
int dump_smaps(void)
{
FILE *in;
char *line = NULL;
size_t size = 0;
in = fopen("/proc/self/smaps", "r");
if (!in)
return errno;
while (getline(&line, &size, in) > (ssize_t)0)
if ((line[0] >= '0' && line[0] <= '9') ||
(line[0] >= 'a' && line[0] <= 'f'))
fputs(line, stdout);
free(line);
if (!feof(in) || ferror(in)) {
fclose(in);
return errno = EIO;
}
if (fclose(in))
return errno = EIO;
return 0;
}
int main(void)
{
printf("Initial mappings:\n");
dump_smaps();
if (write_protect_text(0)) {
fprintf(stderr, "Cannot disable write protection on text sections: %s.\n", strerror(errno));
return EXIT_FAILURE;
}
printf("\nMappings with write protect disabled:\n");
dump_smaps();
if (write_protect_text(1)) {
fprintf(stderr, "Cannot enable write protection on text sections: %s.\n", strerror(errno));
return EXIT_FAILURE;
}
printf("\nMappings with write protect enabled:\n");
dump_smaps();
return EXIT_SUCCESS;
}
The example program dumps /proc/self/smaps before and after changing the text section write protection, showing that it indeed does enable/disable write protectio on all text sections (program code). It does not try to alter write protect on dynamically-loaded libraries. This was tested to work on x86-64 using Ubuntu 3.8.0-35-generic kernel.
If you just want to have one executable with a writable .text, you can just link with -N
At least for me, binutils 2.22 , ld -N objectfile.o
will produce a binary that i can happily write around in.
Reading gcc pages, you can pass the linker option from gcc by : gcc -XN source