Related
From APUE, I learned about guardsize and stackaddr of a stack. If a rsp pointer is going to be lower than stackaddr, the rsp enters the guard stack area, and a signal emits to notify the program.
I am wondering if it is possible to implement dynamic–grow(which grows dynamically) stack using this feature. Can you show
how?
The stack can be grown "dynamically"
Set up an alternate signal stack.
Set up signal handler for SIGSEGV with SA_ONSTACK to use the alternate stack.
When the program runs out of stack, it will be issued a SIGSEGV signal.
The signal handler can use getrlimit/setrlimit with RLIMIT_STACK to change the stack size.
getrlimit and setrlimit are not explicitly mentioned in man signal-safety but I don't see why they would be a problem in a signal handler.
Here is a sample test program. It is annotated. Invoke with -d to dynamically increase the stack size from within the signal handler:
// stacktest.c -- test dynamically growing stack
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <string.h>
#include <signal.h>
#include <assert.h>
#include <setjmp.h>
#include <sys/time.h>
#include <sys/resource.h>
volatile int opt_t = 0;
volatile int opt_d = 0;
volatile int opt_i = 0;
void *top; // initial top of stack
// signal handler
volatile int may_grow = 1; // grow stack within signal handler
volatile int err1 = 0; // error on get
volatile int err2 = 0; // error on set
volatile int hitno; // number of signals
jmp_buf jbuf;
// alternate signal stack
unsigned char altstk[64 * 4096] __attribute__((aligned(4096)));
// xprtstr -- print string in signal handler
void
xprtstr(const char *str)
{
size_t len = strlen(str);
write(1,str,len);
}
// xprtstr -- print number in signal handler
void
xprtnum(unsigned long val,const char *sym)
{
static const char *hex = "0123456789ABCDEF";
xprtstr(" ");
xprtstr(sym);
xprtstr("=");
char buf[100];
char *bp = &buf[50];
*bp-- = 0;
for (int idx = 0; idx < 16; ++idx, --bp) {
*bp = hex[val & 0x0F];
val >>= 4;
}
++bp;
xprtstr(bp);
}
// sigfault -- SIGSEGV handler
void
sigfault(int signo,siginfo_t *info,void *vp)
{
++hitno;
xprtstr("ISR hit");
xprtnum(signo,"signo");
xprtnum(hitno,"hitno");
xprtstr("\n");
struct rlimit rlim;
do {
if (opt_t) {
xprtstr("ISR test\n");
siglongjmp(jbuf,1);
break;
}
// dynamically grow the stack
if (may_grow) {
xprtstr("ISR grow\n");
may_grow = 0;
err1 = getrlimit(RLIMIT_STACK,&rlim);
xprtnum(rlim.rlim_cur,"rlim_cur");
xprtnum(rlim.rlim_cur / 1024,"rlim_cur");
xprtstr("\n");
rlim.rlim_cur += 8 * 1024 * 1024;
err2 = setrlimit(RLIMIT_STACK,&rlim);
getrlimit(RLIMIT_STACK,&rlim);
xprtnum(rlim.rlim_cur,"rlim_cur");
xprtnum(rlim.rlim_cur / 1024,"rlim_cur");
xprtstr("\n");
break;
}
// stop the program
xprtstr("ISR stop\n");
siglongjmp(jbuf,2);
} while (0);
}
// loop -- recursive function to overflow stack
void
loop(unsigned char *old)
{
// get some space on the stack
unsigned char cur[4096];
// get current stack frame address
void *frame = __builtin_frame_address(0);
// get amount of space used on stack
size_t dif = top - frame;
// show where we are
printf("loop: top=%p frame=%p dif=%8.8zX/%zu may_grow=%d\n",
top,frame,dif,dif / 1024,may_grow);
// keep consuming more stack
// NOTE: we don't actually use cur/old but ensure that the compiler won't
// optimize it away
loop(cur);
}
#define SHOWFLAGS(_msk) \
if (flags & (_msk)) \
printf(" " #_msk)
void
showsa(const struct sigaction *sa,const char *who)
{
int flags = sa->sa_flags;
// show the signal flags
printf("showflags: %s flags=%8.8X",who,flags);
SHOWFLAGS(SA_NOCLDSTOP);
SHOWFLAGS(SA_NOCLDWAIT);
SHOWFLAGS(SA_NODEFER);
SHOWFLAGS(SA_ONSTACK);
SHOWFLAGS(SA_RESETHAND);
SHOWFLAGS(SA_RESTART);
SHOWFLAGS(SA_SIGINFO);
// show the signal mask
for (int signo = 1; signo < 32; ++signo) {
if (sigismember(&sa->sa_mask,signo))
printf(" S%d",signo);
}
printf("\n");
}
// setup_signal -- set up signal handler
void
setup_signal(int signo)
{
struct sigaction sa;
assert(sigaction(signo,NULL,&sa) == 0);
sa.sa_sigaction = sigfault;
showsa(&sa,"BEF");
sa.sa_flags |= SA_SIGINFO;
sa.sa_flags |= SA_ONSTACK;
sa.sa_flags &= ~SA_RESETHAND;
assert(sigaction(signo,&sa,NULL) == 0);
assert(sigaction(signo,NULL,&sa) == 0);
showsa(&sa,"AFT");
}
// setup_altstack -- set up alternate signal stack
void
setup_altstack(void)
{
stack_t ss;
sigaltstack(NULL,&ss);
printf("setup_altstack: ss_sp=%p ss_flags=%8.8X ss_size=%zu\n",
ss.ss_sp,ss.ss_flags,ss.ss_size);
void *sp = altstk;
sp += 16 * 4096;
printf("altstk=%p sp=%p\n",altstk,sp);
ss.ss_sp = sp;
ss.ss_size = sizeof(altstk) / 2;
ss.ss_flags = SS_ONSTACK;
assert(sigaltstack(&ss,NULL) == 0);
sigaltstack(NULL,&ss);
printf("setup_altstack: ss_sp=%p ss_flags=%8.8X ss_size=%zu\n",
ss.ss_sp,ss.ss_flags,ss.ss_size);
if (ss.ss_sp != sp) {
printf("setup_altstack: failed\n");
exit(99);
}
}
#define SHOWOPT(_opt,_reason) \
if (_opt) \
printf(#_opt " -- " _reason)
int
main(int argc,char **argv)
{
setlinebuf(stdout);
--argc;
++argv;
for (; argc > 0; --argc, ++argv) {
char *cp = *argv;
if (*cp != '-')
break;
cp += 2;
switch (cp[-1]) {
case 'd':
opt_d = ! opt_d;
break;
case 'i':
opt_i = ! opt_i;
break;
case 't':
opt_t = ! opt_t;
break;
}
}
// show the options
SHOWOPT(opt_d,"dynamically grow stack within signal handler");
SHOWOPT(opt_i,"grow stack initially");
SHOWOPT(opt_t,"test sigsetjmp/siglongjmp");
// set up alternate signal stack and signal handler
if (opt_d || opt_t) {
setup_altstack();
setup_signal(SIGSEGV);
}
// test our sigsetjmp/siglongjmp
if (opt_t) {
for (int try = 1; try <= 2; ++try) {
if (! sigsetjmp(jbuf,1)) {
printf("main: ptr try=%d\n",try);
unsigned int *ptr = NULL;
*ptr = 23;
}
else
printf("main: resume try=%d\n",try);
}
opt_t = 0;
}
// set up large stack outside of signal handler
if (opt_i) {
struct rlimit rlim;
getrlimit(RLIMIT_STACK,&rlim);
printf("rlim_cur=%lu/%lu\n",rlim.rlim_cur,rlim.rlim_cur / 1024);
rlim.rlim_cur = 32 * 1024 * 1024;
setrlimit(RLIMIT_STACK,&rlim);
getrlimit(RLIMIT_STACK,&rlim);
printf("rlim_cur=%lu/%lu\n",rlim.rlim_cur,rlim.rlim_cur / 1024);
}
printf("main: loop\n");
// dynamically grow the stack
may_grow = opt_d;
// top of stack
top = __builtin_frame_address(0);
if (! sigsetjmp(jbuf,1))
loop(NULL);
else
printf("main: stop\n");
return 0;
}
UPDATE:
This describes how to make "the" stack grow dynamically (very interesting!), but I take the question to be about the stacks of threads other than the initial one, or at least inclusive of those stacks. I don't think this answer addresses them. –
John Bollinger
Although the question was tagged with pthreads, I'm not sure if OP actually was talking about subthreads doing this.
I'm not sure that this can be done for subthreads (created via pthread_create). At least not by using a default pthread_create call.
Without special pthread_attr_t values, the default for pthread_create is to malloc a stack [of a default size]. With attributes, the caller might set a larger stack size. And/or the caller will [usually] do an explicit malloc and pass the address along (with the size).
AFAICT, from reading glibc source, pthread_create will not set up a guard area if the user provides a stack pointer. Setting up the guard area [if there is to be one] is the responsibility of the caller.
But, in either case, if the stack overflows (with a guard area), a signal will be generated (SIGSEGV ?).
But, what can one do at that point???
The [pthread internal] function that calls the user's start_routine, the start_routine itself, and any functions that the start routine has called, already have pointers to things on that "old" stack [we must assume this].
So, the per-thread stack can not be moved (i.e. no realloc).
The only way to [possibly] do this is for the caller to provide an explicit stack pointer [and size]. As mentioned, the caller must set up the guard pages (via mprotect, I assume).
Although userfaultfd et. al. might be usable/preferable, I'm going to assume that the caller must use an explicit mmap call (vs. using the heap malloc/realloc).
The main thread's stack [generally] grows downward from the top of the virtual memory space. It can grow until all physical memory is used up and the paging disk is full.
However, for a per-thread (subthread) stack, the caller must decide on the maximum per-thread stack size before creating the thread. More on this below.
It can set up a stack (via mmap) of a smaller initial size. Once thread is created, the base/top stack address must remain constant.
Edit: The following is modified by additional thoughts below.
If a fault occurs, the signal handler could try an extension of the stack. There may be a few ways to do this. My best guess for this is:
It may have to copy/save the existing stack data [somewhere ;-)] if the remap below does not copy the existing data the way realloc does.
Temporarily undo any guard pages.
unmap/remap the stack at the same address but with a larger size (via mremap and/or mmap using MREMAP_FIXED/MAP_FIXED).
If necessary, copy back the stack data onto the "new" stack.
Set up new guard pages.
return from the signal handler [and hope ;-)].
As I said, subthread stacks can't grow "infinitely" as the main thread stack can (via setrlimit).
The sizes/addresses in the example below are not "to scale" ...
Consider two threads (e.g. tA and tB) that start with a small size:
tA's stack at xxxx1000 with size 1000
tB's stack at xxxx2000 with size 1000
If tB hits its guard page and faults, what happens? It has no room the extend its stack downward without colliding with the top of tA's stack.
So, we must map the stacks with sufficient space to grow to the "maximum" and we must know that beforehand.
We need to "space out" the stack addresses so they have room to grow [even if not all pages are mapped initially].
Let's assume that the "maximum" size is 10000. A mapping that would work is:
tA's stack at xxxx1000 with size 1000
tB's stack at xxx11000 with size 1000
Now, if tB hits its guard page, it can extend its stack up to the maximum of 10000
Additional thoughts ...
We probably must initially map the entire maximum region. Otherwise, other unrelated mmap calls may grab space in the proposed extension area. Either mmap from the heap manager, shared memory mappings, or even mmap calls done when setting up other threads.
So, we don't need to use mremap to ever increase the size of the region. Rather, we should use madvise with MADV_DONTNEED on the area that we're not currently using.
If we do this, we don't need a signal handler or guard pages to invoke it to extend the area. Just doing MADV_DONTNEED will keep resource usage low(er).
The thread function can (after popping the stack a bit) release the "popped" area with MADV_DONTNEED.
Other MADV_* options might be better.
All of the above strikes me as craziness! The only use would be a [hugely] multithreaded app that is doing deeply recursive functions.
But, if that's the case, it would be better to convert the recursive functions into ones that don't use actual program stack based recursion. Rather, the function manages an array of structs as a software controlled [pseudo] "stack". Each struct has all the variables that were function/automatic scope in the program stack frame.
The size of this array can be controlled with realloc. Or, we can implement the "stack" as a linked list of these structs with a "slab" allocation scheme. With the slab scheme, all threads could share the same slab allocator. This has the advantage that the amount of memory required can be smaller than if each thread had its own/private slab allocator.
So, IMO, don't do this on the real, per-thread program stack!
Okay, so that's what I've come up with. I've not written code for it or tested it. That's an exercise I think I'll leave to the reader ;-)
malloc/calloc apparently use swap space to satisfy a request that exceeds available free memory. And that pretty much hangs the system as the disk-use light remains constantly on. After it happened to me, and I wasn't immediately sure why, I wrote the following 5-line test program to check that this is indeed why the system was hanging,
/* --- test how many bytes can be malloc'ed successfully --- */
#include <stdio.h>
#include <stdlib.h>
int main ( int argc, char *argv[] ) {
unsigned int nmalloc = (argc>1? atoi(argv[1]) : 10000000 ),
size = (argc>2? atoi(argv[2]) : (0) );
unsigned char *pmalloc = (size>0? calloc(nmalloc,size):malloc(nmalloc));
fprintf( stdout," %s malloc'ed %d elements of %d bytes each.\n",
(pmalloc==NULL? "UNsuccessfully" : "Successfully"),
nmalloc, (size>0?size:1) );
if ( pmalloc != NULL ) free(pmalloc);
} /* --- end-of-function main() --- */
And that indeed hangs the system if the product of your two command-line args exceeds physical memory. Easiest solution is some way whereby malloc/calloc automatically just fail. Harder and non-portable was to write a little wrapper that popen()'s a free command, parses the output, and only calls malloc/calloc if the request can be satisfied by the available "free" memory, maybe with a little safety factor built in.
Is there any easier and more portable way to accomplish that? (Apparently similar to this question can calloc or malloc be used to allocate ONLY physical memory in OSX?, but I'm hoping for some kind of "yes" answer.)
E d i t--------------
Decided to follow Tom's /proc/meminfo suggestion. That is, rather than popen()'ing "free", just directly parse the existing and easily-parsible /proc/meminfo file. And then, a one-line macro of the form
#define noswapmalloc(n) ( (n) < 1000l*memfree(NULL)/2? malloc(n) : NULL )
finishes the job. memfree(), shown below, isn't as portable as I'd like, but can easily and transparently be replaced by a better solution if/when the need arises, which isn't now.
#include <stdio.h>
#include <stdlib.h>
#define _GNU_SOURCE /* for strcasestr() in string.h */
#include <string.h>
char *strcasestr(); /* non-standard extension */
/* ==========================================================================
* Function: memfree ( memtype )
* Purpose: return number of Kbytes of available memory
* (as reported in /proc/meminfo)
* --------------------------------------------------------------------------
* Arguments: memtype (I) (char *) to null-terminated, case-insensitive
* (sub)string matching first field in
* /proc/meminfo (NULL uses MemFree)
* --------------------------------------------------------------------------
* Returns: ( int ) #Kbytes of memory, or -1 for any error
* --------------------------------------------------------------------------
* Notes: o
* ======================================================================= */
/* --- entry point --- */
int memfree ( char *memtype ) {
/* ---
* allocations and declarations
* ------------------------------- */
static char memfile[99] = "/proc/meminfo"; /* linux standard */
static char deftype[99] = "MemFree"; /* default if caller passes null */
FILE *fp = fopen(memfile,"r"); /* open memfile for read */
char memline[999]; /* read memfile line-by-line */
int nkbytes = (-1); /* #Kbytes, init for error */
/* ---
* read memfile until line with desired memtype found
* ----------------------------------------------------- */
if ( memtype == NULL ) memtype = deftype; /* caller wants default */
if ( fp == NULL ) goto end_of_job; /* but we can't get it */
while ( fgets(memline,512,fp) /* read next line */
!= NULL ) { /* quit at eof (or error) */
if ( strcasestr(memline,memtype) /* look for memtype in line */
!= NULL ) { /* found line with memtype */
char *delim = strchr(memline,':'); /* colon following MemType */
if ( delim != NULL ) /* NULL if file format error? */
nkbytes = atoi(delim+1); /* num after colon is #Kbytes */
break; } /* no need to read further */
} /* --- end-of-while(fgets()!=NULL) --- */
end_of_job: /* back to caller with nkbytes */
if ( fp != NULL ) fclose(fp); /* close /proc/meminfo file */
return ( nkbytes ); /* and return nkbytes to caller */
} /* --- end-of-function memfree() --- */
#if defined(MEMFREETEST)
int main ( int argc, char *argv[] ) {
char *memtype = ( argc>1? argv[1] : NULL );
int memfree();
printf ( " memfree(\"%s\") = %d Kbytes\n Have a nice day.\n",
(memtype==NULL?" ":memtype), memfree(memtype) );
} /* --- end-of-function main() --- */
#endif
malloc/calloc apparently use swap space to satisfy a request that exceeds available free memory.
Well, no.
Malloc/calloc use virtual memory. The "virtual" means that it's not real - it's an artificially constructed illusion made out of fakery and lies. Your entire process is built on these artificially constructed illusions - a thread is a virtual CPU, a socket is a virtual network connection, the C language is really a specification for a "C abstract machine", a process is a virtual computer (that implements the languages' abstract machine).
You're not supposed to look behind the magic curtain. You're not supposed to know that physical memory exists. The system doesn't hang - the illusion is just slower, but that's fine because the C abstract machine says nothing about how long anything is supposed to take and does not provide any performance guarantees.
More importantly; because of the illusion, software works. It doesn't crash because there's not enough physical memory. Failure means that it takes an infinite amount of time for software to complete successfully, and "an infinite amount of time" is many orders of magnitude worse than "slower because of swap space".
How to get malloc/calloc to fail if request exceeds free physical memory (i.e., don't use swap)
If you are going to look behind the magic curtain, you need to define your goals carefully.
For one example, imagine if your process has 123 MiB of code and there's currently 1000 MiB of free physical RAM; but (because the code is in virtual memory) only a tiny piece of the code is using real RAM (and the rest of the code is on disk because the OS/executable loader used memory mapped files to avoid wasting real RAM until it's actually necessary). You decide to allocate 1000 MiB of memory (and because the OS creating the illusion isn't very good, unfortunately this causes 1000 MiB of real RAM to be allocated). Next, you execute some more code, but the code you execute isn't in real memory yet, so the OS has to fetch the code from the file on the disk into physical RAM, but you consumed all of the physical RAM so the OS has to send some of the data to swap space.
For another example, imagine if your process has 1 MiB of code and 1234 MiB of data that was carefully allocated to make sure that everything fits in physical memory. Then a completely different process is started and it allocates 6789 MiB of memory for its code and data; so the OS sends all of your process' data to swap space to satisfy the other process that you have no control over.
EDIT
The problem here is that the OS providing the illusion is not very good. When you allocate a large amount of virtual memory with malloc() or calloc(); the OS should be able to use a tiny piece of real memory to lie to you and avoid consuming a large amount of real memory. Specifically (for most modern operating systems running on normal hardware); the OS should be able to fill a huge area of virtual memory with a single page full of zeros that is mapped many times (at many virtual addresses) as "read only", so that allocating a huge amount of virtual memory costs almost no physical RAM at all (until you write to the virtual memory, causing the OS to allocate the least physical memory needed to satisfy the modifications). Of course if you eventually do write to all of the allocated virtual memory, then you'll end up exhausting physical memory and using some swap space; but this will probably happen gradually and not all at once - many tiny delays scattered over a large period of time are far less likely to be noticed than a single huge delay.
With this in mind; I'd be tempted to try using mmap(..., MAP_ANONYMOUS, ...) instead of the (poorly implemented) malloc() or calloc(). This might mean that you have to deal with the possibility that the allocated virtual memory isn't guaranteed to be initialized to zeros, but (depending on what you're using the memory for) that's likely to be easy to work around.
Expanding on a comment I made to the original question:
If you want to disable swapping, use the swapoff command (sudo swapoff -a). I usually run my machine that way, to avoid it freezing when firefox does something it shouldn't. You can use setrlimit() (or the ulimit command) to set a maximum VM size, but that won't properly compensate for some other process suddenly deciding to be a memory hog (see above).
Even if you choose one of the above options, you should read the rest of this answer to see how to avoid unnecessary initialisation on the first call to calloc().
As for your precise test harness, it turns out that you are triggering an unfortunate exception to GNU calloc()'s optimisation.
Here's a comment (now deleted) I made to another answer, which turns out to not be strictly speaking accurate:
I checked the glibc source for the default gnu/linux malloc library, and verified that calloc() does not normally manually clear memory which has just been mmap'd. And malloc() doesn't touch the memory at all.
It turns out that I missed one exception to the calloc optimisation. Because of the way the GNU malloc implementation initialises the malloc system, the first call to calloc always uses memset() to set the newly-allocated storage to 0. Every other call to calloc() passes through the entire calloc logic, which avoids calling memset() on storage which has been freshly mmap'd.
So the following modification to the test program shows radically different behaviour:
#include <stdio.h>
#include <stdlib.h>
int main ( int argc, char *argv[] ) {
/* These three lines were added */
void* tmp = calloc(1000, 1); /* force initialization */
printf("Allocated 1000 bytes at %p\n", tmp);
free(tmp);
/* The rest is unchanged */
unsigned int nmalloc = (argc>1? atoi(argv[1]) : 10000000 ),
size = (argc>2? atoi(argv[2]) : (0) );
unsigned char *pmalloc = (size>0? calloc(nmalloc,size):malloc(nmalloc));
fprintf( stdout," %s malloc'ed %d elements of %d bytes each.\n",
(pmalloc==NULL? "UNsuccessfully" : "Successfully"),
nmalloc, (size>0?size:1) );
if ( pmalloc != NULL ) free(pmalloc);
}
Note that if you set MALLOC_PERTURB_ to a non-zero value, then it is used to initialise malloc()'d blocks, and forces calloc()'d blocks to be initialised to 0. That's used in the test below.
In the following, I used /usr/bin/time to show the number of page faults during execution. Pay attention to the number of minor faults, which are the result of the operating system zero-initialising a previously unreferenced page in an anonymous mmap'd region (and some other occurrences, like mapping a page already present in Linux's page cache). Also look at the resident set size and, of course, the execution time.
$ gcc -Og -ggdb -Wall -o mall mall.c
$ # A simple malloc completes instantly without page faults
$ /usr/bin/time ./mall 4000000000
Allocated 1000 bytes at 0x55b94ff56260
Successfully malloc'ed -294967296 elements of 1 bytes each.
0.00user 0.00system 0:00.00elapsed 100%CPU (0avgtext+0avgdata 1600maxresident)k
0inputs+0outputs (0major+61minor)pagefaults 0swaps
$ # Unless we tell malloc to initialise memory
$ MALLOC_PERTURB_=35 /usr/bin/time ./mall 4000000000
Allocated 1000 bytes at 0x5648c2436260
Successfully malloc'ed -294967296 elements of 1 bytes each.
0.19user 1.23system 0:01.43elapsed 99%CPU (0avgtext+0avgdata 3907584maxresident)k
0inputs+0outputs (0major+976623minor)pagefaults 0swaps
# Same, with calloc. No page faults, instant completion.
$ /usr/bin/time ./mall 1000000000 4
Allocated 1000 bytes at 0x55e8257bb260
Successfully malloc'ed 1000000000 elements of 4 bytes each.
0.00user 0.00system 0:00.00elapsed 100%CPU (0avgtext+0avgdata 1656maxresident)k
0inputs+0outputs (0major+62minor)pagefaults 0swaps
$ # Again, setting the magic malloc config variable changes everything
$ MALLOC_PERMUTE_=35 /usr/bin/time ./mall 1000000000 4
Allocated 1000 bytes at 0x5646f391e260
Successfully malloc'ed 1000000000 elements of 4 bytes each.
0.00user 0.00system 0:00.00elapsed 100%CPU (0avgtext+0avgdata 1656maxresident)k
0inputs+0outputs (0major+62minor)pagefaults 0swaps
Why does some glibc's APIs(such as function malloc(), realloc() or free()) can not be correctly called in threads that are created by syscall clone?
Here is my code only for testing:
int thread_func( void *arg )
{
void *ptr = malloc( 4096 );
printf( "tid=%d, ptr=%x\n", gettid(), ptr );
sleep(1);
if( ptr )
free( ptr );
return 0;
}
int main( int argc, char **argv )
{
int i, m;
void *stk;
int stksz = 1024 * 128;
int flag = CLONE_VM | CLONE _FILES | CLONE_FS | CLONE_SIGHAND;
for( i=m=0; i < 100; i++ )
{
stk = malloc( stksz );
if( !stk ) break;
if( clone( thread_func, stk+stksz, flags, NULL, NULL, NULL, NULL ) != -1 )
m++;
}
printf( "create %d thread\n", m );
sleep(10);
return 0;
}
Testing result: thread thread_func or main thread main will be blocked on malloc() or free() function randomly. Or sometimes causes malloc() or free() to crash.
I think may be malloc() and free() need certain TLS data to distinguish every thread.
Does anyone know the reason, and what solution can been used to resolve this problem?
I think may be malloc() and free() need certain TLS data to distinguish every thread.
Glibc's malloc() and free() do not rely on TLS. They use mutexes to protect the shared memory-allocation data structures. To reduce contention for those, they employ a strategy of maintaining separate memory-allocation arenas with independent metadata and mutexes. This is documented on their manual page.
After correcting the syntax errors in your code and dummying-out the call to non-existent function gettid() (see comments on the question), I was able to produce segmentation faults, but not blockage. Perhaps you confused the exit delay caused by your program's 10-second sleep with blockage.
In addition to any issue that may have been related to your undisclosed implementation of gettid(), your program contains two semantic errors, each producing undefined behavior:
As I already noted in comments, it passes the wrong child-stack pointer values.*
It uses the wrong printf() directive in thread_func() for printing the pointer. The directive for pointer values is %p; %x is for arguments of type unsigned int.
After I corrected those errors as well, the program consistently ran to completion for me. Revised code:
int thread_func(void *arg) {
void *ptr = malloc(4096);
// printf( "tid=%d, ptr=%x\n", gettid(), ptr );
printf("tid=%d, ptr=%p\n", 1, ptr);
sleep(1);
if (ptr) {
free(ptr);
}
return 0;
}
int main(int argc, char **argv) {
int i, m;
char *stk; // Note: char * instead of void * to afford arithmetic
int stksz = 1024 * 128;
int flags = CLONE_VM | CLONE_FILES | CLONE_FS | CLONE_SIGHAND;
for (i = m = 0; i < 100; i++) {
stk = malloc( stksz );
if( !stk ) break;
if (clone(thread_func, stk + stksz - 1, flags, NULL, NULL, NULL, NULL ) != -1) {
m++;
}
}
printf("create %d thread\n", m);
sleep(10);
return 0;
}
Even with that, however, all is not completely well: I see various anomalies in the program output, especially near the beginning.
The bottom line is that, contrary to your assertion, you are not creating any threads, at least not in the sense that the C library recognizes. You are merely creating processes that have behavior similar to threads'. That may be sufficient for some purposes, but you cannot rely on the system to treat such processes identically to threads.
On Linux, bona fide threads that the system and standard library will recognize are POSIX threads, launched via pthread_create(). (I note here that modifying your program to use pthread_create() instead of clone() resolved the output anomalies for me.) You might be able to add flags and arguments to your clone() calls that make the resulting processes enough like the Linux implementation of pthreads to be effectively identical, but whyever would you do such a thing instead of just using real pthreads in the first place?
* The program also performs pointer arithmetic on a void *, which C does not permit. GCC accepts that as an extension, however, and since your code is deeply Linux-specific anyway, I'm letting that slide with only this note.
Correct, malloc and free need TLS for at least the following things:
The malloc arena attached to the current thread (used for allocation operations).
The errno TLS variable (written to when system calls fail).
The stack protector canary (if enabled and the architecture stores the canary in the TCB).
The malloc thread cache (enabled by default in the upcoming glibc 2.26 release).
All these items need a properly initialized thread control block (TCB), but curiously, until recently and as far as malloc/free was concerned, it almost did not matter if a thread created with clone was shared with another TCB (so that the data is no longer thread-local):
Threads basically never reattach themselves to a different arena, so the arena TLS variable is practically read-only after initialization—and multiple threads can share a single arena. errno can be shared as long as system calls only fail in one of the threads undergoing sharing. The stack protector canary is read-only after process startup, and its value is identical across threads anyway.
But all this is an implementation detail, and things change radically in glibc 2.26 with its malloc thread cache: The cache is read and written without synchronization, so it is very likely that what you are trying to do results in memory corruption.
This is not a material change in glibc 2.26, it is always how things were: calling any glibc function from a thread created with clone is undefined. As John Bollinger pointed out, this mostly worked by accident before, but I can assure you that it has always been completely undefined.
How do i know the amount of memory used . i.e RAM usage ?
int main()
{
int i=0;
for(i=0;i<100;i++)
{
printf("%d\n",i);
}
return 0;
}
I want to write a code which calculate the amount of memory used by this program . May be like-
int main()
{
int i=0;
for(i=0;i<100;i++)
{
printf("%d\n",i);
}
printf("Amount of memory consumed=%f",SOME_FUNCTION());
return 0;
}
The getrusage system call will return a handful of information for the current process, among which is the "resident set size":
struct rusage usage;
if (!getrusage(RUSAGE_SELF, &usage)) {
printf("Maximum resident set size (KB): %ld\n", usage.ru_maxrss);
} else {
perror("getrusage");
}
This size equates to the amount of memory that is physically wired to the process and not the entire size of the virtual address space, parts of which might be paged-out or never loaded.
It's not easy to check how much memory your program uses on Linux system. but most likely what you want to check is the value of VmRSS in /proc/[pid]/status (or second column of /proc/[pid]/statm). VmRSS ("resident set size") is the amount of memory your process currently uses.
Other than that you may be interested in VmSize from /proc/[pid]/status (or first column of /proc/[pid]/statm). This is total memory your process uses, including memory swapped out, memory used by shared libraries, memory-mapped resources (which, in general, don't consume real RAM).
To get PID of your process, use getpid(). From within your process you could also check /proc/self/status.
A simple approach will to create a wrapper function for memory allocation and freeing and call the wrapper and put memory usage information in that. This can only be used in case of dynamic memory allocaion. e.g
#define ALLOC 1
#define FREE 2
mem_op(void * pointer,int size,int operation)
{
switch(operation)
{
static int mem_used;
case ALLOC:
// call malloc or alloc
mem_used = mem_used+size;
break;
case FREE:
// call free
mem_used = mem_used-size;
break;
}
Reading Martin Sustrick's blog on challenges attendant with preventing "undefined behavior" in C++, vs C, in particular the problem attendant with malloc() failing due to memory exhaustion, I was reminded of the many, many times I have been frustrated to know what to do in such cases.
With virtual systems such conditions are rare, but on embedded platforms, or where performance degradation attendant with hitting the virtual system equates to failure, as is Martin's case with ZeroMQ, I resolved to find a workable solution, and did.
I wanted to ask the readers of StackOverflow if they've tried this approach, and what their experience with it was.
The solution is to allocate a chunk of spare memory off the heap with a call to malloc() at the start of the program, and then use that pool of spare memory to stave off memory exhaustion when and if it occurs. The idea is to prevent capitulation in favor of an orderly retreat (I was reading the accounts of Kesselring's defense of Italy last night) where error messages and IP sockets and such will work long enough to (hopefully) at least tell the user what happened.
#define SPARE_MEM_SIZE (1<<20) // reserve a megabyte
static void *gSpareMem;
// ------------------------------------------------------------------------------------------------
void *tenacious_malloc(int requested_allocation_size) {
static int remaining_spare_size = 0; // SPARE_MEM_SIZE;
char err_msg[512];
void *rtn = NULL;
// attempt to re-establish the full size of spare memory, if it needs it
if (SPARE_MEM_SIZE != remaining_spare_size) {
if(NULL != (gSpareMem = realloc(gSpareMem, SPARE_MEM_SIZE))) {
remaining_spare_size = SPARE_MEM_SIZE;
// "touch" the memory so O/S will allocate physical memory
meset(gSpareMem, 0, SPARE_MEM_SIZE);
printf("\nSize of spare memory pool restored successfully in %s:%s at line %i :)\n",
__FILE__, __FUNCTION__, __LINE__);
} else {
printf("\nUnable to restore size of spare memory buffer.\n");
}
}
// attempt a plain, old vanilla malloc() and test for failure
if(NULL != (rtn = malloc(requested_allocation_size))) {
return rtn;
} else {
sprintf(err_msg, "\nInitial call to malloc() failed in %s:%s at line %i",
__FILE__, __FUNCTION__, __LINE__);
if(remaining_spare_size < requested_allocation_size) {
// not enough spare storage to satisfy the request, so no point in trying
printf("%s\nRequested allocaton larger than remaining pool. :(\n\t --- ABORTING --- \n", err_msg);
return NULL;
} else {
// take the needed storage from spare memory
printf("%s\nRetrying memory allocation....\n", err_msg);
remaining_spare_size -= requested_allocation_size;
if(NULL != (gSpareMem = realloc(gSpareMem, remaining_spare_size))) {
// return malloc(requested_allocation_size);
if(NULL != (rtn = malloc(requested_allocation_size))) {
printf("Allocation from spare pool succeeded in %s:%s at line %i :)\n",
__FILE__, __FUNCTION__, __LINE__);
return rtn;
} else {
remaining_spare_size += requested_allocation_size;
sprintf(err_msg, "\nRetry of malloc() after realloc() of spare memory pool "
"failed in %s:%s at line %i :(\n", __FILE__, __FUNCTION__, __LINE__);
return NULL;
}
} else {
printf("\nRetry failed.\nUnable to allocate requested memory from spare pool. :(\n");
return NULL;
}
}
}
}
// ------------------------------------------------------------------------------------------------
int _tmain(int argc, _TCHAR* argv[]) {
int *IntVec = NULL;
double *DblVec = NULL;
char *pString = NULL;
char String[] = "Every good boy does fine!";
IntVec = (int *) tenacious_malloc(100 * sizeof(int));
DblVec = (double *) tenacious_malloc(100 * sizeof(double));
pString = (char *)tenacious_malloc(100 * sizeof(String));
strcpy(pString, String);
printf("\n%s", pString);
printf("\nHit Enter to end program.");
getchar();
return 0;
}
The best strategy is to aim for code that works without allocations. In particular, for a correct, robust program, all failure paths must be failure-case-free, which means you can't use allocation in failure paths.
My preference, whenever possible, is to avoid any allocations once an operation has started, instead determining the storage needed and allocating it all prior to the start of the operation. This can greatly simplify program logic and makes testing much easier (since there's a single point of possible failure you have to test). Of course it can also be more expensive in other ways; for example, you might have to make two passes over input data to determine how much storage you will need and then process it using the storage.
In regards to your solution of pre-allocating some emergency storage to use once malloc fails, there are basically two versions of this:
Simply calling free on the emergency storage then hoping malloc works again afterwards.
Going through your own wrapper layer for everything where the wrapper layer can directly use the emergency storage without ever freeing it.
The first approach has the advantage that even standard library and third-party library code can utilize the emergency space, but it has the disadvantage that the freed storage could be stolen by other processes, or threads in your own process, racing for it. If you're sure the memory exhaustion will come from exhausting virtual address space (or process resource limits) rather than system resources, and your process is single-threaded, you don't have to worry about the race, and you can fairly safely assume this approach will work. However, in general, the second approach is much safer, because you have an absolute guarantee that you can obtain the desired amount of emergency storage.
I don't really like either of these approaches, but they may be the best you can do.
On a modern 64 bit computer, you can malloc significantly more memory than you have RAM. In practice, malloc doesn't fail. What happens in practice is that your application starts thrashing, and once you have say 4GB of RAM and your allocations exceed that, your performance will drop to zero because you are swapping like mad. Your performance goes down so much that you never get to the point where malloc can't return memory.