completely remove function call at runtime in C

completely remove function call at runtime in C - c

Is it possible to completely remove function call from C code at runtime and insert it back when needed.
I'm not sure if ELF can be modified at run time, so that no cpu cycle is wasted incase of no use of function.
I don't want to place a 'if' check before the function call to avoid calling a function.
For example if global flag g_flg=1 then func1 should look like below
void func1(int x)
{
/* some processing */
func2(y);
/* some processing */
}
if global g_flag=0 then func1 should look like below
void func1(int x)
{
/* some processing */
/* some processing */
}

Don't optimize something that doesn't need it. Have you tried assessing the potential improvement on your performance?
Try setting g_flg to 1 and execute this:
if (g_flg == 1) {func2(y);}
Then try executing this:
func2(y);
Both 1 million times (or whatever number of times you can run it in a reasonable time). I'm quite sure you'll notice there is virtually no difference between both things.
Plus, apart from that, I think what you want to do is impossible, because ELF is a binary (compiled) format.

What you could probably get away with doing instead would be something like this:
struct Something;
typedef struct Something Something;
int myFunction(Something * me, int i)
{
// do a bunch of stuff
return 42; // obviously the answer
}
int myFunctionDoNothing(Something * dummy1, int dummy2)
{
return 0;
}
int (*function)(Something *, int) = myFunctionDoNothing;
// snip to actual use of function
int i;
function = myFunctionDoNothing;
for (i = 0; i < 100000; ++i) function(NULL, 5 * i); // does nothing
function = myFunction;
for (i = 0; i < 100000; ++i) function(NULL, 5 * i); // does something
WARNING
This might be a premature optimization. Depending on how your compiler treats this and how your cpu handles branching, you might actually lose performance this way as opposed to the naive way (stopping it in the function with a flag)

On most desktop and server architectures branching is faster than indirect calls, since they do branch prediction and/or speculative execution. I have never heard of an architecture where indirect call is faster than a single branch. (Jump tables, for switch() statements, have more than one branch, and are therefore a different thing altogether.)
Consider the following microbenchmark I threw together. test.c:
/* test.c */
volatile long test_calls = 0L;
volatile long test_sum = 0L;
void test(long counter)
{
test_calls++;
test_sum += counter;
}
work.c:
/* work.c */
void test(long counter);
/* Work function, to be measured */
void test_work(long counter, int flag)
{
if (flag)
test(counter);
}
/* Dummy function, to measure call overhead */
void test_none(long counter __attribute__((unused)), int flag __attribute__((unused)) )
{
return;
}
and harness.c:
#define _POSIX_C_SOURCE 200809L
#include <unistd.h>
#include <stdlib.h>
#include <time.h>
#include <stdint.h>
#include <string.h>
#include <stdio.h>
/* From test.c */
extern volatile long test_calls;
extern volatile long test_sum;
/* Dummy function, to measure call overhead */
void test_none(long counter, int flag);
/* Work function, to be measured */
void test_work(long counter, int flag);
/* Timing harness -- GCC x86; modify for other architectures */
struct timing {
struct timespec wall_start;
struct timespec wall_stop;
uint64_t cpu_start;
uint64_t cpu_stop;
};
static inline void start_timing(struct timing *const mark)
{
clock_gettime(CLOCK_REALTIME, &(mark->wall_start));
mark->cpu_start = __builtin_ia32_rdtsc();
}
static inline void stop_timing(struct timing *const mark)
{
mark->cpu_stop = __builtin_ia32_rdtsc();
clock_gettime(CLOCK_REALTIME, &(mark->wall_stop));
}
static inline double cpu_timing(const struct timing *const mark)
{
return (double)(mark->cpu_stop - mark->cpu_start); /* Cycles */
}
static inline double wall_timing(const struct timing *const mark)
{
return (double)(mark->wall_stop.tv_sec - mark->wall_start.tv_sec)
+ (double)(mark->wall_stop.tv_nsec - mark->wall_start.tv_nsec) / 1000000000.0;
}
static int cmpdouble(const void *aptr, const void *bptr)
{
const double a = *(const double *)aptr;
const double b = *(const double *)bptr;
if (a < b)
return -1;
else
if (a > b)
return +1;
else
return 0;
}
void report(double *const wall, double *const cpu, const size_t count)
{
printf("\tInitial call: %.0f cpu cycles, %.9f seconds real time\n", cpu[0], wall[0]);
qsort(wall, count, sizeof (double), cmpdouble);
qsort(cpu, count, sizeof (double), cmpdouble);
printf("\tMinimum: %.0f cpu cycles, %.9f seconds real time\n", cpu[0], wall[0]);
printf("\t5%% less than %.0f cpu cycles, %.9f seconds real time\n", cpu[count/20], wall[count/20]);
printf("\t25%% less than %.0f cpu cycles, %.9f seconds real time\n", cpu[count/4], wall[count/4]);
printf("\tMedian: %.0f cpu cycles, %.9f seconds real time\n", cpu[count/2], wall[count/2]);
printf("\t75%% less than %.0f cpu cycles, %.9f seconds real time\n", cpu[count-count/4-1], wall[count-count/4-1]);
printf("\t95%% less than %.0f cpu cycles, %.9f seconds real time\n", cpu[count-count/20-1], wall[count-count/20-1]);
printf("\tMaximum: %.0f cpu cycles, %.9f seconds real time\n", cpu[count-1], wall[count-1]);
}
int main(int argc, char *argv[])
{
struct timing measurement;
double *wall_seconds = NULL;
double *cpu_cycles = NULL;
unsigned long count = 0UL;
unsigned long i;
int flag;
char dummy;
if (argc != 3 || !strcmp(argv[1], "-h") || !strcmp(argv[1], "--help")) {
fprintf(stderr, "\n");
fprintf(stderr, "Usage: %s COUNT FLAG\n", argv[0]);
fprintf(stderr, "\n");
return 1;
}
if (sscanf(argv[1], " %lu %c", &count, &dummy) != 1) {
fprintf(stderr, "%s: Invalid COUNT.\n", argv[1]);
return 1;
}
if (count < 1UL) {
fprintf(stderr, "%s: COUNT is too small.\n", argv[1]);
return 1;
}
if (!(unsigned long)(count + 1UL)) {
fprintf(stderr, "%s: COUNT is too large.\n", argv[1]);
return 1;
}
if (sscanf(argv[2], " %d %c", &flag, &dummy) != 1) {
fprintf(stderr, "%s: Invalid FLAG.\n", argv[2]);
return 1;
}
wall_seconds = malloc(sizeof (double) * (size_t)count);
cpu_cycles = malloc(sizeof (double) * (size_t)count);
if (!wall_seconds || !cpu_cycles) {
free(cpu_cycles);
free(wall_seconds);
fprintf(stderr, "Cannot allocate enough memory. Try smaller COUNT.\n");
return 1;
}
printf("Call and measurement overhead:\n");
fflush(stdout);
for (i = 0UL; i < count; i++) {
start_timing(&measurement);
test_none(i, flag);
stop_timing(&measurement);
wall_seconds[i] = wall_timing(&measurement);
cpu_cycles[i] = cpu_timing(&measurement);
}
report(wall_seconds, cpu_cycles, (size_t)count);
printf("\n");
printf("Measuring FLAG==0 calls: ");
fflush(stdout);
test_calls = 0L;
test_sum = 0L;
for (i = 0UL; i < count; i++) {
start_timing(&measurement);
test_work(i, 0);
stop_timing(&measurement);
wall_seconds[i] = wall_timing(&measurement);
cpu_cycles[i] = cpu_timing(&measurement);
}
printf("%ld calls, sum %ld.\n", test_calls, test_sum);
report(wall_seconds, cpu_cycles, (size_t)count);
printf("\n");
printf("Measuring FLAG==%d calls:", flag);
fflush(stdout);
test_calls = 0L;
test_sum = 0L;
for (i = 0UL; i < count; i++) {
start_timing(&measurement);
test_work(i, flag);
stop_timing(&measurement);
wall_seconds[i] = wall_timing(&measurement);
cpu_cycles[i] = cpu_timing(&measurement);
}
printf("%ld calls, sum %ld.\n", test_calls, test_sum);
report(wall_seconds, cpu_cycles, (size_t)count);
printf("\n");
printf("Measuring alternating FLAG calls: ");
fflush(stdout);
test_calls = 0L;
test_sum = 0L;
for (i = 0UL; i < count; i++) {
start_timing(&measurement);
test_work(i, i & 1);
stop_timing(&measurement);
wall_seconds[i] = wall_timing(&measurement);
cpu_cycles[i] = cpu_timing(&measurement);
}
printf("%ld calls, sum %ld.\n", test_calls, test_sum);
report(wall_seconds, cpu_cycles, (size_t)count);
printf("\n");
free(cpu_cycles);
free(wall_seconds);
return 0;
}
Put the three files in an empty directory, then compile and build ./call-test:
rm -f *.o
gcc -W -Wall -O3 -fomit-frame-pointer -c harness.c
gcc -W -Wall -O3 -fomit-frame-pointer -c work.c
gcc -W -Wall -O3 -fomit-frame-pointer -c test.c
gcc harness.o work.o test.o -lrt -o call-test
On AMD Athlon II X4 640, using gcc-4.6.3 (Xubuntu 10.04), running
./call-test 1000000 1
tells me that the overhead is just 2 clock cycles (< 1ns) for the test alone (branch not taken), and just 4 clock cycles (just over a nanosecond) when calling the second function which increases test_calls and adds the counter to test_sum.
When omitting all optimizations (use -O0 and leave out -fomit-frame-pointer when compiling), the test alone costs about 3 clock cycles (3 cycles if branch not taken), and about 9 cycles if the branch is taken and the work is done to update the two extra variables.
(The two extra variables let you easily see that the harness does actually do all it should do; they're just an extra check. And I wanted to have some work in the second function, so the timing differences would be easier to spot.)
The above interpretation is only valid for the case when the code is already cached; i.e. run recently. If the code is run only rarely, it won't be in cache. However, then the test overhead matters even less. Caching effects -- for example, if "nearby" code has been run (you can see this for the call overhead measurement, the other test functions code' tends to get cached too!) -- are much larger anyway. (While the test harness does produce the initial call results separately, don't put too much faith in it, since it does not try to clear any caches in any way.)
My conclusion is that adding
if (flag)
debug_function_call();
to any normal code is perfectly fine: the overhead is literally neglible; practically irrelevant. As always, consider the overall algorithm instead. Any enhancements in the algorithm yield much bigger rewards than worrying about the code the compiler generates.
(Since I wrote the test code above at one sitting, there are likely some bugs and/or brainfarts in them. Check, and if you find any, let me know below so I can fix the code.)

Related

How to make "long tv_nsec" and "time_t tv_sec" compatible?

I am writting a wrapper function sleep_new() for clock_nanosleep() which would make thread suspension easier for me.
// POSIX.1-2017 is what compiler is confined to.
#define _XOPEN_SOURCE 700
#include <stdint.h>
#include <time.h>
#include <stdio.h>
#include <string.h>
// POSIX headers.
// Other headers
#include "sleep_new.h"
void sleep_new(long value, const char unit[3]){
// Create a timespec structure and set it's members.
// Members are added together!!! So to set time "1.5 s" we set "t.tv_sec = 1" and "t.tv_sec = 500000000".
// Members ".tv_sec" and ".tv_nsec" represent unit and not value!
struct timespec sleep_time;
// Set flags i.e. TIMER_ABSTIME to 0 to use relative instead of absolute time.
int flags = 0;
// Choose the clock i.e. CLOCK_MONOTONIC is a "clock_id" for the clock started at system start.
int clock_id = CLOCK_MONOTONIC;
// Set timespec structure's members according to the chosen unit.
if (!strcmp(unit, "s")) {
sleep_time.tv_sec = value;
sleep_time.tv_nsec = 0;
}
else if (!strcmp(unit, "ns")){
sleep_time.tv_sec = 0;
sleep_time.tv_nsec = value;
}
else if (!strcmp(unit, "us")){
sleep_time.tv_sec = 0;
sleep_time.tv_nsec = value * 1000;
}
else if (!strcmp(unit, "ms")){
sleep_time.tv_sec = 0;
sleep_time.tv_nsec = value * 1000000;
}
else{
puts("Unit not supported - choose between: s, ms, us, ns\n");
}
// Because last argument is NULL in case of error, remaining time is not stored in "t".
clock_nanosleep(clock_id, flags, &sleep_time, NULL);
}
int main(int argc, char *argv[])
{
// Counter.
uint8_t i;
for(i = 0; i < 256; i++){
// Stdout is newline buffered. This is why we either have to include `\n` at the end or flush() it manually.
// So uncomment one example A or B.
// A
//printf("%d\n", i);
// B
printf("%d, ", i);
fflush(stdout);
// Because last argument is NULL in case of error, remaining time is not stored in "t".
sleep_new(1000, "ms");
}
return 0;
}
If I call this function with sleep_new(1, "s") or sleep_new(2, "s") it works fine, because it sets the sleep_time.tv_nsec = 0; and sleep_time.tv_sec = value;.
In any other scenarios i.e. sleep_new(1000, "ms") something is wrong and sleep is not applied. I debugged application and values are applied to the timespec members just fine but clock_nanosec() just ignores them.
I am using type long for the value because I read in the POSIX here where header time.h defines timespec structure's members tv_nsec who needs long and member tv_sec who uses time_t which is in turn defined in header sys/types.h like this:
time_t shall be an integer type.
So because long can also hold int values I expected this to work, but it doesn't. Does anyone have any suggestion?

The tv_nsec is the number of nanoseconds in a second - 1000 * 1000000 nanoseconds is too much. That's 1 second! tv_nsec should range from 0 to 999999999. The proper calculation could look like:
sleep_time.tv_sec = value / 1000;
sleep_time.tv_nsec = (value % 1000) * 1000000;

problems utilitizing small pauses in c code using nanosleep

I am a C beginner and trying this and that.
I want to display a string letter by letter with tiny pauses in between. So my idea was a small pause using sleep or usleep after displaying each char but I read that using nanosleep in your own function makes more sense. So I put my little pauses in a function "msleep" to get microseconds pauses.
I output my string 3 times.
Once in the main(), then in a do-while-loop in a function (fancyOutput) char by char, and eventually in the same function with printf again to check, if it was handled over correctly.
My problem: I expected, that the middle output would work char by char and separated by 100/1000 seconds breaks, but what I experience is a long break before chowing any char and then a fast output if line two and three. It looks like the compiler "realized what I am planning to do and wants to modify the code to be more efficient." So all my pauses seemed to be combined in one long break.
Maybe you remeber the captions in the tv series "x files" - something like that I want to produce.
For sure there are better and more sophisticated ways to archieve what I am going to try but I want to learn and understand what is going on. Can someone help me with that?
I am using codeclocks on a debian-based distro with gcc.
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
int msleep(long tms);
void fancyOutput(char inputToOutput[]);
int msleep(long tms)
{
struct timespec ts;
int ret;
if (tms < 0)
{
return -1;
}
ts.tv_sec = tms / 1000;
ts.tv_nsec = (tms % 1000) * 1000000;
do
{
// printf("sleeping for %d", ret);
ret = nanosleep(&ts, &ts);
}
while (ret);
return ret;
}
void fancyOutput(char inputToOutput[])
{
int counter = 0;
do
{
printf("%c", inputToOutput[counter]);
msleep(100);
++counter;
}
while (!(inputToOutput[counter]=='\0'));
printf("\n");
printf("%s\n", inputToOutput); // only check, if string was properly handled over to function
}
char output[] = "This string shall appear char by char in the console.";
void main(void)
{
printf("%s\n", output); // only check, if string was properly set and initialized
fancyOutput(output); // here the function above is called to output the string char by cchar with tiny pauses between
}

You are getting problem with buffer.
When you use printf with no \n (new line) C is buffering the display in order to display information block by block (to optimize displaying speed).
Then you need to either add a \n to your printf or add a flush of the stdout.
An other solution will be to use stderr, which got no buffer, but stderr is meant for error not output :)
You can also check setvbuf in order to change the buffering.
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
int msleep(long tms);
void fancyOutput(char inputToOutput[]);
int msleep(long tms)
{
struct timespec ts;
int ret;
if (tms < 0)
{
return -1;
}
ts.tv_sec = tms / 1000;
ts.tv_nsec = (tms % 1000) * 1000000;
do
{
// printf("sleeping for %d", ret);
ret = nanosleep(&ts, &ts);
}
while (ret);
return ret;
}
void fancyOutput(char inputToOutput[])
{
int counter = 0;
do
{
printf("%c", inputToOutput[counter]);
flush(stdout);
msleep(100);
++counter;
}
while (!(inputToOutput[counter]=='\0'));
printf("\n");
printf("%s\n", inputToOutput); // only check, if string was properly handled over to function
}
char output[] = "This string shall appear char by char in the console.";
void main(void)
{
printf("%s\n", output); // only check, if string was properly set and initialized
fancyOutput(output); // here the function above is called to output the string char by cchar with tiny pauses between
}

So, I tried the solution to place fflush(stdout); directly after the char-output in the loop. It worked as intended.
Summarizing for those with similar problems (guess this also happens with usleep and similar self-made functions):
As I understaood, printf "collects" data in stdout until it "sees" \n, which indicates the end of a line. Then printf "releases" stdout. So in my initial post it "kept" each single char in stdout, made a pause after each char and finally released stdout in one fast output.
So fflush(stdout); after each char output via empties stdout char by char.
Hope it can help others.

Where to find the source code of timespec_get?

The C11 standard provides the function timespec_get. If I run the example code on cppreference, or on my computer, it works:
#include <stdio.h>
#include <time.h>
int main(void)
{
struct timespec ts;
timespec_get(&ts, TIME_UTC);
char buff[100];
strftime(buff, sizeof buff, "%D %T", gmtime(&ts.tv_sec));
printf("Current time: %s.%09ld UTC\n", buff, ts.tv_nsec);
}
However, if I look at the sources of glibc here, the code is the following:
#include <time.h>
/* Set TS to calendar time based in time base BASE. */
int
timespec_get (struct timespec *ts, int base)
{
switch (base)
{
case TIME_UTC:
/* Not supported. */
return 0;
default:
return 0;
}
return base;
}
stub_warning (timespec_get)
Which... should not work...
Which leads to the question: where is the source code of timespec_get that is actually called?

The timespec_get function's implementation depends on the system the library is running on, so it appears both as a stub in time/timespec_get.c (in case no implementation is available) and as various system-dependent implementations elsewhere.
You can see the Linux implementation in sysdeps/unix/sysv/linux/timespec_get.c,
/* Set TS to calendar time based in time base BASE. */
int
timespec_get (struct timespec *ts, int base)
{
switch (base)
{
int res;
INTERNAL_SYSCALL_DECL (err);
case TIME_UTC:
res = INTERNAL_VSYSCALL (clock_gettime, err, 2, CLOCK_REALTIME, ts);
if (INTERNAL_SYSCALL_ERROR_P (res, err))
return 0;
break;
default:
return 0;
}
return base;
}
This is is just a thin wrapper around a vDSO call, and the vDSO is part of the Linux kernel itself. If you are curious, look for the definition of clock_gettime there. It's unusual that clock_gettime is in the vDSO, only a small number of syscalls are implemented this way.
Here is the x86 implementation for CLOCK_REALTIME, found in arch/x86/entry/vdso/vclock_gettime.c:
/* Code size doesn't matter (vdso is 4k anyway) and this is faster. */
notrace static int __always_inline do_realtime(struct timespec *ts)
{
unsigned long seq;
u64 ns;
int mode;
do {
seq = gtod_read_begin(gtod);
mode = gtod->vclock_mode;
ts->tv_sec = gtod->wall_time_sec;
ns = gtod->wall_time_snsec;
ns += vgetsns(&mode);
ns >>= gtod->shift;
} while (unlikely(gtod_read_retry(gtod, seq)));
ts->tv_sec += __iter_div_u64_rem(ns, NSEC_PER_SEC, &ns);
ts->tv_nsec = ns;
return mode;
}
Basically, there is some memory in your process which is updated by the kernel, and some registers in your CPU which track the passage of time (or something provided by your hypervisor). The memory in your process is used to translate the value of these CPU registers into the wall clock time. You have to read these in a loop because they can change while you are reading them... the loop logic detects the case when you get a bad read, and tries again.

The timespec_get definition you linked to is a stub (see the stub_warning). The actual implementation will be under sysdeps for your platform. For example, here is the version for sysv: https://github.com/lattera/glibc/blob/a2f34833b1042d5d8eeb263b4cf4caaea138c4ad/sysdeps/unix/sysv/linux/timespec_get.c
int timespec_get (ts, base)
struct timespec *ts;
int base;
{
switch (base)
{
int res;
INTERNAL_SYSCALL_DECL (err);
case TIME_UTC:
res = INTERNAL_GETTIME (CLOCK_REALTIME, ts);
if (INTERNAL_SYSCALL_ERROR_P (res, err))
return 0;
break;
default:
return 0;
}
return base;
}

WinApi C multithreading: how to wait for a thread to finish?

I'm writing a multithreaded program to calculate Fibonacci, Power and Factorial. Instead of using Sleep, I would like to wait for threads to finish, and I'd like to display ids of threads in the order they finish (first finished, first displayed). How should I do this?
#include <windows.h>
#include <stdio.h>
#include <stdlib.h>
#include <conio.h>
unsigned int n = 0;
int priorytety[3] = { THREAD_PRIORITY_BELOW_NORMAL,THREAD_PRIORITY_NORMAL, THREAD_PRIORITY_ABOVE_NORMAL};
HANDLE watki[3];
DWORD WINAPI Fibbonaci(void *argumenty){
unsigned long long int prevPrev = 0;
unsigned long long int prev = 1;
unsigned long long int wynik = 1;
while (wynik <= n){
wynik = prev + prevPrev;
prevPrev = prev;
prev = wynik;
}
printf("fibbonaci : %llu \n", wynik);
ExitThread(wynik);
//return wynik;
}
DWORD WINAPI Potegi(void *argumenty){
unsigned long long int wynik = 2;
while (wynik <= n){
wynik = wynik << 1;
}
printf("potegi : %llu \n", wynik);
return wynik;
}
DWORD WINAPI Silnia(void *argumenty){
//printf("%d", atoi(argv[argc-1]));
unsigned long long int wynik = 1;
unsigned long long int i = 1;
while (wynik <= n){
wynik = wynik * i;
i = i + 1;
}
printf("silnia : %llu \n", wynik);
return wynik;
}
int main(){
int i;
DWORD id;
system("cls");
scanf_s("%d", &n);
LPTHREAD_START_ROUTINE WINAPI funkcje[3] = { Fibbonaci, Potegi, Silnia };
for (i = 0; i < 3; i++)
{
watki[i] = CreateThread(
NULL, // atrybuty bezpieczeństwa
10000, // inicjalna wielkość stosu
funkcje[i] , // funkcja wątku
(void *)n,// dane dla funkcji wątku
0, // flagi utworzenia
&id);
if (watki[i] != INVALID_HANDLE_VALUE)
{
//printf("Utworzylem watek o identyfikatorze %x\n", id);
// ustawienie priorytetu
SetThreadPriority(watki[i], priorytety[1]);
}
}
Sleep(10000);
getchar();
}

#WhozCraig is correct that you should use WaitForMultipleObjects() to wait for all the threads to finish. Read this SO post for more information.
That, however, will not tell you the order in which they ended, only when all have completed. Adding code to each function to print its thread ID should do that (use GetCurrentThreadId()). For example:
printf("potegi : %llu, thread ID %ld \n", wynik, GetCurrentThreadId());
Now we must not forget that there is time between the printf statement and when the thread actually finishes. You are not doing any work there, but technically the thread is still active. In a multithreaded environment, you cannot predict how much time will elapse between the printf and when the thread truly terminates, no matter how little code appears to be there.
If this difference is important to you, then you would need to join each thread independently and see which one terminates first. You could repeatedly call WaitForSingleObject() on each thread handle with a zero timeout and detect which one terminates first. Yes, there is a slight race condition if the third thread finishes slightly before the second while you are checking on the first, and then you check the second thread and notice it has terminated. You'll miss the fact that the third finished first. And this polling technique alters the experiment by consuming a lot of CPU while it is waiting.
Personally, I think you are better off just recording the time (based on the system clock) when each thread finished computing its result, not when its thread terminated. Use GetTickCount() or QueryPerformanceCounter() to measure the time.

Grand central dispatch is giving me a slower execution time

So, I am about 99% certain that I have implemented something wrong, but heres the deal.
I have been playing around with Grand Central Dispatch, and put together an experiment calculating MD5 hashes. I am running a macbook air with an i5, so have 4 cores available. This would led me to believe that using Grand Central Dispatch to calculate the hashes would be approx 4 times faster. But, for some reason, it appears to be slower.
Code below
Using GCD
#include <stdio.h>
#include <time.h>
#import <CommonCrypto/CommonDigest.h>
#import <dispatch/dispatch.h>
int main (int argc, const char * argv[])
{
int i,j,k,l,a;
int num_chars = 4, total;
clock_t start, end;
double elap;
printf("Calculating hashes for %d chars\n", num_chars);
total = num_chars ^ 64;
printf("Calculating %d hashes\n", total);
dispatch_queue_t queue = dispatch_get_global_queue(0,0);
dispatch_queue_t main = dispatch_get_main_queue();
dispatch_group_t group = dispatch_group_create();
start = clock();
printf("Starting calculation queue\n");
for(i=0;i<62;i++) {
for(j=0;j<62;j++) {
for(k=0;k<62;k++) {
for(l=0;l<62;l++) {
dispatch_group_async(group, queue, ^{
char *letters = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789";
char buffer[10];
char out[100];
unsigned char hash[16];
sprintf(buffer, "%c%c%c%c", letters[i], letters[j], letters[k], letters[l]);
CC_MD5(buffer, strlen(buffer), hash);
});
}
}
}
}
printf("Finished calculation queue\n");
dispatch_group_wait(group, DISPATCH_TIME_FOREVER);
end = clock();
elap = ((double) end - start) / CLOCKS_PER_SEC;
printf("Time taken %2f\n", elap);
return 0;
}
Compile and run...
gcc -o a.out main.c
./a.out
Calculating hashes for 4 chars
Calculating 68 hashes
Starting calculation queue
Finished calculation queue
Time taken 35.193133
Looking at Activity Monitor, I can see all 4 cores max out while the script is running.
Now, comment out the dispatching....
#include <stdio.h>
#include <time.h>
#import <CommonCrypto/CommonDigest.h>
#import <dispatch/dispatch.h>
int main (int argc, const char * argv[])
{
int i,j,k,l,a;
int num_chars = 4, total;
clock_t start, end;
double elap;
printf("Calculating hashes for %d chars\n", num_chars);
total = num_chars ^ 64;
printf("Calculating %d hashes\n", total);
dispatch_queue_t queue = dispatch_get_global_queue(0,0);
dispatch_queue_t main = dispatch_get_main_queue();
dispatch_group_t group = dispatch_group_create();
start = clock();
printf("Starting calculation queue\n");
for(i=0;i<62;i++) {
for(j=0;j<62;j++) {
for(k=0;k<62;k++) {
for(l=0;l<62;l++) {
//dispatch_group_async(group, queue, ^{
char *letters = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789";
char buffer[10];
char out[100];
unsigned char hash[16];
sprintf(buffer, "%c%c%c%c", letters[i], letters[j], letters[k], letters[l]);
CC_MD5(buffer, strlen(buffer), hash);
//});
}
}
}
}
printf("Finished calculation queue\n");
//dispatch_group_wait(group, DISPATCH_TIME_FOREVER);
end = clock();
elap = ((double) end - start) / CLOCKS_PER_SEC;
printf("Time taken %2f\n", elap);
return 0;
}
Compile and run
gcc -o b.out main.c
./b.out
Calculating hashes for 4 chars
Calculating 68 hashes
Starting calculation queue
Finished calculation queue
Time taken 7.511273
Looking at Activity Monitor, it only shows 1 core active while the script runs.

There's probably too little work being done in the dispatch that it doesn't make the overhead involved with dispatching worthwhile. I would try and increase the amount of work done in each dispatch. I wouldn't have a clue whether this would help, but try:
Moving the dispatch up a few loops, perhaps wrap the k or j loop inside the dispatched block instead, to get it to do more work.
Remove the call sprintf and strlen. In fact, the block could be simplified to:
static const char *letters = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789";
unsigned char hash[16];
char buffer[4] = { letters[i], letters[j], letters[k], letters[l] };
CC_MD5(buffer, sizeof buffer, hash);

Develop Reference

c reactjs sql-server angularjs arrays wpf database batch-file google-app-engine silverlight