Using 100% cpu vs using maximum available threads - c

I made a Mandelbrot set visual presentation using POSIX thread library in C. The output is a .pgm file.
Upon thread implementation three case seems to pop up which are as follows:-
On using only 4-3 thread it works slow, but don't eat up my CPU usage
On using 20 thread it works well, but use 100%-98% of CPU usage
On using 40 thread it works slow, and is only using 25 threads (as shown by windows task manager/process manager)
So, is it wise to use less threads or the use that amount of threads which just touch 100% CPU usage or the use maximum number of threads offered by OS ?
I am using Intel Pentium 4 x86 (2GB RAM) machine and using Windows 7 Ultimate.
The Code for reference
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <pthread.h>
#define MAX_ITR 100
#define MAX_UNSTABLE 4
#define Xin -1.5
#define Yin 1.0
#define Xf 0.5
#define Yf -1.0
char * arr;
void * function(void * arg)
{
int width = *((int *)arg);
int height = 8000;
int count;
double x=Xin, y=Yin;
double xf = Xf, yf = Yf;
double xinc = (xf-x)/(double)8000;
double yinc = (yf-y)/(double)8000;
double rr,ii,zr=0,zi=0,mag=0;
int i,j;
for(i=0;i<height;i++,x=Xin + xinc*width,y+= yinc)
{
for(j=0+width;j<width+400;j++,x += xinc,count=-1,zr=0,zi=0)
{
mag=0;
while((++count < MAX_ITR) && (mag < MAX_UNSTABLE))
{
rr = zr;
ii = zi; // square of Z
zr = rr*rr - ii*ii;
zi = 2*rr*ii;
zr += x;
zi += y;
mag = pow(zr*zr + zi*zi,0.5);
}
arr[j + i * height] = 255 - (int)(count * 255/MAX_ITR);
}
}
printf("\nthread id >%d\n",width/400 + 1);
printf("xinc %f\nyinc %f\n",xinc,yinc);
printf("from=>(%11.9f, %11.9f)\nto=>(%11.9f, %11.9f)\n",xinc*width,Yin,2*(xinc*width),Yf);
pthread_exit(NULL);
return NULL;
}
int main()
{
int width = 8000, height = 8000,i;
arr = (char *)malloc(sizeof(char) * width * height);
pthread_t threadPoll[20];
int set[20] = {0};
for(i=0;i<20;i++)
{
set[i] = 400*i;
printf("%d ",set[i]);
}
printf("\n");
for(i=0;i<20;i++)
{
pthread_create(&(threadPoll[i]), NULL, function, &(set[i]));
}
for(i=19;i>0;i--)
{
pthread_join(threadPoll[i],NULL);
}
FILE * fp = (FILE*)fopen("out.pgm","wb");
if(fp == NULL)return 1;
fprintf(fp,"P5\n%d %d\n255\n",width,height);
fwrite(arr,sizeof(char)*width*height,1,fp);
fclose(fp);
printf("done!");
free(arr);
return 0;
}

Related

Solving a coupled differential equations system using time splitting

/******************************************************************************
Online C Compiler.
Code, Compile, Run and Debug C program online.
Write your code in this editor and press "Run" button to compile and execute it.
*******************************************************************************/
#include <stdlib.h>
#include <stdio.h>
#include <math.h>
#include <string.h>
#define PI 3.141592
void read_input(double *D, double *L, int *nx, double *t_F);
double main(void) {
/******************************/
/* Declarations of parameters */
/******************************/
/* Number of grid points */
int nx;
/* Length of domain */
double L;
/* Equation coefficients */
double D;
/* Length of time to run simulation. */
double t_F;
/* Read in from file; */
read_input(&D, &L, &nx, &t_F);
/* Grid spacing */
double dx = L/nx;
double invdx2 = 1.0/(dx*dx);
/* Time step */
double dt = 0.25/invdx2; // changed to 0.25/dx^2 to satisfy the stability condition
/************************************************/
/* Solution Storage at Current / Next time step */
/************************************************/
double *uc, *un, *vc, *vn;
/* Time splitting solutions */
double *uts1, *uts2, *vts1, *vts2;
/* Derivative used in finite difference */
double deriv;
/* Allocate memory according to size of nx */
uc = malloc(nx * sizeof(double));
un = malloc(nx * sizeof(double));
vc = malloc(nx * sizeof(double));
vn = malloc(nx * sizeof(double));
uts1 = malloc(nx * sizeof(double));
uts2 = malloc(nx * sizeof(double));
vts1 = malloc(nx * sizeof(double));
vts2 = malloc(nx * sizeof(double));
/* Check the allocation pointers */
if (uc==NULL||un==NULL||vc==NULL||vn==NULL||uts1==NULL||
uts2==NULL||vts1==NULL||vts2==NULL) {
printf("Memory allocation failed\n");
return 1;
}
int k;
double x;
/* Current time */
double ctime;
/* Initialise arrays */
for(k = 0; k < nx; k++) {
x = k*dx;
uc[k] = 1.0 + sin(2.0*PI*x/L);
vc[k] = 0.0;
/* Set other arrays to 0 */
uts1[k] = 0; uts2[k] = 0;
vts1[k] = 0; vts2[k] = 0;
}
/* Loop over timesteps */
while (ctime < t_F){
/* Rotation factors for time-splitting scheme. */
double cfac = cos(dt); //changed from 2*dt to dt
double sfac = sin(dt);
/* First substep for diffusion equation, A_1 */
for (k = 0; k < nx; k++) {
x = k*dx;
/* Diffusion at half time step. */
deriv = (uc[k-1] + uc[k+1] - 2*uc[k])*invdx2 ;
uts1[k] = uc[k] + (D * deriv + vc[k])* 0.5*dt; //
deriv = (vc[k-1] + vc[k+1] - 2*vc[k])*invdx2;
vts1[k] = vc[k] + (D * deriv - uc[k]) * 0.5*dt;
}
/* Second substep for decay/growth terms, A_2 */
for (k = 0; k < nx; k++) {
x = k*dx;
/* Apply rotation matrix to u and v, */
uts2[k] = cfac*uts1[k] + sfac*vts1[k];
vts2[k] = -sfac*uts1[k] + cfac*vts1[k];
}
/* Third substep for diffusion terms, A_1 */
for (k = 0; k < nx; k++) {
x = k*dx;
deriv = (uts2[k-1] + uts2[k+1] - 2*uts2[k])*invdx2;
un[k] = uts2[k] + (D * deriv + vts2[k]) * 0.5*dt;
deriv = (vts2[k-1] + vts2[k+1] - 2*vts2[k])*invdx2;
vn[k] = vts2[k] + (D * deriv - uts2[k]) * 0.5*dt;
}
/* Copy next values at timestep to u, v arrays. */
memcpy(uc,un, sizeof(double) * nx);
memcpy(vc,vn, sizeof(double) * nx);
/* Increment time. */
ctime += dt;
for (k = 0; k < nx; k++ ) {
x = k*dx;
printf("%g %g %g %g\n",ctime,x,uc[k],vc[k]);
}
}
/* Free allocated memory */
free(uc); free(un);
free(vc); free(vn);
free(uts1); free(uts2);
free(vts1); free(vts2);
return 0;
}
// The lines below don't contain any bugs! Don't modify them
void read_input(double *D, double *L, int *nx, double *t_F) {
FILE *infile;
if(!(infile=fopen("input.txt","r"))) {
printf("Error opening file\n");
exit(1);
}
if(4!=fscanf(infile,"%lf %lf %d %lf",D,L,nx,t_F)) {
printf("Error reading parameters from file\n");
exit(1);
}
fclose(infile);
}
So this is the code. It is meant to solve the following differential equations:
du/dt - Dd^2u/dx^2 - v = 0
dv/dt - Dd^2v/dx^2 + u = 0
It splits the equations into two parts. The second x derivative part(A1) and the decay part which contains u and v(A2) . It uses two half steps(0.5dt) for A1 and 1 full step of dt for A2. I know how to do time splitting but i dont know whether i have done it correctly here.
This is for an assignment and i have fixed all the errors and i am just trying to make the code work as intended. I have never had to solve something similar to this so i am definitely very stuck right now. The solution converges but i think its wrong. Any ideas why? Am not looking for someone to outright tell me what am doing wrong, just guide me in the right direction if you know what i mean.
PS: When i compile the code with gcc i get a warning about double main(void). Why might that be?

Calculate the value of đťś‹ from the infinite series in c

I am trying to make a C program that calculates the value of Pi from the infinite series, aka Leibniz series, and display it to the user. My problem is that I need to display a special message that appears when the program hits the first 3.14, and the first 3.141. That special message should include in which iteration of the loop did the the number become 3.14 and 3.141. I am not lazy so a found a way to make the infinite series but the second part I couldn't figure out, so what should I add to my code to make it display the special message?
#include <stdio.h>
int main(void) {
int i, den; // denominator and counter
double pi = 4;
for (i = 0; i < 10000; i++) {
den = i * 2 + 3;
// (4 - 4/3 + 4/5 -4/7 + 4/9 -......)
if (i % 2 == 0) {
pi = pi - (4.0 / den);
}
else {
pi = pi + (4.0 / den);
}
printf("pi = %lf\n", pi);
}
}
Here's a possible solution:
#include<stdio.h>
#include <math.h>
int
main (void)
{
int i, den; //denominator and counter
int prec = 0;
double pi = 4;
for (i = 0; i < 10000; i++)
{
den = i * 2 + 3;
//(4 - 4/3 + 4/5 -4/7 + 4/9 -......)
if (i % 2 == 0)
pi -= 4.0 / den;
else
pi += 4.0 / den;
//printf ("pi = %lf\n", pi);
if (prec < 1 && trunc (100 * pi) == 314)
{
printf ("Found 3.14 at iteration %d\n", i);
prec++;
}
if (prec < 2 && (int)trunc (1000 * pi) == 3141)
{
printf ("Found 3.141 at iteration %d\n", i);
prec++;
}
}
}
The output is:
pi = 2.666667
pi = 3.466667
pi = 2.895238
...
pi = 3.150140
pi = 3.133118
pi = 3.149996
Found 3.14 at iteration 117
...
pi = 3.141000
pi = 3.142185
pi = 3.141000
Found 3.141 at iteration 1686
...
Here is a version that compares the first n digits of a double cmp_n(). Variables use minimal scope. The variable oracle holds the truncated pi to n decimals. The values of oracle must be stored in ascending order. I tweaked the pi formula to be a bit more compact format.
#include <math.h>
#include <stdio.h>
int cmp_n(double d1, double d2, size_t n) {
return fabs(trunc(pow(10, n) * d1) - trunc(pow(10, n) * d2)) < 1.0;
}
int main() {
double pi = 4;
size_t o = 0;
struct {
double pi[;
size_t n;
} oracle[] = {
{ 3.14, 2 },
{ 3.141, 3 }
};
for (int i = 0; i < 10000; i++) {
int den = i * 2 + 3;
//(4 - 4/3 + 4/5 -4/7 + 4/9 -......)
pi += ((i % 2) ? 4.0 : -4.0) / den;
int special = 0;
if(
o < sizeof(oracle) / sizeof(*oracle) &&
cmp_n(pi, oracle[o].pi, oracle[o].n)
) {
special = 1;
o++;
}
printf("pi = %.15f%2s\n", pi, special ? "*" : "");
}
}
and the relevant data (with line numbers);
$ ./a.out | nl -v0 | grep '*'
117 pi = 3.149995866593470 *
1686 pi = 3.141000236580159 *
Note: you need to add the "%.15lf" format string other the pi output is rounded. double only gives you about 15 digits, and the cmp_n() scales the number and this may not work as expected as you get close to the precision supported by double.

Writing a wave generator with SDL

I've coded a simple sequencer in C with SDL 1.2 and SDL_mixer(to play .wav file). It works well and I want to add some audio synthesis to this program. I've look up the and I found this sinewave code using SDL2(https://github.com/lundstroem/synth-samples-sdl2/blob/master/src/synth_samples_sdl2_2.c)
Here's how the sinewave is coded in the program:
static void build_sine_table(int16_t *data, int wave_length)
{
/*
Build sine table to use as oscillator:
Generate a 16bit signed integer sinewave table with 1024 samples.
This table will be used to produce the notes.
Different notes will be created by stepping through
the table at different intervals (phase).
*/
double phase_increment = (2.0f * pi) / (double)wave_length;
double current_phase = 0;
for(int i = 0; i < wave_length; i++) {
int sample = (int)(sin(current_phase) * INT16_MAX);
data[i] = (int16_t)sample;
current_phase += phase_increment;
}
}
static double get_pitch(double note) {
/*
Calculate pitch from note value.
offset note by 57 halfnotes to get correct pitch from the range we have chosen for the notes.
*/
double p = pow(chromatic_ratio, note - 57);
p *= 440;
return p;
}
static void audio_callback(void *unused, Uint8 *byte_stream, int byte_stream_length) {
/*
This function is called whenever the audio buffer needs to be filled to allow
for a continuous stream of audio.
Write samples to byteStream according to byteStreamLength.
The audio buffer is interleaved, meaning that both left and right channels exist in the same
buffer.
*/
// zero the buffer
memset(byte_stream, 0, byte_stream_length);
if(quit) {
return;
}
// cast buffer as 16bit signed int.
Sint16 *s_byte_stream = (Sint16*)byte_stream;
// buffer is interleaved, so get the length of 1 channel.
int remain = byte_stream_length / 2;
// split the rendering up in chunks to make it buffersize agnostic.
long chunk_size = 64;
int iterations = remain/chunk_size;
for(long i = 0; i < iterations; i++) {
long begin = i*chunk_size;
long end = (i*chunk_size) + chunk_size;
write_samples(s_byte_stream, begin, end, chunk_size);
}
}
static void write_samples(int16_t *s_byteStream, long begin, long end, long length) {
if(note > 0) {
double d_sample_rate = sample_rate;
double d_table_length = table_length;
double d_note = note;
/*
get correct phase increment for note depending on sample rate and table length.
*/
double phase_increment = (get_pitch(d_note) / d_sample_rate) * d_table_length;
/*
loop through the buffer and write samples.
*/
for (int i = 0; i < length; i+=2) {
phase_double += phase_increment;
phase_int = (int)phase_double;
if(phase_double >= table_length) {
double diff = phase_double - table_length;
phase_double = diff;
phase_int = (int)diff;
}
if(phase_int < table_length && phase_int > -1) {
if(s_byteStream != NULL) {
int16_t sample = sine_wave_table[phase_int];
sample *= 0.6; // scale volume.
s_byteStream[i+begin] = sample; // left channel
s_byteStream[i+begin+1] = sample; // right channel
}
}
}
}
}
I don't understand how I could change the sinewave formula to genrate other waveform like square/triangle/saw ect...
EDIT:
Because I forgot to explain it, here's what I tried.
I followed the example I've seen on this video series(https://www.youtube.com/watch?v=tgamhuQnOkM). The source code of the method provided by the video is on github, and the wave generation code is looking like this:
double w(double dHertz)
{
return dHertz * 2.0 * PI;
}
// General purpose oscillator
double osc(double dHertz, double dTime, int nType = OSC_SINE)
{
switch (nType)
{
case OSC_SINE: // Sine wave bewteen -1 and +1
return sin(w(dHertz) * dTime);
case OSC_SQUARE: // Square wave between -1 and +1
return sin(w(dHertz) * dTime) > 0 ? 1.0 : -1.0;
case OSC_TRIANGLE: // Triangle wave between -1 and +1
return asin(sin(w(dHertz) * dTime)) * (2.0 / PI);
}
Because the C++ code here uses windows soun api I could not copy/paste this method to make it work on the piece of code I've found using SDL2.
So I tried to this in order to obtain a square wave:
static void build_sine_table(int16_t *data, int wave_length)
{
double phase_increment = ((2.0f * pi) / (double)wave_length) > 0 ? 1.0 : -1.0;
double current_phase = 0;
for(int i = 0; i < wave_length; i++) {
int sample = (int)(sin(current_phase) * INT16_MAX);
data[i] = (int16_t)sample;
current_phase += phase_increment;
}
}
This didn't gave me a square wave but more a saw wave.
Here's what I tried to get a triangle wave:
static void build_sine_table(int16_t *data, int wave_length)
{
double phase_increment = (2.0f * pi) / (double)wave_length;
double current_phase = 0;
for(int i = 0; i < wave_length; i++) {
int sample = (int)(asin(sin(current_phase) * INT16_MAX)) * (2 / pi);
data[i] = (int16_t)sample;
current_phase += phase_increment;
}
}
This also gave me another type of waveform, not triangle.
You’d replace the sin function call with call to one of the following:
// this is a helper function only
double normalize(double phase)
{
double cycles = phase/(2.0*M_PI);
phase -= trunc(cycles) * 2.0 * M_PI;
if (phase < 0) phase += 2.0*M_PI;
return phase;
}
double square(double phase)
{ return (normalize(phase) < M_PI) ? 1.0 : -1.0; }
double sawtooth(double phase)
{ return -1.0 + normalize(phase) / M_PI; }
double triangle(double phase)
{
phase = normalize(phase);
if (phase >= M_PI)
phase = 2*M_PI - phase;
return -1.0 + 2.0 * phase / M_PI;
}
You’d be building tables just like you did for the sine, except they’d be the square, sawtooth and triangle tables, respectively.

libx11 can't take screenshot on Fedora OS C

I am not a specialist in C programming and Linux OS development, but I have a task about making screenshots on Ubuntu and Fedora OS. After searching on the internet I found a lot of topics and questions about how to do it using C language and libX11. Finally, I combined all I could find in one method which captures the screenshot and saves to .png file.
I have two virtual machines installed - one is Ubuntu 18.04, the second is Fedora 30. When I run my code on Ubuntu - it works perfectly, when I run it on Fedora - I have a screenshot file with black content.
My code is:
#include <X11/Xlib.h>
#include <X11/Xutil.h>
#include <X11/extensions/XShm.h>
#include <stdio.h>
#include <inttypes.h>
#include <sys/ipc.h>
#include <sys/shm.h>
#include <cairo.h>
#include <cairo-xlib.h>
#include <stdlib.h>
int get_shift (int mask) {
int shift = 0;
while (mask) {
if (mask & 1) break;
shift++;
mask >>=1;
}
return shift;
}
void takeScreenshot() {
Display *d;
int s;
XImage *image;
XShmSegmentInfo shminfo;
d = XOpenDisplay(NULL);
s = DefaultScreen(d);
unsigned int width = DisplayWidth(d,s);
unsigned int height = DisplayHeight(d,s);
image = XShmCreateImage(d,
DefaultVisual(d,s), // Use a correct visual. Omitted for brevity
24, // Determine correct depth from the visual. Omitted for brevity
ZPixmap, NULL, &shminfo, width, height);
shminfo.shmid = shmget(IPC_PRIVATE,
image->bytes_per_line * image->height,
IPC_CREAT|0777);
shminfo.shmaddr = image->data = shmat(shminfo.shmid, 0, 0);
shminfo.readOnly = False;
XShmAttach(d, &shminfo);
XShmGetImage(d,
RootWindow(d,s),
image,
0,
0,
AllPlanes);
cairo_surface_t *surface;
int stride;
stride = cairo_format_stride_for_width(CAIRO_FORMAT_RGB24, width);
unsigned char *data = malloc(stride * height);
int redShift = get_shift(image->red_mask);
int greenShift = get_shift(image->green_mask);
int blueShift = get_shift(image->blue_mask);
printf("r_shift: %d; g_shift: %d; b_shift: %d\n",redShift, greenShift, blueShift);
printf("byte order: %d\n", image->byte_order);
printf("bytes per line: %d\n", image->bytes_per_line);
printf("bites per pixel: %d\n", image->bits_per_pixel);
printf("r_mask: %lu; g_mask: %lu; b_mask: %lu\n", image->red_mask, image->green_mask, image->blue_mask);
printf("bitmap_bit_order: %d bitmap_pad: %d format: %d xoffset: %d\n", image->bitmap_bit_order, image->bitmap_pad, image->format, image->xoffset);
int x, y;
for (y = 0; y < height; ++y){
for (x = 0; x < width; ++x) {
unsigned long pixel = XGetPixel(image, x, y);
unsigned char red = (image->red_mask & pixel)>>redShift;
unsigned char green = (image->green_mask & pixel)>>greenShift;
unsigned char blue = (image->blue_mask & pixel)>>blueShift;
data[y * stride + x * 4 + 0] = blue;
data[y * stride + x * 4 + 1] = green;
data[y * stride + x * 4 + 2] = red;
}
}
surface = cairo_image_surface_create_for_data(
data,
CAIRO_FORMAT_RGB24,
width, height,
stride);
cairo_status_t surfaceStatus = cairo_surface_status(surface);
const char *r = cairo_status_to_string (surfaceStatus);
printf("%s\n", &r[0]);
int writepngRes = cairo_surface_write_to_png(
surface,
"test.png");
printf("surf status: %d; write result: %d\n", surfaceStatus, writepngRes);
cairo_surface_destroy(surface);
}
int main(int argc, char* argv[]) {
takeScreenshot();
return 0;
}
And I build this code using following command:gcc code.c -o code.so -lXss -lX11 -lXext -fPIC -I/usr/include/cairo -lcairo
The setup is exactly the same on both machines, what I have checked is that bit mask, byte order and bytes per pixel are equal for both systems. I am asking for suggestions about how to find a bug reason, maybe advice which thing to debug. Thank you!
UPDATE:
When I run this code on both platforms I see exactly the same output:
r_shift: 16; g_shift: 8; b_shift: 0
byte order: 0
bytes per line: 5464
bites per pixel: 32
r_mask: 16711680; g_mask: 65280; b_mask: 255
bitmap_bit_order: 0 bitmap_pad: 32 format: 2 xoffset: 0
no error has occurred
surf status: 0; write result: 0

C: Accessing lookup tables faster?

I have a piece of code that traces 4 sines at a time.
My original code was making roughly 12000 sin() function calls per frame and was running at 30 fps.
I tried optimizing it by generating lookup tables. I ended up with 16 different lookup tables. I declared and load them in a separate header file at the top of my program. Each table is declared like so:
static const float d4_lookup[800] {...};
Now, with this new method I actually lost fps?! I'm running at 20 fps now instead of 30. Each frame now only has to do 8 sin / cos calls and 19200 lookup calls vs 12000 sin() calls.
I compile using gcc with -O3 flag on. At the moment, the lookup tables are included at the top and are part of the global scope of the program.
I assume I'm not loading them in the right memory or something to that effect. How can I speed up the lookup time?
** EDIT 1 **
As requested, here's the function that uses the lookup calls, it is called once per frame:
void
update_sines(void)
{
static float c1_sin, c1_cos;
static float c2_sin, c2_cos;
static float c3_sin, c3_cos;
static float c4_sin, c4_cos;
clock_gettime(CLOCK_MONOTONIC, &spec);
s = spec.tv_sec;
ms = spec.tv_nsec * 0.0000001;
etime = concatenate((long)s, ms);
c1_sin = sinf(etime * 0.00525);
c1_cos = cosf(etime * 0.00525);
c2_sin = sinf(etime * 0.007326);
c2_cos = cosf(etime * 0.007326);
c3_sin = sinf(etime * 0.0046);
c3_cos = cosf(etime * 0.0046);
c4_sin = sinf(etime * 0.007992);
c4_cos = cosf(etime * 0.007992);
int k;
for (k = 0; k < 800; ++k)
{
sine1[k] = a1_lookup[k] * ((bx1_sin_lookup[k] * c1_cos) + (c1_sin * bx1_cos_lookup[k])) + d1_lookup[k];
sine2[k] = a2_lookup[k] * ((bx2_sin_lookup[k] * c2_cos) + (c2_sin * bx2_cos_lookup[k])) + d2_lookup[k] + 50;
sine3[k] = a3_lookup[k] * ((bx3_sin_lookup[k] * c3_cos) + (c3_sin * bx3_cos_lookup[k])) + d3_lookup[k];
sine4[k] = a4_lookup[k] * ((bx4_sin_lookup[k] * c4_cos) + (c4_sin * bx4_cos_lookup[k])) + d4_lookup[k] + 50;
}
}
** UPDATE **
For anyone reading this thread, I gave up on this problem. I tried using OpenCL kernels, structs, SIMD instructions as well as all the solutions shown here. In the end the original code that computed the sinf() 12800 per frame worked faster than the lookup tables since the lookup tables didn't fit into the cache. Yet it was still only doing 30 fps. It just had too much going on to keep up with my 60fps expectations. I've decided to take a different direction. Thanks to everyone who contributed to this thread. Most of these solutions would probably work to get some half decent speed improvements but nothing like the 200% speed up I needed here to have the lookup tables work the way I wanted.
Sometimes it's hard to know what's slowing you down, but potentially you are going to ruin your cache hits, you could try a lookup of a struct
typedef struct
{
float bx1_sin;
float bx2_sin;
float bx3_sin;
float bx4_sin;
float bx1_cos;
etc etc
including sine1,2,3,4 as well
} lookup_table
then
lookup_table lookup[800]
now everything at the kth lookup will be in the same small chunk of memory.
also, if you use a macro that takes k as a parameter to do do the contents of the loop lets say SINE_CALC(k), or an inline function...
you can do
for (k = 0; k < 800; ++k)
{
SINE_CALC(k); k++;
SINE_CALC(k); k++;
SINE_CALC(k); k++;
SINE_CALC(k); k++;
SINE_CALC(k); k++;
}
if you do a macro, make sure the k++ is outside the macro call like shown
Try unrolling your loops like this:
for (k = 0; k < 800; ++k)
{
sine1[k] = a1_lookup[k];
sine2[k] = a2_lookup[k];
sine3[k] = a3_lookup[k];
sine4[k] = a4_lookup[k];
}
for (k = 0; k < 800; ++k)
{
sine1[k] *= ((bx1_sin_lookup[k] * c1_cos) + (c1_sin * bx1_cos_lookup[k]));
sine2[k] *= ((bx2_sin_lookup[k] * c2_cos) + (c2_sin * bx2_cos_lookup[k]));
sine3[k] *= ((bx3_sin_lookup[k] * c3_cos) + (c3_sin * bx3_cos_lookup[k]));
sine4[k] *= ((bx4_sin_lookup[k] * c4_cos) + (c4_sin * bx4_cos_lookup[k]));
}
for (k = 0; k < 800; ++k)
{
sine1[k] += d1_lookup[k];
sine2[k] += d2_lookup[k] + 50;
sine3[k] += d3_lookup[k];
sine4[k] += d4_lookup[k] + 50;
}
By accessing fewer lookup tables in each loop, you should be able to stay in the cache. The middle loop could be split up as well, but you'll need to create an intermediate table for one of the sub-expressions.
Intel processors can predict serial access (and perform prefetch) for up to 4 arrays both for forward and backward traverse. At least this was true in Core 2 Duo days. Split your for in:
for (k = 0; k < 800; ++k)
sine1[k] = a1_lookup[k] * ((bx1_sin_lookup[k] * c1_cos) + (c1_sin * bx1_cos_lookup[k])) + d1_lookup[k];
for (k = 0; k < 800; ++k)
sine2[k] = a2_lookup[k] * ((bx2_sin_lookup[k] * c2_cos) + (c2_sin * bx2_cos_lookup[k])) + d2_lookup[k] + 50;
for (k = 0; k < 800; ++k)
sine3[k] = a3_lookup[k] * ((bx3_sin_lookup[k] * c3_cos) + (c3_sin * bx3_cos_lookup[k])) + d3_lookup[k];
for (k = 0; k < 800; ++k)
sine4[k] = a4_lookup[k] * ((bx4_sin_lookup[k] * c4_cos) + (c4_sin * bx4_cos_lookup[k])) + d4_lookup[k] + 50;
I guess you have more cache load than benchmarks in other answers so this does matters. I recommend you not to unroll loops, compilers do it well.
Using a simple sin lookup table will yields >20% speed increase on my linux machine (vm, gcc, 64bit). Interestingly, the size of lookup table (within reasonable < L1 cache size values) does not influence the speed of execution.
Using a fastsin simple implementation from here I got >45% improvement.
Code:
#include <math.h>
#include <stdio.h>
#include <stdint.h>
#include <sys/time.h>
#include <time.h>
#define LOOKUP_SIZE 628
uint64_t currentTimestampUs( void )
{
struct timeval tv;
time_t localTimeRet;
uint64_t timestamp = 0;
//time_t tzDiff = 0;
struct tm when;
int64_t localeOffset = 0;
{
localTimeRet = time(NULL);
localtime_r ( &localTimeRet, &when );
localeOffset = when.tm_gmtoff * 1000000ll;
}
gettimeofday ( &tv, NULL );
timestamp = ((uint64_t)((tv.tv_sec) * 1000000ll) ) + ( (uint64_t)(tv.tv_usec) );
timestamp+=localeOffset;
return timestamp;
}
const double PI = 3.141592653589793238462;
const double PI2 = 3.141592653589793238462 * 2;
static float sinarr[LOOKUP_SIZE];
void initSinArr() {
int a =0;
for (a=0; a<LOOKUP_SIZE; a++) {
double arg = (1.0*a/LOOKUP_SIZE)*((double)PI * 0.5);
float sinval_f = sin(arg); // double computation earlier to avoid losing precision on value
sinarr[a] = sinval_f;
}
}
float sinlookup(float val) {
float normval = val;
while (normval < 0) {
normval += PI2;
}
while (normval > PI2) {
normval -= PI2;
}
int index = LOOKUP_SIZE*(2*normval/PI);
if (index > 3*LOOKUP_SIZE) {
index = -index + 4*LOOKUP_SIZE;//LOOKUP_SIZE - (index-3*LOOKUP_SIZE);
return -sinarr[index];
} else if (index > 2*LOOKUP_SIZE) {
index = index - 2*LOOKUP_SIZE;
return -sinarr[index];
} else if (index > LOOKUP_SIZE) {
index = 2*LOOKUP_SIZE - index;
return sinarr[index];
} else {
return sinarr[index];
}
}
float sin_fast(float x) {
while (x < -PI)
x += PI2;
while (x > PI)
x -= PI2;
//compute sine
if (x < 0)
return 1.27323954 * x + .405284735 * x * x;
else
return 1.27323954 * x - 0.405284735 * x * x;
}
int main(void) {
initSinArr();
int a = 0;
float val = 0;
const int num_tries = 100000;
uint64_t startLookup = currentTimestampUs();
for (a=0; a<num_tries; a++) {
for (val=0; val<PI2; val+=0.01) {
float compval = sinlookup(val);
(void)compval;
}
}
uint64_t startSin = currentTimestampUs();
for (a=0; a<num_tries; a++) {
for (val=0; val<PI2; val+=0.01) {
float compval = sin(val);
(void)compval;
}
}
uint64_t startFastSin = currentTimestampUs();
for (a=0; a<num_tries; a++) {
for (val=0; val<PI2; val+=0.01) {
float compval = sin_fast(val);
(void)compval;
}
}
uint64_t end = currentTimestampUs();
int64_t lookupMs = (startSin - startLookup)/1000;
int64_t sinMs = (startFastSin - startSin)/1000;
int64_t fastSinMs = (end - startFastSin)/1000;
printf(" lookup: %lld ms\n", lookupMs );
printf(" sin: %lld ms\n", sinMs );
printf(" diff: %lld ms\n", sinMs-lookupMs);
printf(" diff%: %lld %\n", 100*(sinMs-lookupMs)/sinMs);
printf("fastsin: %lld ms\n", fastSinMs );
printf(" sin: %lld ms\n", sinMs );
printf(" diff: %lld ms\n", sinMs-fastSinMs);
printf(" diff%: %lld %\n", 100*(sinMs-fastSinMs)/sinMs);
}
Sample result:
lookup: 2276 ms
sin: 3004 ms
diff: 728 ms
diff%: 24 %
fastsin: 1500 ms
sin: 3004 ms
diff: 1504 ms
diff%: 50 %

Resources