I'm working on a program in C where the main objective is absolute speed - it's a code performance competition. There are more ways to speed up the program, however, the largest speedup potential is in I/O operations, specifically, saving to text file. The file is structured as follows: 3 integers of arbitrary digit count per line, separated by whitespaces. The integers are known beforehand, they just need to be converted to a string and written to the output buffer.
The integers only range from -1 to INT_MAX.
The buffer size varies (I set it) based on the data being written but most of the time, the written file size is in orders of 100s of megabytes to something over a gigabyte and buffer is between 4 and 8 MB. The main write loop is this:
int fd = open(filename, O_WRONLY | O_CREAT | O_TRUNC, S_IRUSR | S_IWUSR);
const size_t w_bufsize = get_bufsize(param);
void *buf = NULL;
posix_memalign(&buf, sysconf(_SC_PAGE_SIZE), w_bufsize)
posix_fadvise(fd, 0, 0, POSIX_FADV_NOREUSE);
size_t num_written = 0;
size_t f_idx = 0;
for (int i = 0; i < num_ints; ++i) {
myStruct *str = main_struct->structs + i;
f_idx = fast_write_3_ints(buf, str->int1, str->int2, str->int3, f_idx);
if (f_idx + BYTES_PER_ROW > w_bufsize) {
write(fd, buf, f_idx) != f_idx
if (num_written)
posix_fadvise(fd, (num_written - 1) * w_bufsize, w_bufsize,
POSIX_FADV_DONTNEED);
f_idx = 0;
++num_written;
}
(Return value checking and frees/closes abbreviated for readability)
For converting the integers to text, I use this method:
https://kenny-peng.com/2021/05/28/printing_integers_fast.html
I further improved it by bypassing the temporary buffer and memcpy-ing the characters directly to the output buffer (10-15 % perf increase on my machine).
Here is abbreviated (where possible) version of my code
size_t fast_write_3_ints(char *out_buf, int num1, int num2, int num3,
size_t idx)
{
char *temp_ptr = NULL;
int n_digits = 0;
if (num1 < 0) {
out_buf[idx++] = '-';
num1 = -num1;
}
if (num1 < 10) {
out_buf[idx++] = num1 + '0';
} else {
idx += count_digits(num1);
temp_ptr = out_buf + idx;
for (; num1 >= 1000; num1 /= 1000) {
temp_ptr -= 3;
lookup_digits(temp_ptr, num1 % 1000, 3);
}
if (num1) {
num1 %= 1000;
n_digits = count_digits(num1);
lookup_digits(temp_ptr - n_digits, num1, n_digits);
}
}
out_buf[idx++] = ' ';
// write int 2 and 3 - abbreviated
out_buf[idx++] = '\n';
return idx;
}
static void lookup_digits(char *arr, int num, char write_size)
{
static const char table[3000] __attribute__((aligned(64))) =
"000001002003004005006007008009"
"010011012013014015016017018019"
"020021022023024025026027028029"
"030031032033034035036037038039"
"040041042043044045046047048049"
"050051052053054055056057058059"
"060061062063064065066067068069"
"070071072073074075076077078079"
"080081082083084085086087088089"
"090091092093094095096097098099"
"100101102103104105106107108109"
"110111112113114115116117118119"
"120121122123124125126127128129"
"130131132133134135136137138139"
"140141142143144145146147148149"
"150151152153154155156157158159"
"160161162163164165166167168169"
"170171172173174175176177178179"
"180181182183184185186187188189"
"190191192193194195196197198199"
"200201202203204205206207208209"
"210211212213214215216217218219"
"220221222223224225226227228229"
"230231232233234235236237238239"
"240241242243244245246247248249"
"250251252253254255256257258259"
"260261262263264265266267268269"
"270271272273274275276277278279"
"280281282283284285286287288289"
"290291292293294295296297298299"
"300301302303304305306307308309"
"310311312313314315316317318319"
"320321322323324325326327328329"
"330331332333334335336337338339"
"340341342343344345346347348349"
"350351352353354355356357358359"
"360361362363364365366367368369"
"370371372373374375376377378379"
"380381382383384385386387388389"
"390391392393394395396397398399"
"400401402403404405406407408409"
"410411412413414415416417418419"
"420421422423424425426427428429"
"430431432433434435436437438439"
"440441442443444445446447448449"
"450451452453454455456457458459"
"460461462463464465466467468469"
"470471472473474475476477478479"
"480481482483484485486487488489"
"490491492493494495496497498499"
"500501502503504505506507508509"
"510511512513514515516517518519"
"520521522523524525526527528529"
"530531532533534535536537538539"
"540541542543544545546547548549"
"550551552553554555556557558559"
"560561562563564565566567568569"
"570571572573574575576577578579"
"580581582583584585586587588589"
"590591592593594595596597598599"
"600601602603604605606607608609"
"610611612613614615616617618619"
"620621622623624625626627628629"
"630631632633634635636637638639"
"640641642643644645646647648649"
"650651652653654655656657658659"
"660661662663664665666667668669"
"670671672673674675676677678679"
"680681682683684685686687688689"
"690691692693694695696697698699"
"700701702703704705706707708709"
"710711712713714715716717718719"
"720721722723724725726727728729"
"730731732733734735736737738739"
"740741742743744745746747748749"
"750751752753754755756757758759"
"760761762763764765766767768769"
"770771772773774775776777778779"
"780781782783784785786787788789"
"790791792793794795796797798799"
"800801802803804805806807808809"
"810811812813814815816817818819"
"820821822823824825826827828829"
"830831832833834835836837838839"
"840841842843844845846847848849"
"850851852853854855856857858859"
"860861862863864865866867868869"
"870871872873874875876877878879"
"880881882883884885886887888889"
"890891892893894895896897898899"
"900901902903904905906907908909"
"910911912913914915916917918919"
"920921922923924925926927928929"
"930931932933934935936937938939"
"940941942943944945946947948949"
"950951952953954955956957958959"
"960961962963964965966967968969"
"970971972973974975976977978979"
"980981982983984985986987988989"
"990991992993994995996997998999";
memcpy(arr, table + 3 * num + 3 - write_size, write_size);
}
static int count_digits(int num)
{
if (num < 100000)
if (num < 1000)
if (num < 100)
if (num < 10)
return 1;
else
return 2;
else
return 3;
else if (num < 10000)
return 4;
else
return 5;
else if (num < 10000000)
if (num < 1000000)
return 6;
else
return 7;
else if (num < 100000000)
return 8;
else if (num < 1000000000)
return 9;
else
return 10;
}
This is the main production code right now. Below I describe what alternatives I tried and how it turned out.
I also have to note that my computer is a 14" Macbook Pro with the M1 Pro chip and very fast SSD, which makes IO operations totally negligible compared to the main computation. However, the evaluation server/machine is of very different specs (likely), and there, saving the file is by far the slowest bit. I also noted that some changes made it perform better on my machine but worse on the actual evaluator (likely cache size/memory speed dependent).
I also tried implementing lookup-free int-to-string processing as described here:
https://johnnylee-sde.github.io/Fast-unsigned-integer-to-string/
this did not improve performance by more than run-to-run variance on my machine.
I also tried extending the table to the 4*10000 numbers, but it improved performance on my machine by only 3-5 % and actually made it a little worse in the evaluation system (likely a lot slower CPU/memory).
Is there anything else I can optimize for? I am running out of ideas. The historically fastest version of the code saves to the file 18 % faster than my implementation.
A thread solving the exact some problem but with different functions that are (in my eyes) slower and perform a lot more ops? The fastest way to save graph to file in C
Or should I attempt to integrate the single large buffer routine into my algorithm and write in st_blksize sized buffers instead?
Thanks so much for any help or suggestions
EDIT: Function that determines output buffer size (consider param to be the amount of lines to be written)
size_t get_bufsize(int param)
{
size_t bufsize = 4096;
if (param >= 1000 && param < 10000)
bufsize <<= 4;
else if (param >= 10000 && param < 100000)
bufsize <<= 6;
else if (param >= 100000 && param < 1000000)
bufsize <<= 8;
else if (param >= 1000000 && param <= 5000000)
bufsize <<= 10;
else if (param > 5000000)
bufsize <<= 11;
// printf("Buffer size: %zu\n", bufsize);
return bufsize;
}
EDIT 2:
The integers only range from -1 to INT_MAX.
Here are some directions to try and improve you code efficiency:
if running on a legacy system, you should specify O_BINARY to ensure the write system call does not perform some system specific conversion.
when flushing the buffer to disk, you should try and only write a whole number of pages and shift the remaining chunk to the beginning of the buffer. Allocating a decent number of 4K pages plus some slack and writing the 4K pages is a better approach to allocating a huge number of pages and issuing partial writes.
Your function fast_write_3_ints has a redundant statement num1 %= 1000; as well as the if (num1) test. It and can be further simplified to improve speed on small values:
size_t fast_write_3_ints(char *out_buf, int num1, int num2, int num3,
size_t idx)
{
char *temp_ptr;
int n_digits;
if (num1 < 0) {
out_buf[idx++] = '-';
num1 = -num1;
}
if (num1 < 1000) {
if (num1 < 10) {
out_buf[idx++] = num1 + '0';
} else {
n_digits = 2 + (num1 >= 100);
lookup_digits(out_buf + idx, num1, n_digits));
idx += n_digits;
}
} else {
n_digits = count_digits(num1);
idx += n_digits;
temp_ptr = out_buf + idx;
while (n_digits > 3) {
int digits = num1 % 1000;
num1 /= 1000; // group division and modulo
temp_ptr -= 3;
lookup_digits(temp_ptr, digits, 3);
n_digits -= 3;
}
lookup_digits(temp_ptr - n_digits, num1, n_digits);
}
out_buf[idx++] = ' ';
// write int 2 and 3 - abbreviated
out_buf[idx++] = '\n';
return idx;
}
using branchless code for count_digits might get you some speed gains:
static int count_digits(int num) {
return 1 + (num > 9) + (num > 99) + (num > 999) +
(num > 9999) + (num > 99999) + (num > 999999) +
(num > 9999999) + (num > 99999999) + (num > 999999999);
}
int vs. int_fast32_t
Rather than int, consider int_fast32_t as potentially a 64-bit type may be faster.
Avoid interval tests with 2 values
A little improvement perhaps with a simplified if tree.
Also, favor testing large values first as more likely to match.
uint_fast32_t get_bufsize(int param) {
#define BLOCK ((uint_fast32_t) 4096)
if (param >= 5000000) {
return BLOCK << 11;
}
if (param >= 1000000) {
return BLOCK << 10;
}
if (param >= 100000) {
return BLOCK << 8;
}
if (param >= 10000) {
return BLOCK) << 6;
}
if (param >= 1000) {
return BLOCK << 4;
}
return BLOCK;
}
unsigned vs. int
I have never encounter using int faster than unsigned, yet using unsigned has some potential for faster code. Something to try. After if (num1 < 0) test, code could move to unsigned math and maybe see a marginal improvement.
I doubt any of these will dramatically improve, yet may nudge toward a faster code.
If you're trying to optimise to avoid unnecessarily executing code AND the only negative value is -1, change:
if (num1 < 0) {
out_buf[idx++] = '-';
num1 = -num1;
}
if (num1 < 10) {
out_buf[idx++] = num1 + '0';
} else {
to
if (num1 < 10) {
if (num1 < 0) num = 1, out_buf[idx++] = '-';
out_buf[idx++] = num1 + '0';
} else {
Further, it seems you try to handle the residual 1,2or3 digits in some special case. This is unnecessary.
The example code below "borrows" the branchless function from #chqrlie. It also computes double/triple digits instead of indexing into a LUT. Think about that LUT... Slice off the first 100 values into a second "two digit" function, trim the leading zeros, and stop performing arcane calculations on pointers and counts. (I'm not suggesting you use these functions. Too much arithmetic happening. You could use two distinct conversion functions... or not.) Finally, this example only deals with positive numbers and only translates one.
void lookup_2_digits( char *p, int n ) { // Use a LUT... I didn't for this example
p[1] = (char)(n % 10 + '0'); n /= 10;
p[0] = (char)(n + '0');
}
void lookup_3_digits( char *p, int n ) { // Use a LUT... I didn't for this example
p[2] = (char)(n % 10 + '0'); n /= 10;
p[1] = (char)(n % 10 + '0'); n /= 10;
p[0] = (char)(n + '0');
}
int count_digits(int n) {
return 1+ (n > 9) + (n > 99) + (n > 999)
+ (n > 9999) + (n > 99999) + (n > 999999)
+ (n > 9999999) + (n > 99999999) + (n > 999999999);
}
void doit( int num1 ) {
char out_buf[512] = {0};
int idx = 0;
idx += count_digits( num1 );
char *temp_ptr = out_buf + idx;
do {
if( num1 <= 99 ) {
if( num1 <= 9 )
/* Can deal with -1 here */
*--temp_ptr = num1 + '0';
else
lookup_2_digits( temp_ptr-2, num1 );
num1 = 0;
} else {
lookup_3_digits( temp_ptr -= 3, num1 % 1000 );
num1 /= 1000;
}
} while( num1 > 0 );
puts( out_buf );
}
int main( void ) {
doit( 2165536 );
return 0;
}
When the coef is 0, I used continue to not print, but only printTerm(a) comes out and the printTerm(b) part does not come out.
When I delete the (if & continue) statement, both printTerm(a) and printTerm(b) appear, so it seems that there is a problem here (if & continue) statement.
How can I solve this?
int main() {
a[0].coef = 2;
a[0].expon = 1000; // 2x^1000
a[1].coef = 1;
a[1].expon = 2; // x^2
a[2].coef = 1;
a[2].expon = 0; // 1
b[0].coef = 1;
b[0].expon = 4; // x^4
b[1].coef = 10;
b[1].expon = 3; // 10x^3
b[2].coef = 3;
b[2].expon = 2; // 3x^2
b[2].coef = 1;
b[2].expon = 0; // 1
printTerm(a);
printTerm(b);
return 0;
}
void printTerm(polynomial *p) {
int i=0;
printf("polynomial : ");
while(p[i].expon != -1) {
if(p[i].coef == 0) continue;
printf("%dx^%d", p[i].coef, p[i].expon);
i++;
if(p[i].expon != -1 && p[i].coef > 0) printf(" + ");
}
printf("\n");
}
Because you only increment i if p[i].coef is not equal to 0.
If p[i].coef == 0 it skips the increment part and function is stuck in infinite loop, always checking the same array item.
EDIT:
Way to fix this:
Instead of if(p[i].coef == 0) continue; use:
if (p[i].coef == 0)
{
i++;
continue;
}
This way while loop evaluetes next array item instead of being stuck on the same.
I have typed this simple code to calculate the number of prime numbers between 2 and 5,000,000.
The algorithm works fine and it outputs the correct answer, however when I try to use OpenMP to speedup the execution it outputs a different answer every time.
#include "time.h"
#include "stdio.h"
#include "omp.h"
int main()
{
clock_t start = clock();
int count = 1;
int x;
bool flag;
#pragma omp parallel for schedule(static,1) num_threads(2) shared(count) private(x,flag)
for (x = 3; x <= 5000000; x+=2)
{
flag = false;
if (x == 2 || x == 3)
count++;
else if (x % 2 == 0 || x % 3 == 0)
continue;
else
{
for (int i = 5; i * i <= x; i += 6)
{
if (x % i == 0 || x % (i + 2) == 0)
{
flag = true;
break;
}
}
if (!flag)
count++;
}
}
clock_t end = clock();
printf("The execution took %f ms\n", (double)end - start / CLOCKS_PER_SEC);
printf("%d\n", count);
}
The code doesn't work for any number of threads, dynamic or static scheduling or different chunk sizes.
I have tried messing with private and shared variables but it still didn't work and declaring x and flag inside the for loop didn't work either.
I am using Visual Studio 2019 and I have OpenMP support enabled.
What's the problem with my code ?
You have race conditions with your count variable where multiple threads can try to update it at the same time. The easy fix is to use an OpenMP reduction() clause to give each thread a private copy of the variable and have them all get added up properly:
#include <time.h>
#include <stdio.h>
#include <stdbool.h>
int main(void)
{
clock_t start = clock();
int count = 1;
#pragma omp parallel for schedule(static,1) num_threads(2) reduction(+:count)
for (int x = 3; x <= 5000000; x+=2)
{
bool flag = false;
if (x == 2 || x == 3)
count++;
else if (x % 2 == 0 || x % 3 == 0)
continue;
else
{
for (int i = 5; i * i <= x; i += 6)
{
if (x % i == 0 || x % (i + 2) == 0)
{
flag = true;
break;
}
}
if (!flag)
count++;
}
}
clock_t end = clock();
printf("The execution took %f ms\n", (double)end - start / CLOCKS_PER_SEC);
printf("%d\n", count);
}
This outputs 348513 (Verified as the right number through other software).
Also note cleaned up headers and moving some variable declarations around to avoid the need for a private() clause.
You could also make count an atomic int, but that's slower than using reduction() in my testing.
Just to add to the answer provided by #Shawn, besides solving the count race condition using the reduction OpenMP clause. You can also analyze if your code has load balancing issues, looking at the iterations of the loop that you are parallelizing it is clear that not all iterations have the same among of work. Since you are assigning work to threads in a static manner you might have one thread doing much more work than the other. Test around with the dynamic schedule to see if you notice any difference.
Besides that, you can significantly simplify your sequential code by removing all those conditional branchings that negatively affect the performance of your parallel version.
First you do not need (x == 2), since int x = 3;. You do not need (x == 3) either, just remove it and make count=2; (instead of count=1;) and int x = 5;, since the loop is incrementing in steps of 2 (i.e., x+=2). With this you can also remove this:
if (x == 2 || x == 3)
count++;
Now because the loop starts at 5, and has an incremental step of 2, you know that it will be iterating over odd numbers only, so we can remove also x % 2 == 0 . Now we only have an if( x % 3 == 0) continue; else{..}, which can be simplified into if(x % 3 != 0){..}.
You can rewrite the code also to remove that break:
#pragma omp parallel for schedule(static,1) num_threads(2) reduction(+:count)
for (int x = 5; x <= 5000000; x += 2) {
boolean flag = false;
if (x % 3 != 0) {
for (i = 5; !flag && i * i <= x; i += 6) {
flag = (x % i == 0 || x % (i + 2) == 0);
}
if (!flag) {
count++;
}
}
}
because you are using C/C++ you can even remove that if as well:
int count = 2;
#pragma omp parallel for schedule(static,1) num_threads(2) reduction(+:count)
for (int x = 5; x <= 5000000; x += 2) {
if (x % 3 != 0) {
int flag = 1;
for (int i = 5; flag && i * i <= x; i += 6) {
flag = x % i != 0 && x % (i + 2) != 0;
}
count += flag;
}
}
printf("%d\n", count);
IMO the code is more readable now, we could further improve it by given a good name to the variable flag.
any ideas why this is generating a floating point exception? Please ignore the bad coding. This is very rough and I am just trying to experiment different things on this assignment. Also, I trying to do this assignment with only the operations learned in class (I am awere there are others that would make this easier)
Thanks!
#include <stdio.h>
#include <cs50.h>
int main(void)
{ long c = get_long("What is your credit card number?\n");
long i = 10;
long j = 100;
long n = 0;
long m = 0;
long x = 2;
do
{ long a = (c % i - (c % i)/10)/(i/10);
n = a + n;
i = i*100;
}
while (c/i >= 0.1 || c/i == 0);
do
{ long b = (c%j - (c%j)/10)/(j/10);
m = b + m;
j = j*100;
}
while (c/j >= 0.1 || c/j == 0);
long sum = n + 2*m;
if (i > j)
{
x = i;
}
else
{
x = j;
}
if (sum % 10 == 0)
{ if((x == 10000000000000000) && ((c % (x/100) - c % (x/1000))/(x/100000) == (34)))
{
printf("Amercan Express\n");
}
else if((x == 100000000000000000) && ((c % (x/100) - c % (x/1000))/(x/100000) == (51)))
{
printf("MasterCard\n");
}
else if((x == 10000000000000) && ((c % (x/100) - c % (x/1000))/(x/100000) == 4))
{
printf("Visa\n");
}
else if((x == 10000000000000000) && ((c % (x/100) - c % (x/1000))/(x/100000) == 4))
{
printf("Visa\n");
}
else
{
printf("Invlid\n");
}
}
else
{
printf("Invlid\n");
}
}'''
This is the part that is the problem:
while (c/i >= 0.1 || c/i == 0);
in your first do while loop.
Your data types are long and so you are saying that as long as c/i is greater than 0.1 or == 0 then keep going. Unfortunately this will go on forever as a long will always be >= 0.1 or <= 0 because it cant hold decimal places.If you play around with debug50 by using debug50 ./filename you can place a red dot and step through. What you will see is the first do while loop will keep running increasing the value of i every cycle. Eventually the value of i is bigger than the memory limit for a long and so the code creates unexpected results.If you step through you find that it eventually assigns 0 to i and so you suffer a divide by 0 error.
You can fix this by changing data types or altering that while loop.
I hope that helps. Id suggest stepping through that do while loop using debug50 in the cs50 ide to see whats going on.
i am new to C and was trying some leetcode problems.
I worked it out in my IDE (VS Code), and it seems to work with every testcase.
It even works if i run it in leetcodes terminal, but it suddenly crashes when i submit it.
The problem was to find the median of 2 sorted arrays.
This is the error:
Runtime Error Message:
AddressSanitizer: heap-buffer-overflow on address 0x602000000190 at pc 0x0000004018b1 bp 0x7ffd5082b970 sp 0x7ffd5082b960
Last executed input:
[]
[1]
And this is my code:
double findMedianSortedArrays(int *nums1, int nums1Size, int *nums2, int nums2Size)
{
double median1;
double median2;
if (nums1Size % 2 == 0)
{
int middle = (nums1Size / 2);
median1 = (double)(nums1[middle] + nums1[middle - 1]) / 2;
}
else
{
if (nums1Size == 1)
{
median1 = (double)nums1[nums1Size - 1];
}
else
{
int middle = ((nums1Size - 1) / 2);
median1 = (double)nums1[middle];
}
}
if (nums2Size % 2 == 0)
{
int middle = (nums2Size / 2);
median2 = (double)(nums2[middle] + nums2[middle - 1]) / 2;
}
else
{
if (nums2Size == 1)
{
median2 = (double)nums2[nums2Size - 1];
}
else
{
int middle = ((nums2Size - 1) / 2);
median2 = (double)nums2[middle];
}
}
double totalmedian = (median1 + median2) / 2;
return totalmedian;
}
Im really lost and hope someone can help me.
Thank you in advance!