How to speed up printf in C - c

I have a task to print all the prime numbers between 1 and 1000000 in class and the fastest 10 programs get extra marks. The main problem is the time it takes for the prime numbers to be printed to the console.
Basically using the Sieve of Eratosthenes I produce an array with only boolean values in it. The boolean value Numbers[i] is true if i+2 is a prime number.
for(i = 0; i <= n - 2; ++i)
if (Numbers[i]) // True if the number is prime
printf("%d\n", i+2);
Printf seems to be really slow as the program can generate the list of primes in about 0.035 s but then takes a further 11 seconds to print the list. Is there anyway I can speed this up, thanks.

Beneath is a slightly unoptimized implementation (although I skipped the intermediate list and print directly) of what I think you were supposed to do. Running that program on an AMD A8-6600K with a small load (mainly a Youtube music-video for some personal entertainment) results in
real 0m1.211s
user 0m0.047s
sys 0m0.122s
averaged over a couple of runs. So the problem lies in your implementation of the sieve or you are hiding some essential facts about your hardware.
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <inttypes.h>
#include <limits.h>
#include <string.h>
/* I call it a general bitset. Others might call it an abomination. YMMV. */
# define ERAT_BITS (sizeof(uint32_t)*CHAR_BIT)
# define GET_BIT(s,n) ((*(s+(n/ERAT_BITS)) & ( 1<<( n % ERAT_BITS ))) != 0)
# define SET_BIT(s,n) (*(s+(n/ERAT_BITS)) |= ( 1<<( n % ERAT_BITS )))
# define CLEAR_BIT(s,n) (*(s+(n/ERAT_BITS)) &= ~( 1<<( n % ERAT_BITS )))
# define TOG_BIT(s,n) (*(s+(n/ERAT_BITS)) ^= ( 1<<( n % ERAT_BITS )))
/* size is the size in bits, the overall size might be bigger */
typedef struct mp_bitset_t {
uint32_t size;
uint32_t *content;
} mp_bitset_t;
# define mp_bitset_alloc(bst, n) \
do {\
(bst)->content=malloc(( n /(sizeof(uint32_t)) + 1 ));\
if ((bst)->content == NULL) {\
fprintf(stderr, "memory allocation for bitset failed");\
exit(EXIT_FAILURE);\
}\
(bst)->size = n;\
} while (0)
# define mp_bitset_size(bst) ((bst)->size)
# define mp_bitset_setall(bst) memset((bst)->content,~(uint32_t)(0),\
(bst->size /(sizeof(uint32_t) ) +1 ))
# define mp_bitset_clearall(bst) memset((bst)->content,0,\
(bst->size /(sizeof(uint32_t) ) +1 ))
# define mp_bitset_clear(bst,n) CLEAR_BIT((bst)->content, n)
# define mp_bitset_set(bst,n) SET_BIT((bst)->content, n)
# define mp_bitset_get(bst,n) GET_BIT((bst)->content, n)
# define mp_bitset_free(bst) \
do {\
free((bst)->content);\
free(bst);\
} while (0)
uint32_t mp_bitset_nextset(mp_bitset_t * bst, uint32_t n);
uint32_t mp_bitset_prevset(mp_bitset_t * bst, uint32_t n);
void mp_eratosthenes(mp_bitset_t * bst);
/* It's called Hallek's method but it has many inventors*/
static uint32_t isqrt(uint32_t n)
{
uint32_t s, rem, root;
if (n < 1)
return 0;
/* This is actually the highest square but it goes
* downward from this, quite fast */
s = 1 << 30;
rem = n;
root = 0;
while (s > 0) {
if (rem >= (s | root)) {
rem -= (s | root);
root >>= 1;
root |= s;
} else {
root >>= 1;
}
s >>= 2;
}
return root;
}
uint32_t mp_bitset_nextset(mp_bitset_t *bst, uint32_t n)
{
while ((n < mp_bitset_size(bst)) && (!mp_bitset_get(bst, n))) {
n++;
}
return n;
}
/*
* Standard method, quite antique now, but good enough for the handful
* of primes needed here.
*/
void mp_eratosthenes(mp_bitset_t *bst)
{
uint32_t n, k, r, j;
mp_bitset_setall(bst);
mp_bitset_clear(bst, 0);
mp_bitset_clear(bst, 1);
n = mp_bitset_size(bst);
r = isqrt(n);
for (k = 4; k < n; k += 2)
mp_bitset_clear(bst, k);
k = 0;
while ((k = mp_bitset_nextset(bst, k + 1)) < n) {
if (k > r) {
break;
}
for (j = k * k; j < n; j += k * 2) {
mp_bitset_clear(bst, j);
}
}
}
#define UPPER_LIMIT 1000000 /* one million */
int main(void) {
mp_bitset_t *bst;
uint32_t n, k, j;
bst = malloc(sizeof(mp_bitset_t));
if(bst == NULL) {
fprintf(stderr, "failed to allocate %zu bytes\n",sizeof(mp_bitset_t));
exit(EXIT_FAILURE);
}
mp_bitset_alloc(bst, UPPER_LIMIT);
mp_bitset_setall(bst);
mp_bitset_clear(bst, 0); // 0 is not prime b.d.
mp_bitset_clear(bst, 1); // 1 is not prime b.d.
n = mp_bitset_size(bst);
for (k = 4; k < n; k += 2) {
mp_bitset_clear(bst, k);
}
k = 0;
while ((k = mp_bitset_nextset(bst, k + 1)) < n) {
printf("%" PRIu32 "\n", k);
for (j = k * k; j < n; j += k * 2) {
mp_bitset_clear(bst, j);
}
}
mp_bitset_free(bst);
return EXIT_SUCCESS;
}
Compiled with
gcc-4.9 -O3 -g3 -W -Wall -Wextra -Wuninitialized -Wstrict-aliasing -pedantic -std=c11 tests.c -o tests
(GCC is gcc-4.9.real (Ubuntu 4.9.4-2ubuntu1~14.04.1) 4.9.4)

Since by default console output is line buffered, which is the reason of the increased time.
You can use the setvbuf function to allow printing to console/stdout only in chunks rather than for each iteration.
E.g.
char buffer[256];
setvbuf(stdout, buffer, _IOFBF, sizeof(buffer));
You can alter the size of buffer according to your needs.
IOFBF option is for full buffering i.e. output will be printed once the buffer is full.
See setvbuf for more details

Related

Faster divisibility test than % operator?

I noticed a curious thing on my computer.* The handwritten divisibility test is significantly faster than the % operator. Consider the minimal example:
* AMD Ryzen Threadripper 2990WX, GCC 9.2.0
static int divisible_ui_p(unsigned int m, unsigned int a)
{
if (m <= a) {
if (m == a) {
return 1;
}
return 0;
}
m += a;
m >>= __builtin_ctz(m);
return divisible_ui_p(m, a);
}
The example is limited by odd a and m > 0. However, it can be easily generalized to all a and m. The code just converts the division to a series of additions.
Now consider the test program compiled with -std=c99 -march=native -O3:
for (unsigned int a = 1; a < 100000; a += 2) {
for (unsigned int m = 1; m < 100000; m += 1) {
#if 1
volatile int r = divisible_ui_p(m, a);
#else
volatile int r = (m % a == 0);
#endif
}
}
... and the results on my computer:
| implementation | time [secs] |
|--------------------|-------------|
| divisible_ui_p | 8.52user |
| builtin % operator | 17.61user |
Therefore more than 2 times faster.
The question: Can you tell me how the code behaves on your machine? Is it missed optimization opportunity in GCC? Can you do this test even faster?
UPDATE:
As requested, here is a minimal reproducible example:
#include <assert.h>
static int divisible_ui_p(unsigned int m, unsigned int a)
{
if (m <= a) {
if (m == a) {
return 1;
}
return 0;
}
m += a;
m >>= __builtin_ctz(m);
return divisible_ui_p(m, a);
}
int main()
{
for (unsigned int a = 1; a < 100000; a += 2) {
for (unsigned int m = 1; m < 100000; m += 1) {
assert(divisible_ui_p(m, a) == (m % a == 0));
#if 1
volatile int r = divisible_ui_p(m, a);
#else
volatile int r = (m % a == 0);
#endif
}
}
return 0;
}
compiled with gcc -std=c99 -march=native -O3 -DNDEBUG on AMD Ryzen Threadripper 2990WX with
gcc --version
gcc (Gentoo 9.2.0-r2 p3) 9.2.0
UPDATE2: As requested, the version that can handle any a and m (if you also want to avoid integer overflow, the test has to be implemented with integer type twice as long as the input integers):
int divisible_ui_p(unsigned int m, unsigned int a)
{
#if 1
/* handles even a */
int alpha = __builtin_ctz(a);
if (alpha) {
if (__builtin_ctz(m) < alpha) {
return 0;
}
a >>= alpha;
}
#endif
while (m > a) {
m += a;
m >>= __builtin_ctz(m);
}
if (m == a) {
return 1;
}
#if 1
/* ensures that 0 is divisible by anything */
if (m == 0) {
return 1;
}
#endif
return 0;
}
What you’re doing is called strength reduction: replacing an expensive operation with a series of cheap ones.
The mod instruction on many CPUs is slow, because it historically was not tested in several common benchmarks and the designers therefore optimized other instructions instead. This algorithm will perform worse if it has to do many iterations, and % will perform better on a CPU where it needs only two clock cycles.
Finally, be aware that there are many shortcuts to take the remainder of division by specific constants. (Although compilers will generally take care of this for you.)
I will answer my question myself. It seems that I became a victim of branch prediction. The mutual size of the operands does not seem to matter, only their order.
Consider the following implementation
int divisible_ui_p(unsigned int m, unsigned int a)
{
while (m > a) {
m += a;
m >>= __builtin_ctz(m);
}
if (m == a) {
return 1;
}
return 0;
}
and the arrays
unsigned int A[100000/2];
unsigned int M[100000-1];
for (unsigned int a = 1; a < 100000; a += 2) {
A[a/2] = a;
}
for (unsigned int m = 1; m < 100000; m += 1) {
M[m-1] = m;
}
which are / are not shuffled using the shuffle function.
Without shuffling, the results are still
| implementation | time [secs] |
|--------------------|-------------|
| divisible_ui_p | 8.56user |
| builtin % operator | 17.59user |
However, once I shuffle these arrays, the results are different
| implementation | time [secs] |
|--------------------|-------------|
| divisible_ui_p | 31.34user |
| builtin % operator | 17.53user |

Efficient way to find divisibility

Professor says this isn't a efficient algorithm to check whether the number is divisible by a number from 100,000-150,000. I'm having trouble finding a better way. Any help would be appreciated.
unsigned short divisibility_check(unsigned long n) {
unsigned long i;
for (i = 100000; i <= 150000; i++) {
if (n % i == 0) {
return 0;
}
}
return 1;
}
Let's say you need to find whether a positive integer K is divisible by a number between 100,000 and 150,000, and it is such a rare operation, that doing precalculations is just not worth the processor time or memory used.
If K < 100,000, it cannot be divisible by a number between 100,000 and 150,000.
If 100,000 ≤ K ≤ 150,000, it is divisible by itself. It is up to you to decide whether this counts or not.
For a K > 150,000 to be divisible by M, with 100,000 ≤ M ≤ 150,000, K must also be divisible by L = K / M. This is because K = L × M, and all three are positive integers. So, you only need to test the divisibility of K by a set of L, where ⌊ K / 150,000 ⌋ ≤ L ≤ ⌊ K / 100,000 ⌋.
However, that set of Ls becomes larger than the set of possible Ms when K > = 15,000,000,000. Then it is again less work to just test K for divisibility against each M, much like OP's code is now.
When implementing this as a program, the most important thing in practice is, surprisingly, the comments you add. Do not write comments that describe what the code does; write comments that explain the model or algorithm you are trying to implement (say, at the function level), and your intent of what each small block of code should accomplish.
In this particular case, you should probably add a comment to each if clause, explaining your reasoning, much like I did above.
Beginner programmers often omit comments completely. It is unfortunate, because writing good comments is a hard habit to pick up afterwards. It is definitely a good idea to learn to comment your code (as I described above -- the comments that describe what the code does are less than useful; more noise than help), and keep honing your skill on that.
A programmer whose code is maintainable, is worth ten geniuses who produce write-only code. This is because all code has bugs, because humans make errors. To be an efficient developer, your code must be maintainable. Otherwise you're forced to rewrite each buggy part from scratch, wasting a lot of time. And, as you can see above, "optimization" at the algorithmic level, i.e. thinking about how to avoid having to do work, yields much better results than trying to optimize your loops or something like that. (You'll find in real life that surprisingly often, optimizing a loop in the proper way, removes the loop completely.)
Even in exercises, proper comments may be the difference between "no points, this doesn't work" and "okay, I'll give you partial credit for this one, because you had a typo/off-by-one bug/thinko on line N, but otherwise your solution would have worked".
As bolov did not understand how the above leads to a "naive_with_checks" function, I'll show it implemented here.
For ease of testing, I'll show a complete test program. Supply the range of integers to test, and the range of divisors accepted, as parameters to the program (i.e. thisprogram 1 500000 100000 150000 to duplicate bolov's tests).
#include <stdlib.h>
#include <inttypes.h>
#include <limits.h>
#include <locale.h>
#include <ctype.h>
#include <stdio.h>
#include <errno.h>
int is_divisible(const uint64_t number,
const uint64_t minimum_divisor,
const uint64_t maximum_divisor)
{
uint64_t divisor, minimum_result, maximum_result, result;
if (number < minimum_divisor) {
return 0;
}
if (number <= maximum_divisor) {
/* Number itself is a valid divisor. */
return 1;
}
minimum_result = number / maximum_divisor;
if (minimum_result < 2) {
minimum_result = 2;
}
maximum_result = number / minimum_divisor;
if (maximum_result < minimum_result) {
maximum_result = minimum_result;
}
if (maximum_result - minimum_result > maximum_divisor - minimum_divisor) {
/* The number is so large that it is the least amount of work
to check each possible divisor. */
for (divisor = minimum_divisor; divisor <= maximum_divisor; divisor++) {
if (number % divisor == 0) {
return 1;
}
}
return 0;
} else {
/* There are fewer possible results than divisors,
so we check the results instead. */
for (result = minimum_result; result <= maximum_result; result++) {
if (number % result == 0) {
divisor = number / result;
if (divisor >= minimum_divisor && divisor <= maximum_divisor) {
return 1;
}
}
}
return 0;
}
}
int parse_u64(const char *s, uint64_t *to)
{
unsigned long long value;
const char *end;
/* Empty strings are not valid. */
if (s == NULL || *s == '\0')
return -1;
/* Parse as unsigned long long. */
end = s;
errno = 0;
value = strtoull(s, (char **)(&end), 0);
if (errno == ERANGE)
return -1;
if (end == s)
return -1;
/* Overflow? */
if (value > UINT64_MAX)
return -1;
/* Skip trailing whitespace. */
while (isspace((unsigned char)(*end)))
end++;
/* If the string does not end here, it has garbage in it. */
if (*end != '\0')
return -1;
if (to)
*to = (uint64_t)value;
return 0;
}
int main(int argc, char *argv[])
{
uint64_t kmin, kmax, dmin, dmax, k, count;
if (argc != 5) {
fprintf(stderr, "\n");
fprintf(stderr, "Usage: %s [ -h | --help | help ]\n", argv[0]);
fprintf(stderr, " %s MIN MAX MIN_DIVISOR MAX_DIVISOR\n", argv[0]);
fprintf(stderr, "\n");
fprintf(stderr, "This program counts which positive integers between MIN and MAX,\n");
fprintf(stderr, "inclusive, are divisible by MIN_DIVISOR to MAX_DIVISOR, inclusive.\n");
fprintf(stderr, "\n");
return EXIT_SUCCESS;
}
/* Use current locale. This may change which codes isspace() considers whitespace. */
if (setlocale(LC_ALL, "") == NULL)
fprintf(stderr, "Warning: Your C library does not support your current locale.\n");
if (parse_u64(argv[1], &kmin) || kmin < 1) {
fprintf(stderr, "%s: Invalid minimum positive integer to test.\n", argv[1]);
return EXIT_FAILURE;
}
if (parse_u64(argv[2], &kmax) || kmax < kmin || kmax >= UINT64_MAX) {
fprintf(stderr, "%s: Invalid maximum positive integer to test.\n", argv[2]);
return EXIT_FAILURE;
}
if (parse_u64(argv[3], &dmin) || dmin < 2) {
fprintf(stderr, "%s: Invalid minimum divisor to test for.\n", argv[3]);
return EXIT_FAILURE;
}
if (parse_u64(argv[4], &dmax) || dmax < dmin) {
fprintf(stderr, "%s: Invalid maximum divisor to test for.\n", argv[4]);
return EXIT_FAILURE;
}
count = 0;
for (k = kmin; k <= kmax; k++)
count += is_divisible(k, dmin, dmax);
printf("%" PRIu64 "\n", count);
return EXIT_SUCCESS;
}
It is useful to note that the above, running bolov's test, i.e. thisprogram 1 500000 100000 150000 only takes about 15 ms of wall clock time (13 ms CPU time), median, on a much slower Core i5-7200U processor. For really large numbers, like 280,000,000,000 to 280,000,010,000, the test does the maximum amount of work, and takes about 3.5 seconds per 10,000 numbers on this machine.
In other words, I wouldn't trust bolov's numbers to have any relation to timings for properly written test cases.
It is important to note that for any K between 1 and 500,000, the same test that bolov says their code measures, the above code does at most two divisibility tests to find if K is divisible by an integer between 100,000 and 150,000.
This solution is therefore quite efficient. It is definitely acceptable and near-optimal, when the tested K are relatively small (say, 32 bit unsigned integers or smaller), or when precomputed tables cannot be used.
Even when precomputed tables can be used, it is unclear if/when prime factorization becomes faster than the direct checks. There is certainly a tradeoff in the size and content of the precomputed tables. bolov claims that it is clearly superior to other methods, but hasn't implemented a proper "naive" divisibility test as shown above, and bases their opinion on experiments on quite small integers (1 to 500,000) that have simple prime decompositions.
As an example, a table of integers 1 to 500,000 pre-checked for divisibility takes only 62500 bytes (43750 bytes for 150,000 to 500,000). With that table, each test takes a small near-constant time (that only depends on memory and cache effects). Extending it to all 32-bit unsigned integers would require 512 GiB (536,870,912 bytes); the table can be stored in a memory-mapped read-only file, to let the OS kernel manage how much of it is mapped to RAM at any time.
Prime decomposition itself, especially using trial division, becomes more expensive than the naive approach when the number of trial divisions exceeds the range of possible divisors (50,000 divisors in this particular case). As there are 13848 primes (if one counts 1 and 2 as primes) between 1 and 150,000, the number of trial divisions can easily approach the number of divisors for sufficiently large input values.
For numbers with many prime factors, the combinatoric phase, finding if any subset of the prime factors multiply to a number between 100,000 and 150,000 is even more problematic. The number of possible combinations grows faster than exponentially. Without careful checks, this phase alone can do way more work per large input number than just trial division with each possible divisor would be.
(As an example, if you have 16 different prime factors, you already have 65,535 different combinations; more than the number of direct trial divisions. However, all such numbers are larger than 64-bit; the smallest being 2·3·5·7·11·13·17·19·23·29·31·37·41·43·47·53 = 32,589,158,477,190,044,730 which is a 65-bit number.)
There is also the problem of code complexity. The more complex the code, the harder it is to debug and maintain.
Ok, so I've implemented the version with sieve primes and factorization mentioned in the comments by m69 and it is ... way faster than the naive approach. I must admit, I didn't expect this at all.
My notations: left == 100'000 and right = 150'000
naive your version
naive_with_checks your version with simple checks:
if (n < left) no divisor
else if (n <= right) divisor
else if (left * 2 >= right && n < left * 2) divisor
factorization (above checks implemented)
Precompute the Sieve of Eratosthenes for all primes up to right. This time is not measured
factorize n (only with the primes from the prev step)
generate all subsets (backtracking, depth first: i.e. generate p1^0 * p2^0 * p3^0 first, instead of p1^5 first) with the product < left or until the product is in [left, right] (found divisor).
factorization_opt optimization of the previous algorithm where the subsets are not generated (no vector of subsets is created). I just pass the current product from one backtracking iteration to the next.
Nominal Animal's version I have also ran his version on my system with the same range.
I have written the program in C++ so I won't share it here.
I used std::uint64_t as data type and I have checked all numbers from 1 to 500'000 to see if each is divisible by a number in interval [100'000, 150'000]. All version reached the same solution: 170'836 numbers with positive results.
The setup:
Hardware: Intel Core i7-920, 4 cores with HT (all algorithm versions are single threaded), 2.66 GHz (boost 2.93 GHz),
8 MB SmartCache; memory: 6 GB DDR3 triple channel.
Compiler: Visual Studio 2017 (v141), Release x64 mode.
I must also add that I haven't profiled the programs so there is definitely room to improve the implementation. However this is enough here as the idea is to find a better algorithm.
version | elapsed time (milliseconds)
-----------------------+--------------
naive         |  167'378 ms (yes, it's thousands separator, aka 167 seconds)
naive_with_checks |   97'197 ms
factorization | 7'906 ms
factorization_opt | 7'320 ms
|
Nominal Animal version | 14 ms
Some analysis:
For naive vs naive_with_checks: all the numbers in [1 200'000] can be solved with just the simple checks. As these represent 40% of all the numbers checked, the naive_with_checks version does roughly 60% of the work naive does. The execution time reflect this as naive_with_checks runtime is ≅58% of the naive version.
The factorization version is a whopping 12.3 times faster. That is indeed impressive. I haven't analyzed the time complexity of the alg.
And the final optimization brings a further 1.08x speedup. This is basically the time gained by removing the creation and copy of the small vectors of subset factors.
For those interested the sieve precomputation which is not included above takes about 1 ms. And this is the naive implementation from wikipedia, no optimizations whatsoever.
For comparison, here's what I had in mind when I posted my comment about using prime factorization. Compiled with gcc -std=c99 -O3 -m64 -march=haswell this is slightly faster than the naive method with checks and inversion when tested with the last 10,000 integers in the 64-bit range (3.469 vs 3.624 seconds).
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <stdint.h>
#include <stdbool.h>
void eratosthenes(bool *ptr, uint64_t size) {
memset(ptr, true, size);
for (uint64_t i = 2; i * i < size; i++) {
if (ptr[i]) {
for (uint64_t j = i * i; j < size; j += i) {
ptr[j] = false;
}
}
}
}
bool divisible(uint64_t n, uint64_t a, uint64_t b) {
/* check for trivial cases first */
if (n < a) {
return false;
}
if (n <= b) {
return true;
}
if (n < 2 * a) {
return false;
}
/* Inversion: use range n/b ~ n/a; see Nominal Animal's answer */
if (n < a * b) {
uint64_t c = a;
a = (n + b - 1) / b; // n/b rounded up
b = n / c;
}
/* Create prime sieve when first called, or re-calculate it when */
/* called with a higher value of b; place before inversion in case */
/* of a large sequential test, to avoid repeated re-calculation. */
static bool *prime = NULL;
static uint64_t prime_size = 0;
if (prime_size <= b) {
prime_size = b + 1;
prime = realloc(prime, prime_size * sizeof(bool));
if (!prime) {
printf("Out of memory!\n");
return false;
}
eratosthenes(prime, prime_size);
}
/* Factorize n into prime factors up to b, using trial division; */
/* there are more efficient but also more complex ways to do this. */
/* You could return here, if a factor in the range a~b is found. */
static uint64_t factor[63];
uint8_t factors = 0;
for (uint64_t i = 2; i <= n && i <= b; i++) {
if (prime[i]) {
while (n % i == 0) {
factor[factors++] = i;
n /= i;
}
}
}
/* Prepare divisor sieve when first called, or re-allocate it when */
/* called with a higher value of b; in a higher-level language, you */
/* would probably use a different data structure for this, because */
/* this method iterates repeatedly over a potentially sparse array. */
static bool *divisor = NULL;
static uint64_t div_size = 0;
if (div_size <= b / 2) {
div_size = b / 2 + 1;
divisor = realloc(divisor, div_size * sizeof(bool));
if (!divisor) {
printf("Out of memory!\n");
return false;
}
}
memset(divisor, false, div_size);
divisor[1] = true;
uint64_t max = 1;
/* Iterate over each prime factor, and for every divisor already in */
/* the sieve, add the product of the divisor and the factor, up to */
/* the value b/2. If the product is in the range a~b, return true. */
for (uint8_t i = 0; i < factors; i++) {
for (uint64_t j = max; j > 0; j--) {
if (divisor[j]) {
uint64_t product = factor[i] * j;
if (product >= a && product <= b) {
return true;
}
if (product < div_size) {
divisor[product] = true;
if (product > max) {
max = product;
}
}
}
}
}
return false;
}
int main() {
uint64_t count = 0;
for (uint64_t n = 18446744073709541615LLU; n <= 18446744073709551614LLU; n++) {
if (divisible(n, 100000, 150000)) ++count;
}
printf("%llu", count);
return 0;
}
And this is the naive + checks + inversion implementation I compared it with:
#include <stdio.h>
#include <stdint.h>
#include <stdbool.h>
bool divisible(uint64_t n, uint64_t a, uint64_t b) {
if (n < a) {
return false;
}
if (n <= b) {
return true;
}
if (n < 2 * a) {
return false;
}
if (n < a * b) {
uint64_t c = a;
a = (n + b - 1) / b;
b = n / c;
}
while (a <= b) {
if (n % a++ == 0) return true;
}
return false;
}
int main() {
uint64_t count = 0;
for (uint64_t n = 18446744073709541615LLU; n <= 18446744073709551614LLU; n++) {
if (divisible(n, 100000, 150000)) ++count;
}
printf("%llu", count);
return 0;
}
Here's a recursive method with primes. The idea here is that if a number is divisible by a number between 100000 and 150000, there is a path of reducing by division the product of only relevant primes that will pass through a state in the target range. (Note: the code below is meant for numbers greater than 100000*150000). In my testing, I could not find an instance where the stack performed over 600 iterations.
# Euler sieve
def getPrimes():
n = 150000
a = (n+1) * [None]
ps = ([],[])
s = []
p = 1
while (p < n):
p = p + 1
if not a[p]:
s.append(p)
# Save primes less
# than half
# of 150000, the only
# ones needed to construct
# our candidates.
if p < 75000:
ps[0].append(p);
# Save primes between
# 100000 and 150000
# in case our candidate
# is prime.
elif p > 100000:
ps[1].append(p)
limit = n / p
new_s = []
for i in s:
j = i
while j <= limit:
new_s.append(j)
a[j*p] = True
j = j * p
s = new_s
return ps
ps1, ps2 = getPrimes()
def f(n):
# Prime candidate
for p in ps2:
if not (n % p):
return True
# (primes, prime_counts)
ds = ([],[])
prod = 1
# Prepare only prime
# factors that could
# construct a composite
# candidate.
for p in ps1:
while not (n % p):
prod *= p
if (not ds[0] or ds[0][-1] != p):
ds[0].append(p)
ds[1].append(1)
else:
ds[1][-1] += 1
n /= p
# Reduce the primes product to
# a state where it's between
# our target range.
stack = [(prod,0)]
while stack:
prod, i = stack.pop()
# No point in reducing further
if prod < 100000:
continue
# Exit early
elif prod <= 150000:
return True
# Try reducing the product
# by different prime powers
# one prime at a time
if i < len(ds[0]):
for p in xrange(ds[1][i] + 1):
stack.append((prod / ds[0][i]**p, i + 1))
return False
Output:
c = 0
for ii in xrange(1099511627776, 1099511628776):
f_i = f(ii)
if f_i:
c += 1
print c # 239
Here is a very simple solution with a sieve cache. If you call the divisibility_check function for many numbers in a sequence, this should be very efficient:
#include <string.h>
int divisibility_check_sieve(unsigned long n) {
static unsigned long sieve_min = 1, sieve_max;
static unsigned char sieve[1 << 19]; /* 1/2 megabyte */
if (n < sieve_min || n > sieve_max) {
sieve_min = n & ~(sizeof(sieve) - 1);
sieve_max = sieve_min + sizeof(sieve) - 1;
memset(sieve, 1, sizeof sieve);
for (unsigned long m = 100000; m <= 150000; m++) {
unsigned long i = sieve_min % m;
if (i != 0)
i = m - i;
for (; i < sizeof sieve; i += m) {
sieve[i] = 0;
}
}
}
return sieve[n - sieve_min];
}
Here is a comparative benchmark:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
int divisibility_check_naive(unsigned long n) {
for (unsigned long i = 100000; i <= 150000; i++) {
if (n % i == 0) {
return 0;
}
}
return 1;
}
int divisibility_check_small(unsigned long n) {
unsigned long i, min = n / 150000, max = n / 100000;
min += (min == 0);
max += (max == 0);
if (max - min > 150000 - 100000) {
for (i = 100000; i <= 150000; i++) {
if (n % i == 0) {
return 0;
}
}
return 1;
} else {
for (i = min; i <= max; i++) {
if (n % i == 0) {
unsigned long div = n / i;
if (div >= 100000 && div <= 150000)
return 0;
}
}
return 1;
}
}
int divisibility_check_sieve(unsigned long n) {
static unsigned long sieve_min = 1, sieve_max;
static unsigned char sieve[1 << 19]; /* 1/2 megabyte */
if (n < sieve_min || n > sieve_max) {
sieve_min = n & ~(sizeof(sieve) - 1);
sieve_max = sieve_min + sizeof(sieve) - 1;
memset(sieve, 1, sizeof sieve);
for (unsigned long m = 100000; m <= 150000; m++) {
unsigned long i = sieve_min % m;
if (i != 0)
i = m - i;
for (; i < sizeof sieve; i += m) {
sieve[i] = 0;
}
}
}
return sieve[n - sieve_min];
}
int main(int argc, char *argv[]) {
unsigned long n, count = 0, lmin, lmax, range[2] = { 1, 500000 };
int pos = 0, naive = 0, small = 0, sieve = 1;
clock_t t;
char *p;
for (int i = 1; i < argc; i++) {
n = strtoul(argv[i], &p, 0);
if (*p == '\0' && pos < 2)
range[pos++] = n;
else if (!strcmp(argv[i], "naive"))
naive = 1;
else if (!strcmp(argv[i], "small"))
small = 1;
else if (!strcmp(argv[i], "sieve"))
sieve = 1;
else
printf("invalid argument: %s\n", argv[i]);
}
lmin = range[0];
lmax = range[1] + 1;
if (naive) {
t = clock();
for (count = 0, n = lmin; n != lmax; n++) {
count += divisibility_check_naive(n);
}
t = clock() - t;
printf("naive: [%lu..%lu] -> %lu non-divisible numbers, %10.2fms\n",
lmin, lmax - 1, count, t * 1000.0 / CLOCKS_PER_SEC);
}
if (small) {
t = clock();
for (count = 0, n = lmin; n != lmax; n++) {
count += divisibility_check_small(n);
}
t = clock() - t;
printf("small: [%lu..%lu] -> %lu non-divisible numbers, %10.2fms\n",
lmin, lmax - 1, count, t * 1000.0 / CLOCKS_PER_SEC);
}
if (sieve) {
t = clock();
for (count = 0, n = lmin; n != lmax; n++) {
count += divisibility_check_sieve(n);
}
t = clock() - t;
printf("sieve: [%lu..%lu] -> %lu non-divisible numbers, %10.2fms\n",
lmin, lmax - 1, count, t * 1000.0 / CLOCKS_PER_SEC);
}
return 0;
}
Here are some run times:
naive: [1..500000] -> 329164 non-divisible numbers, 158174.52ms
small: [1..500000] -> 329164 non-divisible numbers, 12.62ms
sieve: [1..500000] -> 329164 non-divisible numbers, 1.35ms
sieve: [0..4294967295] -> 3279784841 non-divisible numbers, 8787.23ms
sieve: [10000000000000000000..10000000001000000000] -> 765978176 non-divisible numbers, 2205.36ms

Read from the standard input a natural number, n. Find the greatest perfect square that is less than or equal to n

#include <stdio.h>
#include <stdlib.h>
int main() {
int i, j, n, maxi = 0;
printf("\n Introduce the number:\n");
scanf("%d", &n);
for (j = 1; j <= n; j++)
{
i = 0;
while (i < j) {
i++;
if (j == i * i) {
if (j > maxi) {
maxi = j;
printf("%d", maxi);
}
}
}
}
return 0;
}
I have to find the greatest perfect square smaller than than a number n, I succeeded in finding all the perfect squares that are smaller than the number n but because each time it finds a perfect square it displays it I couldn't think of any way to compare all the perfect square that were found (or at least that's what I think the problem is) so I would appreciate some help. I already know that you could also solve this problem using a more simpler method ( like the one below ) and if you have any other ideas on how to solve it I'd like to hear them.
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
int main()
{
int n,j;
printf("\n Your number:\n");
scanf("%d",&n);
j=(int)sqrt(n);
printf("%d",j*j);
return 0;
}
You only need a single loop here. Check if i*i <= n. If so, set maxi to i*i and increment i:
int n, i = 1, sq = 1;
printf("\n Introduce the number:\n");
scanf("%d", &n);
while (i*i <= n) {
sq = i*i;
i++;
}
printf("sq=%d\n", sq);
Find the greatest perfect square that is less than or equal to n
For n>=0, this is akin to finding the integer square root of n.
unsigned greatest_perfect_square(unsigned x) {
unsigned root = usqrt(x);
return root * root;
}
if you have any other ideas on how to solve it I'd like to hear them.
The order of complexity to find the square root is O(bit-width-of-type-n). e.g. 16 iterations.
#include <limits.h>
unsigned usqrt(unsigned x) {
unsigned y = 0;
unsigned xShifted = 0;
const unsigned MSBit = UINT_MAX - UINT_MAX/2;
// This constant relies on no padding and bit width even
const unsigned TwoBitCount_N = sizeof(x) * CHAR_BIT / 2;
for (unsigned TwoBitCount = TwoBitCount_N; TwoBitCount > 0; TwoBitCount--) {
// Shift `xShifted` 2 places left while shifting in the 2 MSbits of x
xShifted <<= 1;
if (x & MSBit) {
xShifted |= 1;
}
x <<= 1;
xShifted <<= 1;
if (x & MSBit) {
xShifted |= 1;
}
x <<= 1;
// Shift the answer 1 bit left
y <<= 1;
// Form test value as y*2 + 1
unsigned Test = (y << 1) | 1;
// If xShifted big enough ...
if (xShifted >= Test) {
xShifted -= Test;
// Increment answer
y |= 1;
}
}
return y;
}
OP's method is far far slower. Even the inner loop takes O(sqrt(n)) time.
Note:
OP's code: j == i * i is subject to overflow and leads to the incorrect answer when j is larger.
j/i == i performs a like test without overflow.
#Jonathan Leffler suggested a Newton-Raphson approximation approach. Some lightly tested code below works quite fast, often taking only a few iterations.
I suspect this is O(log(bit-width-of-type-n)) for the main part, yet of course still O(log(bit-width-of-type-n)) for bit_width().
Both of the functions could be improved.
unsigned bit_width(unsigned x) {
unsigned width = 0;
while (x) {
x /= 2;
width++;
}
return width;
}
unsigned usqrt_NR(unsigned x) {
if (x == 0) {
return 0;
}
unsigned y = 1u << bit_width(x)/2;
unsigned y_previous;
unsigned diff;
unsigned diff1count = 0;;
do {
y_previous = y;
y = (y + x/y)/2;
diff = y_previous < y ? y - y_previous : y_previous - y;
if (diff == 1) diff1count++;
} while (diff > 1 || (diff == 1 && diff1count <= 1));
y = (y_previous + y)/2;
return y;
}
This minimizes the number of multiplications: it looks for the first square which is larger than n, meaning that the perfect square immediately before was the solution.
for (i = 1; i <= n; i++) {
if (i*i > n) {
break;
}
}
i--;
// i*i is your answer
On some platforms it might be useful to exploit the fact that (i+1)*(i+1) = i*i + 2*i + 1, or in other words, if you already have i^2, (i+1)^2 is obtained by adding i to it twice, and incrementing by 1; and at the beginning, 0^2 is 0 to prime the cycle.
for (i = 0, sq = 0; i < n; i++) {
sq += i; // Or on some platforms sq += i<<1 instead of two sums
sq += i; // Some compilers will auto-optimize "sq += 2*i" for the platform
sq++; // Or even sq += ((2*i)|1) as adding 1 to even numbers is OR'ing 1
if (sq > n) {
break;
}
// if sq is declared as signed integer, a possible overflow will
// show it as being negative. This way we can still get a "correct" result
// with i the smallest root that does not overflow.
// In 16-bit arithmetic this is 181, root of 32761; next square would be
// 33124 which cannot be represented in signed 16-bit space.
if (sq < 0) {
break;
}
}
// (i*i) is your answer

minimal prime generator C

I have to create a minimal prime generator in C (we need the minimal primes that have at least two digits) and i cannot use tables.So my thought was first find all the primes,second with using masks to find all the subsequences of each number and finally check if every subsequence is not a prime number.i cannot find the minimal primes because i didnt put a condition to check if a subsequence is prime or no?(my code is not ready so it may have some mistakes but it runs)
my code
#include <stdio.h>
#define MAXNUMB 100
int main ()
{
int i,j,x,mask,max=1,mult,sub;
for (i = 11 ; i < MAXNUMB; i += 2 ) {
for (j = 3; j * j <= i; j += 2) {
if (i % j == 0) {
break;
}
}
if (j * j > i) {
int length = 0;
int tmp=i;
while (tmp != 0) {
tmp /= 10;
length++;
}
for (x=1;x<length*2;x++) {
mask=x;
mult=1;
sub=0;
int num=i;
while ( num != 0 ) {
if ( mask % 2 == 1 ) {
sub += num % 10 * mult;
mult *= 10;
}
num /= 10;
mask /= 2;
}
printf ("%d \n",sub);
}
printf ("%d is minimal prime \n",i);
}
}
}
If MAXNUMB is not too large then you can very quickly find all the primes until MAXNUMB using the Sieve_of_Eratosthenes. After you have that you can get every subsequence of an n-digit number by counting from 1 to 2^n-1 and use the bit-pattern of the current count to specify the current subsequence. Check each subsequence in the sieve.
If MAXNUMB is too big then you can build the sieve up to sqrt(MAXNUMB), so that when you test a number whether it's prime or not you just have to check whether it's divisible with any prime, instead of checking whether it's divisible any odd number.
Wikipedia has a nice pseudocode algorithm for primality. It's naive, but it works.
function is_prime(n : integer)
if n ≤ 1
return false
else if n ≤ 3
return true
else if n mod 2 = 0 or n mod 3 = 0
return false
let i ← 5
while i×i ≤ n
if n mod i = 0 or n mod (i + 2) = 0
return false
i ← i + 6
return true
I'll leave translating this into C for you. My one note is that i should be of a type larger than n's type. For the reason, consider what happens to i×i ≤ n when n is the largest integer.
When you're ready, you can look at my solution to this problem.
The concept of a minimal prime is moderately interesting. I wrote the following code to do the checking.
/* SO 33838621: Minimal Primes */
/*
** Find the minimal primes less than 100,000.
**
** A minimal prime is a prime number for which no subsequence of the digits
** that make up the number is itself prime.
** The question gives two examples:
** = 881 is prime and is a minimal prime because none of { 8, 8, 1, 88,
** 81, 81 } are prime.
** = 109 is prime but is not a minimal prime because { 1, 0, 9, 10, 9,
** 19 } includes the prime 19.
** Clearly, the single digit primes are all trivially minimal.
**
** Additional wrinkle: the code may not build up a table of primes.
**
** NB: All primes except 2 and 3 have the form 6N±1
*/
/*
** There are two problems to solve:
** (1) Check for primality without using a table of primes.
** (2) Generate all subsequences of a number.
** The latter problem is somewhat harder than the former.
** The surviving solution uses recursive string manipulation.
**
** NB: Command subsequences is derived from this, and helps check
** the validity of the minimal primes.
*/
#include <assert.h>
#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
static bool is_prime(int n)
{
if (n < 2)
return false;
if (n == 2 || n == 3 || n == 5 || n == 7)
return true;
if (n % 2 == 0 || n % 3 == 0 || n % 5 == 0 || n % 7 == 0)
return false;
for (int c = 12; (c - 1) * (c - 1) <= n; c += 6)
{
if (n % (c - 1) == 0 || n % (c + 1) == 0)
return false;
}
return true;
}
static bool has_prime_n_digit_subset(int p0, int n_digits, const char *buffer, int buflen)
{
//printf("-->> p0 = %5d, n = %d (%s)\n", p0, n_digits, buffer);
assert(buflen >= 0 && strlen(buffer) == (size_t)buflen);
for (int i = 0; i < buflen; i++)
{
int p1 = 10 * p0 + buffer[i] - '0';
if (n_digits > 1)
{
if (has_prime_n_digit_subset(p1, n_digits - 1, &buffer[i+1], buflen - i - 1))
{
//printf("<<-- true\n");
return true;
}
}
else
{
//printf("C %d\n", p1);
if (is_prime(p1))
{
//printf("<<-- p1 = %d: true\n", p1);
return true;
}
}
}
//printf("<<-- false\n");
return false;
}
static void check_minimal_prime(int n)
{
assert(n > 0);
if (is_prime(n))
{
//printf("P %d\n", n);
char buffer[20];
sprintf(buffer, "%d", n);
char n_digits = strlen(buffer);
for (int i = 1; i < n_digits; i++)
{
if (has_prime_n_digit_subset(0, i, buffer, n_digits))
return;
}
printf("%d\n", n); /* It's a minimal prime */
}
}
int main(int argc, char **argv)
{
int max = 100000;
if (argc > 2)
{
fprintf(stderr, "Usage: %s [maximum]\n", argv[0]);
exit(1);
}
else if (argc == 2)
{
max = atoi(argv[1]);
if (max <= 0)
{
fprintf(stderr, "Invalid number (%d from %s)\n", max, argv[1]);
exit(1);
}
}
max /= 6;
check_minimal_prime(2);
check_minimal_prime(3);
for (int c = 1; c < max; c++)
{
check_minimal_prime(6 * c - 1);
check_minimal_prime(6 * c + 1);
}
return 0;
}
The list of numbers generated was:
2
3
5
7
11
19
41
61
89
409
449
499
881
991
6469
6949
9001
9049
9649
9949
60649
666649
946669
60000049
66000049
66600049
I didn't find any more minimal primes when checking up to 1,000,000,000. Timing was:
100 0m0.006s
1000 0m0.006s
10000 0m0.006s
100000 0m0.012s
1000000 0m0.129s
10000000 0m2.617s
100000000 1m8.200s
1000000000 32m34.561s

Exponential problems and their C representation

I came across well-known N-Queen problem and I was wondering how to write a program to calculate number of possibilities in this particular problem. My program can find solution fast for really small N's (since it's heuristic).
I'd also like to know how to represent such big numbers in C. Are there any algorithms for really big numbers? Anytime I write and implementation of my own arithmetic I get i. e. quadratic multiplication with tons of memory allocation what cannot be fast. Thank you in advance for exhaustive answer.
here is a nice solution, using recursion
(taken from: <http://rosettacode.org/wiki/N-queens_problem#C>)
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
typedef uint32_t uint;
uint full, *qs, count = 0, nn;
void solve(uint d, uint c, uint l, uint r)
{
uint b, a, *s;
if (!d) // exit condition
{
count++;
#if 0
printf("\nNo. %d\n===========\n", count);
for (a = 0; a < nn; a++, putchar('\n'))
{
for (b = 0; b < nn; b++, putchar(' '))
{
putchar(" -QQ"[((b == qs[a])<<1)|((a + b)&1)]);
} // end for
} // end for
#endif
return;
} // end if
a = (c | (l <<= 1) | (r >>= 1)) & full;
if (a != full)
{
for (*(s = qs + --d) = 0, b = 1; b <= full; (*s)++, b <<= 1)
{
if (!(b & a))
{
solve(d, b|c, b|l, b|r);
} // end if
} // end for
} // end if
} // end function: solve
int main(int n, char **argv)
{
if (n <= 1 || (nn = atoi(argv[1])) <= 0) nn = 8;
qs = calloc(nn, sizeof(int));
full = (1U << nn) - 1;
solve(nn, 0, 0, 0);
printf("\nSolutions: %d\n", count);
return 0;
} // end function: main

Resources