Is there a more efficient way to achieve this:
Given an array A of size n and two positive integers a and b, find the sum floor(abs(A[i]-A[j])*a/b) taken over all pairs (i, j) where 0 <= i < j < n.
int A[n];
int a, b; // assigned some positive integer values
...
int total = 0;
for (int i = 0; i < n; i++) {
for (int j = i+1; j < n; j++) {
total += abs(A[i]-A[j])*a/b; // want integer division here
}
}
To optimize this a little bit, I sorted the array (O(nlogn)) and then didn't use an abs function. Also, I cached the value a[i] before the inner for loop, so I could just read stuff from A sequentially. I was considering precomputing a/b and storing that in a float, but the extra casting just makes it slower (especially since I want to take the floor of the result).
I couldn't come up with a solution that was better than O(n^2).
Yes, there is a more efficient algorithm. It can be done it in O(n*log n). I don't expect there to be an asymptotically faster way, but I'm far from any idea of a proof.
Algorithm
First sort the array in O(n*log n) time.
Now, let us look at the terms
floor((A[j]-A[i])*a/b) = floor ((A[j]*a - A[i]*a)/b)
for 0 <= i < j < n. For each 0 <= k < n, write A[k]*a = q[k]*b + r[k] with 0 <= r[k] < b.
For A[k] >= 0, we have q[k] = (A[k]*a)/b and r[k] = (A[k]*a)%b with integer division, for A[k] < 0, we have q[k] = (A[k]*a)/b - 1 and r[k] = b + (A[k]*a)%b unless b divides A[k]*a, in which case we have q[k] = (A[k]*a)/b and r[k] = 0.
Now we rewrite the terms:
floor((A[j]*a - A[i]*a)/b) = floor(q[j] - q[i] + (r[j] - r[i])/b)
= q[j] - q[i] + floor((r[j] - r[i])/b)
Each q[k] appears k times with positive sign (for i = 0, 1, .. , k-1) and n-1-k times with negative sign (for j = k+1, k+2, ..., n-1), so its total contribution to the sum is
(k - (n-1-k))*q[k] = (2*k+1-n)*q[k]
The remainders still have to be accounted for. Now, since 0 <= r[k] < b, we have
-b < r[j] - r[i] < b
and floor((r[j]-r[i])/b) is 0 when r[j] >= r[i] and -1 when r[j] < r[i]. So
n-1
∑ floor((A[j]-A[i])*a/b) = ∑ (2*k+1-n)*q[k] - inversions(r)
i<j k=0
where an inversion is a pair (i,j) of indices with 0 <= i < j < n and r[j] < r[i].
Calculating the q[k] and r[k] and summing the (2*k+1-n)*q[k] is done in O(n) time.
It remains to efficiently count the inversions of the r[k] array.
For each index 0 <= k < n, let c(k) be the number of i < k such that r[k] < r[i], i.e. the number of inversions in which k appears as the larger index.
Then obviously the number of inversions is ∑ c(k).
On the other hand, c(k) is the number of elements that are moved behind r[k] in a stable sort (stability is important here).
Counting these moves, and hence the inversions of an array is easy to do while merge-sorting it.
Thus the inversions can be counted in O(n*log n) too, giving an overall complexity of O(n*log n).
Code
A sample implementation with a simple unscientific benchmark (but the difference between the naive quadratic algorithm and the above is so large that an unscientific benchmark is conclusive enough).
#include <stdlib.h>
#include <stdio.h>
#include <time.h>
long long mergesort(int *arr, unsigned elems);
long long merge(int *arr, unsigned elems, int *scratch);
long long nosort(int *arr, unsigned elems, long long a, long long b);
long long withsort(int *arr, unsigned elems, long long a, long long b);
int main(int argc, char *argv[]) {
unsigned count = (argc > 1) ? strtoul(argv[1],NULL,0) : 1000;
srand(time(NULL)+count);
long long a, b;
b = 1000 + 9000.0*rand()/(RAND_MAX+1.0);
a = b/3 + (b-b/3)*1.0*rand()/(RAND_MAX + 1.0);
int *arr1, *arr2;
arr1 = malloc(count*sizeof *arr1);
arr2 = malloc(count*sizeof *arr2);
if (!arr1 || !arr2) {
fprintf(stderr,"Allocation failed\n");
exit(EXIT_FAILURE);
}
unsigned i;
for(i = 0; i < count; ++i) {
arr1[i] = 20000.0*rand()/(RAND_MAX + 1.0) - 2000;
}
for(i = 0; i < count; ++i) {
arr2[i] = arr1[i];
}
long long res1, res2;
double start = clock();
res1 = nosort(arr1,count,a,b);
double stop = clock();
printf("Naive: %lld in %.3fs\n",res1,(stop-start)/CLOCKS_PER_SEC);
start = clock();
res2 = withsort(arr2,count,a,b);
stop = clock();
printf("Sorting: %lld in %.3fs\n",res2,(stop-start)/CLOCKS_PER_SEC);
return EXIT_SUCCESS;
}
long long nosort(int *arr, unsigned elems, long long a, long long b) {
long long total = 0;
unsigned i, j;
long long m;
for(i = 0; i < elems-1; ++i) {
m = arr[i];
for(j = i+1; j < elems; ++j) {
long long d = (arr[j] < m) ? (m-arr[j]) : (arr[j]-m);
total += (d*a)/b;
}
}
return total;
}
long long withsort(int *arr, unsigned elems, long long a, long long b) {
long long total = 0;
unsigned i;
mergesort(arr,elems);
for(i = 0; i < elems; ++i) {
long long q, r;
q = (arr[i]*a)/b;
r = (arr[i]*a)%b;
if (r < 0) {
r += b;
q -= 1;
}
total += (2*i+1LL-elems)*q;
arr[i] = (int)r;
}
total -= mergesort(arr,elems);
return total;
}
long long mergesort(int *arr, unsigned elems) {
if (elems < 2) return 0;
int *scratch = malloc((elems + 1)/2*sizeof *scratch);
if (!scratch) {
fprintf(stderr,"Alloc failure\n");
exit(EXIT_FAILURE);
}
return merge(arr, elems, scratch);
}
long long merge(int *arr, unsigned elems, int *scratch) {
if (elems < 2) return 0;
unsigned left = (elems + 1)/2, right = elems-left, i, j, k;
long long inversions = 0;
inversions += merge(arr, left, scratch);
inversions += merge(arr+left,right,scratch);
if (arr[left] < arr[left-1]) {
for(i = 0; i < left; ++i) {
scratch[i] = arr[i];
}
i = 0; j = 0; k = 0;
int *lptr = scratch, *rptr = arr+left;
while(i < left && j < right) {
if (rptr[j] < lptr[i]) {
arr[k++] = rptr[j++];
inversions += (left-i);
} else {
arr[k++] = lptr[i++];
}
}
while(i < left) arr[k++] = lptr[i++];
}
return inversions;
}
Related
I'm trying to solve a problem on codechef, here's the link:
https://www.codechef.com/problems/KFIB
The given problem statement is:
Chef recently had been studying about Fibonacci numbers and wrote a code to print out the k-th term of the Fibonacci series (1, 1, 2, 3, 5, 8, 13….). He was wondering whether he could write a program to generate the k-th term for similar series. More specifically:
T(n, k) is 1 if n <= k and
T(n, k) = T(n-1, k) + T(n-2, k) + T(n-3, k) … + T(n-k, k) if n > k.
Given n and k, output T(n, k) % (1000000007) as the answer could be very large
Input : Two integers, N and K
Output : One integer, the nth term of the series mod 1000000007
Constraints : 1 ≤ N, K ≤ 2*105
example:
Input: 7 5
Output: 9
The series is as follows {1, 1, 1, 1, 1, 5, 9}
void fibo(int n, unsigned long k) {
unsigned long *a, c;
a = (unsigned long *)malloc(sizeof(unsigned long) * k);
for (unsigned long i = 0; i < k; i++) { //T(n,k)=1 when n<=k
*(a + i)=1;
}
for (unsigned long m = 0; m < n - 1; m++) {
c = *(a);
for (unsigned long j = 0; j < k - 1; j++) {
*(a + j) = *(a + j + 1);
c = c + *(a + j);
}
*(a + k - 1) = c;
}
printf("%d ", *(a) % 1000000007);
}
This works with smaller values but not with very large values. I got the result of the example but when I enter the values 200000 500, I get incorrect answers
The problem is you compute the value modulo ULONG_MAX and reduce the result modulo 1000000007 at the end. This does not give the correct result. You must reduce modulo 1000000007 at each step to avoid potential arithmetic overflow (which does not cause undefined behavior for type unsigned long but gives a different result from the expected one).
Here is a modified version of your code with a faster alternative (more than twice as fast on my laptop):
#include <stdio.h>
#include <stdlib.h>
#define DIVIDER 1000000007ul
unsigned long fibo(unsigned long n, unsigned long k) {
unsigned long c = 1;
if (n > k) {
unsigned long *a = (unsigned long *)malloc(sizeof(*a) * k);
for (unsigned long i = 0; i < k; i++) { //T(n,k)=1 when n<=k
a[i] = 1;
}
for (unsigned long m = k; m < n; m++) {
c = a[0];
for (unsigned long j = 0; j < k - 1; j++) {
a[j] = a[j + 1];
#if 0
// slower version using modulo
c = (c + a[j]) % DIVIDER;
#else
// faster version with a test
if ((c += a[j]) >= DIVIDER)
c -= DIVIDER;
#endif
}
a[k - 1] = c;
}
free(a);
}
return c;
}
int main(int argc, char *argv[]) {
if (argc <= 2) {
printf("usage: fibo n k");
return 1;
} else {
unsigned long n = strtoul(argv[1], NULL, 10);
unsigned long k = strtoul(argv[2], NULL, 10);
printf("%lu\n", fibo(n, k));
}
return 0;
}
Output:
$ time ./fibo 200000 100000
871925546
real 0m34.667s
user 0m34.288s
sys 0m0.113s
$ time ./fibo-faster 200000 100000
871925546
real 0m15.073s
user 0m14.846s
sys 0m0.064s
Given the restrictions on input values:
the values of T(n, k) are in the range [0..1000000006] which fits in an int32_t.
the sum of k terms is in the range [0..200000*1000000006] which fits in an int64_t.
hence we can compute the next term in 64 bits and use a single modulo on the result.
This gives an even faster version (more than 3 times faster):
#include <stdio.h>
#include <stdint.h>
#include <stdlib.h>
#define DIVIDER 1000000007
uint32_t fibo(uint32_t n, uint32_t k) {
uint32_t c = 1;
if (n > k) {
uint32_t *a = (uint32_t *)malloc(sizeof(*a) * k);
uint64_t temp;
for (uint32_t i = 0; i < k; i++) { //T(n,k)=1 when n<=k
a[i] = 1;
}
for (uint32_t m = k; m < n; m++) {
temp = a[0];
for (uint32_t j = 0; j < k - 1; j++) {
temp += a[j] = a[j + 1];
}
a[k - 1] = c = temp % DIVIDER;
}
free(a);
}
return c;
}
int main(int argc, char *argv[]) {
if (argc <= 2) {
printf("usage: fibo n k");
return 1;
} else {
uint32_t n = strtoul(argv[1], NULL, 10);
uint32_t k = strtoul(argv[2], NULL, 10);
printf("%lu\n", (unsigned long)fibo(n, k));
}
return 0;
}
Output:
$ time ./fibo-faster 200000 100000
871925546
real 0m3.854s
user 0m3.800s
sys 0m0.018s
To avoid overflow, you can change below statement
c=c+*(a+j);
To
c=(c+*(a+j))%1000000007;
That means only the remainder will be keep in your heap. This won't impact the final results.
Here is the updated code and compiled by clang.(updated according to #bruno's comments)
#include <stdlib.h>
#include <stdio.h>
#define DIVIDER 1000000007ul
#define U4 unsigned long
U4 fibo(U4 n,U4 k)
{
U4 *a,c ;
if(n<=k) return 1;
a= (U4*) malloc (sizeof(U4)*k);
for (U4 i=0;i<k;i++) //T(n,k)=1 when n<=k
{
*(a+i)=1;
}
for (U4 m=k;m<n; m++)
{
c=*(a);
for (U4 j=0;j<k-1;j++)
{
*(a+j)= *(a+j+1);
c=(c+*(a+j))%DIVIDER;
}
*(a+k-1)=c;
}
free(a);
return c;
}
int main(int argc, char *argv[])
{
U4 n, k;
char *endptr;
if(argc <= 2){
printf("usage: t.exe n k");
return 0;
}
n = strtoul(argv[1], &endptr, 10);
k = strtoul(argv[2], &endptr, 10);
printf("%lu", fibo(n,k));
}
Compiler command:
$ clang test.c -o test.exe
$ test.exe 200000 500
80391289
I have some trouble in vectorize some C code using SSE vector instructions. The code which I have to victorize is
#define N 1000
void matrix_mul(int mat1[N][N], int mat2[N][N], int result[N][N])
{
int i, j, k;
for (i = 0; i < N; ++i)
{
for (j = 0; j < N; ++j)
{
for (k = 0; k < N; ++k)
{
result[i][k] += mat1[i][j] * mat2[j][k];
}
}
}
}
Here is what I got so far:
void matrix_mul_sse(int mat1[N][N], int mat2[N][N], int result[N][N])
{
int i, j, k; int* l;
__m128i v1, v2, v3;
v3 = _mm_setzero_si128();
for (i = 0; i < N; ++i)
{
for (j = 0; j < N; j += 4)
{
for (k = 0; k < N; k += 4)
{
v1 = _mm_set1_epi32(mat1[i][j]);
v2 = _mm_loadu_si128((__m128i*)&mat2[j][k]);
v3 = _mm_add_epi32(v3, _mm_mul_epi32(v1, v2));
_mm_storeu_si128((__m128i*)&result[i][k], v3);
v3 = _mm_setzero_si128();
}
}
}
}
After execution I got wrong result. I know that the reason is the loading from memory to v2. I loop through mat1 in row major order so I need to load mat2[0][0], mat2[1][0], mat2[2][0], mat2[3][0].... but what actually loaded is mat2[0][0], mat2[0][1], mat2[0][2], mat2[0][3]... because mat2 has stored in the memory in row major order. I tried to fix this problem but without any improvement.
Can anyone help me please.
Below fixed your implementation:
void matrix_mul_sse(int mat1[N][N], int mat2[N][N], int result[N][N])
{
int i, j, k;
__m128i v1, v2, v3, v4;
for (i = 0; i < N; ++i)
{
for (j = 0; j < N; ++j) // 'j' must be incremented by 1
{
// read mat1 here because it does not use 'k' index
v1 = _mm_set1_epi32(mat1[i][j]);
for (k = 0; k < N; k += 4)
{
v2 = _mm_loadu_si128((const __m128i*)&mat2[j][k]);
// read what's in the result array first as we will need to add it later to our calculations
v3 = _mm_loadu_si128((const __m128i*)&result[i][k]);
// use _mm_mullo_epi32 here instead _mm_mul_epi32 and add it to the previous result
v4 = _mm_add_epi32(v3, _mm_mullo_epi32(v1, v2));
// store the result
_mm_storeu_si128((__m128i*)&result[i][k], v4);
}
}
}
}
In short _mm_mullo_epi32 (requires SSE4.1) produces 4 x int32 results as opposed to _mm_mul_epi32 which does 2 x int64 results. If you cannot use SSE4.1 then have a look at the answer here for an alternative SSE2 solution.
Full description by Intel Intrinsic Guide:
_mm_mullo_epi32: Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit integers, and store
the low 32 bits of the intermediate integers in dst.
_mm_mul_epi32: Multiply the low 32-bit integers from each packed 64-bit element in a and b, and store the
signed 64-bit results in dst.
I kinda changed around your code to make the addressing explicit [ it helps in this case ].
#define N 100
This is a stub for the vector unit multiple & accumulate operation; you should be able to replace NV with whatever throw your vector unit has, and put the relevant opcodes in here.
#define NV 8
int Vmacc(int *A, int *B) {
int i = 0;
int x = 0;
for (i = 0; i < NV; i++) {
x += *A++ * *B++;
}
return x;
}
This multiply has two notable variations from the norm:
1. It caches the columnar vector into a contiguous one.
2. It attempts to push slices of the multiply accumulate into a vector-like func.
Even without using the vector unit, this takes half the time of naive version just because of better cache/prefetch utilization.
void mm2(int *A, int *B, int n, int *C) {
int c, r;
int stride = 0;
int cache[N];
for (c = 0; c < n; c++) {
/* cache cumn i: */
for (r = 0; r < n; r++) {
cache[r] = B[c + r*n];
}
for (r = 0; r < n; r++) {
int k = 0;
int x = 0;
int *Av = A + r*n;
for (k = 0; k+NV-1 < n; k += NV) {
x += Vmacc(Av+k, cache+k);
}
while (k < n) {
x += Av[k] * cache[k];
k++;
}
C[r*n + c] = x;
}
}
}
I have spent more 10hr+ on trying to sort the following(hexadecimals) in LSD radix sort, but no avail. There is very little material on this subject on web.
0 4c7f cd80 41fc 782c 8b74 7eb1 9a03 aa01 73f1
I know I have to mask and perform bitwise operations to process each hex digit (4 bits), but have no idea on how and where.
I'm using the code (I understand) from GeeksforGeeks
void rsort(int a[], int n) {
int max = getMax(a, n);
for (int exp = 1; max / exp > 0; exp *= 10) {
ccsort(a, n, exp);
}
}
int getMax(int a[], int n) {
int max = a[0];
int i = 0;
for (i = 0; i < n; i++) {
if (a[i] > max) {
max = a[i];
}
}
return max;
}
void ccsort(int a[], int n, int exp) {
int count[n];
int output[n];
int i = 0;
for (i = 0; i < n; i++) {
count[i] = 0;
output[i] = 0;
}
for (i = 0; i < n; i++) {
++count[(a[i] / exp) % 10];
}
for (i = 1; i <= n; i++) {
count[i] += count[i - 1];
}
for (i = n - 1; i >= 0; i--) {
output[count[(a[i] / exp) % 10] - 1] = a[i];
--count[(a[i] / exp) % 10];
}
for (i = 0; i < n; i++) {
a[i] = output[i];
}
}
I have also checked all of StackOverFlow on this matter, but none of them covers the details.
Your implementation of radix sort is slightly incorrect:
it cannot handle negative numbers
the array count[] in function ccsort() should have a size of 10 instead of n. If n is smaller than 10, the function does not work.
the loop for cumulating counts goes one step too far: for (i = 1; i <= n; i++). Once again the <= operator causes a bug.
you say you sort by hex digits but the code uses decimal digits.
Here is a (slightly) improved version with explanations:
void ccsort(int a[], int n, int exp) {
int count[10] = { 0 };
int output[n];
int i, last;
for (i = 0; i < n; i++) {
// compute the number of entries with any given digit at level exp
++count[(a[i] / exp) % 10];
}
for (i = last = 0; i < 10; i++) {
// update the counts to have the index of the place to dispatch the next
// number with a given digit at level exp
last += count[i];
count[i] = last - count[i];
}
for (i = 0; i < n; i++) {
// dispatch entries at the right index for its digit at level exp
output[count[(a[i] / exp) % 10]++] = a[i];
}
for (i = 0; i < n; i++) {
// copy entries batch to original array
a[i] = output[i];
}
}
int getMax(int a[], int n) {
// find the largest number in the array
int max = a[0];
for (int i = 1; i < n; i++) {
if (a[i] > max) {
max = a[i];
}
}
return max;
}
void rsort(int a[], int n) {
int max = getMax(a, n);
// for all digits required to express the maximum value
for (int exp = 1; max / exp > 0; exp *= 10) {
// sort the array on one digit at a time
ccsort(a, n, exp);
}
}
The above version is quite inefficient because of all the divisions and modulo operations. Performing on hex digits can be done with shifts and masks:
void ccsort16(int a[], int n, int shift) {
int count[16] = { 0 };
int output[n];
int i, last;
for (i = 0; i < n; i++) {
++count[(a[i] >> shift) & 15];
}
for (i = last = 0; i < 16; i++) {
last += count[i];
count[i] = last - count[i];
}
for (i = 0; i < n; i++) {
output[count[(a[i] >> shift) & 15]++] = a[i];
}
for (i = 0; i < n; i++) {
a[i] = output[i];
}
}
void rsort16(int a[], int n) {
int max = a[0];
for (int i = 1; i < n; i++) {
if (a[i] > max) {
max = a[i];
}
}
for (int shift = 0; (max >> shift) > 0; shift += 4) {
ccsort16(a, n, shift);
}
}
It would be approximately twice as fast to sort one byte at a time with a count array of 256 entries. It would also be faster to compute the counts for all digits in one pass, as shown in rcgldr's answer.
Note that this implementation still cannot handle negative numbers.
There's a simpler way to implement a radix sort. After checking for max, find the lowest power of 16 >= max value. This can be done with max >>= 4 in a loop, incrementing x so that when max goes to zero, then 16 to the power x is >= the original max value. For example a max of 0xffff would need 4 radix sort passes, while a max of 0xffffffff would take 8 radix sort passes.
If the range of values is most likely to take the full range available for an integer, there's no need to bother determining max value, just base the radix sort on integer size.
The example code you have shows a radix sort that scans an array backwards due to the way the counts are converted into indices. This can be avoided by using an alternate method to convert counts into indices. Here is an example of a base 256 radix sort for 32 bit unsigned integers. It uses a matrix of counts / indices so that all 4 rows of counts are generated with just one read pass of the array, followed by 4 radix sort passes (so the sorted data ends up back in the original array). std::swap is a C++ function to swap the pointers, for a C program, this can be replaced by swapping the pointers inline. t = a; a = b; b = t, where t is of type uint32_t * (ptr to unsigned 32 bit integer). For a base 16 radix sort, the matrix size would be [8][16].
// a is input array, b is working array
uint32_t * RadixSort(uint32_t * a, uint32_t *b, size_t count)
{
size_t mIndex[4][256] = {0}; // count / index matrix
size_t i,j,m,n;
uint32_t u;
for(i = 0; i < count; i++){ // generate histograms
u = a[i];
for(j = 0; j < 4; j++){
mIndex[j][(size_t)(u & 0xff)]++;
u >>= 8;
}
}
for(j = 0; j < 4; j++){ // convert to indices
m = 0;
for(i = 0; i < 256; i++){
n = mIndex[j][i];
mIndex[j][i] = m;
m += n;
}
}
for(j = 0; j < 4; j++){ // radix sort
for(i = 0; i < count; i++){ // sort by current lsb
u = a[i];
m = (size_t)(u>>(j<<3))&0xff;
b[mIndex[j][m]++] = u;
}
std::swap(a, b); // swap ptrs
}
return(a);
}
void int_radix_sort(void) {
int group; //because extracting 8 bits
int buckets = 1 << 8; //using size 256
int map[buckets];
int mask = buckets - 1;
int i;
int cnt[buckets];
int flag = NULL;
int partition;
int *src, *dst;
for (group = 0; group < 32; group += 8) {
// group = 8, number of bits we want per round, we want 4 rounds
// cnt
for (int i = 0; i < buckets; i++) {
cnt[i] = 0;
}
for (int j = 0; j < n; j++) {
i = (lst[j] >> group) & mask;
cnt[i]++;
tmp[j] = lst[j];
}
//map
map[0] = 0;
for (int i = 1; i < buckets; i++) {
map[i] = map[i - 1] + cnt[i - 1];
}
//move
for (int j = 0; j < n; j++) {
i = (tmp[j] >> group) & mask;
lst[map[i]] = tmp[j];
map[i]++;
}
}
}
After hours of researching I came across the answer. I'm still do not understand what is going on in this code/answer. I cannot get my head wrapped around the concept. Hopefully, someone can explain.
I see your points. I think negative numbers are easy to sort after the list has been sorted with something like loop, flag, and swap. wb unsigned float points? – itproxti Nov 1 '16 at 16:02
As for handling floating points there might be a way, for example 345.768 is the number, it needs to be converted to an integer, i.e. make it 345768, I multiplied 1000 with it. Just like the offset moves the -ve numbers to +ve domain, so will multiplying by 1000, 10000 etc will turn the floats to numbers with their decimal part as all zeros. Then they can be typecasted to int or long. However with large values, the whole reformed number may not be accomodated within the entire int or long range.
The number that is to be multiplied has to be constant, just like the offset so that the relationship among the magnitudes is preserved. Its better to use powers of 2 such as 8 or 16, as then bitshifting operator can be used. However just like the calculation of offset takes some time, so will calculation of the multiplier will take some time. The whole array is to be searched to calculate the least number that when multiplied will turn all the numbers with zeros in decimal parts.
This may not compute fast but still can do the job if required.
I am trying to implement the sieve of eratosthenes in C. The code works for small input values, but once the input goes beyond a certain range, a run- time error is thrown. This is the second problem in the classical section of the SPOJ base. What is the mistake?
#include<stdio.h>
#include<math.h>
int prime(unsigned long int, unsigned long int);
int main()
{
int nitem;
unsigned long int sn,fn;
scanf("%d", &nitem);
while(nitem)
{
scanf("%lu", &fn);
//printf("%d",fn);
scanf("%lu", &sn);
prime(fn, sn);
nitem--;
}
return 0;
}
int prime(unsigned long int fn, unsigned long int sn)
{
unsigned long int prim[100000];
int i,j,k;
for(i = 0; i < 100000; i++)
{
prim[i] = 1;
}
prim[0] = 0;
prim[1] = 0;
//printf("%d", sn);
//printf("%d", k);
//printf("%d", (k <= sn));
for(k = 2; k <= sqrt(sn); k++)
{
// printf("alksnc%5d", k);
if(prim[k] == 1)
{
for(j = 2; (k * j) <= sn; j++)
{
//printf("%d", prim[k]);
prim[k * j] = 0;
}
}
}
for(int i = 0; i <= sn; i++)
{
if(prim[i] !=0 && i >= fn)
{
printf("%lu\n", i);
}
}
printf("\n");
return;
}
Input:
1
100000 100345
output:
run time error
Input:
1
3 5
output:
3
5
We can make more efficient use of memory (2x) by only sieving odd numbers as all the even numbers you're processing waste time and space. It's trickier to work out but gives us something like:
#include <math.h>
#include <libc.h>
#define MAX_ODD_PRIMES 1048576
void prime(unsigned long fn, unsigned long sn)
{
unsigned char primes[MAX_ODD_PRIMES];
for (unsigned long i = 0; i < MAX_ODD_PRIMES; i++)
{
primes[i] = TRUE;
}
primes[0] = 0; // preset first odd, '1'
for (unsigned long k = 3; k <= sqrt(sn) + 1; k += 2)
{
if (primes[k / 2])
{
for (unsigned long j = 3; (k * j) <= sn; j += 2)
{
primes[k * j / 2] = FALSE;
}
}
}
if (fn <= 2)
{
printf("2\n");
fn = 3;
}
for (unsigned long i = fn / 2; i * 2 + 1 <= sn; i++)
{
if (primes[i])
{
printf("%lu\n", i * 2 + 1);
}
}
}
EXAMPLE
> ./a.out
1 1999900 2000000
1999957
1999969
1999979
1999993
>
1) Array range error.
By changing code
for (j = 2; (k * j) <= sn; j++) {
if (k * j >= 100000) {
printf("Out of range %d %d\n", k, j);
exit(1);
}
prim[k * j] = 0;
}
}
With input 2, 100000
Output
Out of range 2 50000
By using an array (VLA) sized to the task, this is avoided. Many other optimizations available. Consider also a malloc() array.
void prime(unsigned long int fn, unsigned long int sn) {
unsigned long int prim[sn + 1];
2) int prime() eventually performs return; where return something; is expected. Suggest changing function to void prime()
int prime(unsigned long int fn, unsigned long int sn) {
unsigned long int prim[100000];
...
printf("\n");
return;
}
I want to write a function to get a combination_n_k. And here is what I did:
int com(int n, int k)
{
int result = 1;
for( int i=n; i>(n-k); i--)
result *= i;
for( int i=k; i>1; i--)
result /= i;
return result;
}
This works fine for small number, but when it turn to big number, the result just exceed the max of int. Is there any better way to do this?
You can get to a slightly higher bound by using unsigned long long and a bit more clever algorithm:
unsigned long long n = /* n */, k = /* k */;
unsigned long long p = 1; /* accumulates the product */
unsigned long long m = k < n - k ? k : n - k; /* min(k, n - k) */
k = m;
unsigned long long i = n - k + 1;
unsigned long long j = 1;
while (m-- > 0) {
p = p * i++ / j++;
}
If this is still not enough, then you will have to use a bigint library.