systick example on cortex m3 not working - c

I am trying to measure the time elapsed during a function but I am not sure if I am doing it correctly. When I stop the systick its value returned is 24 which is not possible.
#define SYSTICKS 0xFFFFFFu
static volatile uint32_t Count = 0;
void SysTick_Handler(void)
{
Count++;
}
int doSomething()
{
int a = 0;
int b = 8;
return a + b;
}
int main(int argc, char *argv[])
SysTick_Config(SYSTICKS);
volatile uint64_t a = (uint64_t)SYSTICKS - SysTick->VAL;
a += ((uint64_t)SYSTICKS * Count);
/* some code here */
doSomething();
volatile uint64_t b = (uint64_t)SYSTICKS - SysTick->VAL;
b += ((uint64_t)SYSTICKS * Count);
volatile uint64_t timeElapsed = a - b;
printf("a = %lld\r\n", a);
printf("b = %lld\r\n", b);
printf("timeElapsed = %lld\n", timeElapsed);
And here is the output
a = 16777215
b = 24
timeElapsed = 16777191

Related

Floating point exception (core dumped) for Linear Congruential Generator

I'm finishing up my header file for a Linear Congruential Generator (LCG) based cipher program. It receives two unsigned long values (m and c) and generates the LCG struct using these values. In my getA() I'm trying to have it add the temp variable to the uPrimes array (in case it cannot be reduced to 1, so it can be included when calculating p) but I keep receiving the error: "Floating point exception (core dumped)". It will run to completion, but does not perform as it needs to, if I do not try to do this. In this latest iteration I attempted to assign the first value of the uPrimes array as 1, and once the while loop finishes it assigns the value of temp to the first value, with the same result. Any help would be greatly appreciated! (I apologize ahead of time for the mismatched variable declarations, I was toying around with the data types haphazardly to try and solve the problem)
/* Header guard prevents errors if header is included twice */
#ifndef LCG_H
#define LCG_H
#include <stdlib.h>
struct LinearCongruentialGenerator
{
unsigned long m; /* modulus */
unsigned long c; /* increment */
unsigned long a; /* multiplier */
unsigned long x; /* value in sequence */
};
/***************************************************************/
/* Initialize an LCG with modulus m and increment c. */
/* Calculate multiplier a such that: */
/* a = 1+2p, if 4 is a factor of m, otherwise, a = 1+p. */
/* p = (product of m’s unique prime factors). */
/* a < m */
/* Seed value x is same as increment c. */
/* If values are invalid for LCG, set all fields to zero. */
/***************************************************************/
struct LinearCongruentialGenerator makeLCG(unsigned long m, unsigned long c);
/* Update lcg and return next value in the sequence. */
unsigned long getNextRandomValue(struct LinearCongruentialGenerator* lcg);
unsigned long getA(unsigned long m);
int checkInput(unsigned long m, unsigned long c);
struct LinearCongruentialGenerator makeLCG(unsigned long m, unsigned long c)
{
struct LinearCongruentialGenerator lcg;
if(checkInput(m,c) && (getA(m)<m && getA(m)>0))
{
lcg.m = m;
lcg.c = c;
lcg.a = getA(m);
lcg.x = c;
} else
{
lcg.m = 0; lcg.c = 0;
lcg.a = 0; lcg.x = 0;
}
return lcg;
}
unsigned long getNextRandomValue(struct LinearCongruentialGenerator* lcg)
{
lcg->x = ((lcg->a*lcg->x)+lcg->c)%lcg->m;
return lcg->x;
}
unsigned long getA(unsigned long m)
{
unsigned long primes[15] = {2,3,5,7,11,13,17,19,23,29,31,37,41,43,47};
int uPrimes[63];
unsigned long temp = m; int y = 0; int p = 1;
int prev; int z; unsigned long a; int q = 1;
uPrimes[0] = 1;
while(y < 16)
{
if(temp % primes[y] == 0)
{
if(primes[y] != prev)
{
uPrimes[q] = primes[y];
prev = primes[y];
q++; y++;
printf("Unique Prime for %lu is: %d\n", m, uPrimes[q-1]);
} else temp = temp/primes[y];
} else y++;
}
uPrimes[0] = temp;
for(z = 0; z < q; z++)
{
p = p * uPrimes[z];
printf("P for %lu is %d\n", m, p);
}
if(m % 4 == 0){a = 1+(2*p);}
else a = 1+p;
if(a < m && a > 0){return a;}
else return 0;
}
int checkInput(unsigned long m, unsigned long c)
{
int x = 2;
if(c > m || c <= 0){return 0;}
else
{
while(x < c)
{
if(m % x == 0 && c % x == 0)
{return 0;}
else x++;
}
return 1;
}
}
#endif
This is the test file I've been given to verify my header file works as needed:
#include <stdio.h>
#include "lcg.h"
/* Print LCG values along with a message */
void printLCG(struct LinearCongruentialGenerator* lcg, char* msg)
{
printf("%s (m=%lu,a=%lu,c=%lu,x=%lu)\n", msg, lcg->m, lcg->a, lcg->c, lcg->x);
}
/* Print message and n values generated by the LCG */
void testValues(struct LinearCongruentialGenerator* lcg, char* msg, int n)
{
int i;
printf("%s\n", msg);
for(i = 0; i < n; ++i)
{
unsigned long x = getNextRandomValue(lcg);
printf("%lu\n", x);
}
}
/* Create and test a few LCGs */
int main()
{
struct LinearCongruentialGenerator lcg1 = makeLCG(126,25);
struct LinearCongruentialGenerator lcg2 = makeLCG(38875,1234);
struct LinearCongruentialGenerator lcg3 = makeLCG(4611686018427387904,961168601842738797);
/* Some error cases */
struct LinearCongruentialGenerator lcg4 = makeLCG(4,3);
struct LinearCongruentialGenerator lcg5 = makeLCG(0,5);
struct LinearCongruentialGenerator lcg6 = makeLCG(5,0);
printLCG(&lcg1, "initialized lcg1");
printLCG(&lcg2, "initialized lcg2");
printLCG(&lcg3, "initialized lcg3");
printLCG(&lcg4, "initialized error test lcg4");
printLCG(&lcg5, "initialized error test lcg5");
printLCG(&lcg6, "initialized error test lcg6");
testValues(&lcg1, "test lcg1", 10);
testValues(&lcg2, "test lcg2", 10);
testValues(&lcg3, "test lcg3", 10);
printLCG(&lcg1, "lcg1 after first test");
printLCG(&lcg2, "lcg2 after first test");
printLCG(&lcg3, "lcg3 after first test");
testValues(&lcg1, "test lcg1 again", 20);
printLCG(&lcg1, "lcg1 after second test");
return 0;
}
At least this problem, primes[15] does not exist. The 15 primes[0] ... primes[15-1]
do exist.
Code may have then attempted %0 resulting in "Floating point exception (core dumped)". For various reasons an integer div by 0 or remainder reports as a FP error.
unsigned long primes[15] = { ... }
while(y < 16) {
if(temp % primes[y] == 0)
Suggest while(y < 15) {
After cleaning up (removing all warnings made by clang -Weverything), repairing the off-by-one found by #chux, and adding a bit more tracking I got this
#include <stdlib.h>
#include <stdio.h>
struct LinearCongruentialGenerator {
unsigned long m; /* modulus */
unsigned long c; /* increment */
unsigned long a; /* multiplier */
unsigned long x; /* value in sequence */
};
/***************************************************************/
/* Initialize an LCG with modulus m and increment c. */
/* Calculate multiplier a such that: */
/* a = 1+2p, if 4 is a factor of m, otherwise, a = 1+p. */
/* p = (product of m’s unique prime factors). */
/* a < m */
/* Seed value x is same as increment c. */
/* If values are invalid for LCG, set all fields to zero. */
/***************************************************************/
struct LinearCongruentialGenerator makeLCG(unsigned long m, unsigned long c);
/* Update lcg and return next value in the sequence. */
unsigned long getNextRandomValue(struct LinearCongruentialGenerator *lcg);
unsigned long getA(unsigned long m);
int checkInput(unsigned long m, unsigned long c);
void printLCG(struct LinearCongruentialGenerator *lcg, char *msg);
void testValues(struct LinearCongruentialGenerator *lcg, char *msg, int n);
struct LinearCongruentialGenerator makeLCG(unsigned long m, unsigned long c)
{
struct LinearCongruentialGenerator lcg;
if (checkInput(m, c) && (getA(m) < m && getA(m) > 0)) {
lcg.m = m;
lcg.c = c;
lcg.a = getA(m);
lcg.x = c;
} else {
lcg.m = 0;
lcg.c = 0;
lcg.a = 0;
lcg.x = 0;
}
return lcg;
}
unsigned long getNextRandomValue(struct LinearCongruentialGenerator *lcg)
{
lcg->x = lcg->a * lcg->x;
lcg->x = lcg->x + lcg->c;
lcg->x = lcg->x % lcg->m;
return lcg->x;
}
unsigned long getA(unsigned long m)
{
unsigned long primes[15] =
{ 2, 3, 5, 7, 11, 13, 17, 19, 23, 29, 31, 37, 41, 43, 47 };
unsigned long uPrimes[63];
unsigned long temp = m;
unsigned long y = 0;
unsigned long p = 1;
unsigned long prev = 0;
unsigned long z;
unsigned long a;
unsigned long q = 1;
uPrimes[0] = 1;
while (y < 15) {
if (temp % primes[y] == 0) {
if (primes[y] != prev) {
uPrimes[q] = primes[y];
prev = primes[y];
q++;
y++;
printf("Unique Prime for %lu is: %lu\n", m, uPrimes[q - 1]);
} else
temp = temp / primes[y];
} else
y++;
}
uPrimes[0] = temp;
printf("q = %lu\n", q);
for (z = 0; z < q; z++) {
p = p * uPrimes[z];
printf("P for %lu is %lu\n", m, p);
}
if (m % 4 == 0) {
a = 1 + (2 * p);
} else {
a = 1 + p;
}
printf("getA(m): m = %lu, a = %lu\n", m, a);
if (a < m && a > 0) {
return a;
} else {
return 0;
}
}
int checkInput(unsigned long m, unsigned long c)
{
unsigned long x = 2;
printf("checkInput(m = %lu, c = %lu)\n", m, c);
if (c > m || c <= 0) {
return 0;
} else {
while (x < c) {
if (m % x == 0 && c % x == 0) {
return 0;
} else
x++;
}
return 1;
}
}
/* Print LCG values along with a message */
void printLCG(struct LinearCongruentialGenerator *lcg, char *msg)
{
printf("%s (m=%lu,a=%lu,c=%lu,x=%lu)\n", msg, lcg->m, lcg->a, lcg->c, lcg->x);
}
/* Print message and n values generated by the LCG */
void testValues(struct LinearCongruentialGenerator *lcg, char *msg, int n)
{
int i;
printf("%s\n", msg);
for (i = 0; i < n; ++i) {
unsigned long x = getNextRandomValue(lcg);
printf("%lu\n", x);
}
}
/* Create and test a few LCGs */
int main(void)
{
puts("makeLCG(126,25)");
struct LinearCongruentialGenerator lcg1 = makeLCG(126, 25);
puts("makeLCG(38875,1234)");
struct LinearCongruentialGenerator lcg2 = makeLCG(38875, 1234);
//puts("makeLCG(4611686018427387904,961168601842738797)");
//struct LinearCongruentialGenerator lcg3 = makeLCG(4611686018427387904,961168601842738797);
/* Some error cases */
struct LinearCongruentialGenerator lcg4 = makeLCG(4, 3);
struct LinearCongruentialGenerator lcg5 = makeLCG(0, 5);
struct LinearCongruentialGenerator lcg6 = makeLCG(5, 0);
printLCG(&lcg1, "initialized lcg1");
printLCG(&lcg2, "initialized lcg2");
//printLCG(&lcg3, "initialized lcg3");
printLCG(&lcg4, "initialized error test lcg4");
printLCG(&lcg5, "initialized error test lcg5");
printLCG(&lcg6, "initialized error test lcg6");
testValues(&lcg1, "test lcg1", 10);
testValues(&lcg2, "test lcg2", 10);
//testValues(&lcg3, "test lcg3", 10);
printLCG(&lcg1, "lcg1 after first test");
printLCG(&lcg2, "lcg2 after first test");
//printLCG(&lcg3, "lcg3 after first test");
testValues(&lcg1, "test lcg1 again", 20);
printLCG(&lcg1, "lcg1 after second test");
return 0;
}
Compiling with clang -g3 -O3 -Weverything -std=c11 lcg.c -o lcg and running it in the debugger (gdb) gives (here!):
makeLCG(126,25)
checkInput(m = 126, c = 25)
Unique Prime for 126 is: 2
Unique Prime for 126 is: 3
Unique Prime for 126 is: 7
q = 4
P for 126 is 126
P for 126 is 252
P for 126 is 756
P for 126 is 5292
getA(m): m = 126, a = 5293
Unique Prime for 126 is: 2
Unique Prime for 126 is: 3
Unique Prime for 126 is: 7
q = 4
P for 126 is 126
P for 126 is 252
P for 126 is 756
P for 126 is 5292
getA(m): m = 126, a = 5293
makeLCG(38875,1234)
checkInput(m = 38875, c = 1234)
Unique Prime for 38875 is: 5
q = 2
P for 38875 is 38875
P for 38875 is 194375
getA(m): m = 38875, a = 194376
Unique Prime for 38875 is: 5
q = 2
P for 38875 is 38875
P for 38875 is 194375
getA(m): m = 38875, a = 194376
checkInput(m = 4, c = 3)
Unique Prime for 4 is: 2
q = 2
P for 4 is 4
P for 4 is 8
getA(m): m = 4, a = 17
Unique Prime for 4 is: 2
q = 2
P for 4 is 4
P for 4 is 8
getA(m): m = 4, a = 17
checkInput(m = 0, c = 5)
checkInput(m = 5, c = 0)
initialized lcg1 (m=0,a=0,c=0,x=0)
initialized lcg2 (m=0,a=0,c=0,x=0)
initialized error test lcg4 (m=0,a=0,c=0,x=0)
initialized error test lcg5 (m=0,a=0,c=0,x=0)
initialized error test lcg6 (m=0,a=0,c=0,x=0)
test lcg1
Program received signal SIGFPE, Arithmetic exception.
0x0000000000400b1b in getNextRandomValue (lcg=<optimized out>) at lcg.c:54
54 lcg->x = lcg->x % lcg->m;
A division by zero because getA(m) returns always zero because the calculated value in a is always bigger than m and that zero from getA() causes makeLCG() to call the second branch that sets all values in the struct to zero.

Hash value from C function

I have a problem and hope you can help me.
I have a function written in C that returns hash a value. My
headache is when I execute the program from another tool it takes a lot of time to run, probably because inside my function I run a command that hashes my value in SHA256, so I would like to know if there is another way to do it, maybe a function or something like that.
Here is what I have:
const char *EncryptSHA256 (char *Arg1) {
char command[128];
char result[512];
//I want to replace from here
snprintf(command, sizeof command, "echo -n %s | sha256sum | cut -c1-64",Arg1);
FILE *fpipe;
if (0 == (fpipe = (FILE*)popen(command, "r"))) {
perror("popen() failed.");
exit(1);
}
fread(result, 1, 512, fpipe);
pclose(fpipe);
const char *sha256 = &result[0];
//to here
return sha256;
}
Your code has undefined behavior because you return a pointer to result, a local array with automatic storage. Reading from this array by the caller has undefined behavior.
You should at least make result static so its contents remain readable after EncryptSHA256 returns to its caller.
Regarding the inefficiency of the method, here is a public domain implementation of SHA256 that you can use directly inside your program:
/* public domain sha256 implementation based on fips180-3 */
#include <stddef.h>
#include <stdint.h>
#include <string.h>
/* Public API */
struct sha256 {
uint64_t len; /* processed message length */
uint32_t h[8]; /* hash state */
uint8_t buf[64]; /* message block buffer */
};
/* reset state */
void sha256_init(struct sha256 *s);
/* process message */
void sha256_update(struct sha256 *s, const void *m, size_t len);
/* get message digest */
/* state is ruined after sum, keep a copy if multiple sum is needed */
/* part of the message might be left in s, zero it if secrecy is needed */
void sha256_sum(struct sha256 *s, uint8_t md[32]);
/* Implementation */
static uint32_t ror(uint32_t n, int k) {
return (n >> k) | (n << (32 - k));
}
#define Ch(x,y,z) (z ^ (x & (y ^ z)))
#define Maj(x,y,z) ((x & y) | (z & (x | y)))
#define S0(x) (ror(x,2) ^ ror(x,13) ^ ror(x,22))
#define S1(x) (ror(x,6) ^ ror(x,11) ^ ror(x,25))
#define R0(x) (ror(x,7) ^ ror(x,18) ^ (x>>3))
#define R1(x) (ror(x,17) ^ ror(x,19) ^ (x>>10))
static const uint32_t K[64] = {
0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
};
static void processblock(struct sha256 *s, const uint8_t *buf) {
uint32_t W[64], t1, t2, a, b, c, d, e, f, g, h;
int i;
for (i = 0; i < 16; i++) {
W[i] = (uint32_t)buf[4 * i + 0] << 24;
W[i] |= (uint32_t)buf[4 * i + 1] << 16;
W[i] |= (uint32_t)buf[4 * i + 2] << 8;
W[i] |= buf[4 * i + 3];
}
for (; i < 64; i++)
W[i] = R1(W[i-2]) + W[i-7] + R0(W[i-15]) + W[i-16];
a = s->h[0];
b = s->h[1];
c = s->h[2];
d = s->h[3];
e = s->h[4];
f = s->h[5];
g = s->h[6];
h = s->h[7];
#define ROUND(a,b,c,d,e,f,g,h,i) \
t1 = h + S1(e) + Ch(e,f,g) + K[i] + W[i]; \
t2 = S0(a) + Maj(a,b,c); \
d += t1; \
h = t1 + t2;
for (i = 0; i < 64; ) {
ROUND(a, b, c, d, e, f, g, h, i); i++;
ROUND(h, a, b, c, d, e, f, g, i); i++;
ROUND(g, h, a, b, c, d, e, f, i); i++;
ROUND(f, g, h, a, b, c, d, e, i); i++;
ROUND(e, f, g, h, a, b, c, d, i); i++;
ROUND(d, e, f, g, h, a, b, c, i); i++;
ROUND(c, d, e, f, g, h, a, b, i); i++;
ROUND(b, c, d, e, f, g, h, a, i); i++;
}
#undef ROUND
s->h[0] += a;
s->h[1] += b;
s->h[2] += c;
s->h[3] += d;
s->h[4] += e;
s->h[5] += f;
s->h[6] += g;
s->h[7] += h;
}
static void pad(struct sha256 *s) {
unsigned r = s->len % 64;
s->buf[r++] = 0x80;
if (r > 56) {
memset(s->buf + r, 0, 64 - r);
r = 0;
processblock(s, s->buf);
}
memset(s->buf + r, 0, 56 - r);
s->len *= 8;
s->buf[56] = s->len >> 56;
s->buf[57] = s->len >> 48;
s->buf[58] = s->len >> 40;
s->buf[59] = s->len >> 32;
s->buf[60] = s->len >> 24;
s->buf[61] = s->len >> 16;
s->buf[62] = s->len >> 8;
s->buf[63] = s->len;
processblock(s, s->buf);
}
void sha256_init(struct sha256 *s) {
s->len = 0;
s->h[0] = 0x6a09e667;
s->h[1] = 0xbb67ae85;
s->h[2] = 0x3c6ef372;
s->h[3] = 0xa54ff53a;
s->h[4] = 0x510e527f;
s->h[5] = 0x9b05688c;
s->h[6] = 0x1f83d9ab;
s->h[7] = 0x5be0cd19;
}
void sha256_sum(struct sha256 *s, uint8_t md[20]) {
int i;
pad(s);
for (i = 0; i < 8; i++) {
md[4 * i + 0] = s->h[i] >> 24;
md[4 * i + 1] = s->h[i] >> 16;
md[4 * i + 2] = s->h[i] >> 8;
md[4 * i + 3] = s->h[i];
}
}
void sha256_update(struct sha256 *s, const void *m, unsigned long len) {
const uint8_t *p = m;
unsigned r = s->len % 64;
s->len += len;
if (r) {
if (len < 64 - r) {
memcpy(s->buf + r, p, len);
return;
}
memcpy(s->buf + r, p, 64 - r);
len -= 64 - r;
p += 64 - r;
processblock(s, s->buf);
}
for (; len >= 64; len -= 64, p += 64)
processblock(s, p);
memcpy(s->buf, p, len);
}
You would change your function to this:
const char *EncryptSHA256(char *Arg1) {
struct sha256 s;
unsigned char md[32];
static char result[65];
sha256_init(&s);
sha256_update(&s, Arg1, strlen(Arg1));
sha256_sum(&s, md);
for (int i = 0; i < 32; i++) {
sprintf(result + i * 2, "%02x", md[i]);
}
return result;
}
You could also change the API to pass an array of 32 unsigned characters to get the binary form if it is more convenient.

Hexadecimal comparison for big number in c program

#include<stdio.h>
long long int A = 0xAABBCCDDBBCCFFEE;
long long int B = 0xAACCCCDDBBDDFF00;
int array[8] = {'\0'};
int byte_compare(int A, int B)
{
int i = 0;
long long int j = 0xFF;
long long int C = A;
long long int D = B;
while(i < 8)
{
C = (j & A);
D = (j & B);
printf("C = %x\n",C);
printf("D = %x\n",D);
printf("j = %x\n",j);
if(C == D)
{
array[i] = 1;
}
else
{
array[i] = 0;
}
i++;
j = j << 8;
}
}
main()
{
int i = 0;
byte_compare(A,B);
while(i < 8)
{
printf("array[%d] - %d\n",i, array[i]);
i++;
}
}
long long int A = 0xAABBCCDDBBCCFFEE;
long long int B = 0xAACCCCDDBBDDFF00;
result = 10111010
when A and B byte number matches with each other it should print 1 otherwise 0.
for my above program it is printing output
C = ee
D = 0
j = ff
C = ff00
D = ff00
j = ff00
C = cc0000
D = dd0000
j = ff0000
C = bb000000
D = bb000000
j = ff000000
C = 0
D = 0
j = 0
C = 0
D = 0
j = 0
C = 0
D = 0
j = 0
C = 0
D = 0
j = 0
array[0] - 0
array[1] - 1
array[2] - 0
array[3] - 1
array[4] - 1
array[5] - 1
array[6] - 1
array[7] - 1
Not able to compare after first four byte please help me what datatype I should use instead above.
You have many problems in your code
#include <stdio.h>
#include <stdint.h>
#include <inttypes.h>
void byte_compare(uint64_t C, uint64_t D, int *array)
{
int i = 0;
while (i < 8)
{
printf("C = %"PRIx64"\n" , (C&0xFF));
printf("D = %"PRIx64"\n\n", (D&0xFF));
if ((C&0xFF) == (D&0xFF))
{
array[i] = 1;
}
else
{
array[i] = 0;
}
i++;
C >>= 8;
D >>= 8;
}
}
int main(void)
{
int i = 0;
uint64_t A = UINT64_C(0xAABBCCDDBBCCFFEE);
uint64_t B = UINT64_C(0xAACCCCDDBBDDFF00);
int array[8] = {0};
byte_compare(A, B, array);
while (i < 8)
{
printf("array[%d] - %d\n", i, array[i]);
i++;
}
}
Correct type for your variables must be unsigned long long due to the literals: bit 63 is set to 1 so assign it to a signed variable causes overflow which invoke UB. Using stdint.h standard header you can use uint64_t that is clearer.
byte_compare function definition is wrong, you must pass parameters with correct type: unsigned long long (uint64_t). Moreover the function does not return an int so should be void.
printf with %x format specifier wants an unsigned int, not a uint64_t. Best thing to do is to use defined in inttypes.h standard header where all format specifier are well reported: in your case PRIx64 is the right one.
In your version of code you are left shifting a long long int that is used as mask to "select" the byte to check. It is plain wrong because last shift causes an overflow on a signed value that is UB as for point 1.
stdint.h has also macros that append the correct suffix to literals.
Another example using a mask:
#include <stdio.h>
#include <stdint.h>
#include <inttypes.h>
void byte_compare(uint64_t C, uint64_t D, int *array)
{
uint64_t mask = UINT64_C(0xFF00000000000000);
while (mask > 0)
{
printf("C = %"PRIx64"\n" , (C&mask));
printf("D = %"PRIx64"\n\n", (D&mask));
if ((C&mask) == (D&mask))
{
*array = 1;
}
else
{
*array = 0;
}
mask >>= 8;
array--;
}
}
int main(void)
{
int i = 0;
uint64_t A = UINT64_C(0xAABBCCDDBBCCFFEE);
uint64_t B = UINT64_C(0xAACCCCDDBBDDFF00);
int array[8] = {0};
byte_compare(A, B, &array[7]);
while (i < 8)
{
printf("array[%d] - %d\n", i, array[i]);
i++;
}
}

Fermat Primality Test failure in C

#include <stdio.h>
#include <time.h>
#include <math.h>
#include <stdlib.h>
#define MAXNUM 2000000000
#define MINNUM 1990000001
#define MAXTRIES 10
unsigned long long b, e, m, result;
int modulo(b, e, m)
{
result = 1;
while(e > 0)
{
if(e % 2 == 1)
{
result = (result * b);
}
b = (b * b) % m;
e = e / 2;
}
return result % m;
}
int isPrime(n)
{
unsigned long long a;
int i;
for(i = 1; i <= 10; i++)
{
a = rand() % (n - 1) + 1;
if(modulo(a, n - 1, n) != 1)
{
return 0;
}
}
return 1;
}
int main()
{
unsigned int prime = 0;
unsigned int flag = 0;
unsigned int tries;
unsigned int start;
long curtime;
unsigned long long p;
curtime = time(NULL);
srand((unsigned int) curtime);
printf("Checking range [1990000001, 2000000000] for prime numbers.\n");
if(MINNUM % 2 == 0)
{
start = MINNUM + 1;
}
else
{
start = MINNUM;
}
printf("Trying Fermat test with seed %ld \n\n",curtime);
prime = 0;
for(tries = 1; tries <= MAXTRIES; tries++)
{
clock_t tic = clock();
for(p = start; p <= MAXNUM; p += 2)
{
if(isPrime(p))
prime++;
}
clock_t toc = clock();
printf("Probabilistic algorithm: Found %ld primes in %f seconds.(tries = %d)\n", prime, (double)(toc - tic) / CLOCKS_PER_SEC,tries);
prime = 0;
}
return 0;
}
So the problem is that the algorithm finds in every try 5000000 prime numbers when it should find around 466646 with some deviation. Which means that in every try it should find a number of primes close to the one mentioned above.
It looks like the main problem is caused by integer overflows in the modulo() function. Specifically, result=(result*b) is going to overflow quite regularly. You need to store these variables in 64-bit unsigned integers, and calculate the modulus of this result every time.
This will work (with a few minor corrections elsewhere):
#include <inttypes.h>
#define MAXNUM 2000000000
#define MINNUM 1990000001
#define MAXTRIES 10
uint64_t modulo(uint64_t b, uint64_t e, uint64_t m){
uint64_t result=1;
while(e>0){
if(e%2==1){
result=(result*b)%m;
}
b=(b*b)%m;
e=e/2;
}
return result%m;
}
Result:
Checking range [1990000001, 2000000000] for prime numbers.
Trying Fermat test with seed 1416322197
Probabilistic algorithm: Found 466646 primes in 5.157485 seconds.(tries=1)

CUDA reduction to find the maximum of an array

I am doing the Udacity course on parallel programming (homework 3) and can not figure out why I can't get the maximum in the array using parallel reduction (Udacity forums yet to provide solution). I am pretty certain that I have set up the arrays properly and that the algorithm is correct. I suspect that I have a problem with memory management (accessing out of bounds, incorrect array sizes, copying to and from). Please help! I am running this in the Udacity environment, not locally. Below is the code that I am currently using. For some reason when I change the fmaxf's to fminf's it does find the minimum.
#include "reference_calc.cpp"
#include "utils.h"
#include "math.h"
#include <stdio.h>
#include <cmath>
__global__ void reduce_max_kernel(float *d_out, const float *d_logLum, int size) {
// Reduce log Lum with Max Operator
int myId = threadIdx.x + blockDim.x * blockIdx.x;
int tid = threadIdx.x;
extern __shared__ float temp[];
if (myId < size) {
temp[tid] = d_logLum[myId];
}
else {
temp[tid] = d_logLum[tid];
}
for (unsigned int s = blockDim.x / 2; s > 0; s >>= 1) {
if (tid < s) {
if (myId < size) {
temp[tid] = fmaxf(d_logLum[myId + s], d_logLum[myId]);
} else {
temp[tid] = d_logLum[tid];
}
}
__syncthreads();
}
if (tid == 0) {
d_out[blockIdx.x] = temp[0];
}
}
__global__ void reduce_max_kernel2(float *d_out, float *d_in) {
// Reduce log Lum with Max Operator
int myId = threadIdx.x + blockDim.x * blockIdx.x;
int tid = threadIdx.x;
for (unsigned int s = blockDim.x >> 1; s > 0; s >>= 1) {
if (tid < s) {
d_in[myId] = fmaxf(d_in[myId + s], d_in[myId]);
}
__syncthreads();
}
if (tid == 0) {
d_out[0] = d_in[0];
}
}
void your_histogram_and_prefixsum(const float* const d_logLuminance,
unsigned int* const d_cdf,
float &min_logLum,
float &max_logLum,
const size_t numRows,
const size_t numCols,
const size_t numBins)
{
//TODO
/*Here are the steps you need to implement
1) find the minimum and maximum value in the input logLuminance channel
store in min_logLum and max_logLum
2) subtract them to find the range
3) generate a histogram of all the values in the logLuminance channel using
the formula: bin = (lum[i] - lumMin) / lumRange * numBins
4) Perform an exclusive scan (prefix sum) on the histogram to get
the cumulative distribution of luminance values (this should go in the
incoming d_cdf pointer which already has been allocated for you) */
//int size = 1 << 18;
int points = numRows * numCols;
int logPoints = ceil(log(points)/log(2));
int sizePow = logPoints;
int size = pow(2, sizePow);
int numThreads = 1024;
int numBlocks = size / numThreads;
float *d_out;
float *d_max_out;
checkCudaErrors(cudaMalloc((void **) &d_out, numBlocks * sizeof(float)));
checkCudaErrors(cudaMalloc((void **) &d_max_out, sizeof(float)));
cudaDeviceSynchronize();
reduce_max_kernel<<<numBlocks, numThreads, sizeof(float)*numThreads>>>(d_out, d_logLuminance, points);
cudaDeviceSynchronize();
reduce_max_kernel2<<<1, numBlocks>>>(d_max_out, d_out);
float h_out_max;
checkCudaErrors(cudaMemcpy(&h_out_max, d_max_out, sizeof(float), cudaMemcpyDeviceToHost));
printf("%f\n", h_out_max);
checkCudaErrors(cudaFree(d_max_out));
checkCudaErrors(cudaFree(d_out));
}
You are trying to reproduce the reduce2 reduction kernel of the CUDA SDK reduction sample. Robert Crovella has already spot two mistakes that you have made in your code. Besides them, I think you are also mistakenly initializing the shared memory.
Below, please find a complete working example constructed around your attempt. I have left the wrong instructions of your approach.
#include <thrust\device_vector.h>
#define BLOCKSIZE 256
/********************/
/* CUDA ERROR CHECK */
/********************/
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) { getchar(); exit(code); }
}
}
/*******************************************************/
/* CALCULATING THE NEXT POWER OF 2 OF A CERTAIN NUMBER */
/*******************************************************/
unsigned int nextPow2(unsigned int x)
{
--x;
x |= x >> 1;
x |= x >> 2;
x |= x >> 4;
x |= x >> 8;
x |= x >> 16;
return ++x;
}
__global__ void reduce_max_kernel(float *d_out, const float *d_logLum, int size) {
int tid = threadIdx.x; // Local thread index
int myId = blockIdx.x * blockDim.x + threadIdx.x; // Global thread index
extern __shared__ float temp[];
// --- Loading data to shared memory. All the threads contribute to loading the data to shared memory.
temp[tid] = (myId < size) ? d_logLum[myId] : -FLT_MAX;
// --- Your solution
// if (myId < size) { temp[tid] = d_logLum[myId]; } else { temp[tid] = d_logLum[tid]; }
// --- Before going further, we have to make sure that all the shared memory loads have been completed
__syncthreads();
// --- Reduction in shared memory. Only half of the threads contribute to reduction.
for (unsigned int s=blockDim.x/2; s>0; s>>=1)
{
if (tid < s) { temp[tid] = fmaxf(temp[tid], temp[tid + s]); }
// --- At the end of each iteration loop, we have to make sure that all memory operations have been completed
__syncthreads();
}
// --- Your solution
//for (unsigned int s = blockDim.x / 2; s > 0; s >>= 1) {
// if (tid < s) { if (myId < size) { temp[tid] = fmaxf(d_logLum[myId + s], d_logLum[myId]); } else { temp[tid] = d_logLum[tid]; } }
// __syncthreads();
//}
if (tid == 0) {
d_out[blockIdx.x] = temp[0];
}
}
/********/
/* MAIN */
/********/
int main()
{
const int N = 10;
thrust::device_vector<float> d_vec(N,3.f); d_vec[4] = 4.f;
int NumThreads = (N < BLOCKSIZE) ? nextPow2(N) : BLOCKSIZE;
int NumBlocks = (N + NumThreads - 1) / NumThreads;
// when there is only one warp per block, we need to allocate two warps
// worth of shared memory so that we don't index shared memory out of bounds
int smemSize = (NumThreads <= 32) ? 2 * NumThreads * sizeof(int) : NumThreads * sizeof(int);
// --- reduce2
thrust::device_vector<float> d_vec_block(NumBlocks);
reduce_max_kernel<<<NumBlocks, NumThreads, smemSize>>>(thrust::raw_pointer_cast(d_vec_block.data()), thrust::raw_pointer_cast(d_vec.data()), N);
// --- The last part of the reduction, which would be expensive to perform on the device, is executed on the host
thrust::host_vector<float> h_vec_block(d_vec_block);
float result_reduce0 = -FLT_MAX;
for (int i=0; i<NumBlocks; i++) result_reduce0 = fmax(h_vec_block[i], result_reduce0);
printf("Result = %f\n",result_reduce0);
}

Resources