I have an ansi C function to sum up values from an array, based on patterns. Something like:
long sum_all_according_to_pattern(int n, int *values, int *pattern)
{
long sum = 0;
int i = 0;
for(;i<n;i++){
if(pattern[i])
sum+=values[i];
}
return sum;
}
Let's say I've a set of patterns such as:
Pattern 1: 1,1,1,1
Pattern 2: 1,1,0,0
Pattern 3: 1,0,0,1
I need to generate a specific code for each pattern, without the loop and the if. For the previous patterns, it would be:
long sum_according_to_pattern_1(int *values)
{
return values[0]+values[1]+values[2]+values[3];
}
long sum_according_to_pattern_2(int *values)
{
return values[0]+values[1];
}
long sum_according_to_pattern_3(int *values)
{
return values[0]+values[3];
}
or even
long sum_according_to_pattern_1(int *values)
{
long sum = 0;
sum+=values[0];
sum+=values[1];
sum+=values[2];
sum+=values[3];
return sum;
}
long sum_according_to_pattern_2(int *values)
{
long sum = 0;
sum+=values[0];
sum+=values[1];
return sum;
}
long sum_according_to_pattern_3(int *values)
{
long sum = 0;
sum+=values[0];
sum+=values[3];
return sum;
}
Now, suppose that such patterns can be much larger than only 4 elements. Also, suppose I've much more than only these 3 patterns.
My question is: there is some way to achieve that using only ansi C constructions? As I'm trying to keep everything contained, I don't want to write a script to generate the code for me. What I need is to specify the pattern using something like a bitmap macro and than generate the function during compile time.
The way I would do it would be with a macro that defined all the patterns you want, combined with other macros that defined the functions or other info you need about them. So you would have something like:
#define FUNCTION_PATTERNS(M) \
M(1, 0xf) \
M(2, 0x3) \
M(3, 0x9)
#define DEFINE_SUM_FUNCTION(NUM, PATTERN) \
long sum_according_to_pattern_##NUM(int *values) { \
long sum = 0; \
for (int i = 0; 1UL << i <= PATTERN; i++) \
if (PATTERN & (1UL << i)) sum += values[i]; \
}
#define SUM_FUNCTION_NAME(NUM, PATTERN) sum_according_to_pattern_##NUM
now you can easily declare all the functions and build a table of pointers to them:
FUNCTION_PATTERNS(DEFINE_SUM_FUNCTION)
long (*sum_functions[])(int *) = { FUNCTION_PATTERNS(SUM_FUNCTION_NAME) };
if you want, you can manually unroll the loop in the DEFINE_SUM_FUNCTION macro, or you can rely on your C compiler to do it for you, possibly with an appropriate pragma or compile-time flag.
Note that the above will only work up to 32 or 64 elements (depending on architecture). If you want more, you'll have to split the patterns into multiple values.
Extending Chris Dodd's approach.
I think you can generate exactly what you describe by using a list of symbols for the patterns. So, starting with the same X-macro setup.
#define PATTERNS(_) \
_(1, A B C D) \
_(2, A B) \
_(3, A D) \
/**/
#define A sum += values[0];
#define B sum += values[1];
#define C sum += values[2];
#define D sum += values[3];
#define GEN_FUNC(num, pattern) \
long sum_accoring_to_pattern ## num (int *values) { \
long sum = 0; \
pattern \
return sum; \
}
PATTERNS(GEN_FUNC)
Running through cpp -P genpat.c | indent -gnu -i4 -br -ce -cdw -nbc -brf -brs -l100 -bbo yields
long
sum_accoring_to_pattern1 (int *values) {
long sum = 0;
sum += values[0];
sum += values[1];
sum += values[2];
sum += values[3];
return sum;
}
long
sum_accoring_to_pattern2 (int *values) {
long sum = 0;
sum += values[0];
sum += values[1];
return sum;
}
long
sum_accoring_to_pattern3 (int *values) {
long sum = 0;
sum += values[0];
sum += values[3];
return sum;
}
You could also generate the shorter form.
#define PATTERNS(_) \
_(1, A B C D) \
_(2, A B) \
_(3, A D) \
/**/
#define A + values[0]
#define B + values[1]
#define C + values[2]
#define D + values[3]
#define GEN_FUNC(num, pattern) \
long sum_accoring_to_pattern ## num (int *values) { \
return pattern ;\
}
PATTERNS(GEN_FUNC)
You almost certainly want to #undef A .. D afterword. :)
Related
I'm trying to convert one of my methods into a multi-line define directive to implement the solution in this question: Can you use #defines like method parameters in HLSL?. However, I'm not having much success with it, and am getting difficult to parse errors.
Here is what I've done:
float Random2D(float x, float y, int seed = 0)
{
return (frac(sin(fmod(dot(float2(x, y), float2(12.9898,78.233)), 6.2831853)) * (43758.5453 + seed))) * 1;
}
#define BASE_NOISE(x, y, seed) Random2D(x, y, seed)
#define FBMNoiseSampler(x, y, octaves, persistance, lacunarity, seed) \
{ \
float value = 0; \
float amplitude = 1; \
float frequency = 1; \
\
float maxVal = 0; \
float minVal = 0; \
\
for (uint i = 0; i < octaves; i++) \
{ \
float2 sample = float2(x , y) * frequency; \
float sampleResult = BASE_NOISE(sample.x, sample.y, seed) * 2 - 1; // Changes range from [0, 1] to [-1, 1]. \
result += sampleResult * amplitude; // Summation of each sample value as they go up octaves \
\
maxVal += 1 * amplitude; \
minVal -= 1 * amplitude; \
\
amplitude *= persistance; // Amplitude decreases as the octaves go up as persistance [0, 1] \
frequency *= lacunarity; // Frequency increases as octaves go up as frequency [1, inf) \
} \
// Normalizing back to [0, 1] \
// Formula: finalValue = ((initialValue - center of range) / (length of range)) + 0.5 \
return ((value - (maxVal + minVal) / 2) / (maxVal - minVal)) + 0.5; \
}
And here's how I'm calling it in CSMain in a compute shader:
#undef BASE_NOISE
#define BASE_NOISE(x, y, seed) Random2D(x, y, seed) // Or any other noise function
value = FBMNoiseSampler(0.5, 0.5, 1, 0.5, 2.5, 0)
But this gives me the following errors:
Shader error in 'ExampleFile': 'Random2D': no matching 0 parameter function at kernel CSMain at ExampleFile.compute(x) (on d3d11)
Shader error in 'ExampleFile': syntax error: unexpected float constant at kernl CSMain at ExampleFile.compute(x) (on d3d11)
Shader error in 'ExampleFile': syntax error: unexpected token '{' at kernel CSMain at ExampleFile.compute(x) (on d3d11)
Where x in the line that I'm calling the macro from.
How do I properly turn the FBMNoiseSampler method into a macro? From what I can tell the backslashes are in the right places according to many example multi-line macros (e.g. https://www.geeksforgeeks.org/multiline-macros-in-c/), and I couldn't find any resources that showed how to return a value in these multi-line macros.
This is a repost (Old one deleted) with more specific examples. Any help is much appreciated.
Here are some problems with your definition:
You cannot return something in a define directive (unless you mean to return in the caller function).
Define directives cannot contain comments since the last \ will be ignored.
This relates to C.
I am wondering if I can dynamically cast a pointer.
I have a struct called Value that encapsulates different data (for example, int and float arrays):
typedef struct {
uint64_t count; // array length
int8_t type; // array type
uint8_t data[]; // array
} *Value;
ints have type = 1. floats have type = 2. (More types - including negative types - exist but are omitted in this example for simplicity.)
data[] can contain different types. The values are accessed by casting data, eg ((int *)x->data)[0].
Here is the code I use to add 2 Values:
#define ADD(x, y) ((x) + (y))
#define APPLY_OP(op) \
if (1 == ret->type){ \
APPLY_OP_ACCESSORS(op, ((int *)ret->data)); \
} \
else { \
APPLY_OP_ACCESSORS(op, ((float *)ret->data)); \
}
#define APPLY_OP_ACCESSORS(op, ra)
if (1 == x->type && 1 == y->type) { APPLY_OP_ITER(op, ra, ((int* )x->data), ((int* )y->data)) } \
else if (1 == x->type && 2 == y->type) { APPLY_OP_ITER(op, ra, ((int* )x->data), ((float*)y->data)) } \
else if (2 == x->type && 1 == y->type) { APPLY_OP_ITER(op, ra, ((float*)x->data), ((int* )y->data)) } \
else if (2 == x->type && 2 == y->type) { APPLY_OP_ITER(op, ra, ((floar*)x->data), ((float*)x->data)) }
#define APPLY_OP_ITER(op, ra, xa, ya) /* op, return/x/y accessors */ \
if (x->count == y->count) for (uint64_t i = 0; i < x->count; ++i) ra[i] = op( xa[i], ya[i] ); \
else if (x->count > y->count) for (uint64_t i = 0; i < x->count; ++i) ra[i] = op( xa[i], ya[0] ); \
else { for (uint64_t i = 0; i < y->count; ++i) ra[i] = op( xa[0], ya[i] );}
Value add(Value x, Value y){
// allocate an object for the return value
Value ret = getReturnObject(x, y);
// ...
// handle if error
// ...
// calculate
APPLY_OP(ADD);
return ret;
}
The APPLY_OP macro has 2 branches, each having the same code except for the pointer type to the ret->data struct member. Is there some way i can avoid this branching/duplication, and dynamically change the ret->data pointer type instead? With more Value->types and more functions (subtract, multiply, divide, etc) the generated binary bloats up significantly.
NB this is simplified significantly from my own project. I omitted what I could. Hopefully the question is clear.
I noticed a curious thing on my computer.* The handwritten divisibility test is significantly faster than the % operator. Consider the minimal example:
* AMD Ryzen Threadripper 2990WX, GCC 9.2.0
static int divisible_ui_p(unsigned int m, unsigned int a)
{
if (m <= a) {
if (m == a) {
return 1;
}
return 0;
}
m += a;
m >>= __builtin_ctz(m);
return divisible_ui_p(m, a);
}
The example is limited by odd a and m > 0. However, it can be easily generalized to all a and m. The code just converts the division to a series of additions.
Now consider the test program compiled with -std=c99 -march=native -O3:
for (unsigned int a = 1; a < 100000; a += 2) {
for (unsigned int m = 1; m < 100000; m += 1) {
#if 1
volatile int r = divisible_ui_p(m, a);
#else
volatile int r = (m % a == 0);
#endif
}
}
... and the results on my computer:
| implementation | time [secs] |
|--------------------|-------------|
| divisible_ui_p | 8.52user |
| builtin % operator | 17.61user |
Therefore more than 2 times faster.
The question: Can you tell me how the code behaves on your machine? Is it missed optimization opportunity in GCC? Can you do this test even faster?
UPDATE:
As requested, here is a minimal reproducible example:
#include <assert.h>
static int divisible_ui_p(unsigned int m, unsigned int a)
{
if (m <= a) {
if (m == a) {
return 1;
}
return 0;
}
m += a;
m >>= __builtin_ctz(m);
return divisible_ui_p(m, a);
}
int main()
{
for (unsigned int a = 1; a < 100000; a += 2) {
for (unsigned int m = 1; m < 100000; m += 1) {
assert(divisible_ui_p(m, a) == (m % a == 0));
#if 1
volatile int r = divisible_ui_p(m, a);
#else
volatile int r = (m % a == 0);
#endif
}
}
return 0;
}
compiled with gcc -std=c99 -march=native -O3 -DNDEBUG on AMD Ryzen Threadripper 2990WX with
gcc --version
gcc (Gentoo 9.2.0-r2 p3) 9.2.0
UPDATE2: As requested, the version that can handle any a and m (if you also want to avoid integer overflow, the test has to be implemented with integer type twice as long as the input integers):
int divisible_ui_p(unsigned int m, unsigned int a)
{
#if 1
/* handles even a */
int alpha = __builtin_ctz(a);
if (alpha) {
if (__builtin_ctz(m) < alpha) {
return 0;
}
a >>= alpha;
}
#endif
while (m > a) {
m += a;
m >>= __builtin_ctz(m);
}
if (m == a) {
return 1;
}
#if 1
/* ensures that 0 is divisible by anything */
if (m == 0) {
return 1;
}
#endif
return 0;
}
What you’re doing is called strength reduction: replacing an expensive operation with a series of cheap ones.
The mod instruction on many CPUs is slow, because it historically was not tested in several common benchmarks and the designers therefore optimized other instructions instead. This algorithm will perform worse if it has to do many iterations, and % will perform better on a CPU where it needs only two clock cycles.
Finally, be aware that there are many shortcuts to take the remainder of division by specific constants. (Although compilers will generally take care of this for you.)
I will answer my question myself. It seems that I became a victim of branch prediction. The mutual size of the operands does not seem to matter, only their order.
Consider the following implementation
int divisible_ui_p(unsigned int m, unsigned int a)
{
while (m > a) {
m += a;
m >>= __builtin_ctz(m);
}
if (m == a) {
return 1;
}
return 0;
}
and the arrays
unsigned int A[100000/2];
unsigned int M[100000-1];
for (unsigned int a = 1; a < 100000; a += 2) {
A[a/2] = a;
}
for (unsigned int m = 1; m < 100000; m += 1) {
M[m-1] = m;
}
which are / are not shuffled using the shuffle function.
Without shuffling, the results are still
| implementation | time [secs] |
|--------------------|-------------|
| divisible_ui_p | 8.56user |
| builtin % operator | 17.59user |
However, once I shuffle these arrays, the results are different
| implementation | time [secs] |
|--------------------|-------------|
| divisible_ui_p | 31.34user |
| builtin % operator | 17.53user |
I have a task to print all the prime numbers between 1 and 1000000 in class and the fastest 10 programs get extra marks. The main problem is the time it takes for the prime numbers to be printed to the console.
Basically using the Sieve of Eratosthenes I produce an array with only boolean values in it. The boolean value Numbers[i] is true if i+2 is a prime number.
for(i = 0; i <= n - 2; ++i)
if (Numbers[i]) // True if the number is prime
printf("%d\n", i+2);
Printf seems to be really slow as the program can generate the list of primes in about 0.035 s but then takes a further 11 seconds to print the list. Is there anyway I can speed this up, thanks.
Beneath is a slightly unoptimized implementation (although I skipped the intermediate list and print directly) of what I think you were supposed to do. Running that program on an AMD A8-6600K with a small load (mainly a Youtube music-video for some personal entertainment) results in
real 0m1.211s
user 0m0.047s
sys 0m0.122s
averaged over a couple of runs. So the problem lies in your implementation of the sieve or you are hiding some essential facts about your hardware.
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <inttypes.h>
#include <limits.h>
#include <string.h>
/* I call it a general bitset. Others might call it an abomination. YMMV. */
# define ERAT_BITS (sizeof(uint32_t)*CHAR_BIT)
# define GET_BIT(s,n) ((*(s+(n/ERAT_BITS)) & ( 1<<( n % ERAT_BITS ))) != 0)
# define SET_BIT(s,n) (*(s+(n/ERAT_BITS)) |= ( 1<<( n % ERAT_BITS )))
# define CLEAR_BIT(s,n) (*(s+(n/ERAT_BITS)) &= ~( 1<<( n % ERAT_BITS )))
# define TOG_BIT(s,n) (*(s+(n/ERAT_BITS)) ^= ( 1<<( n % ERAT_BITS )))
/* size is the size in bits, the overall size might be bigger */
typedef struct mp_bitset_t {
uint32_t size;
uint32_t *content;
} mp_bitset_t;
# define mp_bitset_alloc(bst, n) \
do {\
(bst)->content=malloc(( n /(sizeof(uint32_t)) + 1 ));\
if ((bst)->content == NULL) {\
fprintf(stderr, "memory allocation for bitset failed");\
exit(EXIT_FAILURE);\
}\
(bst)->size = n;\
} while (0)
# define mp_bitset_size(bst) ((bst)->size)
# define mp_bitset_setall(bst) memset((bst)->content,~(uint32_t)(0),\
(bst->size /(sizeof(uint32_t) ) +1 ))
# define mp_bitset_clearall(bst) memset((bst)->content,0,\
(bst->size /(sizeof(uint32_t) ) +1 ))
# define mp_bitset_clear(bst,n) CLEAR_BIT((bst)->content, n)
# define mp_bitset_set(bst,n) SET_BIT((bst)->content, n)
# define mp_bitset_get(bst,n) GET_BIT((bst)->content, n)
# define mp_bitset_free(bst) \
do {\
free((bst)->content);\
free(bst);\
} while (0)
uint32_t mp_bitset_nextset(mp_bitset_t * bst, uint32_t n);
uint32_t mp_bitset_prevset(mp_bitset_t * bst, uint32_t n);
void mp_eratosthenes(mp_bitset_t * bst);
/* It's called Hallek's method but it has many inventors*/
static uint32_t isqrt(uint32_t n)
{
uint32_t s, rem, root;
if (n < 1)
return 0;
/* This is actually the highest square but it goes
* downward from this, quite fast */
s = 1 << 30;
rem = n;
root = 0;
while (s > 0) {
if (rem >= (s | root)) {
rem -= (s | root);
root >>= 1;
root |= s;
} else {
root >>= 1;
}
s >>= 2;
}
return root;
}
uint32_t mp_bitset_nextset(mp_bitset_t *bst, uint32_t n)
{
while ((n < mp_bitset_size(bst)) && (!mp_bitset_get(bst, n))) {
n++;
}
return n;
}
/*
* Standard method, quite antique now, but good enough for the handful
* of primes needed here.
*/
void mp_eratosthenes(mp_bitset_t *bst)
{
uint32_t n, k, r, j;
mp_bitset_setall(bst);
mp_bitset_clear(bst, 0);
mp_bitset_clear(bst, 1);
n = mp_bitset_size(bst);
r = isqrt(n);
for (k = 4; k < n; k += 2)
mp_bitset_clear(bst, k);
k = 0;
while ((k = mp_bitset_nextset(bst, k + 1)) < n) {
if (k > r) {
break;
}
for (j = k * k; j < n; j += k * 2) {
mp_bitset_clear(bst, j);
}
}
}
#define UPPER_LIMIT 1000000 /* one million */
int main(void) {
mp_bitset_t *bst;
uint32_t n, k, j;
bst = malloc(sizeof(mp_bitset_t));
if(bst == NULL) {
fprintf(stderr, "failed to allocate %zu bytes\n",sizeof(mp_bitset_t));
exit(EXIT_FAILURE);
}
mp_bitset_alloc(bst, UPPER_LIMIT);
mp_bitset_setall(bst);
mp_bitset_clear(bst, 0); // 0 is not prime b.d.
mp_bitset_clear(bst, 1); // 1 is not prime b.d.
n = mp_bitset_size(bst);
for (k = 4; k < n; k += 2) {
mp_bitset_clear(bst, k);
}
k = 0;
while ((k = mp_bitset_nextset(bst, k + 1)) < n) {
printf("%" PRIu32 "\n", k);
for (j = k * k; j < n; j += k * 2) {
mp_bitset_clear(bst, j);
}
}
mp_bitset_free(bst);
return EXIT_SUCCESS;
}
Compiled with
gcc-4.9 -O3 -g3 -W -Wall -Wextra -Wuninitialized -Wstrict-aliasing -pedantic -std=c11 tests.c -o tests
(GCC is gcc-4.9.real (Ubuntu 4.9.4-2ubuntu1~14.04.1) 4.9.4)
Since by default console output is line buffered, which is the reason of the increased time.
You can use the setvbuf function to allow printing to console/stdout only in chunks rather than for each iteration.
E.g.
char buffer[256];
setvbuf(stdout, buffer, _IOFBF, sizeof(buffer));
You can alter the size of buffer according to your needs.
IOFBF option is for full buffering i.e. output will be printed once the buffer is full.
See setvbuf for more details
From time to time I use the following code for generating a matrix style datastructure
typedef double myType;
typedef struct matrix_t{ |Compilation started at Mon Apr 5 02:24:15
myType **matrix; |
size_t x; |gcc structreaderGeneral.c -std=gnu99 -lz
size_t y; |
}matrix; |Compilation finished at Mon Apr 5 02:24:15
|
|
matrix alloc_matrix(size_t x, size_t y){ |
if(0) |
fprintf(stderr,"\t-> Alloc matrix with dim (%lu,%lu) byteprline=%lu bytetotal:%l\|
u\n",x,y,y*sizeof(myType),x*y*sizeof(myType)); |
|
myType **m = (myType **)malloc(x*sizeof(myType **)); |
for(size_t i=0;i<x;i++) |
m[i] =(myType *) malloc(y*sizeof(myType *)); |
|
matrix ret; |
ret.x=x; |
ret.y=y; |
ret.matrix=m; |
return ret; |
}
And then I would change my typedef accordingly if I needed a different kind of type for the entries in my matrix.
Now I need 2 matrices with different types, an easy solution would be to copy/paste the code, but is there some way to do a more generic implementation.
Thanks
edit:
I should clarify that its in c not c++.
Sorry for not making that clear.
In C? Messy, but possible with macro magic. (You're getting to the point where C++ is a better choice, BTW).
#define DECL_MATRIX(type,name) \
typedef struct matrix_##type##_t { \
type **matrix; \
size_t x; \
size_t y; \
} name; \
name alloc_##name(size_t x,size_t y)
#define DEFINE_MATRIX_OPS(type,name) \
struct matrix_##type##_t \
alloc_##name(size_t x, size_t y) { \
size_t i; \
struct matrix_##type##_t ret; \
type **m; \
\
m = (type **)malloc(x*sizeof(type *)); \
for(size_t i=0;i<x;i++) \
m[i] =(type *) malloc(y*sizeof(type)); \
ret.x=x; \
ret.y=y; \
ret.matrix=m; \
return ret; \
}
You'd then use these like this:
// At the top level of the file
DECL_MATRIX(double, dmat);
DECL_MATRIX(int, imat);
DEFINE_MATRIX_OPS(double, dmat);
DEFINE_MATRIX_OPS(int, imat);
// In a function
dmat d = alloc_dmat(3,3);
imat i = alloc_imat(2,6);
As a design note, it's better for matrixes of a fixed size to allocate the memory for the elements as a single block and to use a little math to index into them. Thus instead of ary[a][b] you use ary[a*x_size+y]. You can wrap this all up in more macros if you want, but it is much more efficient, both in terms of memory management and access.
I needed a very simple matrix for a one-off project and knocked this one up. It's not what I would call production quality, but it may give you some ideas:
template <typename T>
class Matrix2D {
public:
Matrix2D( unsigned int width, unsigned int height,
const T & v = T() ) {
if ( width == 0 || height == 0 ) {
throw std::out_of_range( "Invalid Matrix2D size ");
}
for ( unsigned int x = 0; x < width; x++ ) {
mData.push_back( std::vector<T>( height, v ) );
}
}
T & operator()( unsigned int x, unsigned int y ) {
if ( x >= Width() || y >= Height() ) {
throw std::range_error( "Invalid Matrix2D index" );
}
return mData[x][y];
}
const T & operator()( unsigned int x, unsigned int y ) const {
if ( x >= Width() || y >= Height() ) {
throw std::range_error( "Invalid Matrix2D index" );
}
return mData[x][y];
}
void Clear( const T & v = T() ) {
for ( unsigned int x = 0; x < Width(); x++ ) {
for ( unsigned int y = 0; y < Height(); y++ ) {
mData[x][y] = v;
}
}
}
unsigned int Width() const {
return mData.size();
}
unsigned int Height() const {
return mData[0].size();
}
void DumpOn( std::ostream & os ) {
for ( unsigned int y = 0; y < Height(); y++ ) {
for ( unsigned int x = 0; x < Width(); x++ ) {
os << '[' << mData[x][y] << ']';
}
os << "\n";
}
}
private:
std::vector <std::vector <T> > mData;
};
As suggested on previous comments, a row-major matrix using linear memory:
template<typename T, unsigned int DIM>
class matrix {
public:
matrix<T,DIM>() {
matrix(0);
}
matrix<T,DIM>(const T* v) {
for (unsigned int i=0; i<DIM*DIM; ++i)
value[i] = v[i];
}
matrix<T,DIM>(T v) {
for (unsigned int i=0; i<DIM*DIM; ++i)
value[i] = v;
}
T& operator[](int index) {
assert(index >= 0 && index < (int)(DIM*DIM));
return value[index];
}
// and so on...
private:
T value[DIM * DIM];
}