Related
Here is my test code to find 1st clipping area on the screen.
Two subroutines and dummy loops in the code to compare the performance of them.
point_in_neon (NEON version) and point_in (Regular version) does the same thing:
find out the first clipping area (contains given point) in given list and return -1 if there is no matching area.
I expected NEON version is faster than regular version.
Unfortunately, it is slower than regular version. Is there another way to speed it up?
The compiler command is:
${CC} -O2 -ftree-vectorize -o vcomp vcomp.c
Thanks,
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <assert.h>
#include <math.h>
#include <sys/time.h>
#include <arm_neon.h>
#define WIDTH (4096)
#define HEIGHT (4096)
#define CLIPS (32)
static inline uint64_t now(void) {
struct timeval tv;
gettimeofday(&tv,NULL);
return tv.tv_sec*1000000+tv.tv_usec;
}
typedef struct _rect_t {
int32_t x;
int32_t y;
uint32_t width;
uint32_t height;
} rect_t;
typedef struct _point_t {
int32_t x;
int32_t y;
} point_t;
int32_t inline point_in_neon(const point_t *pt, const rect_t rs[4]) {
const int32_t right[4]={
rs[0].x+rs[0].width-1,
rs[1].x+rs[1].width-1,
rs[2].x+rs[2].width-1,
rs[3].x+rs[3].width-1
}, bottom[4]={
rs[0].y+rs[0].height-1,
rs[1].y+rs[1].height-1,
rs[2].y+rs[2].height-1,
rs[3].y+rs[3].height-1
};
int32x4_t p, r;
uint32x4_t t;
uint32_t res[4];
//p = <Xp, Xp, Xp, Xp>
p=vld1q_dup_s32(&pt->x);
//r = <Left0, Left1, Left2, Left3>
r=vld1q_lane_s32(&rs[0].x, r, 0);
r=vld1q_lane_s32(&rs[1].x, r, 1);
r=vld1q_lane_s32(&rs[2].x, r, 2);
r=vld1q_lane_s32(&rs[3].x, r, 3);
//t = (p >= r)
t=vcgeq_s32(p, r);
//r = <Right0, Right1, Right2, Right3>
r=vld1q_s32(&right);
//t = t & (r >= p)
t=vandq_u32(t, vcgeq_s32(r, p));
//p = <Yp, Yp, Yp, Yp>
p=vld1q_dup_s32(&pt->y);
//r = <Top0, Top1, Top2, Top3>
r=vld1q_lane_s32(&rs[0].y, r, 0);
r=vld1q_lane_s32(&rs[1].y, r, 1);
r=vld1q_lane_s32(&rs[2].y, r, 2);
r=vld1q_lane_s32(&rs[3].y, r, 3);
//t = t & (p >= r)
t=vandq_u32(t, vcgeq_s32(p, r));
//r = <Bottom0, Bottom1, Bottom2, Bottom3>
r=vld1q_s32(&bottom);
//t = t & (r >= p)
t=vandq_u32(t, vcgeq_s32(r, p));
vst1q_u32(res, t);
if(res[0])
return 0;
else if(res[1])
return 1;
else if(res[2])
return 2;
else if(res[3])
return 3;
return -1;
}
int32_t inline point_in(const point_t *pt, const rect_t *rs, uint32_t len) {
int32_t i;
for(i=0;i<len;i++) {
int32_t right=rs[i].x+rs[i].width-1,
bottom=rs[i].y+rs[i].height-1;
if(pt->x>=rs[i].x && pt->x<=right &&
pt->y>=rs[i].y && pt->y<=bottom)
return i;
}
return -1;
}
int32_t main(int32_t argc, char *argv[]) {
rect_t rs[CLIPS];
int32_t i, j;
uint64_t ts0, ts1;
int32_t res[2][CLIPS];
srand((unsigned int)time(NULL));
for(i=0;i<CLIPS;i++) {
rs[i].x=rand()%WIDTH;
rs[i].y=rand()%HEIGHT;
rs[i].width=rand()%WIDTH;
rs[i].height=rand()%HEIGHT;
}
memset(res, 0, sizeof(res));
ts0=now();
for(i=0;i<HEIGHT;i++) {
for(j=0;j<WIDTH;j++) {
point_t p={i, j};
int32_t idx=point_in(&p, rs, CLIPS);
if(idx>=0)
res[0][idx]=1;
}
}
ts0=now()-ts0;
ts1=now();
for(i=0;i<HEIGHT;i++) {
for(j=0;j<WIDTH;j++) {
int32_t k, idx;
point_t p={i, j};
for(k=0, idx=-1;k<CLIPS/4;k++) {
idx=point_in_neon(&p, &rs[k*4]);
if(idx>=0)
break;
}
if(idx>=0)
res[1][k*4+idx]=1;
}
}
ts1=now()-ts1;
/*
for(i=0;i<CLIPS;i++) {
if(res[0][i]!=res[1][i]) {
printf("error.\n");
return 1;
}
}
*/
printf("regular = %lu\n", ts0);
printf("neon = %lu\n", ts1);
return 0;
}
According to Peter Cordes's suggestion, I replaced data loding parts of point_in_neon subroutine with vld4q_s32 intrinsic and subsequent right and bottom calculation can be vectorized. Now the code is shorter and faster than regular version.
int32_t inline point_in_neon(const point_t *pt, const rect_t rs[4]) {
int32x4x4_t r;
int32x4_t right, bottom, p;
uint32x4_t t;
uint32_t res[4];
/*
r.val[0] = <X0, X1, X2, X3>
r.val[1] = <Y0, Y1, Y2, Y3>
r.val[2] = <Width0, Width1, Width2, Width3>
r.val[3] = <Height0, Height1, Height2, Height3>
*/
r=vld4q_s32(rs);
//right = <Right0, Right1, Right2, Right3>
right=vsubq_s32(vaddq_s32(r.val[0], r.val[2]), vdupq_n_s32(1));
//bottom = <Bottom0, Bottom1, Bottom2, Bottom3>
bottom=vsubq_s32(vaddq_s32(r.val[1], r.val[3]), vdupq_n_s32(1));
//p = <Xp, Xp, Xp, Xp>
p=vld1q_dup_s32(&pt->x);
//t = (p >= left)
t=vcgeq_s32(p, r.val[0]);
//t = t & (right >= p)
t=vandq_u32(t, vcgeq_s32(right, p));
//p = <Yp, Yp, Yp, Yp>
p=vld1q_dup_s32(&pt->y);
//t = t & (p >= top)
t=vandq_u32(t, vcgeq_s32(p, r.val[1]));
//t = t & (r >= bottom)
t=vandq_u32(t, vcgeq_s32(bottom, p));
vst1q_u32(res, t);
if(res[0])
return 0;
else if(res[1])
return 1;
else if(res[2])
return 2;
else if(res[3])
return 3;
return -1;
}
Starting with your original point_in method, we can clean up a little bit here by removing the -1's, and changing <= to <.
int32_t inline point_in(const point_t *pt, const rect_t *rs, uint32_t len) {
int32_t i;
for(i=0; i < len; i++)
{
// this is pointless - change your data structures so that
// the rect stores minx/maxx, miny/maxy instead!
int32_t right = rs[i].x + rs[i].width;
int32_t bottom= rs[i].y + rs[i].height;
bool cmp0 = pt->x >= rs[i].x;
bool cmp1 = pt->y >= rs[i].y;
bool cmp2 = pt->x < right;
bool cmp3 = pt->y < bottom;
if(cmp0 & cmp1 & cmp2 & cmp3)
return i;
}
return -1;
}
Next obvious thing to point out:
// your screen size...
#define WIDTH (4096)
#define HEIGHT (4096)
// yet your structures use uint32 as storage???
typedef struct _rect_t {
int32_t x;
int32_t y;
uint32_t width;
uint32_t height;
} rect_t;
typedef struct _point_t {
int32_t x;
int32_t y;
} point_t;
If you can get away with using 16bit integers, this will go at twice the speed (because you can fit 8x 16bit numbers in a SIMD register, v.s. 4x 32bit). Whilst we're at it, we might as well change the data layout to structure of array at the same time.
I'm also going to hoist the pointless p.x + width out, and store it as xmax/ymax instead (removes duplicated computation in your loops).
typedef struct rect_x8_t {
int16x8_t x;
int16x8_t y;
int16x8_t xmax; //< x + width
int16x8_t ymax; //< y + height
} rect_x8_t;
typedef struct point_x8_t {
int16x8_t x;
int16x8_t y;
} point_x8_t;
On the assumption you don't have a number of clips that's divisible by 8, we'll need to pad the number slightly (not a big deal)
// assuming this has already been initialised
rect_t rs[CLIPS];
// how many batches of 8 do we need?
uint32_t CLIPS8 = (CLIPS / 8) + (CLIPS & 7 ? 1 : 0);
// allocate in batches of 8
rect_x8_t rs8[CLIPS8] = {};
// I'm going to do this rubbishly as an pre-process step.
// I don't care too much about efficiency here...
for(uint32_t i = 0; i < CLIPS; ++i) {
rs8[i / 8].x[i & 7] = rs[i].x;
rs8[i / 8].y[i & 7] = rs[I].y;
rs8[i / 8].xmax[i & 7] = rs[i].x + rs[i].width;
rs8[i / 8].ymax[i & 7] = rs[i].y + rs[i].height;
}
I have a couple of concerns here:
for(i=0;i<HEIGHT;i++) {
for(j=0;j<WIDTH;j++) {
// This seems wrong? Shouldn't it be p = {j, i} ?
point_t p={i, j};
int32_t idx=point_in(&p, rs, CLIPS);
// I'm not quite sure what the result says about your
// image data and clip regions???
//
// This seems like a really silly way of asking
// a simple question about the clip regions. The pixels
// don't have any effect here.
if(idx >= 0)
res[0][idx] = 1;
}
}
Anyhow, now refactoring the point_in method to use int16x8_t, we get:
inline int32_t point_in_x8(const point_x8_t pt,
const rect_x8_t* rs,
uint32_t len) {
for(int32_t i = 0; i < len; i++) {
// perform comparisons on 8 rects at a time
uint16x8_t cmp0 = vcgeq_s16(pt.x, rs[i].x);
uint16x8_t cmp1 = vcgeq_s16(pt.y, rs[i].y);
uint16x8_t cmp2 = vcltq_s16(pt.x, rs[i].xmax);
uint16x8_t cmp3 = vcltq_s16(pt.y, rs[I].ymax);
// combine to single comparison value
uint16x8_t cmp01 = vandq_u16(cmp0, cmp1);
uint16x8_t cmp23 = vandq_u16(cmp2, cmp3);
uint16x8_t cmp0123 = vandq_u16(cmp01, cmp23);
// use a horizontal max to see if any lanes are true
if(vmaxvq_u16(cmp0123)) {
for(int32_t j = 0; j < 8; ++j) {
if(cmp0123[j])
return 8*i + j;
}
}
}
return -1;
}
Any additional padded elements in the rect_x8_t structs should end up being ignored (since they should be 0/0, 0/0, which will always end up being false).
Then finally...
for(i = 0; i < HEIGHT; i++) {
point_x8_t p;
// splat the y value
p.y = vld1q_dup_s16(i);
for(j = 0; j < WIDTH; j++) {
// splat the x value
p.x = vld1q_dup_s16(j);
int32_t idx = point_in_x8(p, rs8, CLIPS8);
if(idx >= 0)
res[1][idx] = 1;
}
}
The vld4 instruction actually has a fairly high latency. Given that WIDTH * HEIGHT is actually a very big number, pre-swizzling here (as a pre-processing step) makes a lot more sense imho.
HOWEVER
This whole algorithm could be massively improved by simply ignoring the pixels, and working on CLIP regions directly.
A clip region will be false if it is entirely contained by the preceding clip regions
for(i = 0; i < CLIPS; i++) {
// if region is empty, ignore.
if(rs[i].width == 0 || rs[i].height == 0) {
res[0][i] = 0;
continue;
}
// first region will always be true (unless it's of zero size)
if(i == 0) {
res[0][1] = 1;
continue;
}
uint32_t how_many_intersect = 0;
bool entirely_contained = false;
uint32_t intersection_indices[CLIPS] = {};
// do a lazy test first.
for(j = i - 1; j >= 0; --j) {
// if the last region is entirely contained by preceding
// ones, it will be false. exit loop.
if(region_is_entirely_contained(rs[i], rs[j])) {
res[0][i] = 0;
entirely_contained = true;
j = -1; ///< break out of loop
}
else
// do the regions intersect?
if(region_intersects(rs[i], rs[j])) {
intersection_indices[how_many_intersect] = j;
++how_many_intersect;
}
}
// if one region entirely contains this clip region, skip it.
if(entirely_contained) {
continue;
}
// if you only intersect one or no regions, the result is true.
if(how_many_intersect <= 1) {
res[0][i] = 1;
continue;
}
// If you get here, the result is *probably* true, however
// you will need to split this clip region against the previous
// ones to be fully sure. If all regions are fully contained,
// the answer is false.
// I won't implement it, but something like this:
* split rs[i] against each rs[intersection_indices[]].
* Throw away the rectangles that are entirely contained.
* Each bit that remains should be tested against each rs[intersection_indices[]]
* If you find any split rectangle that isn't contained,
set to true and move on.
}
I have a task to print all the prime numbers between 1 and 1000000 in class and the fastest 10 programs get extra marks. The main problem is the time it takes for the prime numbers to be printed to the console.
Basically using the Sieve of Eratosthenes I produce an array with only boolean values in it. The boolean value Numbers[i] is true if i+2 is a prime number.
for(i = 0; i <= n - 2; ++i)
if (Numbers[i]) // True if the number is prime
printf("%d\n", i+2);
Printf seems to be really slow as the program can generate the list of primes in about 0.035 s but then takes a further 11 seconds to print the list. Is there anyway I can speed this up, thanks.
Beneath is a slightly unoptimized implementation (although I skipped the intermediate list and print directly) of what I think you were supposed to do. Running that program on an AMD A8-6600K with a small load (mainly a Youtube music-video for some personal entertainment) results in
real 0m1.211s
user 0m0.047s
sys 0m0.122s
averaged over a couple of runs. So the problem lies in your implementation of the sieve or you are hiding some essential facts about your hardware.
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <inttypes.h>
#include <limits.h>
#include <string.h>
/* I call it a general bitset. Others might call it an abomination. YMMV. */
# define ERAT_BITS (sizeof(uint32_t)*CHAR_BIT)
# define GET_BIT(s,n) ((*(s+(n/ERAT_BITS)) & ( 1<<( n % ERAT_BITS ))) != 0)
# define SET_BIT(s,n) (*(s+(n/ERAT_BITS)) |= ( 1<<( n % ERAT_BITS )))
# define CLEAR_BIT(s,n) (*(s+(n/ERAT_BITS)) &= ~( 1<<( n % ERAT_BITS )))
# define TOG_BIT(s,n) (*(s+(n/ERAT_BITS)) ^= ( 1<<( n % ERAT_BITS )))
/* size is the size in bits, the overall size might be bigger */
typedef struct mp_bitset_t {
uint32_t size;
uint32_t *content;
} mp_bitset_t;
# define mp_bitset_alloc(bst, n) \
do {\
(bst)->content=malloc(( n /(sizeof(uint32_t)) + 1 ));\
if ((bst)->content == NULL) {\
fprintf(stderr, "memory allocation for bitset failed");\
exit(EXIT_FAILURE);\
}\
(bst)->size = n;\
} while (0)
# define mp_bitset_size(bst) ((bst)->size)
# define mp_bitset_setall(bst) memset((bst)->content,~(uint32_t)(0),\
(bst->size /(sizeof(uint32_t) ) +1 ))
# define mp_bitset_clearall(bst) memset((bst)->content,0,\
(bst->size /(sizeof(uint32_t) ) +1 ))
# define mp_bitset_clear(bst,n) CLEAR_BIT((bst)->content, n)
# define mp_bitset_set(bst,n) SET_BIT((bst)->content, n)
# define mp_bitset_get(bst,n) GET_BIT((bst)->content, n)
# define mp_bitset_free(bst) \
do {\
free((bst)->content);\
free(bst);\
} while (0)
uint32_t mp_bitset_nextset(mp_bitset_t * bst, uint32_t n);
uint32_t mp_bitset_prevset(mp_bitset_t * bst, uint32_t n);
void mp_eratosthenes(mp_bitset_t * bst);
/* It's called Hallek's method but it has many inventors*/
static uint32_t isqrt(uint32_t n)
{
uint32_t s, rem, root;
if (n < 1)
return 0;
/* This is actually the highest square but it goes
* downward from this, quite fast */
s = 1 << 30;
rem = n;
root = 0;
while (s > 0) {
if (rem >= (s | root)) {
rem -= (s | root);
root >>= 1;
root |= s;
} else {
root >>= 1;
}
s >>= 2;
}
return root;
}
uint32_t mp_bitset_nextset(mp_bitset_t *bst, uint32_t n)
{
while ((n < mp_bitset_size(bst)) && (!mp_bitset_get(bst, n))) {
n++;
}
return n;
}
/*
* Standard method, quite antique now, but good enough for the handful
* of primes needed here.
*/
void mp_eratosthenes(mp_bitset_t *bst)
{
uint32_t n, k, r, j;
mp_bitset_setall(bst);
mp_bitset_clear(bst, 0);
mp_bitset_clear(bst, 1);
n = mp_bitset_size(bst);
r = isqrt(n);
for (k = 4; k < n; k += 2)
mp_bitset_clear(bst, k);
k = 0;
while ((k = mp_bitset_nextset(bst, k + 1)) < n) {
if (k > r) {
break;
}
for (j = k * k; j < n; j += k * 2) {
mp_bitset_clear(bst, j);
}
}
}
#define UPPER_LIMIT 1000000 /* one million */
int main(void) {
mp_bitset_t *bst;
uint32_t n, k, j;
bst = malloc(sizeof(mp_bitset_t));
if(bst == NULL) {
fprintf(stderr, "failed to allocate %zu bytes\n",sizeof(mp_bitset_t));
exit(EXIT_FAILURE);
}
mp_bitset_alloc(bst, UPPER_LIMIT);
mp_bitset_setall(bst);
mp_bitset_clear(bst, 0); // 0 is not prime b.d.
mp_bitset_clear(bst, 1); // 1 is not prime b.d.
n = mp_bitset_size(bst);
for (k = 4; k < n; k += 2) {
mp_bitset_clear(bst, k);
}
k = 0;
while ((k = mp_bitset_nextset(bst, k + 1)) < n) {
printf("%" PRIu32 "\n", k);
for (j = k * k; j < n; j += k * 2) {
mp_bitset_clear(bst, j);
}
}
mp_bitset_free(bst);
return EXIT_SUCCESS;
}
Compiled with
gcc-4.9 -O3 -g3 -W -Wall -Wextra -Wuninitialized -Wstrict-aliasing -pedantic -std=c11 tests.c -o tests
(GCC is gcc-4.9.real (Ubuntu 4.9.4-2ubuntu1~14.04.1) 4.9.4)
Since by default console output is line buffered, which is the reason of the increased time.
You can use the setvbuf function to allow printing to console/stdout only in chunks rather than for each iteration.
E.g.
char buffer[256];
setvbuf(stdout, buffer, _IOFBF, sizeof(buffer));
You can alter the size of buffer according to your needs.
IOFBF option is for full buffering i.e. output will be printed once the buffer is full.
See setvbuf for more details
I have an ansi C function to sum up values from an array, based on patterns. Something like:
long sum_all_according_to_pattern(int n, int *values, int *pattern)
{
long sum = 0;
int i = 0;
for(;i<n;i++){
if(pattern[i])
sum+=values[i];
}
return sum;
}
Let's say I've a set of patterns such as:
Pattern 1: 1,1,1,1
Pattern 2: 1,1,0,0
Pattern 3: 1,0,0,1
I need to generate a specific code for each pattern, without the loop and the if. For the previous patterns, it would be:
long sum_according_to_pattern_1(int *values)
{
return values[0]+values[1]+values[2]+values[3];
}
long sum_according_to_pattern_2(int *values)
{
return values[0]+values[1];
}
long sum_according_to_pattern_3(int *values)
{
return values[0]+values[3];
}
or even
long sum_according_to_pattern_1(int *values)
{
long sum = 0;
sum+=values[0];
sum+=values[1];
sum+=values[2];
sum+=values[3];
return sum;
}
long sum_according_to_pattern_2(int *values)
{
long sum = 0;
sum+=values[0];
sum+=values[1];
return sum;
}
long sum_according_to_pattern_3(int *values)
{
long sum = 0;
sum+=values[0];
sum+=values[3];
return sum;
}
Now, suppose that such patterns can be much larger than only 4 elements. Also, suppose I've much more than only these 3 patterns.
My question is: there is some way to achieve that using only ansi C constructions? As I'm trying to keep everything contained, I don't want to write a script to generate the code for me. What I need is to specify the pattern using something like a bitmap macro and than generate the function during compile time.
The way I would do it would be with a macro that defined all the patterns you want, combined with other macros that defined the functions or other info you need about them. So you would have something like:
#define FUNCTION_PATTERNS(M) \
M(1, 0xf) \
M(2, 0x3) \
M(3, 0x9)
#define DEFINE_SUM_FUNCTION(NUM, PATTERN) \
long sum_according_to_pattern_##NUM(int *values) { \
long sum = 0; \
for (int i = 0; 1UL << i <= PATTERN; i++) \
if (PATTERN & (1UL << i)) sum += values[i]; \
}
#define SUM_FUNCTION_NAME(NUM, PATTERN) sum_according_to_pattern_##NUM
now you can easily declare all the functions and build a table of pointers to them:
FUNCTION_PATTERNS(DEFINE_SUM_FUNCTION)
long (*sum_functions[])(int *) = { FUNCTION_PATTERNS(SUM_FUNCTION_NAME) };
if you want, you can manually unroll the loop in the DEFINE_SUM_FUNCTION macro, or you can rely on your C compiler to do it for you, possibly with an appropriate pragma or compile-time flag.
Note that the above will only work up to 32 or 64 elements (depending on architecture). If you want more, you'll have to split the patterns into multiple values.
Extending Chris Dodd's approach.
I think you can generate exactly what you describe by using a list of symbols for the patterns. So, starting with the same X-macro setup.
#define PATTERNS(_) \
_(1, A B C D) \
_(2, A B) \
_(3, A D) \
/**/
#define A sum += values[0];
#define B sum += values[1];
#define C sum += values[2];
#define D sum += values[3];
#define GEN_FUNC(num, pattern) \
long sum_accoring_to_pattern ## num (int *values) { \
long sum = 0; \
pattern \
return sum; \
}
PATTERNS(GEN_FUNC)
Running through cpp -P genpat.c | indent -gnu -i4 -br -ce -cdw -nbc -brf -brs -l100 -bbo yields
long
sum_accoring_to_pattern1 (int *values) {
long sum = 0;
sum += values[0];
sum += values[1];
sum += values[2];
sum += values[3];
return sum;
}
long
sum_accoring_to_pattern2 (int *values) {
long sum = 0;
sum += values[0];
sum += values[1];
return sum;
}
long
sum_accoring_to_pattern3 (int *values) {
long sum = 0;
sum += values[0];
sum += values[3];
return sum;
}
You could also generate the shorter form.
#define PATTERNS(_) \
_(1, A B C D) \
_(2, A B) \
_(3, A D) \
/**/
#define A + values[0]
#define B + values[1]
#define C + values[2]
#define D + values[3]
#define GEN_FUNC(num, pattern) \
long sum_accoring_to_pattern ## num (int *values) { \
return pattern ;\
}
PATTERNS(GEN_FUNC)
You almost certainly want to #undef A .. D afterword. :)
I'm writing a function which just make the sum of two values,
but the C type of these values is specified in parameter,
it can be int, unsigned int, char, unsigned char, float ...
I search a solution to do that without making a lot of C code to process to all different mixed cases.
To be clear the code is :
void addition(unsigned char type_data_1_uc, void *value_data_1_ptr,
unsigned char type_data_2_uc, void *value_data_2_ptr,
unsigned char type_result_uc, void *value_result_ptr)
{
/* types :
0 : TYPE_BIT
1 : TYPE_CHAR
2 : TYPE_UNSIGNED_CHAR
3 : TYPE_INT
4 : TYPE_UNSIGNED_INT
5 : TYPE_SHORT_INT
6 : TYPE_UNSIGNED_SHORT_INT
7 : TYPE_LONG
8 : TYPE_UNSIGNED_LONG
9 : TYPE_FLOAT */
/* INT + INT = INT */
if ((type_data_1_uc == 3)
&& (type_data_2_uc == 3)
&& (type_result_uc == 3))
{
*((int *) value_result_ptr) = *((int *) value_data_1_ptr) + *((int *) value_data_2_ptr);
}
/* FLOAT + FLOAT = FLOAT */
if ((type_data_1_uc == 9)
&& (type_data_2_uc == 9)
&& (type_result_uc == 9))
{
*((float *) value_result_ptr) = *((int *) value_data_1_ptr) + *((int *) value_data_2_ptr);
}
/* UNSIGNED CHAR + INT = INT */
if ((type_data_1_uc == 2)
&& (type_data_2_uc == 3)
&& (type_result_uc == 3))
{
*((int *) value_result_ptr) = *((unsigned char *) value_data_1_ptr) + *((int *) value_data_2_ptr);
}
/* ......... */
}
int main(int argc, char **argv)
{
int data_1;
int data_2;
int result;
data_1 = 26;
data_2 = 32;
addition(3, &data_1, 3, &data_2, 3, &result);
printf("result = %d\n", result);
return 0;
}
I have think to use the union structure but it doesn't solve the problem, because union also need to be statically casted :
/* UNION DATA */
union data_union
{
char tab_c[4];
unsigned char tab_uc[4];
int i;
unsigned int ui;
short int si;
unsigned short int usi;
long l;
unsigned long ul;
float f;
};
void addition(unsigned char type_data_1_uc, union data_union *value_data_1_ptr,
unsigned char type_data_2_uc, union data_union *value_data_2_ptr,
unsigned char type_result_uc, union data_union *value_result_ptr)
{
/* types :
0 : TYPE_BIT
1 : TYPE_CHAR
2 : TYPE_UNSIGNED_CHAR
3 : TYPE_INT
4 : TYPE_UNSIGNED_INT
5 : TYPE_SHORT_INT
6 : TYPE_UNSIGNED_SHORT_INT
7 : TYPE_LONG
8 : TYPE_UNSIGNED_LONG
9 : TYPE_FLOAT */
/* INT + INT = INT */
if ((type_data_1_uc == 3)
&& (type_data_2_uc == 3)
&& (type_result_uc == 3))
{
(*value_result_ptr).i = (*value_data_1_ptr).i + (*value_data_2_ptr).i;
}
/* FLOAT + FLOAT = FLOAT */
if ((type_data_1_uc == 9)
&& (type_data_2_uc == 9)
&& (type_result_uc == 9))
{
(*value_result_ptr).f = (*value_data_1_ptr).f + (*value_data_2_ptr).f;
}
/* UNSIGNED CHAR + INT = INT */
if ((type_data_1_uc == 2)
&& (type_data_2_uc == 3)
&& (type_result_uc == 3))
{
(*value_result_ptr).i = (*value_data_1_ptr).uc + (*value_data_2_ptr).i;
}
}
int main(int argc, char **argv)
{
static union data_union data_1_union;
static union data_union data_2_union;
static union data_union result_union;
memset(&data_1_union, 0, sizeof(union data_union));
memset(&data_2_union, 0, sizeof(union data_union));
data_1_union.i = 26;
data_2_union.i = 32;
addition(3, &data_1_union, 3, &data_2_union, 3, &result_union);
printf("result_union.i = %d\n", result_union.i);
return 0;
}
Any idea to solve this ?
You cannot do that without some "pain", since C is statically typed language. The compiler needs to know the types of variables in order to generate the proper instructions. Most CPU:s have distinct instructions for adding 8-bit integers, 32-bit integers, floats, and so on.
That said, you can certainly improve on the interface: I would use variable arguments to make the prototype:
typedef enum {
TYPE_BIT = 0,
TYPE_CHAR,
TYPE_UNSIGNED_CHAR,
TYPE_INT,
TYPE_UNSIGNED_INT,
TYPE_SHORT_INT,
TYPE_UNSIGNED_SHORT_INT,
TYPE_LONG,
TYPE_UNSIGNED_LONG,
TYPE_FLOAT,
} Type;
void addition(Type type, void *result, ...);
This expects to be called with four arguments, the two latter of which should have the type indicated by the type argument. The result is stored at result, which should be a pointer to the same type as the arguments.
Not sure how to represent single-bit values, probably as unsigned char but it's kind of pointless: single bits is not a type that you can do arithmetic with in C so you're just going to end up doing the add with more bits, then masking some of them off. You also can't have a pointer to a single bit in memory on most machines.
The idiomatic C approach to this problem is to use macros to eliminate code duplication as much as possible. The rest will unfortunately have to be done manually. For example:
enum typecode {
TC_INT = 3,
TC_FLOAT = 9,
/* ... */
};
void addition(enum typecode type_data_1_uc, void *value_data_1_ptr,
enum typecode type_data_2_uc, void *value_data_2_ptr,
enum typecode type_result_uc, void *value_result_ptr)
{
#define DO_ADD(tc1, tc2, tcret, type1, type2, typeret) do { \
if (type_data_1_uc == tc1 && type_data_2_uc == tc2 \
&& type_result_uc == tcret) { \
*(typeret *)value_result_ptr = \
*(type1 *)(value_data_1_ptr) + *(type2 *)(value_data_2_ptr) \
return; \
} while (0)
/* INT + INT = INT */
DO_ADD(TC_INT, TC_INT, TC_INT, int, int, int);
/* FLOAT + FLOAT = FLOAT */
DO_ADD(TC_FLOAT, TC_FLOAT, TC_FLOAT, float, float, float);
/* UCHAR + INT = INT */
DO_ADD(TC_UCHAR, TC_INT, TC_INT, unsigned char, int, int);
/* ... */
#undef DO_ADD
/* none matched: abort() or set an error code, as appropriate */
}
From time to time I use the following code for generating a matrix style datastructure
typedef double myType;
typedef struct matrix_t{ |Compilation started at Mon Apr 5 02:24:15
myType **matrix; |
size_t x; |gcc structreaderGeneral.c -std=gnu99 -lz
size_t y; |
}matrix; |Compilation finished at Mon Apr 5 02:24:15
|
|
matrix alloc_matrix(size_t x, size_t y){ |
if(0) |
fprintf(stderr,"\t-> Alloc matrix with dim (%lu,%lu) byteprline=%lu bytetotal:%l\|
u\n",x,y,y*sizeof(myType),x*y*sizeof(myType)); |
|
myType **m = (myType **)malloc(x*sizeof(myType **)); |
for(size_t i=0;i<x;i++) |
m[i] =(myType *) malloc(y*sizeof(myType *)); |
|
matrix ret; |
ret.x=x; |
ret.y=y; |
ret.matrix=m; |
return ret; |
}
And then I would change my typedef accordingly if I needed a different kind of type for the entries in my matrix.
Now I need 2 matrices with different types, an easy solution would be to copy/paste the code, but is there some way to do a more generic implementation.
Thanks
edit:
I should clarify that its in c not c++.
Sorry for not making that clear.
In C? Messy, but possible with macro magic. (You're getting to the point where C++ is a better choice, BTW).
#define DECL_MATRIX(type,name) \
typedef struct matrix_##type##_t { \
type **matrix; \
size_t x; \
size_t y; \
} name; \
name alloc_##name(size_t x,size_t y)
#define DEFINE_MATRIX_OPS(type,name) \
struct matrix_##type##_t \
alloc_##name(size_t x, size_t y) { \
size_t i; \
struct matrix_##type##_t ret; \
type **m; \
\
m = (type **)malloc(x*sizeof(type *)); \
for(size_t i=0;i<x;i++) \
m[i] =(type *) malloc(y*sizeof(type)); \
ret.x=x; \
ret.y=y; \
ret.matrix=m; \
return ret; \
}
You'd then use these like this:
// At the top level of the file
DECL_MATRIX(double, dmat);
DECL_MATRIX(int, imat);
DEFINE_MATRIX_OPS(double, dmat);
DEFINE_MATRIX_OPS(int, imat);
// In a function
dmat d = alloc_dmat(3,3);
imat i = alloc_imat(2,6);
As a design note, it's better for matrixes of a fixed size to allocate the memory for the elements as a single block and to use a little math to index into them. Thus instead of ary[a][b] you use ary[a*x_size+y]. You can wrap this all up in more macros if you want, but it is much more efficient, both in terms of memory management and access.
I needed a very simple matrix for a one-off project and knocked this one up. It's not what I would call production quality, but it may give you some ideas:
template <typename T>
class Matrix2D {
public:
Matrix2D( unsigned int width, unsigned int height,
const T & v = T() ) {
if ( width == 0 || height == 0 ) {
throw std::out_of_range( "Invalid Matrix2D size ");
}
for ( unsigned int x = 0; x < width; x++ ) {
mData.push_back( std::vector<T>( height, v ) );
}
}
T & operator()( unsigned int x, unsigned int y ) {
if ( x >= Width() || y >= Height() ) {
throw std::range_error( "Invalid Matrix2D index" );
}
return mData[x][y];
}
const T & operator()( unsigned int x, unsigned int y ) const {
if ( x >= Width() || y >= Height() ) {
throw std::range_error( "Invalid Matrix2D index" );
}
return mData[x][y];
}
void Clear( const T & v = T() ) {
for ( unsigned int x = 0; x < Width(); x++ ) {
for ( unsigned int y = 0; y < Height(); y++ ) {
mData[x][y] = v;
}
}
}
unsigned int Width() const {
return mData.size();
}
unsigned int Height() const {
return mData[0].size();
}
void DumpOn( std::ostream & os ) {
for ( unsigned int y = 0; y < Height(); y++ ) {
for ( unsigned int x = 0; x < Width(); x++ ) {
os << '[' << mData[x][y] << ']';
}
os << "\n";
}
}
private:
std::vector <std::vector <T> > mData;
};
As suggested on previous comments, a row-major matrix using linear memory:
template<typename T, unsigned int DIM>
class matrix {
public:
matrix<T,DIM>() {
matrix(0);
}
matrix<T,DIM>(const T* v) {
for (unsigned int i=0; i<DIM*DIM; ++i)
value[i] = v[i];
}
matrix<T,DIM>(T v) {
for (unsigned int i=0; i<DIM*DIM; ++i)
value[i] = v;
}
T& operator[](int index) {
assert(index >= 0 && index < (int)(DIM*DIM));
return value[index];
}
// and so on...
private:
T value[DIM * DIM];
}