For my cuda project I want to give my device function a single integer.
My function looks like
__device__ void PBKDF2_CUDA(const uint8_t password[], const int pass_len, const uint8_t Essid[], const int Essid_len, const int c, const int dkLen, uint32_t T_ptr[], int *PW_len_test)
{
uint32_t Hash_ptr[5] = {0};
uint32_t L[5]={0,0,0,0,0};
uint32_t T[8] = {0};
//Maybe working
/*uint8_t * password_shrinked = (uint8_t*)malloc(8 + 1);
for(int i = 0; i < 8; i++)
password_shrinked[i] = password[i];
password_shrinked[8 + 1] = 0;*/
int password_len = pass_len;
if (pass_len != 8)
{
*PW_len_test = pass_len;
password_len = 8;
}
uint8_t * password_shrinked = (uint8_t*)malloc(sizeof(uint8_t)*(password_len + 1));
for (int i = 0; i < password_len; i++)
password_shrinked[i] = password[i];
password_shrinked[password_len + 1] = 0;
//Some other stuff
free(password_shrinked);
};
and I'm calling it from a kernel like this:
__global__ void kernel(uint8_t Password_list[], const int *Password_len, uint8_t Essid[], int *Essid_len, int *rounds,int *dkLen, uint32_t T[], int pmk_size, int *PW_len_test)
{
int idx= threadIdx.x + blockDim.x*blockIdx.x;
printf("Password_len is: %d\n", Password_len);
PBKDF2_CUDA(Password_list+idx*(8), 8, Essid, *Essid_len, *rounds, *dkLen, T+idx*pmk_size, PW_len_test + idx*sizeof(int));
}
Calling kernel in main function:
kernel<<<BLOCKS, THREADS>>>(Pass_d, Pass_len_d, Essid_d, Essid_len_d, rounds_d, key_len_d, PMK_d, PMK_size, PW_len_test_d);
Now, regardless if I set Pass_len_d to 8, or if I'm calling the kernel with 8 instead of Pass_len_d, my device function creates garbage (returning wrong values, explanation below). It only works if I set the value manually in the kernel function (as seen above) or in the device function.
With garbage I mean that some returned values are not calculated correctly from the password list (uint8_t array), but others are correctly calculated. Which words are correctly calculated changes with every run, so I assume there is a race condition somewhere, but I can not find it.
There's at least one buffer overflow.
password_shrinked[password_len + 1] = 0; writes to a slot one byte above what was allocated.
Remember that if you allocate password_len + 1 bytes, the last location in the array is password_len.
Related
I want to use C language to implement the recognition of mnist datasets, using a backpropagation algorithm, but when loading the input layer neural units, the Segmentation fault (core dumped) is displayed, here's the code snippet, why, and how to solve it.
#include <stdio.h>
#include <unistd.h>
#include <math.h>
#include <stdlib.h>
#include <time.h>
#define PATH_TRAIN_IMAGES "../../train-images-idx3-ubyte"
#define PATH_TRAIN_LABELS "../../train-labels-idx1-ubyte"
#define PATH_WEIGHT_DATA2 "../data/data2.weight"
#define PATH_WEIGHT_DATA3 "../data/data3.weight"
#define PATH_BIAS_DATA2 "../data/data2.bias"
#define PATH_BIAS_DATA3 "../data/data3.bias"
#define TRAIN_IMAGES_NUMBER 60000
#define PIXEL 784
#define HIDDEN_UNITS_NUMBER 300
#define OUT_UNITS_NUMBER 10
#define TRAIN_TEST 0
struct Unit
{
// input with weight
float z;
// bias
float b;
// output
float a;
};
float sigmod(float z)
{
return (1 / (1 + exp(-z)));
}
struct Unit* create_unit(float uz, float ub, float ua)
{
struct Unit* unit = (struct Unit*)malloc(sizeof(struct Unit));
unit->z = uz;
unit->b = ub;
unit->a = ua;
return unit;
}
int load_train_labels(char* path_train_labels, unsigned char* ar_label)
{
FILE *fp_label;
int size_label = 0;
fp_label = fopen(path_train_labels, "rb");
fseek(fp_label, 0, SEEK_END);
size_label = ftell(fp_label);
printf("%s size:%d byte\n", path_train_labels, size_label);
rewind(fp_label);
// Starting with the 9th byte
fseek(fp_label,8,SEEK_SET);
unsigned char train_labels_buffer[size_label];
ar_label = (unsigned char*)malloc(sizeof(unsigned char) * size_label - 8);
fread(ar_label, 1, size_label - 8, fp_label);
fclose(fp_label);
return size_label;
}
int load_train_images(char* path_train_images, unsigned char* ar_img)
{
FILE *fp_img;
int size_img = 0;
fp_img = fopen(path_train_images, "rb");
fseek(fp_img, 0, SEEK_END);
size_img = ftell(fp_img);
printf("%s size:%d byte\n", path_train_images, size_img);
rewind(fp_img);
// Starting with the 17th byte, each byte stores the value of one pixel in a picture
fseek(fp_img, 16, SEEK_SET);
ar_img = (unsigned char*)malloc(sizeof(char) * size_img - 16);
fread(ar_img, 1, size_img - 16, fp_img);
fclose(fp_img);
return size_img;
}
int load_data(char* path_data, unsigned char* ar_data)
{
FILE *fp_data;
int size_data;
fp_data = fopen(path_data, "rb");
fseek(fp_data, 0, SEEK_END);
size_data = ftell(fp_data);
fseek(fp_data, 0, SEEK_SET);
ar_data = (unsigned char*)malloc(sizeof(char) * size_data);
printf("%s size:%d byte\n", path_data, size_data);
return size_data;
}
int main(int argc, char *argv[])
{
printf("Loading train labels file.\n");
unsigned char* ar_label;
int size_label;
size_label = load_train_labels(PATH_TRAIN_LABELS, ar_label);
printf("Loading train images file.\n");
unsigned char* ar_img;
int size_img;
size_img = load_train_images(PATH_TRAIN_IMAGES, ar_img);
printf("Loading random weight file.\n");
unsigned char* ar_weight2;
int size_weight2;
size_weight2 = load_data(PATH_WEIGHT_DATA2, ar_weight2);
unsigned char* ar_weight3;
int size_weight3;
size_weight3 = load_data(PATH_WEIGHT_DATA3, ar_weight3);
printf("Loading random bias file.\n");
unsigned char* ar_bias2;
int size_bias2;
size_bias2 = load_data(PATH_BIAS_DATA2, ar_bias2);
unsigned char* ar_bias3;
int size_bias3;
size_bias3 = load_data(PATH_BIAS_DATA3, ar_bias3);
float uz = 0;
float ub = 0;
float ua = 0;
struct Unit* out_units[OUT_UNITS_NUMBER];
for (int t = 0; t < OUT_UNITS_NUMBER; t++)
{
out_units[t] = create_unit(uz, ub, ua);
}
struct Unit* hid_units[HIDDEN_UNITS_NUMBER];
for(int i = 0; i < HIDDEN_UNITS_NUMBER; i++)
{
hid_units[i] = create_unit(uz, ub, ua);
}
struct Unit* in_units[PIXEL] = {NULL};
for(int i = 0; i < PIXEL; i++)
{
in_units[i] = create_unit(uz, ub, ua);
}
/*******************
* load C1 *
*******************/
printf("Loading train...\n");
float C[TRAIN_IMAGES_NUMBER];
for(int i = 0; i < PIXEL; i++)
{
in_units[i]->a = (float)*((ar_img+i*sizeof(char))); //segmentation fault(core dumped)
printf("in_unit[%d] = %f\n", i, in_units[i]->a);
}
for(int i = 0; i < HIDDEN_UNITS_NUMBER; i++)
{
for(int j = 0; j < PIXEL; j++)
{
hid_units[i]->z += in_units[j]->a * ((float)*(ar_weight2+((i*PIXEL+j)*sizeof(float))));
}
hid_units[i]->z += ((float)*(ar_bias2+(i*sizeof(float))));
hid_units[i]->a = sigmod(hid_units[i]->z);
}
for(int i = 0; i < OUT_UNITS_NUMBER; i++)
{
for(int j = 0; j < HIDDEN_UNITS_NUMBER; j++)
{
out_units[i]->z += hid_units[j]->a * ((float)*(ar_weight3+((i*HIDDEN_UNITS_NUMBER+j)*sizeof(float))));
}
out_units[i]->z += ((float)*(ar_bias3 + (i*sizeof(float))));
out_units[i]->a = sigmod(out_units[i]->z);
}
// free(in_units)
free(ar_label);
free(ar_img);
free(ar_weight2);
free(ar_bias2);
free(ar_weight3);
free(ar_bias3);
return 0;
}
Almost all source code was uploaded. I used the gdb debugger, but only showed Program terminated with signal SIGSEGV, Segmentation fault.And I turned on ulimit, but didn't find the core file.
Your pointer-passing is flawed. When calling a function and passing that function a pointer to something, you can alter the data the pointer is pointing to, but not the address of the pointer itself so that it is visible from the caller perspective (only in the callees perspective).
For example, one of your functions signature reads:
int load_data(char* path_data, unsigned char* ar_data)
In that function, you do a
ar_data = (unsigned char*)malloc(sizeof(char) * size_data);
This is fine, but this does not effect, that the caller of the function load_data can access this allocated memory. Instead this memory address is lost as soon as that function returns.
This means, that when you write
unsigned char* ar_label;
int size_label;
size_label = load_train_labels(PATH_TRAIN_LABELS, ar_label);
then after calling the function, ar_label still has its original (uninitialized) value. What you probably meant to do was to write the function signature as (notice the extra asterisks/ampersands in the following):
int load_data(char* path_data, unsigned char** ar_data)
Then, allocate the memory as:
*ar_data = (unsigned char*)malloc(sizeof(char) * size_data);
and use the function as:
unsigned char* ar_label;
int size_label;
size_label = load_train_labels(PATH_TRAIN_LABELS, &ar_label);
This way, you are passing a pointer to a pointer and therefore can alter the address the pointer ar_label points to in the caller. This means, that this way you store the address of the mallocated memory block in the callers pointer variable instead of in a copy of the pointer variable supplied as parameter. And you therefore are allowed to access the memory this pointer points to in the caller afterwards.
I am doing a GHASH for the AES-GCM implementation.
and i need to implement this
where v is the bit length of the final block of A, u is the bit length of the final block of C, and || denotes concatenation of bit strings.
How can I do the concatenation of A block to fill in the zeros padding from v to 128 bit, as I do not know the length of the whole block of A.
So I just take the A block and XOR it with an array of 128 bits
void GHASH(uint8_t H[16], uint8_t len_A, uint8_t A_i[len_A], uint8_t len_C,
uint8_t C_i[len_C], uint8_t X_i[16]) {
uint8_t m;
uint8_t n;
uint8_t i;
uint8_t j;
uint8_t zeros[16] = {0};
if (i == m + n) {
for(j=16; j>=0; j--){
C_i[j] = C_i[j] ^ zeros[j]; //XOR with zero array to fill in 0 of length 128-u
tmp[j] = X_i[j] ^ C_i[j]; // X[m+n+1] XOR C[i] left shift by (128bit-u) and store into tmp
gmul(tmp, H, X_i); //Do Multiplication of tmp to H and store into X
}
}
I am pretty sure that I am not correct. But I have no idea how to do it.
It seems to me that you've got several issues here, and conflating them is a big part of the problem. It'll be much easier when you separate them.
First: passing in a parameter of the form uint8_t len_A, uint8_t A_i[len_A] is not proper syntax and won't give you what you want. You're actually getting uint8_t len_A, uint8_t * A_i, and the length of A_i is determined by how it was declared on the level above, not how you tried to pass it in. (Note that uint8_t * A and uint8_t A[] are functionally identical here; the difference is mostly syntactic sugar for the programmer.)
On the level above, since I don't know if it was declared by malloc() or on the stack, I'm not going to get fancy with memory management issues. I'm going to use local storage for my suggestion.
Unit clarity: You've got a bad case going on here: bit vs. byte vs. block length. Without knowing the core algorithm, it appears to me that the undeclared m & n are block lengths of A & C; i.e., A is m blocks long, and C is n blocks long, and in both cases the last block is not required to be full length. You're passing in len_A & len_C without telling us (or using them in code so we can see) whether they're the bit length u/v, the byte length of A_i/C_i, or the total length of A/C, in bits or bytes or blocks. Based on the (incorrect) declaration, I'm assuming they're the length of A_i/C_i in bytes, but it's not obvious... nor is it the obvious thing to pass. By the name, I would have guessed it to be the length of A/C in bits. Hint: if your units are in the names, it becomes obvious when you try to add bitLenA to byteLenB.
Iteration control: You appear to be passing in 16-byte blocks for the i'th iteration, but not passing in i. Either pass in i, or pass in the full A & C instead of A_i & C_i. You're also using m & n without setting them or passing them in; the same issue applied. I'll just pretend they're all correct at the moment of use and let you fix that.
Finally, I don't understand the summation notation for the i=m+n+1 case, in particular how len(A) & len(C) are treated, but you're not asking about that case so I'll ignore it.
Given all that, let's look at your function:
void GHASH(uint8_t H[], uint8_t len_A, uint8_t A_i[], uint8_t len_C, uint8_t C_i[], uint8_t X_i[]) {
uint8_t tmpAC[16] = {0};
uint8_t tmp[16];
uint8_t * pAC = tmpAC;
if (i == 0) { // Initialization case
for (j=0; j<len_A; ++j) {
X_i[j] = 0;
}
return;
} else if (i < m) { // Use the input memory for A
pAC = A_i;
} else if (i == m) { // Use temp memory init'ed to 0; copy in A as far as it goes
for (j=0; j<len_A; ++j) {
pAC[j] = A_i[j];
}
} else if (i < m+n) { // Use the input memory for C
pAC = C_i;
} else if (i == m+n) { // Use temp memory init'ed to 0; copy in C as far as it goes
for (j=0; j<len_A; ++j) {
pAC[j] = C_i[j];
}
} else if (i == m+n+1) { // Do something unclear to me. Maybe this?
// Use temp memory init'ed to 0; copy in len(A) & len(C)
pAC[0] = len_A; // in blocks? bits? bytes?
pAC[1] = len_C; // in blocks? bits? bytes?
}
for(j=16; j>=0; j--){
tmp[j] = X_i[j] ^ pAC[j]; // X[m+n+1] XOR A or C[i] and store into tmp
gmul(tmp, H, X_i); //Do Multiplication of tmp to H and store into X
}
}
We only copy memory in the last block of A or C, and use local memory for the copy. Most blocks are handled with a single pointer copy to point to the correct bit of input memory.
if you don't care about every little bit of efficiency (i assume this is to experiment, and not for real use?) just reallocate and pad (in practice, you could round up and calloc when you first declare these):
size_t round16(size_t n) {
// if n isn't a multiple of 16, round up to next multiple
if (n % 16) return 16 * (1 + n / 16);
return n;
}
size_t realloc16(uint8_t **data, size_t len) {
// if len isn't a multiple of 16, extend with 0s to next multiple
size_t n = round16(len);
*data = realloc(*data, n);
for (size_t i = len; i < n; ++i) (*data)[i] = 0;
return n;
}
void xor16(uint8_t *result, uint8_t *a, uint8_t *b) {
// 16 byte xor
for (size_t i = 0; i < 16; ++i) result[i] = a[i] ^ b[i];
}
void xorandmult(uint8_t *x, uint8_t *data, size_t n, unint8_t *h) {
// run along the length of the (extended) data, xoring and mutliplying
uint8_t tmp[16];
for (size_t i = 0; i < n / 16; ++i) {
xor16(tmp, x, data+i*16);
multgcm(x, h, tmp);
}
}
void ghash(uint8_t *x, uint8_t **a, size_t len_a, uint8_t **c, size_t len_c, uint8_t *h) {
size_t m = realloc16(a, len_a);
xorandmult(x, *a, m, h);
size_t n = realloc16(c, len_c);
xorandmult(x, *c, n, h);
// then handle lengths
}
uint8_t x[16] = {0};
ghash(x, &a, len_a, &c, len_c, h);
disclaimer - no expert, just skimmed the spec. code uncompiled, unchecked, and not intended for "real" use. also, the spec supports arbitrary (bit) lengths, but i assume you're working in bytes.
also, i am still not sure i am answering the right question.
I had a short interview where a question is like this: set an integer value to be 0xaa55 at address 0x*****9.
The only thing I noticed is that the address given is not aligned on word boundary. So setting an int *p to the address should not work. Then is it just using a unsigned char *p to assign the value byte-wise? Is it the point of this interview question? There is no point of doing this in real life, is there?
You need to get back to the interviewer with a number of subsidiary questions:
What is the size in bytes of an int?
Is the machine little-endian or big-endian?
Does the machine handle non-aligned access automatically?
What is the performance penalty for handling non-aligned access automatically?
What is the point of this?
The chances are that someone is thinking of marshalling data the quick and dirty way.
You're right that one basic process is to write the bytes via a char * or unsigned char * that is initialized to the relevant address. The answers to my subsidiary questions 1 and 2 determine the exact mechanism to use, but for a 2-byte int in little-endian format, you might use:
unsigned char *p = 0x*****9; // Copied from question!
unsigned int v = 0xAA55;
*p++ = v & 0xFF;
v >>= 8;
*p = v & 0xFF;
You can generalize to 4-byte or 8-byte integers easily; handling big-endian integers is a bit more fiddly.
I assembled some timing code to see what the relative costs were. Tested on a MacBook Pro (2.3 GHz Intel Core i7, 16 GiB 1333 MHz DDR3 RAM, Mac OS X 10.7.5, home-built GCC 4.7.1), I got the following times for the non-optimized code:
Aligned: 0.238420
Marshalled: 0.931727
Unaligned: 0.243081
Memcopy: 1.047383
Aligned: 0.239070
Marshalled: 0.931718
Unaligned: 0.242505
Memcopy: 1.060336
Aligned: 0.239915
Marshalled: 0.934913
Unaligned: 0.242374
Memcopy: 1.049218
When compiled with optimization, I got segmentation faults, even without -DUSE_UNALIGNED — which puzzles me a bit. Debugging was not easy; there seemed to be a lot of aggressive inline optimization which meant that variables could not be printed by the debugger.
The code is below. The Clock type and the time.h header (and timer.c source) are not shown, but can be provided on request (see my profile). They provide high resolution timing across most platforms (Windows is shakiest).
#include <string.h>
#include <stdio.h>
#include "timer.h"
static int array[100000];
enum { ARRAY_SIZE = sizeof(array) / sizeof(array[0]) };
static int repcount = 1000;
static void uac_aligned(int value)
{
int *base = array;
for (int i = 0; i < repcount; i++)
{
for (int j = 0; j < ARRAY_SIZE - 2; j++)
base[j] = value;
}
}
static void uac_marshalled(int value)
{
for (int i = 0; i < repcount; i++)
{
char *base = (char *)array + 1;
for (int j = 0; j < ARRAY_SIZE - 2; j++)
{
*base++ = value & 0xFF;
value >>= 8;
*base++ = value & 0xFF;
value >>= 8;
*base++ = value & 0xFF;
value >>= 8;
*base = value & 0xFF;
value >>= 8;
}
}
}
#ifdef USE_UNALIGNED
static void uac_unaligned(int value)
{
int *base = (int *)((char *)array + 1);
for (int i = 0; i < repcount; i++)
{
for (int j = 0; j < ARRAY_SIZE - 2; j++)
base[j] = value;
}
}
#endif /* USE_UNALIGNED */
static void uac_memcpy(int value)
{
for (int i = 0; i < repcount; i++)
{
char *base = (char *)array + 1;
for (int j = 0; j < ARRAY_SIZE - 2; j++)
{
memcpy(base, &value, sizeof(int));
base += sizeof(int);
}
}
}
static void time_it(int value, const char *tag, void (*function)(int value))
{
Clock c;
char buffer[32];
clk_init(&c);
clk_start(&c);
(*function)(value);
clk_stop(&c);
printf("%-12s %12s\n", tag, clk_elapsed_us(&c, buffer, sizeof(buffer)));
}
int main(void)
{
int value = 0xAA55;
for (int i = 0; i < 3; i++)
{
time_it(value, "Aligned:", uac_aligned);
time_it(value, "Marshalled:", uac_marshalled);
#ifdef USE_UNALIGNED
time_it(value, "Unaligned:", uac_unaligned);
#endif /* USE_UNALIGNED */
time_it(value, "Memcopy:", uac_memcpy);
}
return(0);
}
memcpy((void *)0x23456789, &(int){0xaa55}, sizeof(int));
Yes, you may need to deal with unaligned multi-byte values in real life. Imagine your device exchanges data with another device. For example, this data may be a message structure sent over a network or a file structure saved to disk. The format of that data may be predefined and not under your control. And the definiton of the data structure may not account for alignement (or even endianness) restrictions of your device. In these situations you'll need to take care when accessing these unaligned multi-byte values.
I want to convert an array of uint8_t to a uint32_t in NesC.
Does anyone know how I can do this?
The solution that i were found is the use of the function :
void * memcpy ( void * destination, const void * source, size_t num );
There is also the function :
void * memset ( void * ptr, int value, size_t num );
In my code i use memcpy and it works fine. Thanks to all people that answer my question
If you want to convert a single uint8_t in the source to a single uint32_t in the destination, it's actually very simple. Just create the destination array, and copy the values in a loop:
uint8_t *source;
size_t source_count; /* Number of entries in the source */
uint32_t *dest = malloc(sizeof(*dest) * source_count);
for (int i = 0; i < source_count; i++)
dest[i] = source[i];
Your title mentions strings, but your question text doesn't. This is confusing.
If you have four 8-bit integers, you can join them into a single 32-bit like so:
const uint8_t a = 1, b = 2, c = 3, d = 4;
const uint32_t big = (a << 24) | (b << 16) | (c << 8) | d;
This orders them like so, where letters denote bits from the variables above:
0xaabbccdd
In other words, a is taken to be the most significant byte, and d the least.
If you have an array, you can of course do this in a loop:
uint32_t bytes_to_word(const uint8_t *bytes)
{
size_t i;
uint32_t out = 0;
for(i = 0; i < 4; ++i)
{
out <<= 8;
out |= bytes[i];
}
return out;
}
The above assumes that bytes has four values.
Let me preface this with.. I have extremely limited experience with ASM, and even less with SIMD.
But it happens that I have the following MMX/SSE optimised code, that I would like to port across to AltiVec instructions for use on PPC/Cell processors.
This is probably a big ask.. Even though it's only a few lines of code, I've had no end of trouble trying to work out what's going on here.
The original function:
static inline int convolve(const short *a, const short *b, int n)
{
int out = 0;
union {
__m64 m64;
int i32[2];
} tmp;
tmp.i32[0] = 0;
tmp.i32[1] = 0;
while (n >= 4) {
tmp.m64 = _mm_add_pi32(tmp.m64,
_mm_madd_pi16(*((__m64 *)a),
*((__m64 *)b)));
a += 4;
b += 4;
n -= 4;
}
out = tmp.i32[0] + tmp.i32[1];
_mm_empty();
while (n --)
out += (*(a++)) * (*(b++));
return out;
}
Any tips on how I might rewrite this to use AltiVec instructions?
My first attempt (a very wrong attempt) looks something like this.. But it's not entirely (or even remotely) correct.
static inline int convolve_altivec(const short *a, const short *b, int n)
{
int out = 0;
union {
vector unsigned int m128;
int i64[2];
} tmp;
vector unsigned int zero = {0, 0, 0, 0};
tmp.i64[0] = 0;
tmp.i64[1] = 0;
while (n >= 8) {
tmp.m128 = vec_add(tmp.m128,
vec_msum(*((vector unsigned short *)a),
*((vector unsigned short *)b), zero));
a += 8;
b += 8;
n -= 8;
}
out = tmp.i64[0] + tmp.i64[1];
#endif
while (n --)
out += (*(a++)) * (*(b++));
return out;
}
You're not far off - I fixed a few minor problems, cleaned up the code a little, added a test harness, and it seems to work OK now:
#include <assert.h>
#include <stdio.h>
#include <stdlib.h>
#include <altivec.h>
static int convolve_ref(const short *a, const short *b, int n)
{
int out = 0;
int i;
for (i = 0; i < n; ++i)
{
out += a[i] * b[i];
}
return out;
}
static inline int convolve_altivec(const short *a, const short *b, int n)
{
int out = 0;
union {
vector signed int m128;
int i32[4];
} tmp;
const vector signed int zero = {0, 0, 0, 0};
assert(((unsigned long)a & 15) == 0);
assert(((unsigned long)b & 15) == 0);
tmp.m128 = zero;
while (n >= 8)
{
tmp.m128 = vec_msum(*((vector signed short *)a),
*((vector signed short *)b), tmp.m128);
a += 8;
b += 8;
n -= 8;
}
out = tmp.i32[0] + tmp.i32[1] + tmp.i32[2] + tmp.i32[3];
while (n --)
out += (*(a++)) * (*(b++));
return out;
}
int main(void)
{
const int n = 100;
vector signed short _a[n / 8 + 1];
vector signed short _b[n / 8 + 1];
short *a = (short *)_a;
short *b = (short *)_b;
int sum_ref, sum_test;
int i;
for (i = 0; i < n; ++i)
{
a[i] = rand();
b[i] = rand();
}
sum_ref = convolve_ref(a, b, n);
sum_test = convolve_altivec(a, b, n);
printf("sum_ref = %d\n", sum_ref);
printf("sum_test = %d\n", sum_test);
printf("%s\n", sum_ref == sum_test ? "PASS" : "FAIL");
return 0;
}
(Warning: all of my Altivec experience comes from working on Xbox360/PS3 - I'm not sure how different they are from other Altivec platforms).
First off, you should check your pointer alignment. Most vector loads (and stores) operations are expected to be from 16-byte aligned addresses. If they aren't, things will usually carry on without warning, but you won't get the data you were expecting.
It's possible (but slower) to do unaligned loads, but you basically have to read a bit before and after your data and combine them. See Apple's Altivec page. I've also done it before using an lvlx and lvrx load instructions, and then ORing them together.
Next up, I'm not sure your multiplies and adds are the same. I've never used either _mm_madd_pi16 or vec_msum, so I'm not positive they're equivalent. You should step through in a debugger and make sure they give you the same output for the same input data. Another possible difference is that they may treat overflow differently (e.g. modular vs. saturate).
Last but not least, you're computing 4 ints at a time instead of 2. So your union should hold 4 ints, and you should sum all 4 of them at the end.