I Have a multi-threaded C program where I have 4 threads doing some arithmetic computations using some global arrays. here is example of the code.
__m256 *array_1;
__m256 *array_2;
__m256 *array_3;
#define ALIGNMENT 32
#define SIMD_STEP 8
void Init_arrays()
{
int i;
posix_memalign((void**) &array_1, ALIGNMENT, 32*sizeof(__m256));
posix_memalign((void**) &array_2, ALIGNMENT, 4 *sizeof(__m256));
posix_memalign((void**) &array_3, ALIGNMENT, 2 *sizeof(__m256));
for(i=0;i < 256; i+= SIMD_STEP)
{
// Filling array for the 1st stage
}
for(i=0;i < 64; i+= SIMD_STEP)
{
// Filling array for the 2nd stage
}
for(i=0;i < 16; i+= SIMD_STEP)
{
// Filling array for the 3rd stage
}
}
void *routine(void *thread_info)
{
int n;
unsigned t_start,t_stop;
unsigned ind1, ind2, ind3;
float *arr_in , *arr_out;
struct thread_data *mydata;
mydata = (struct thread_data*) thread_info;
t_start = mydata->start;
t_stop = mydata->stop;
arr_in = mydata->input;
arr_out = mydata->output;
for (n = t_start; n < t_stop; n += 8)
{
ind1 = 256 + n;
ind2 = 512 + n;
vec_a = _mm256_load_ps((float *) (&arr_in[n ]) );
vec_b = _mm256_load_ps((float *) (&arr_in[ind1]) );
vec_c = _mm256_load_ps((float *) (&arr_in[ind2]) );
T_fac1 = array_1[n];
T_fac2 = array_2[n];
T_fac3 = array_3[n];
// print data 'printf()'
// further computations
_mm256_store_ps((float *) (&arr_out[n ]), (vec_a) );
_mm256_store_ps((float *) (&arr_out[ind1]), (vec_b) );
_mm256_store_ps((float *) (&arr_out[ind2]), (vec_c) );
}
pthread_exit(NULL);
}
void foo(float* in,float* out)
{
unsigned t,i=0;
for(t=0;t<256;t+=64)
{
thread_data_array[i].start = t;
thread_data_array[i].stop = t+QUARTER;
thread_data_array[i].input = in;
thread_data_array[i].output = out;
pthread_create(&threads[i],NULL,routine,(void*)&thread_data_array[i]);
i++;
}
for(i=0; i<NUM_THREADS; i++)
{
int rc = pthread_join(threads[i], NULL);
if (rc)
{
fprintf(stderr, "failed to join thread #%u - %s\n",i, strerror(rc));
}
}
}
int main()
{
float *data1;
float *data2;
posix_memalign((void**)&data1, 32, 1024 * sizeof(float));
posix_memalign((void**)&data2, 32, 1024 * sizeof(float));
Load_inputs(reals,imags);//load data into the two arrays
Init_arrays();
// print data 'printf()'
foo(data1,data2);
return EXIT_SUCCESS;
}
For some reason reading from the array_1 for example doesnt work as it should be inside the thread, and I did not know the reason behind it. Here is display of the array_1 as it should be
Display from the main Display from the thread
RE = 1.000000 IM = -0.000000 RE = 1.000000 IM = -0.000000
RE = 0.999981 IM = -0.006136 RE = 0.399624 IM = 0.671559
RE = 0.999925 IM = -0.012272 RE = 0.416430 IM = 0.634393
RE = 0.999831 IM = -0.018407 RE = 0.433094 IM = 0.595699
RE = 0.999699 IM = -0.024541 RE = 0.449612 IM = 0.555570
RE = 0.999529 IM = -0.030675 RE = 0.465977 IM = 0.514103
RE = 0.999322 IM = -0.036807 RE = 0.482184 IM = 0.471397
RE = 0.999078 IM = -0.042938 RE = 0.498228 IM = 0.427555
RE = 0.998795 IM = -0.049068 // the same
RE = 0.998476 IM = -0.055195 // the same
RE = 0.998118 IM = -0.061321 // the same
RE = 0.997723 IM = -0.067444 // the same
RE = 0.997290 IM = -0.073565 // the same
RE = 0.996820 IM = -0.079682 // the same
RE = 0.996313 IM = -0.085797 // the same
RE = 0.995767 IM = -0.091909 // the same
Could somebody know what is the reason behind this erroneous results?
Given
__m256 *array_1;
__m256 *array_2;
__m256 *array_3;
#define ALIGNMENT 32
#define SIMD_STEP 8
void Init_arrays()
{
int i;
posix_memalign((void**) &array_1, ALIGNMENT, 32*sizeof(__m256));
posix_memalign((void**) &array_2, ALIGNMENT, 4 *sizeof(__m256));
posix_memalign((void**) &array_3, ALIGNMENT, 2 *sizeof(__m256));
.
.
.
This loop references array elements that are way out of range:
for (n = t_start; n < t_stop; n += 8)
{
.
.
.
T_fac1 = array_1[n];
T_fac2 = array_2[n];
T_fac3 = array_3[n];
array_3 has all of two members: 2 *sizeof(__m256) yet the index is incremented by eight?
Related
I'm wondering what intrinsics make the SIMD slower than normal matrix multiplication and what should I do to make the multiplication of large matrix faster using SIMD. Here we have matrixA[8][8], matrixB[8][8] and result matrixC[8][8]. Because the maximum number of elements for float32_t is 4, so I did 2 vmul and vadd, which seem to be quite not optimized. I work on ARMv7-A Cortex A8.
void matrix_mult_neon (void)
{
int i;
float32x4x2_t vectB1, vectB2, vectB3, vectB4, vectB5, vectB6, vectB7, vectB8;
vectB1 = vld2q_f32(matrixB[0]);
vectB2 = vld2q_f32(matrixB[1]);
vectB3 = vld2q_f32(matrixB[2]);
vectB4 = vld2q_f32(matrixB[3]);
vectB5 = vld2q_f32(matrixB[4]);
vectB6 = vld2q_f32(matrixB[5]);
vectB7 = vld2q_f32(matrixB[6]);
vectB8 = vld2q_f32(matrixB[7]);
float32x4x2_t vectT1, vectT2, vectT3, vectT4, vectT5, vectT6, vectT7, vectT8;
for (i = 0; i < 8; i++)
{
vectT1.val[0] = vmulq_n_f32(vectB1.val[0], matrixA[i][0]);
vectT1.val[1] = vmulq_n_f32(vectB1.val[1], matrixA[i][0]);
vectT2.val[0] = vmulq_n_f32(vectB2.val[0], matrixA[i][1]);
vectT2.val[1] = vmulq_n_f32(vectB2.val[1], matrixA[i][1]);
vectT3.val[0] = vmulq_n_f32(vectB3.val[0], matrixA[i][2]);
vectT3.val[1] = vmulq_n_f32(vectB3.val[1], matrixA[i][2]);
vectT4.val[0] = vmulq_n_f32(vectB4.val[0], matrixA[i][3]);
vectT4.val[1] = vmulq_n_f32(vectB4.val[1], matrixA[i][3]);
vectT5.val[0] = vmulq_n_f32(vectB5.val[0], matrixA[i][4]);
vectT5.val[1] = vmulq_n_f32(vectB5.val[1], matrixA[i][4]);
vectT6.val[0] = vmulq_n_f32(vectB6.val[0], matrixA[i][5]);
vectT6.val[1] = vmulq_n_f32(vectB6.val[1], matrixA[i][5]);
vectT7.val[0] = vmulq_n_f32(vectB7.val[0], matrixA[i][6]);
vectT7.val[1] = vmulq_n_f32(vectB7.val[1], matrixA[i][6]);
vectT8.val[0] = vmulq_n_f32(vectB8.val[0], matrixA[i][7]);
vectT8.val[1] = vmulq_n_f32(vectB8.val[1], matrixA[i][7]);
vectT1.val[0] = vaddq_f32(vectT1.val[0], vectT2.val[0]);
vectT1.val[0] = vaddq_f32(vectT1.val[0], vectT3.val[0]);
vectT1.val[0] = vaddq_f32(vectT1.val[0], vectT4.val[0]);
vectT1.val[0] = vaddq_f32(vectT1.val[0], vectT5.val[0]);
vectT1.val[0] = vaddq_f32(vectT1.val[0], vectT6.val[0]);
vectT1.val[0] = vaddq_f32(vectT1.val[0], vectT7.val[0]);
vectT1.val[0] = vaddq_f32(vectT1.val[0], vectT8.val[0]);
vectT1.val[1] = vaddq_f32(vectT1.val[1], vectT2.val[1]);
vectT1.val[1] = vaddq_f32(vectT1.val[1], vectT3.val[1]);
vectT1.val[1] = vaddq_f32(vectT1.val[1], vectT4.val[1]);
vectT1.val[1] = vaddq_f32(vectT1.val[1], vectT5.val[1]);
vectT1.val[1] = vaddq_f32(vectT1.val[1], vectT6.val[1]);
vectT1.val[1] = vaddq_f32(vectT1.val[1], vectT7.val[1]);
vectT1.val[1] = vaddq_f32(vectT1.val[1], vectT8.val[1]);
vst2q_f32(matrixC_neon[i], vectT1);
}
}
My normal matrix multiplication function:
void matrix_mult (void)
{
float tempProduct;
int i, j, k;
for (i = 0; i < 8; i++)
{
for (j = 0; j < 8; j++)
{
tempProduct = 0;
for (k = 0; k < 8; k++)
{
tempProduct = tempProduct + matrixA[i][k] * matrixB[k][j];
}
matrixC[i][j] = tempProduct;
}
}
}
I use gettimeofday() function in the library <sys/time.h> to calculate time in nanoseconds.
The Problem:
aarch32 has a NEON register bank of the size 256bytes total
A 8x8 float matrix is already 256bytes large, and you need three of them. (768)
You have to read the matrix B "vertically", which means it's physically impossible to do it the "streaming" way for maximum data locality.
You do vector-scalar multiply which takes four times as much total than vector-vector multiplication.
You load Mat A via VFP. And VFP on the Cortex-A8 particularly is unbelievably slow, in addtion to the NEON<->VFP switching overhead. Unlike auto-vectorization, intrinsic do pretty much everything the way you tell it to do. And you gave the wrong instruction.
The Solution:
We transpose matrix B and do dot-product math line by line.
I hope the code below works for you, and if performance is crucial, consider writing in assembly since compilers aren't very trustworthy when it comes to NEON performance, even in intrinsics.
static __always_inline float32x2_t dotProduct(float32x4x2_t input1, float32x4x2_t input2)
{
float32x2_t d0, d1;
float32x4_t q0;
input1.val[0] = vmulq_f32(input1.val[0], input2.val[0]);
input1.val[1] = vmulq_f32(input1.val[1], input2.val[1]);
q0 = vaddq_f32(input1.val[0], input1.val[1]);
d0 = vget_low_f32(q0);
d1 = vget_high_f32(q0);
d0 = vpadd_f32(d0, d1);
d0 = vpadd_f32(d0, d1);
return d0;
}
void matMulF_neon(float *pDst, float *pMatA, float *pMatB)
{
float32x4x4_t line01, line23, line45, line67;
float32x4x2_t b[8], *pA, *pB, temp;
float32x2x4_t result;
uint32_t i;
// vld4 for easier transpose
line01 = vld4q_f32(pMatB++);
line23 = vld4q_f32(pMatB++);
line45 = vld4q_f32(pMatB++);
line67 = vld4q_f32(pMatB);
// transpose MatB
vuzpq_f32(line01.val[0], line45.val[0]);
vuzpq_f32(line01.val[1], line45.val[1]);
vuzpq_f32(line01.val[2], line45.val[2]);
vuzpq_f32(line01.val[3], line45.val[3]);
vuzpq_f32(line23.val[0], line67.val[0]);
vuzpq_f32(line23.val[1], line67.val[1]);
vuzpq_f32(line23.val[2], line67.val[2]);
vuzpq_f32(line23.val[3], line67.val[3]);
// store MatB to stack
b[0].val[0] = line01.val[0];
b[0].val[1] = line01.val[1];
b[1].val[0] = line01.val[2];
b[1].val[1] = line01.val[3];
b[2].val[0] = line23.val[0];
b[2].val[1] = line23.val[1];
b[3].val[0] = line23.val[2];
b[3].val[1] = line23.val[3];
b[4].val[0] = line45.val[0];
b[4].val[1] = line45.val[1];
b[5].val[0] = line45.val[2];
b[5].val[1] = line45.val[3];
b[6].val[0] = line67.val[0];
b[6].val[1] = line67.val[1];
b[7].val[0] = line67.val[2];
b[7].val[1] = line67.val[3];
pA = (float32x4x2_t *) pMatA;
i = 8;
do
{
// just the right amount of data for aarch32 NEON register bank size
pB = b;
temp = *pA++;
result.val[0] = dotProduct(*pB++, temp);
result.val[1] = dotProduct(*pB++, temp);
result.val[2] = dotProduct(*pB++, temp);
result.val[3] = dotProduct(*pB++, temp);
vst4_lane_f32(pDst++, result, 0);
result.val[0] = dotProduct(*pB++, temp);
result.val[1] = dotProduct(*pB++, temp);
result.val[2] = dotProduct(*pB++, temp);
result.val[3] = dotProduct(*pB, temp);
vst4_lane_f32(pDst++, result, 0);
} while (--i);
}
/////////////////////////// EDIT
I checked the disassembly and the generated code is FUBAR. (Linaro GCC 7.1.1)
I'd go the assembly route. Writing NEON codes in intrinsics is pure waste of time IMO.
Closed. This question needs debugging details. It is not currently accepting answers.
Edit the question to include desired behavior, a specific problem or error, and the shortest code necessary to reproduce the problem. This will help others answer the question.
Closed 5 years ago.
Improve this question
I was hoping to create a twiddle table with a specific ordering for a specific purpose, here is the initial code:
#define TWIDDLE_LIMIT 64
#define PI 3.1415927
float *twiddle_real;
float *twiddle_imag;
void main()
{
int N = 256;
int TW_size = TWIDDLE_LIMIT + (TWIDDLE_LIMIT>>2);
twiddle_real = malloc(TW_size * sizeof(float));
twiddle_imag = malloc(TW_size * sizeof(float));
int i;
for(i=0; i<TWIDDLE_LIMIT; i++)
{
twiddle_real[i] = (float) cos((float)i * 2.0 * PI / (float)N);
twiddle_imag[i] = (float) - sin((float)i * 2.0 * PI / (float)N);
}
for(int a=0; a<TWIDDLE_LIMIT; a++)
printf("RE = %f \t IM = %f \n",twiddle_real[a],twiddle_imag[a]);
}
And i get this kind of result:
RE = 1.000000 IM = -0.000000 //64 lines
RE = 0.999699 IM = -0.024541
RE = 0.998795 IM = -0.049068
RE = 0.997290 IM = -0.073565
RE = 0.995185 IM = -0.098017
RE = 0.992480 IM = -0.122411
RE = 0.989177 IM = -0.146730
RE = 0.985278 IM = -0.170962
RE = 0.980785 IM = -0.195090
RE = 0.975702 IM = -0.219101
RE = 0.970031 IM = -0.242980
RE = 0.963776 IM = -0.266713
RE = 0.956940 IM = -0.290285
RE = 0.949528 IM = -0.313682
RE = 0.941544 IM = -0.336890
RE = 0.932993 IM = -0.359895
RE = 0.923880 IM = -0.382683
RE = 0.914210 IM = -0.405241
RE = 0.903989 IM = -0.427555
RE = 0.893224 IM = -0.449611
RE = 0.881921 IM = -0.471397
RE = 0.870087 IM = -0.492898
RE = 0.857729 IM = -0.514103
RE = 0.844854 IM = -0.534998
RE = 0.831470 IM = -0.555570
RE = 0.817585 IM = -0.575808
RE = 0.803208 IM = -0.595699
RE = 0.788346 IM = -0.615232
RE = 0.773010 IM = -0.634393
RE = 0.757209 IM = -0.653173
RE = 0.740951 IM = -0.671559
RE = 0.724247 IM = -0.689541
RE = 0.707107 IM = -0.707107
RE = 0.689541 IM = -0.724247
RE = 0.671559 IM = -0.740951
RE = 0.653173 IM = -0.757209
RE = 0.634393 IM = -0.773010
RE = 0.615232 IM = -0.788346
RE = 0.595699 IM = -0.803208
RE = 0.575808 IM = -0.817585
RE = 0.555570 IM = -0.831470
RE = 0.534998 IM = -0.844854
RE = 0.514103 IM = -0.857729
RE = 0.492898 IM = -0.870087
RE = 0.471397 IM = -0.881921
RE = 0.449611 IM = -0.893224
RE = 0.427555 IM = -0.903989
RE = 0.405241 IM = -0.914210
RE = 0.382683 IM = -0.923880
RE = 0.359895 IM = -0.932993
RE = 0.336890 IM = -0.941544
RE = 0.313682 IM = -0.949528
RE = 0.290285 IM = -0.956940
RE = 0.266713 IM = -0.963776
RE = 0.242980 IM = -0.970031
RE = 0.219101 IM = -0.975702
RE = 0.195090 IM = -0.980785
RE = 0.170962 IM = -0.985278
RE = 0.146730 IM = -0.989177
RE = 0.122411 IM = -0.992480
RE = 0.098017 IM = -0.995185
RE = 0.073565 IM = -0.997290
RE = 0.049068 IM = -0.998795
RE = 0.024541 IM = -0.999699
And this is only a minimal exaple that I can explain you as much as possibble.
What i want and tried to create is a table that begins with the earlier set of lines and the resdt will be as following:
(earlier set of lines) expressed as following(idx:0 --- > 64)
re[idx] = (float) cos((float)i * (2*pi)/(float)N);
im[idx] = (float)-sin((float)i * (2*pi)/(float)N);
(2nd set of lines) expressed as following repeated 4 times consequently
re[idx] = (float) cos(4 * (float)i * (2*pi)/(float)N);
im[idx] = (float)-sin(4 * (float)i * (2*pi)/(float)N);
(3nd set of lines) expressed as following repeated 16 times consequently
re[idx] = (float) cos(16 * (float)i * (2*pi)/(float)N);
im[idx] = (float)-sin(16 * (float)i * (2*pi)/(float)N);
The result is expected to be as following:
//set 1 as above repeated just 1 time
// ....
//set 2 repeated 4 times
RE = 1.000000 IM = -0.000000 //1st ligne of set 1
RE = 0.995185 IM = -0.098017 //4th ligne of set 1
RE = 0.980785 IM = -0.195090 //8th ligne of set 1
RE = 0.956940 IM = -0.290285 //12th ligne of set 1 ...
RE = 0.923880 IM = -0.382683
RE = 0.881921 IM = -0.471397
RE = 0.831470 IM = -0.555570
RE = 0.773010 IM = -0.634393
RE = 0.707107 IM = -0.707107
RE = 0.634393 IM = -0.773010
RE = 0.555570 IM = -0.831470
RE = 0.471397 IM = -0.881921
RE = 0.382683 IM = -0.923880
RE = 0.290285 IM = -0.956940
RE = 0.195090 IM = -0.980785
RE = 0.098017 IM = -0.995185
// set 3 repeated 16 times
RE = 1.000000 IM = -0.000000 //1st ligne of set 1
RE = 0.923880 IM = -0.382683 //16th ligne of set 1
RE = 0.707107 IM = -0.707107 //38th ligne of set 1
RE = 0.382683 IM = -0.923880 //64th ligne of set 1
I've tried several times but i keep getting wrong results I don't know if it's precison issue or sometimng else.
You might maintain factor and set size in an additional (outer) loop:
// you will need more than you calculated previously!
float twiddle_real[TWIDDLE_LIMIT * 3];
float twiddle_imag[TWIDDLE_LIMIT * 3];
// pointer arithmetic...
float* real = twiddle_real;
float* imag = twiddle_imag;
double factor = 1.0;
for(int size = TWIDDLE_LIMIT; size > 1; size /= 4)
{
for(int j = 0; j < 64; ++j)
{
*real++ = (float) cos((j % size) * factor * 2.0 * PI / N);
*imag++ = (float) -sin((j % size) * factor * 2.0 * PI / N);
}
factor *= 4.0;
}
You do not need all those casts by the way - since factor is a double, (j % size) is converted to implicitly, as well as N afterwards.
Recommendation: As re and img belong together, I would represent them as such:
struct Complex
{
double re;
double im;
};
You could then have an array of these:
struct Complex twiddle[TW_size]; // no need for malloc, by the way...
You might have noticed: I changed to double, too. There is no reason to use float (with more limited precision) unless you have limited memory available (micro controllers)...
Alternative (why re-inventing the wheel?): use complex.h.
It doesn't work because your math is wrong. The first table covers only the first quadrant of the complex plane. When you multiply by 4 for the second segment of your desired output, you won't get 4 repeats of the first quadrant, you'll get all four quadrants.
Here is something closer to what you specified. And it fixes some problems with your use of C:
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#define N_ROOTS_OF_UNITY 64
#define PI 3.14159265358979323846264338327950
float *twiddle_real;
float *twiddle_imag;
int main()
{
size_t table_size = N_ROOTS_OF_UNITY * 3;
twiddle_real = malloc(table_size * sizeof(float));
twiddle_imag = malloc(table_size * sizeof(float));
size_t i, incr, repeat, k = 0;
for (incr = 1; incr <= 16; incr *= 4) {
for (repeat = 0; repeat < incr; ++repeat) {
for(i = 0; i < N_ROOTS_OF_UNITY; i += incr) {
twiddle_real[k] = (float) cos(i * (PI / 2) / N_ROOTS_OF_UNITY);
twiddle_imag[k] = (float) -sin(i * (PI / 2) / N_ROOTS_OF_UNITY);
++k;
}
}
}
for (int a = 0; a < table_size; a++)
printf("%d: RE = %f\tIM = %f\n", a, twiddle_real[a], twiddle_imag[a]);
return 0;
}
I want to use texture 2D memory for double precision. I want to read from texture to shared memory and convert int2 to double, and then transfer back to host memory But I am getting only first row as desired and all other row's value is 2.00000000.
#include<stdio.h>
#include<cuda.h>
#define Xdim 8
#define Ydim 8
texture<int2,2>me_texture;
static __inline__ __device__ double fetch_double(int2 p){
return __hiloint2double(p.y, p.x);
}
__global__ void kern(double *o, int pitch){
__shared__ double A[Xdim][Ydim];
unsigned int i = blockIdx.x*blockDim.x + threadIdx.x;
unsigned int j = blockIdx.y*blockDim.y + threadIdx.y;
int2 jj;
if(i<Xdim && j<Ydim){
jj = tex2D(me_texture, i, j);
A[threadIdx.x][threadIdx.y] = fetch_double(jj);
}
__syncthreads();
if(i<Xdim && j<Ydim){
o[j*Xdim + i] = A[threadIdx.x][threadIdx.y];
}
}
int main(int argc, char *argv[]){
double hbuf[Xdim][Ydim];
double hout[Xdim][Ydim];
double *dob;
double *dbuf;
size_t pitch_bytes;
cudaMallocPitch((void**)&dbuf, &pitch_bytes, sizeof(double)*Xdim, Ydim);
cudaMallocPitch((void**)&dob, &pitch_bytes, sizeof(double)*Xdim, Ydim);
hbuf[0][0] = 1.234567891234567;
hbuf[0][1] = 12.34567891234567;
hbuf[0][2] = 123.4567891234567;
hbuf[0][3] = 1234.567891234567;
hbuf[0][4] = 12345.67891234567;
hbuf[0][5] = 123456.7891234567;
hbuf[0][6] = 1234567.891234567;
hbuf[0][7] = 12345678.91234567;
hbuf[1][0] = 123456789.1234567;
hbuf[1][1] = 1234567891.234567;
hbuf[1][2] = 12345678912.34567;
hbuf[1][3] = 123456789123.4567;
hbuf[1][4] = 1234567891234.567;
hbuf[1][5] = 12345678912345.67;
hbuf[1][6] = 123456789123456.7;
hbuf[1][7] = 1234567891234567;
hbuf[2][0] = 123456789.7654321;
hbuf[2][1] = 1234567897.654321;
hbuf[2][2] = 12345678976.54321;
hbuf[2][3] = 123456789765.4321;
hbuf[2][4] = 1234567897654.321;
hbuf[2][5] = 12345678976543.21;
hbuf[2][6] = 123456789765432.1;
hbuf[2][7] = 1234567897654321;
hbuf[3][0] = 9.876543211234567;
hbuf[3][1] = 98.76543211234567;
hbuf[3][2] = 987.6543211234567;
hbuf[3][3] = 9876.543211234567;
hbuf[3][4] = 98765.43211234567;
hbuf[3][5] = 987654.3211234567;
hbuf[3][6] = 9876543.211234567;
hbuf[3][7] = 98765432.11234567;
hbuf[4][0] = 987654321.1234567;
hbuf[4][1] = 9876543211.234567;
hbuf[4][2] = 98765432112.34567;
hbuf[4][3] = 987654321123.4567;
hbuf[4][4] = 9876543211234.567;
hbuf[4][5] = 98765432112345.67;
hbuf[4][6] = 987654321123456.7;
hbuf[4][7] = 9876543211234567;
hbuf[5][0] = 987654321.7654321;
hbuf[5][1] = 9876543217.654321;
hbuf[5][2] = 98765432176.54321;
hbuf[5][3] = 987654321765.4321;
hbuf[5][4] = 9876543217654.321;
hbuf[5][5] = 98765432176543.21;
hbuf[5][6] = 987654321765432.1;
hbuf[5][7] = 9876543217654321;
hbuf[6][0] = 1234567891234567;
hbuf[6][1] = 123456789123456.7;
hbuf[6][2] = 12345678912345.67;
hbuf[6][3] = 1234567891234.567;
hbuf[6][4] = 123456789123.4567;
hbuf[6][5] = 12345678912.34567;
hbuf[6][6] = 1234567891.234567;
hbuf[6][7] = 123456789.1234567;
hbuf[7][0] = 12345678.91234567;
hbuf[7][1] = 1234567.891234567;
hbuf[7][2] = 123456.7891234567;
hbuf[7][3] = 12345.67891234567;
hbuf[7][4] = 1234.567891234567;
hbuf[7][5] = 123.4567891234567;
hbuf[7][6] = 12.34567891234567;
hbuf[7][7] = 1.234567891234567;
for (int i=0; i<Xdim; i++){
for(int j=0; j<Ydim; j++){
printf("%.16f\t", hbuf[i][j]);
}
printf("\n");
}
cudaMemcpy2D(dbuf, pitch_bytes, hbuf, Xdim*sizeof(double), Xdim*sizeof(double), Ydim, cudaMemcpyHostToDevice);
me_texture.addressMode[0] = cudaAddressModeClamp;
me_texture.addressMode[1] = cudaAddressModeClamp;
me_texture.filterMode = cudaFilterModeLinear;
me_texture.normalized = false;
cudaBindTexture2D(0, me_texture, dbuf, cudaCreateChannelDesc(32,32,0,0, cudaChannelFormatKindSigned), Xdim, Ydim, pitch_bytes );
int pitch = pitch_bytes/sizeof(double);
kern<<<1, 64>>>(dob, pitch);
cudaMemcpy2D(hout,Xdim*sizeof(double), dob, pitch_bytes, Xdim*sizeof(double),Ydim, cudaMemcpyDeviceToHost);
printf("\nI am Fine\n");
for(int i = 0 ; i < Xdim ; i++){
for(int j=0; j<Ydim; j++){
printf("%.16f\t", hout[i][j]);
}
printf("\n");
}
cudaUnbindTexture(me_texture);
cudaFree(dbuf);
cudaFree(dob);
return 0;
}
Above code work fine if you change the following things.
Replace
kern<<<1, 64>>>(..., ..)
to
dim3 blockPerGrid(1, 1)
dim3 threadPerBlock(8, 8)
kern<<<blockPerGrid, threadPerBlock>>>(....)
here in place of Xdim change it to pitch
o[j*pitch + i] = A[threadIdx.x][threadIdx.y];
And change cudaFilterModeLinear to cudaFilterModePoint .
For the compilation you need to specify the computing capability, suppose your compute capability ie 3.0 then it would be
nvcc -arch=sm_30 file.cu
If your code contained error checking, you would realise that your kernel launch is failing with an invalid filter mode. It isn't legal in CUDA to use a cudaFilterModeLinear with non-float types, so nothing is actually running. If you change the filter mode to cudaFilterModePoint, you might find things start working.
I have been trying to get KissFFT to work on a dsPIC, however after trying various different ways, the output is not what it should be. I was hoping to get some help to see if there are any configurations that I may be overlooking or if its just somthing i haven't thought of?
I am using a dsPIC33EP256MC202 with the XC16 compiler within MPLABX.
Declarations and memory assignment.
int readings[3] = {0, 0, 0};
kiss_fft_scalar zero;
memset(&zero,0,sizeof(zero));
int size = 128 * 2;
float fin[256];
kiss_fft_cpx in[size];
kiss_fft_cpx out[size];
for (i = 0; i < size; i++) {
in[i].r = zero;
in[i].i = zero;
out[i].r = zero;
out[i].i = zero;
}
kiss_fft_cfg mycfg = kiss_fft_alloc(size*2 ,0 ,NULL,NULL);
Get readings from an accellerometer on the breadboard and populate the float array (using pythagoras to consolidate the 3 axis' into one signal). The input XYZ value are scaled down as they come in anywhere between -2400 and 2400 on average.
while(1)
{
if(iii <= 1){
UART_Write_Text("Collecting...");
}
getOutput(readings);
X = (double)readings[0];
Y = (double)readings[1];
Z = (double)readings[2];
X = X / 50;
Y = Y / 50;
Z = Z / 50;
if(ii <= 256){
fin[ii] = sqrt(X*X + Y*Y + Z*Z);
ii++;
}
else{
i=0;
while(i<255){
fin[i] = fin[i+1];
i++;
}
fin[255] = sqrt(X*X + Y*Y + Z*Z);
}
Once the float array is full of values, populate the real component of the input complex array with the values in the float array. Then perform the Kiss FFT and populate a float array (arrayDFTOUT) with the absolute value of each real and imaginary value of the out array of Kiss FFT, the final loop makes any negative value positive.
if(iii == 255){
iii = 0;
UART_Write_Text("Processing...");
for (i = 0; i < size; i++) {
// samples are type of short
in[i].r = fin[i];
in[i].i = zero;
out[i].r = zero;
out[i].i = zero;
}
kiss_fft(mycfg, in, out);
for(i=0;i<128;i++){
arrayDFTOUT[i] = sqrt((out[i].r*out[i].r) + (out[i].i*out[i].i));
}
arrayDFTOUT[0] = 1;
for(i = 0; i<128; i++){
if(arrayDFTOUT[i] < 0){
arrayDFTOUT[i] = arrayDFTOUT[i] - (arrayDFTOUT[i]*2);
}
}
Finally display the output values through serial using the UART on the breadboard.
for(i = 0; i < 128; i++){
sprintf(temp, "%f,", arrayDFTOUT[i]);
UART_Write_Text(temp);
}
And are the results. All zero's aparet from the first value that was set to 1 after KissFFT had been performed. Any ideas?
For solving project euler problem 20 to find the sum of digits in 100! i am running the following program , it is working for factorial of small numbers but not for 100.which data type should i use or is it necessary to use an array for storing the digits?
int rec(int);
void main()
{
int f=1,i=1,z,s=0,r,n;
while(i<=100)
{
f=f*i;
f=rec(f);
i++;
}
n=f;
while(n!=0)
{
r=n%10;
n=n/10;
s=s+r;
}
printf("\n%d",s);
}
int rec(int t)
{
if(t%10==0)
{
t=t/10;
rec(t);
}
return t;
}
Approximate factorial of 100 can be calculated using the double type. You can also use the Stirling's formula, stating that
n! ≈ sqrt(2*M_PI*n) * pow(n/exp(0),n)
If you plug in the numbers, you'll get n! ≈ 9*10157. That means your type needs to be able to hold 158 decimal digits or, equivalently, ~log2(9*10157) = 525 bits or 66 8-bit bytes.
No fundamental numeric type in C is big enough. The largest you are guaranteed to get is 64 bits (if you use unsigned long long).
So, if you want to calculate n! in C, you either need to construct long arithmetic multiplication by hand or use a special library that can do that for you.
For this relatively simple task you can actually implement long multiplication and use it to get the factorial value by repeated multiplication.
In the following program I've used an in-place multiplication algorithm, which modifies one of the multiplicands in the process and eventually replaces it with the product. The algorithm can be derived directly from the long multiplication known from school.
This program calculates factorials of integers from 1 up to and including 100:
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <limits.h>
typedef unsigned char uint8;
typedef unsigned short uint16;
#if UINT_MAX >= 0xFFFFFFFF
typedef unsigned uint32;
#else
typedef unsigned long uint32;
#endif
typedef unsigned uint;
void MulInPlace(uint8* dst/* n bytes */,
const uint8* src/* n bytes */,
uint n)
{
uint c1, c2;
if (n >= 0xFFFF) abort();
for (c1 = n - 1; c1 != ~0u; c1--)
{
uint16 s = 0;
uint32 p = 0; // p must be able to store ceil(log2(n))+2*8 bits
for (c2 = c1; c2 != ~0u; c2--)
{
p += dst[c2] * src[c1 - c2];
}
dst[c1] = (uint8)(p & 0xFF);
for (c2 = c1 + 1; c2 < n; c2++)
{
p >>= 8;
s += dst[c2] + (uint8)(p & 0xFF);
dst[c2] = (uint8)(s & 0xFF);
s >>= 8;
}
}
}
int ByteDivInPlace(uint8* dst/* n bytes */,
uint n,
uint8 divisor,
uint8* remainder)
{
uint rem = 0;
int nonzero = 0;
while (n)
{
rem += dst[n - 1];
nonzero |= (dst[n - 1] = rem / divisor);
rem = (rem % divisor) << 8;
n--;
}
if (remainder != NULL)
*remainder = (uint8)(rem >> 8);
return nonzero; // 1 if the quotient is non-zero, 0 otherwise
}
void IncInPlace(uint8* dst/* n bytes */,
uint n)
{
uint c = 1;
while (n-- && c)
{
c += *dst;
*dst++ = c & 0xFF;
c >>= 8;
}
}
void DestroyingDecimalPrint(uint8* dst, uint n)
{
uint8 r;
if (ByteDivInPlace(dst, n, 10, &r))
DestroyingDecimalPrint(dst, n);
printf("%d", r);
}
int main(void)
{
int i;
uint8 factorial[66];
uint8 factor[sizeof(factorial)];
uint8 tmp[sizeof(factorial)];
// factor = 1
memset(factor, 0, sizeof(factor));
factor[0] = 1;
// factorial = 1
memcpy(factorial, factor, sizeof(factorial));
for (i = 1; i <= 100; i++)
{
// factorial *= factor
MulInPlace(factorial, factor, sizeof(factorial));
// tmp = factorial
memcpy(tmp, factorial, sizeof(factorial));
// print i and tmp
printf("%i! = ", i);
DestroyingDecimalPrint(tmp, sizeof(tmp));
printf("\n");
// factor += 1
IncInPlace(factor, sizeof(factor));
}
return 0;
}
Output (ideone):
1! = 1
2! = 2
3! = 6
4! = 24
5! = 120
6! = 720
7! = 5040
8! = 40320
9! = 362880
10! = 3628800
11! = 39916800
12! = 479001600
13! = 6227020800
14! = 87178291200
15! = 1307674368000
16! = 20922789888000
17! = 355687428096000
18! = 6402373705728000
19! = 121645100408832000
20! = 2432902008176640000
21! = 51090942171709440000
22! = 1124000727777607680000
23! = 25852016738884976640000
24! = 620448401733239439360000
25! = 15511210043330985984000000
26! = 403291461126605635584000000
27! = 10888869450418352160768000000
28! = 304888344611713860501504000000
29! = 8841761993739701954543616000000
30! = 265252859812191058636308480000000
31! = 8222838654177922817725562880000000
32! = 263130836933693530167218012160000000
33! = 8683317618811886495518194401280000000
34! = 295232799039604140847618609643520000000
35! = 10333147966386144929666651337523200000000
36! = 371993326789901217467999448150835200000000
37! = 13763753091226345046315979581580902400000000
38! = 523022617466601111760007224100074291200000000
39! = 20397882081197443358640281739902897356800000000
40! = 815915283247897734345611269596115894272000000000
41! = 33452526613163807108170062053440751665152000000000
42! = 1405006117752879898543142606244511569936384000000000
43! = 60415263063373835637355132068513997507264512000000000
44! = 2658271574788448768043625811014615890319638528000000000
45! = 119622220865480194561963161495657715064383733760000000000
46! = 5502622159812088949850305428800254892961651752960000000000
47! = 258623241511168180642964355153611979969197632389120000000000
48! = 12413915592536072670862289047373375038521486354677760000000000
49! = 608281864034267560872252163321295376887552831379210240000000000
50! = 30414093201713378043612608166064768844377641568960512000000000000
51! = 1551118753287382280224243016469303211063259720016986112000000000000
52! = 80658175170943878571660636856403766975289505440883277824000000000000
53! = 4274883284060025564298013753389399649690343788366813724672000000000000
54! = 230843697339241380472092742683027581083278564571807941132288000000000000
55! = 12696403353658275925965100847566516959580321051449436762275840000000000000
56! = 710998587804863451854045647463724949736497978881168458687447040000000000000
57! = 40526919504877216755680601905432322134980384796226602145184481280000000000000
58! = 2350561331282878571829474910515074683828862318181142924420699914240000000000000
59! = 138683118545689835737939019720389406345902876772687432540821294940160000000000000
60! = 8320987112741390144276341183223364380754172606361245952449277696409600000000000000
61! = 507580213877224798800856812176625227226004528988036003099405939480985600000000000000
62! = 31469973260387937525653122354950764088012280797258232192163168247821107200000000000000
63! = 1982608315404440064116146708361898137544773690227268628106279599612729753600000000000000
64! = 126886932185884164103433389335161480802865516174545192198801894375214704230400000000000000
65! = 8247650592082470666723170306785496252186258551345437492922123134388955774976000000000000000
66! = 544344939077443064003729240247842752644293064388798874532860126869671081148416000000000000000
67! = 36471110918188685288249859096605464427167635314049524593701628500267962436943872000000000000000
68! = 2480035542436830599600990418569171581047399201355367672371710738018221445712183296000000000000000
69! = 171122452428141311372468338881272839092270544893520369393648040923257279754140647424000000000000000
70! = 11978571669969891796072783721689098736458938142546425857555362864628009582789845319680000000000000000
71! = 850478588567862317521167644239926010288584608120796235886430763388588680378079017697280000000000000000
72! = 61234458376886086861524070385274672740778091784697328983823014963978384987221689274204160000000000000000
73! = 4470115461512684340891257138125051110076800700282905015819080092370422104067183317016903680000000000000000
74! = 330788544151938641225953028221253782145683251820934971170611926835411235700971565459250872320000000000000000
75! = 24809140811395398091946477116594033660926243886570122837795894512655842677572867409443815424000000000000000000
76! = 1885494701666050254987932260861146558230394535379329335672487982961844043495537923117729972224000000000000000000
77! = 145183092028285869634070784086308284983740379224208358846781574688061991349156420080065207861248000000000000000000
78! = 11324281178206297831457521158732046228731749579488251990048962825668835325234200766245086213177344000000000000000000
79! = 894618213078297528685144171539831652069808216779571907213868063227837990693501860533361810841010176000000000000000000
80! = 71569457046263802294811533723186532165584657342365752577109445058227039255480148842668944867280814080000000000000000000
81! = 5797126020747367985879734231578109105412357244731625958745865049716390179693892056256184534249745940480000000000000000000
82! = 475364333701284174842138206989404946643813294067993328617160934076743994734899148613007131808479167119360000000000000000000
83! = 39455239697206586511897471180120610571436503407643446275224357528369751562996629334879591940103770870906880000000000000000000
84! = 3314240134565353266999387579130131288000666286242049487118846032383059131291716864129885722968716753156177920000000000000000000
85! = 281710411438055027694947944226061159480056634330574206405101912752560026159795933451040286452340924018275123200000000000000000000
86! = 24227095383672732381765523203441259715284870552429381750838764496720162249742450276789464634901319465571660595200000000000000000000
87! = 2107757298379527717213600518699389595229783738061356212322972511214654115727593174080683423236414793504734471782400000000000000000000
88! = 185482642257398439114796845645546284380220968949399346684421580986889562184028199319100141244804501828416633516851200000000000000000000
89! = 16507955160908461081216919262453619309839666236496541854913520707833171034378509739399912570787600662729080382999756800000000000000000000
90! = 1485715964481761497309522733620825737885569961284688766942216863704985393094065876545992131370884059645617234469978112000000000000000000000
91! = 135200152767840296255166568759495142147586866476906677791741734597153670771559994765685283954750449427751168336768008192000000000000000000000
92! = 12438414054641307255475324325873553077577991715875414356840239582938137710983519518443046123837041347353107486982656753664000000000000000000000
93! = 1156772507081641574759205162306240436214753229576413535186142281213246807121467315215203289516844845303838996289387078090752000000000000000000000
94! = 108736615665674308027365285256786601004186803580182872307497374434045199869417927630229109214583415458560865651202385340530688000000000000000000000
95! = 10329978488239059262599702099394727095397746340117372869212250571234293987594703124871765375385424468563282236864226607350415360000000000000000000000
96! = 991677934870949689209571401541893801158183648651267795444376054838492222809091499987689476037000748982075094738965754305639874560000000000000000000000
97! = 96192759682482119853328425949563698712343813919172976158104477319333745612481875498805879175589072651261284189679678167647067832320000000000000000000000
98! = 9426890448883247745626185743057242473809693764078951663494238777294707070023223798882976159207729119823605850588608460429412647567360000000000000000000000
99! = 933262154439441526816992388562667004907159682643816214685929638952175999932299156089414639761565182862536979208272237582511852109168640000000000000000000000
100! = 93326215443944152681699238856266700490715968264381621468592963895217599993229915608941463976156518286253697920827223758251185210916864000000000000000000000000
You should look for overflow, print the value after each iteration.
Note that rec(t); doesn't do anything as it doesn't use the returned value... you want t = rec(t);.
int is definitely too short, try long long... if that's still overflowing, you need another data structure.. eg: GMP Library.
Note: using some "proper" language for the job might give you some insight to the range you have to support... e.g. with python:
>>> import math
>>> math.factorial(100)
93326215443944152681699238856266700490715968264381621468592963895217599993229915608941463976156518286253697920827223758251185210916864000000000000000000000000L
private static void problem20()
{
string muliplent = "100";
for (int i = 99; i > 1; i--)
{
muliplent = getproduct(muliplent, i);
}
int sum = 0;
char[] result=muliplent.ToCharArray();
int count = muliplent.ToCharArray().Count();
for (int j = 0; j < count; j++)
{
sum = sum + (result[j] - '0');
}
Console.WriteLine("sum is {0}", sum);
Console.ReadLine();
}
private static string getproduct(string multiplent, int multiplier)
{
StringBuilder str = new StringBuilder();
int product = 0;
int remainder = 0;
int dividend = 0;
char[] c = multiplent.ToCharArray();
for (int i = c.Count() - 1; i >= 0; i--)
{
product = (((c[i] - '0') * multiplier) + dividend);
remainder = product % 10;
dividend = product / 10;
if (i != 0)
{
str.Insert(0, remainder);
}
}
str.Insert(0, product);
return str.ToString();
}