I need to implement a transfer function from direct form II. Here's a description of what direct form II is. I'm allowed to choose what coefficents are to be included.
So far my code looks like this:
int main(void)
{
int l, m;
l = dirii2(2); // l = -82;
//m = directii(2);
return l;
}
int dirii2(int x)
{
int y = 0; // output // 3 is constant of array size.
static int v[3] = {6,4,0};
int b[3] = {3,5,2}, a[2] = {3,6};
int q0, q1, q2;
for (int i = 0; i < 3 ; i++) {
q0 = x; // q0 = x = 2
q1 = a[2-2]*v[2-1]; // q1 = 3 * 6 = 18
q2 = a[2-1]*v[2-2]; // q2 = 6 * 4 = 24
v[2] = q0 - q1 - q2;// v(n) = x(n) - a1*v(n-1) - a2v(n-2); // v = 2 - 18 - 24 = -40
y =+ b[i]*v[2-i];// + b[1]*v[2-1]+b[2]*v[2-2];
v[0] = v[1];
v[1] = v[2];
}
return y;
}
When I run the code the result y gains or loses values rapidly like it's getting unstable:
First iteration y becomes -138.
Second iteration y becomes 230.
Third iteration y becomes -92.
Did I write the code right?
Related
I'm rearranging an array in my project on ARMv7. Now I get the elements' address d[] in the order I expect. To make the code more efficient, I want to use neon intrinstics in C++. Now my problem is, I can load the address array d[] by using vld1q_s32(), but I do not know how to read the elements of this vector as addresses.The instructions I know can only simply duplicate one vector.
This problem has been confusing me for several days. Or neon cannot do certain thing?
Thanks for your answering.
Here is my code:
void InputRearrange(int8_t* din, int8_t* dout, const int x, const int y){
int8_t* dout_array[16];
int out = 0;
dout_array[0] = din;
for(int n = 1; n < 16; n++) {//get the address of the first line in z-axis
dout_array[n] = dout_array[n - 1] + x*y;
}
for(int y_count = 0; y_count < y; y_count++) {
for(int x_count = 0; x_count < x; x_count++) {
for(int z_count = 0; z_count < 16; z_count++) {
dout[out++] = *(dout_array[k]++);//dout_array[k]++ let dout_array[k] moves in x-axis and I want to change this loop into neon intrinsics.
}
}
}
}
}
din[ ] is the original array and is like a 3-D array as a cube but stored as a 1-D one. The cube has three axis: x, y , z(=16). The original way array din[ ] stores the elements from x-axis first and then y-axis and last z-axis. But my code changed the order to z-axis first and then x-axis and last y-axis. I would like to use neon intrinsics in the final for loop, but it seems that it cannot be realized.
Your code rearranges a three-dimensional array int8_t (&output)[y][x][16] to int8_t (&input)[16][y][x], which is also equivalent to transposing a 2d array of int8_t (&out)[x*y][16] to int8_t (&in)[16][x*y].
This can definitely benefit from arm neon intrinsics that can interleave/deinterleave either registers (vzip,vuzp) or memory content (vldN, vstN).
// planarizes next 128 bytes to 16 planes
void planarize(int8_t *in, int8_t *out, int xy) {
int8_t * o_1 = out + 4*xy;
int8_t * o_2 = out + 8*xy;
int8_t * o_3 = out + 12*xy;
int8x16x4_t a = vld4q_s8(in); in+=64;
int8x16x4_t b = vld4q_s8(in); in+=64;
int8x16x2_t c = vuzpq_s8(a.val[0], b.val[0]);
int8x16x2_t d = vuzpq_s8(a.val[1], b.val[1]);
int8x16x2_t e = vuzpq_s8(a.val[2], b.val[2]);
int8x16x2_t f = vuzpq_s8(a.val[3], b.val[3]);
c = vuzpq_s8(c.val[0], c.val[1]);
d = vuzpq_s8(d.val[0], d.val[1]);
e = vuzpq_s8(e.val[0], e.val[1]);
f = vuzpq_s8(f.val[0], f.val[1]);
// now c = 0 16 32 48 64 80 96 112 4 20 36 52 68 84 100 116
// 8 24 40 56 72 88 104 120 12 28 44 60 76 92 108 124
// d = c + 1, e = d + 1, f = e + 1
vst1_s8(out + 0 * xy, vget_low_s8(c.val[0]);
vst1_s8(out + 1 * xy, vget_low_s8(d.val[0]);
vst1_s8(out + 2 * xy, vget_low_s8(e.val[0]);
vst1_s8(out + 3 * xy, vget_low_s8(f.val[0]);
vst1_s8(o_1 + 4 * xy, vget_high_s8(c.val[0]);
vst1_s8(o_1 + 5 * xy, vget_high_s8(d.val[0]);
vst1_s8(o_1 + 6 * xy, vget_high_s8(e.val[0]);
vst1_s8(o_1 + 7 * xy, vget_high_s8(f.val[0]);
vst1_s8(o_2 + 0 * xy, vget_low_s8(c.val[1]);
vst1_s8(o_2 + 1 * xy, vget_low_s8(d.val[1]);
vst1_s8(o_2 + 2 * xy, vget_low_s8(e.val[1]);
vst1_s8(o_2 + 3 * xy, vget_low_s8(f.val[1]);
vst1_s8(o_3 + 4 * xy, vget_high_s8(c.val[1]);
vst1_s8(o_3 + 5 * xy, vget_high_s8(d.val[1]);
vst1_s8(o_3 + 6 * xy, vget_high_s8(e.val[1]);
vst1_s8(o_3 + 7 * xy, vget_high_s8(f.val[1]);
}
The opposite would interleave from 16 independent planes
int8x16x2_t load4(int8_t *in, int xy) {
int8x8_t a0 = vld1_s8(in);
int8x8_t a1 = vld1_s8(in + xy);
int8x8_t a2 = vld1_s8(in + 2 * xy);
int8x8_t a3 = vld1_s8(in + 3 * xy);
auto a = vzipq_s8(vcombine_s8(a0, a0), vcombine_s8(a1, a1)).val[0];
auto b = vzipq_s8(vcombine_s8(a2, a2), vcombine_s8(a3, a3)).val[0];
return vzipq_s8(a,b);
}
int8_t *store4(int8x16x2_t a, int8x16x2_t b, int8x16x2_t c, int8x16x2_t d, int8_t *out) {
int32x4x4_t A{
vreinterpretq_s32_s8(a.val[0]),
vreinterpretq_s32_s8(b.val[0]),
vreinterpretq_s32_s8(c.val[0]),
vreinterpretq_s32_s8(d.val[0])};
int32x4x4_t B{
vreinterpretq_s32_s8(a.val[1]),
vreinterpretq_s32_s8(b.val[1]),
vreinterpretq_s32_s8(c.val[1]),
vreinterpretq_s32_s8(d.val[1])};
vst4q_s32((int32_t*)out, A); out += 64;
vst4q_s32((int32_t*)out, B); out += 64;
return out;
}
void interleave(int8_t *in, int8_t *out, int xy) {
int w = xy;
while (w >= 8) {
auto a = load4(in, xy);
auto b = load4(in + 4*xy, xy);
auto c = load4(in + 8*xy, xy);
auto d = load4(in + 12*xy, xy);
in += 8;
out = store4(a,b,c,d, out);
w -= 8;
}
}
Handling the excess (xy & 7 != 0) can be done by processing one full block aligned at in_ptr + xy - 8 and out_ptr + xy * 16 - 8*16.
I just have a problem with C dynamic scoping in the following code. I am aware that C only uses static (lexical) scoping, but the problem asks to pretend that we are using dynamic scoping.
#include <stdio.h>
int x = 25;
int y = 15;
int z = 5;
static void func1(int z) {
x += 20;
y += 30;
z += 40;
printf("func1: x=%4d y=%4d z=%4d x+y+z=%d\n",x,y,z,x+y+z);
}
static void func2(void) {
int z = 100;
y += 30;
z += 10;
func1(20);
printf("func2: x=%4d y=%4d z=%4d x+y+z=%d\n",x,y,z,x+y+z);
}
static void func3(int x) {
x += 40;
func2();
printf("func3: x=%4d y=%4d z=%4d x+y+z=%d\n",x,y,z,x+y+z);
}
static int func4() {
int y = 90;
func3(80);
printf("func4: x=%4d y=%4d z=%4d x+y+z=%d\n",x,y,z,x+y+z);
return 1000;
}
int main(int arc, char *argv[]) {
int y = func4();
z += 2000;
printf("main: x=%4d y=%4d z=%4d x+y+z=%d\n",x,y,z,x+y+z);
}
The following is the result of all 5:
func1: x = 140 y=? z =? x+y+z = 350
func2: x = ? y=? z =? x+y+z = 400
func3: x = ? y=? z =? x+y+z = 295
func4: x = ? y=? z =? x+y+z = 180
main: x = ? y=? z =? x+y+z = 3030
From my understanding, we go from main first and then to the next function called. So, in main, we have:
x = 25 (from global variable),
y = 1000 (from return 1000 from func4), and
z = 2005 (from global variable 5 + 2000)
x + y +z = 25 + 1000 + 2005 = 3030 -> correct.
But, in the other functions, such as in func4:
variable x = 25 (from main from global)
y = 90 (from local variable)
z = 5 (from global variable)
x + y + z = 25 + 90 + 5 = 120 -> incorrect.
This problem persists on the other functions, where I think I find the correct for 1 or 2 variables but have no idea for the last one.
I am pretty sure I got this right just want to make sure though... In the following code, we have omitted the definitions of constants M and N
here are the two functions in question arith and optarith:
#define M /* Mystery number 1 */
#define N /* Mystery number 2 */
unsigned int arith(unsigned int x, unsigned int y){
unsigned int result = 0;
result = x * M + y/N;
return result;
}
We compiled this code for particular values of M and N. The compiler optimized the multiplication and
division. The following is a translation of the generated machine code back into C:
unsigned int optarith(unsigned int x, unsigned int y){
unsigned int t = 3 * x;
x <<= 6;
x -= 2 * t;
y >>= 4;
y = y/3;
return x + y;
}
so here's my work
so to find value M for the operation M * x
x <<= 6; // same as x = x * 64
x-= 2 * t; //same as x = x - 2 * (3 * x)
so basically x = 64x - 6x which is just 58x therefore M = 58
and to find the value N for operation y/N
y >>= 4; same as y = y/16;
y = y/3; same as y = (y/16)/3;
that makes y/48 so N = 48
Did I do this correctly?
My task is
Show on the screen n-element of the progression {xi}.
Xi = Xi-1 - 3Xi-2
X0 = 0
X1 = 2
i = [2,n]
Here is done, but I didn't understand this theme very well, so i need some help with it.
My code(doesn't work):
void __fastcall TForm1::Button1Click(TObject *Sender)
{
int n = Edit1->Text.ToInt();
int i, x;
if(n==0){
i=0;
Label1->Caption = IntToStr(i);
}
if(n==1){
i=2;
Label1->Caption = IntToStr(i);
}
else {
for(i=2;i<=n;i++){
x=(i-1)-3*(i-2);
Label1->Caption = IntToStr(x);
}
}
}
It's not very nessesary to write code in C++ Builder
You misunderstood the progression formula. Xi-1 and Xi-2 refer to previous elements calculated in your progression.
So you need two variables, which will be carrying previous values that you have just calculated. At any given loop, you calculate the current Xi value using the general progression formula, then copy the value of Xi-1 into Xi-2, throwing the previous value of Xi-2. Then you copy the value of Xi (the up to now current value) into Xi-1.
void __fastcall TForm1::Button1Click(TObject *Sender)
{
int n = Edit1->Text.ToInt();
int i, x;
int xim1, xim2
if(n==0){
i=0;
Label1->Caption = IntToStr(i);
}
if(n==1){
i=2;
Label1->Caption = IntToStr(i);
}
else {
xim1 = 2;
xim2 = 0;
for(i=2;i<=n;i++){
x = xim1-3*xim2;
xim2 = xim1;
xim1 = x;
}
Label1->Caption = IntToStr(x);
}
}
Given this generating function:
X_0 = 0
X_1 = 2
X_i = X_{i-1} + 3*X_{i-2} i = [2,n]
How would you calculate x_4? We know that X_4 = X_3 + 3*X_2; which means that we need to be able to calculate X_3 and X_2. We can write these as:
X_2 = X_1 + 3*X_0 = 2 + 3*0 = 2
X_3 = X_2 + 3*X_1 = 2 + 3*2 = 8
X_4 = X_3 + 3*X_2 = 8 + 3*2 = 14
This can normally be written as a recursive function:
int calcSeries(int n)
{
if(0 == n)
return 0;
if(1 == n)
return 2;
return calcSeries(n-1) + 3*calcSeries(n-2)
}
BTW, this is a very naive implementation for this series, the main problem is that we have two recursive trees; if you look at the hand expansion of X_4 above notice that X_2 appears twice (in the calculation of X_3 and X_4), but we don't store this value so we need to calculate it twice.
Can anyone help me out with the pollard rho implementation? I have implemented this in C. It's working fine for numbers upto 10 digits but it's not able to handle greater numbers.
Please help me out to improve it to carry out factorization of numbers upto 18 digits . My code is this:
#include<stdio.h>
#include<math.h>
int gcd(int a, int b)
{
if(b==0) return a ;
else
return(gcd(b,a%b)) ;
}
long long int mod(long long int a , long long int b , long long int n )
{
long long int x=1 , y=a ;
while(b>0)
{
if(b%2==1) x = ((x%n)*(y%n))%n ;
y = ((y%n)*(y%n))%n ;
b/=2 ;
}
return x%n ;
}
int isprimes(long long int u)
{
if(u==3)
return 1 ;
int a = 2 , i ;
long long int k , t = 0 , r , p ;
k = u-1 ;
while(k%2==0)
{ k/=2 ; t++ ; }
while(a<=3) /*der are no strong pseudoprimes common in base 2 and base 3*/
{
r = mod(a,k,u) ;
for(i = 1 ; i<=t ; i++)
{
p = ((r%u)*(r%u))%u ;
if((p==1)&&(r!=1)&&(r!=(u-1)))
{ return 0 ; }
r = p ;
}
if(p!=1)
return 0 ;
else
a++ ;
}
if(a==4)
return 1 ;
}
long long int pol(long long int u)
{
long long int x = 2 , k , i , a , y , c , s;
int d = 1 ;
k = 2 ;
i = 1 ;
y = x ;
a = u ;
if(isprimes(u)==1)
{
return 1;
}
c=-1 ;
s = 2 ;
while(1)
{
i++;
x=((x%u)*(x%u)-1)% u ;
d = gcd(abs(y-x),u) ;
if(d!=1&&d!=u)
{ printf("%d ",d);
while(a%d==0) { a=a/d; }
x = 2 ;
k = 2 ;
i = 1 ;
y = x ;
if(a==1)
{ return 0 ; }
if(isprimes(a)!=0)
{ return a ; }
u=a ;
}
if(i==k)
{y = x ; k*=2 ; c = x ;} /*floyd cycle detection*/
if(c==x)
{ x = ++s ; }
}
return ;
}
int main()
{
long long int t ;
long long int i , n , j , k , a , b , u ;
while(scanf("%lld",&n)&&n!=0)
{ u = n ; k = 0 ;
while(u%2==0)
{ u/=2 ; k = 1 ; }
if(k==1) printf("2 ") ;
if(u!=1)
t = pol(u) ;
if(u!=1)
{
if(t==1)
{ printf("%lld",u) ; }
else
if(t!=0)
{ printf("%lld",t) ; }
}
printf("\n");
}
return 0;
}
sorry for the long code ..... I am a new coder.
When you're multiplying two numbers modulo m, the intermediate product can become nearly m^2. So if you use a 64-bit unsigned integer type, the maximal modulus it can handle is 2^32, if the modulus is larger, overflow may happen. It will be rare when the modulus is only slightly larger, but that makes it only less obvious, you cannot rely on being lucky if the modulus allows the possibility of overflow.
You can gain a larger range by a factor of two if you choose a representative of the residue class modulo m of absolute value at most m/2 or something equivalent:
uint64_t mod_mul(uint64_t x, uint64_t y, uint64_t m)
{
int neg = 0;
// if x is too large, choose m-x and note that we need one negation for that at the end
if (x > m/2) {
x = m - x;
neg = !neg;
}
// if y is too large, choose m-y and note that we need one negation for that at the end
if (y > m/2) {
y = m - y;
neg = !neg;
}
uint64_t prod = (x * y) % m;
// if we had negated _one_ factor, and the product isn't 0 (mod m), negate
if (neg && prod) {
prod = m - prod;
}
return prod;
}
So that would allow moduli of up to 2^33 with a 64-bit unsigned type. Not a big step.
The recommended solution to the problem is the use of a big-integer library, for example GMP is available as a distribution package on most if not all Linux distros, and also (relatively) easily installable on Windows.
If that is not an option (really, are you sure?), you can get it to work for larger moduli (up to 2^63 for an unsigned 64-bit integer type) using Russian peasant multiplication:
x * y = 2 * (x * (y/2)) + (x * (y % 2))
so for the calculation, you only need that 2*(m-1) doesn't overflow.
uint64_t mod_mult(uint64_t x, uint64_t y, uint64_t m)
{
if (y == 0) return 0;
if (y == 1) return x % m;
uint64_t temp = mod_mult(x,y/2,m);
temp = (2*temp) % m;
if (y % 2 == 1) {
temp = (temp + x) % m;
}
return temp;
}
Note however that this algorithm needs O(log y) steps, so it's rather slow in practice. For smaller m you can speed it up, if 2^k*(m-1) doesn't overflow, you can proceed in steps of k bits instead of single bits (x*y = ((x * (y >> k)) << k) + (x * (y & ((1 << k)-1)))), which is a good improvement if your moduli are never larger than 48 or 56 bits, say.
Using that variant of modular multiplication, your algorithm will work for larger numbers (but it will be significantly slower). You can also try test for the size of the modulus and/or the factors to determine which method to use, if m < 2^32 or x < (2^64-1)/y, the simple (x * y) % m will do.
You can try this C implementation of Pollard Rho :
unsigned long long pollard_rho(const unsigned long long N) {
// Require : a composite number N, not a square.
// Ensure : res is a non-trivial factor of N.
// Option : define a timeout, define a rand function.
static const int timeout = 18;
static unsigned long long rand_val = 2994439072U;
rand_val = (rand_val * 1025416097U + 286824428U) % 4294967291LLU;
unsigned long long res = 1, a, b, c, i = 0, j = 1, x = 1, y = 1 + rand_val % (N - 1);
for (; res == 1; ++i) {
if (i == j) {
if (j >> timeout)
break;
j <<= 1;
x = y;
}
a = y, b = y;
for (y = 0; a; a & 1 ? b >= N - y ? y -= N : 0, y += b : 0, a >>= 1, (c = b) >= N - b ? c -= N : 0, b += c);
y = (1 + y) % N;
for (a = N, b = y > x ? y - x : x - y; (a %= b) && (b %= a););
res = a | b;
}
return res;
}
Otherwise there is a pure C quadratic sieve which factors numbers from 0 to 300-bit.