How to implement nested loops in cuda thrust - loops

I currently have to run a nested loop as follow:
for(int i = 0; i < N; i++){
for(int j = i+1; j <= N; j++){
compute(...)//some calculation here
I've tried leaving the first loop in CPU and do the second loop in GPU. Results are too many memory access. Is there any other ways to do it? For example by thrust::reduce_by_key?
The whole program is here:
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#include <thrust/generate.h>
#include <thrust/sort.h>
#include <thrust/binary_search.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/random.h>
#include <cmath>
#include <iostream>
#include <iomanip>
#define N 1000000
// define a 2d point pair
typedef thrust::tuple<float, float> Point;
// return a random Point in [0,1)^2
Point make_point(void)
static thrust::default_random_engine rng(12345);
static thrust::uniform_real_distribution<float> dist(0.0f, 1.0f);
float x = dist(rng);
float y = dist(rng);
return Point(x,y);
struct sqrt_dis: public thrust::unary_function<Point, double>
float x, y;
double tmp;
sqrt_dis(float _x, float _y): x(_x), y(_y){}
__host__ __device__
float operator()(Point a)
tmp =(thrust::get<0>(a)-x)*(thrust::get<0>(a)-x)+\
tmp = -1.0*(sqrt(tmp));
return (1.0/tmp);
int main(void) {
clock_t t1, t2;
double result;
t1 = clock();
// allocate some random points in the unit square on the host
thrust::host_vector<Point> h_points(N);
thrust::generate(h_points.begin(), h_points.end(), make_point);
// transfer to device
thrust::device_vector<Point> points = h_points;
thrust::plus<double> binary_op;
float init = 0;
for(int i = 0; i < N; i++){
Point tmp_i = points[i];
float x = thrust::get<0>(tmp_i);
float y = thrust::get<1>(tmp_i);
result += thrust::transform_reduce(points.begin()+i,\
std::cout<<"result"<<i<<": "<<result<<std::endl;
t2 = clock()-t1;
std::cout<<"result: ";
std::cout<< result <<std::endl;
std::cout<<"run time: "<<t2/CLOCKS_PER_SEC<<"s"<<std::endl;
return 0;

EDIT: Now that you have posted an example, here is how you could solve it:
You have n 2D points stored in a linear array like this (here n=4)
points = [p0 p1 p2 p3]
Based on your code I assume you want to calculate:
result = f(p0, p1) + f(p0, p2) + f(p0, p3) +
f(p1, p2) + f(p1, p3) +
f(p2, p3)
Where f() is your distance function which needs to be executed m times in total:
m = (n-1)*n/2
in this example: m=6
You can look at this problem as a triangular matrix:
[ p0 p1 p2 p3 ]
[ p1 p2 p3 ]
[ p2 p3 ]
[ p3 ]
Transforming this matrix into a linear vector with m elements while leaving out the diagonal elements results in:
[p1 p2 p3 p2 p3 p3]
The index of an element in the vector is k = [0,m-1].
Index k can be remapped to columns and rows of the triangular matrix to k -> (i,j):
i = n - 2 - floor(sqrt(-8*k + 4*n*(n-1)-7)/2.0 - 0.5)
j = k + i + 1 - n*(n-1)/2 + (n-i)*((n-i)-1)/2
i is the row and j is the column.
In our example:
0 -> (0, 1)
1 -> (0, 2)
2 -> (0, 3)
3 -> (1, 2)
4 -> (1, 3)
5 -> (2, 3)
Now you can put all this together and execute a modified distance functor m times which applies the aforementioned mapping to get the corresponding pairs based on the index and then sum up everything.
I modified your code accordingly:
#include <thrust/device_vector.h>
#include <thrust/generate.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/transform_reduce.h>
#include <thrust/random.h>
#include <math.h>
#include <iostream>
#include <stdio.h>
#include <stdint.h>
typedef float Float;
// define a 2d point pair
typedef thrust::tuple<Float, Float> Point;
// return a random Point in [0,1)^2
Point make_point(void)
static thrust::default_random_engine rng(12345);
static thrust::uniform_real_distribution<Float> dist(0.0, 1.0);
Float x = dist(rng);
Float y = dist(rng);
return Point(x,y);
struct sqrt_dis_new
typedef thrust::device_ptr<Point> DevPtr;
DevPtr points;
const uint64_t n;
sqrt_dis_new(uint64_t n, DevPtr p) : n(n), points(p)
Float operator()(uint64_t k) const
// calculate indices in triangular matrix
const uint64_t i = n - 2 - floor(sqrt((double)(-8*k + 4*n*(n-1)-7))/2.0 - 0.5);
const uint64_t j = k + i + 1 - n*(n-1)/2 + (n-i)*((n-i)-1)/2;
printf("%llu -> (%llu, %llu)\n", k,i,j);
const Point& p1 = *(points.get()+j);
const Point& p2 = *(points.get()+i);
const Float xm = thrust::get<0>(p1)-thrust::get<0>(p2);
const Float ym = thrust::get<1>(p1)-thrust::get<1>(p2);
return 1.0/(-1.0 * sqrt(xm*xm + ym*ym));
int main()
const uint64_t N = 4;
// allocate some random points in the unit square on the host
thrust::host_vector<Point> h_points(N);
thrust::generate(h_points.begin(), h_points.end(), make_point);
// transfer to device
thrust::device_vector<Point> d_points = h_points;
const uint64_t count = (N-1)*N/2;
std::cout << count << std::endl;
thrust::plus<Float> binary_op;
const Float init = 0.0;
Float result = thrust::transform_reduce(thrust::make_counting_iterator((uint64_t)0),
std::cout<<"result: " << result << std::endl;
return 0;
It depends on your compute function which you do not specify.
Usually you unroll the loops and launch the kernel in a 2D manner for every combination of i and j if the computations are independent.
Have a look at the Thrust examples and identify similar use cases to your problem.


Sparse matrix multiplication in Eigen giving wrong result?

I am using Eigen in a project of mine, and I am running into a strange issue. I have complex sparse matrices A and B (1500x1500 or larger), and am multiplying them together with coefficients.
When A = B, and taking vector x of ones, I expect that
(A-B)*x = 0, (A*B-B*A)*x = 0,
(A*A*B*B - B*B*A*A)*x = 0,
etc. and I do get this result for all these cases. (A.isApprox(B) evaluates to 1 and (A-B).norm() = 0).
However, when I multiply the matrices by doubles, as in
(c1*A*c2*A*d1*B*d2*B - d1*B*d2*B*c1*A*c2*A)*x,
I get a nonzero result, which doesn't make sense to me, as scalars should commute with the matrices. In fact, if I do,
(c1*c2*d1*d2*A*A*B*B - d1*d2*c1*c2*B*B*A*A)*x
I get zero. Any time the coefficients are interspersed in the matrix manipulation, I get a nonzero result.
I am not using any compiler optimizations, etc.
What am I doing wrong here?
I have worked up a simple example. Maybe I'm missing something dumb, but here it is. This gives me an error of 10^20.
#include <iostream>
#include <cmath>
#include <vector>
#include <Eigen/Sparse>
#include <complex>
typedef std::complex<double> Scalar;
typedef Eigen::SparseMatrix<Scalar, Eigen::RowMajor> SpMat;
typedef Eigen::Triplet<Scalar> trip;
int main(int argc, const char * argv[]) {
double k0 = M_PI;
double dz = 0.01;
double nz = 1500;
std::vector<double> rhos(nz), atten(nz), cp(nz);
for(int i = 0; i < nz; ++i){
if(i < 750){
rhos[i] = 1.5;
cp[i] = 2500;
atten[i] = 0.5;
rhos[i] = 1;
cp[i] = 1500;
atten[i] = 0;
Scalar ci, eta, n, rho, drhodz;
Scalar t1, t2, t3, t4;
ci = Scalar(0,1);
eta = 1.0/(40.0*M_PI*std::log10(std::exp(1.0)));
int Mp = 6;
std::vector<std::vector<trip> > mat_entries_N(Mp), mat_entries_D(Mp);
for(int i = 0; i < nz; ++i){
n = 1500./cp[i] * (1.+ ci * eta * atten[i]);
rho = rhos[i];
if(i > 0 && i < nz-1){
drhodz = (rhos[i+1]-rhos[i-1])/(2*dz);
else if(i == 0){
drhodz = (rhos[i+1]-rhos[i])/(dz);
else if(i == nz-1){
drhodz = (rhos[i]-rhos[i-1])/(dz);
t1 = (n*n - 1.);
t2 = 1./(k0*k0)*(-2./(dz * dz));
t3 = 1./(k0*k0)*(drhodz/rho*2.*dz);
t4 = 1./(k0*k0)*(1/(dz * dz));
double c,d;
for(int mp = 0; mp < Mp; ++mp){
c = std::pow(std::sin((mp+1)*M_PI/(2*Mp+1)),2);
d = std::pow(std::cos((mp+1)*M_PI/(2*Mp+1)),2);
mat_entries_N[mp].push_back(trip(i,i,(c*(t1 + t2))));
mat_entries_D[mp].push_back(trip(i,i,(d*(t1 + t2))));
if(i < nz - 1){
mat_entries_N[mp].push_back(trip(i,i+1,(c*(-t3 + t4))));
mat_entries_D[mp].push_back(trip(i,i+1,(d*(-t3 + t4))));
if(i > 0){
mat_entries_N[mp].push_back(trip(i,i-1,(c*(t3 + t4))));
mat_entries_D[mp].push_back(trip(i,i-1,(d*(t3 + t4))));
SpMat N(nz,nz), D(nz,nz);
SpMat identity(nz, nz);
std::vector<trip> idcoeffs;
for(int i = 0; i < nz; ++i){
identity.setFromTriplets(idcoeffs.begin(), idcoeffs.end());
SpMat temp(nz,nz);
N = identity;
D = identity;
for(int mp = 0; mp < Mp; ++mp){
temp.setFromTriplets(mat_entries_N[mp].begin(), mat_entries_N[mp].end());
N = (temp*N).eval();
temp.setFromTriplets(mat_entries_D[mp].begin(), mat_entries_D[mp].end());
D = (temp*D).eval();
std::cout << (N*D - D*N).norm() << std::endl;
return 0;
The problem is that without a meaningful reference value defining what is the expected order of magnitude of a non-zero value, it is impossible to conclude whether 1e20 is a huge or a tiny value.
In your case, the norm of the matrices N and D are about 1e20 and 1e18 respectively, and the norm of N*D is about 1e38. Given that the relative precision of double is about 1e-16, an error of 1e20 can be considered as 0 compared to 1e38.
To summarize, it is most of the time meaningless to look at the absolute error. Instead, you have to look at the relative error:
std::cout << (N*D - D*N).norm()/(N*D).norm() << std::endl;
which gives you about 1e-17. This is indeed smaller that the numerical precision of double.

How to convert an index into N coordinates?

In a 1 dimensional space:
x = i
In a 2 dimensional space (of size sx, sy):
x = i / sx
y = i % sx
In a 3 dimensional space (of size sx, sy, sz):
x = i / (sy*sz)
y = (i/sz) % sy
z = i % sz
How to deal with an N dimensional space? How can these formulas be generalized?
What about the inverse conversion?
(x1, x2, ..., xn) --> i
Note: all variables are integer.
I guess the common formulas looks like:
to check this formulas i used this programs and it looks like works.
#include <iostream>
#include <string>
#include <string.h>
#define Sz1 2
#define Sz2 3
#define Sz3 4
#define Sz4 5
using namespace std;
int main()
int a[Sz4][Sz3][Sz2][Sz1];
int i,j,k,l,n,x1,x2,x3,x4,s1,s2,s3,s4;
for (i=0;i<s1*s2*s3*s4;i++){
x4= i/(s1*s2*s3);
x3 = i / (s1*s2) % s3;
x2 = (i/s1) % s2;
x1 = i % s1;
for (l=0;l<Sz4;l++) {
for (i=0;i<Sz3;i++) {
for(j=0;j<Sz2;j++) {
for(k=0;k<Sz1;k++) {
n=s1*s2*s3*l+s1*s2*i+s1*j+k;//inverse convertion
cout<<a[l][i][j][k]<<"{"<<n<<"} ";
return 0;

Reading from a textfile and taking those values into a 7 by 7 matrix

Hey guys so I am kind of stuck on what to do. I know what I want in pseudo code but having some difficulty actually writing it.
I have 2 arrays of 7 numbers. In the first array the numbers are
Angstroms [0.4000, 0.5000, 0.6000, 0.7000, 0.8000, 0.9000, 1.0000]
Energy [-0.9767, -1.1000, -1.1535, -1.1710, -1.1704, -1.1604, -1.145]
Input File
0.400000000000000 -0.976798125297645
0.500000000000000 -1.10086977056789
0.600000000000000 -1.153517976992553
0.700000000000000 -1.171014611491842
0.800000000000000 -1.170406254407191
0.900000000000000 -1.160412786280990
1.000000000000000 -1.145758813825982
Now I want to fit them to a sixth order polynomial so I can do more stuff with that data which is the problem I am having.
My output for this stage would be this
A = Angstroms
[1 1 1 1 1 1 1
-0.4 -0.5 -0.6 -0.7 -0.8 -0.9 -1.0
A^2 A^2 A^2 A^2 A^2 A^2 A^2 So the above value squared
A^3 A^3 A^3 A^3 A^3 A^3 A^3 and so on
A^4 A^4 A^4 A^4 A^4 A^4 A^4
A^5 A^5 A^5 A^5 A^5 A^5 A^5
A^6 A^6 A^6 A^6 A^6 A^6 A^6]
How do I pull the list and then add it to this matrix and then in a short efficient manner with each next row being like a^2, a^3 and so on.
I have an idea of just inputting every value like
A(0,0) = 1 ; A(0,1) = -0.4 and so on but that would be super tedious.
My code so far
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#define ELEMENTS 6
#define n 7
// First read the data from the file
// Fit the data to a sixth order polynomial
// Fit the data to sixth order polynomial in terms of exp(-r)
// Plot the fitted functions and the data points to a set between 0.2 and 5 A
void * file_input ();
void * polynomial ();
int main () {
return 0;
void * file_input () {
static double Angstroms[ELEMENTS];
static double Energy[ELEMENTS];
float a, b;
int i;
FILE * in_file = fopen("H2Data.txt", "r");
for (i = 0; i <= ELEMENTS; i++) {
fscanf(in_file,"%f %f\n", &a, &b);
Angstroms[i] = a;
Energy[i] = b;
// printf ("Angstroms[%f], Energy[%f]\n", Angstroms[i], Energy[i]);
return in_file, Angstroms, Energy;
// Sextic equation ax^6 + bx^5 + cx^4 + dx^3 + ex^2 +fx + g = 0
void * polynomial(Angstroms, Energy) {
int i, j;
double * ax = malloc (n * n * sizeof(double)); // 7 by 7 matrix
double * b = malloc (n * sizeof(double)); // 0 by 1 matrix for the b values
You can try following code
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#define ELEMENTS 7
#define N 7
double Angstroms[ELEMENTS];
double Energy[ELEMENTS];
double ax[ELEMENTS][N];
void file_input();
void polynomial();
int main () {
return 0;
void file_input () {
float a, b;
int i;
FILE * in_file = fopen("H2Data.txt", "r");
for (i = 0; i < ELEMENTS; i++) {
fscanf(in_file,"%f %f\n", &a, &b);
Angstroms[i] = a;
Energy[i] = b;
void polynomial() {
int i,j;
for( i=0; i < ELEMENTS; i++)
ax[i][0] = 1;
ax[i][1] = Angstroms[i];
for( i=2; i < N; i++)
for (j=0; j < ELEMENTS; j++)
ax[j][i] = ax[j][i-1]*ax[i][1];
Try this:
double * ax = malloc (n * n * sizeof(double)); // 7 by 7 matrix
double * bx = malloc (n * sizeof(double)); // 0 by 1 matrix for the b values
for (i = 0; i <= n; i++) {
fscanf(in_file,"%f %f\n", &a, &b);
bx[i] = b; // Energy
ax[0,i] = 1;
ax[1,i] = a; // Angstroms
ax[2,i] = a*a; // a^2
It allocates enough space for the vector and the matrix, then for each file in the input file fills one element of the vector and one column in your matrix.

Pseudo-Random number genetor based on LCG

I want to implement the pseudo-random number generator in xv6. I am trying to implement Linear congruential generator algorithm, but I am not getting how to seed it. Here is the piece of my code. I know this code won't work because X is not changing globally. I am not getting how doing that.
static int X = 1;
int random_g(int M)
int a = 1103515245, c = 12345;
X = (a * X + c) % M;
return X;
Incorrect code.
Do not use % on X, the random state variable, to update the state. Use % to form the return value.
Use unsigned types to avoid signed integer overflow (UB) - Perhaps unsigned, unsigned long, unsigned long long. Wider affords a longer sequence.
To match a = 1103515245, c = 12345, we want m = 31.
static unsigned long X = 1;
int random_g(int M) {
const unsigned long a = 1103515245, c = 12345;
#define m 0x80000000
int r = (X % M) + 1; // [1 ... M]
X = (a * X + c) % m;
return r;
Additional code needed to remove the typical M bias. Many SO post on that.
Ref: Why 1103515245 is used in rand? and
I don't know how much that helps you, but if you have an Intel Ivy Bridge or later generation processor, you could try to use the RDRAND instruction. Something along these lines:
static int X;
random_g (int M)
asm volatile("byte $0x48; byte $0x0F; byte $0xC7; byte $0xF0"); // RDRAND AX
asm volatile("mov %%ax, %0": "=r"(X)); // X = rdrand_val
int a = 1103515245, c = 12345;
X = (a * X + c) % M;
return X;
I haven't tested the above code, as I can't build xv6 right now, but it should give you a hint as to how you can work; utilising your processor's rng.
In the following code, random_g is a self-seeding random number generator that returns values between 1 and M. The main function tests the function for the specific case where M is 8.
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <stdbool.h>
#include <time.h>
int random_g( int M )
static uint32_t X = 1;
static bool ready = false;
if ( !ready )
X = (uint32_t)time(NULL);
ready = true;
static const uint32_t a = 1103515245;
static const uint32_t c = 12345;
X = (a * X + c);
uint64_t temp = (uint64_t)X * (uint64_t)M;
temp >>= 32;
return (int)temp;
int main(void)
int i, r;
int M = 8;
int *histogram = calloc( M+1, sizeof(int) );
for ( i = 0; i < 1000000; i++ )
r = random_g( M );
if ( i < 10 )
printf( "%d\n", r );
if ( r < 1 || r > M )
printf( "bad number: %d\n", r );
printf( "\n" );
for ( i = 1; i <= M; i++ )
printf( "%d %6d\n", i, histogram[i] );
free( histogram );

Own asin() function (with Taylor series) not accurate

I need to write my own asin() function without math.h library with the use of Taylor series. It works fine for numbers between <-0.98;0.98> but when I am close to limits it stops with 1604 iterations and therefore is inaccurate.
I don't know how to make it more accurete. Any suggestions are very appreciated!
The code is following:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#define EPS 0.000000000001
double my_arcsin(double x)
long double a, an, b, bn;
a = an = 1.0;
b = bn = 2.0;
long double n = 3.0;
double xn;
double xs = x;
double xp = x;
int iterace = 0;
xn = xs + (a/b) * (my_pow(xp,n) / n);
while (my_abs(xn - xs) >= EPS)
n += 2.0;
an += 2.0;
bn += 2.0;
a = a * an;
b = b * bn;
xs = xn;
xn = xs + (a/b) * (my_pow(xp,n) / n);
//printf("%d\n", iterace);
return xn;
int main(int argc, char* argv[])
double x = 0.0;
if (argc > 2)
x = strtod(argv[2], NULL);
if (strcmp(argv[1], "--asin") == 0)
if (x < -1 || x > 1)
printf("%.10e\n", my_arcsin(x));
//printf("%.10e\n", asin(x));
return 0;
And also a short list of my values and expected ones:
My values Expected values my_asin(x)
5.2359877560e-01 5.2359877560e-01 0.5
1.5567132089e+00 1.5707963268e+00 1 //problem
1.4292568534e+00 1.4292568535e+00 0.99 //problem
1.1197695150e+00 1.1197695150e+00 0.9
1.2532358975e+00 1.2532358975e+00 0.95
Even though the convergence radius of the series expansion you are using is 1, therefore the series will eventually converge for -1 < x < 1, convergence is indeed painfully slow close to the limits of this interval. The solution is to somehow avoid these parts of the interval.
I suggest that you
use your original algorithm for |x| <= 1/sqrt(2),
use the identity arcsin(x) = pi/2 - arcsin(sqrt(1-x^2)) for 1/sqrt(2) < x <= 1.0,
use the identity arcsin(x) = -pi/2 + arcsin(sqrt(1-x^2)) for -1.0 <= x < -1/sqrt(2).
This way you can transform your input x into [-1/sqrt(2),1/sqrt(2)], where convergence is relatively fast.
PLEASE NOTICE: In this case I strongly recommend #Bence's method, since you can't expect a slowly convergent method with low data accuracy to obtain arbitrary precision.
However I'm willing to show you how to improve the result using your current algorithm.
The main problem is that a and b grows too fast and soon become inf (after merely about 150 iterations). Another similar problem is my_pow(xp,n) grows fast when n grows, however this doesn't matter much in this very case since we could assume the input data goes inside the range of [-1, 1].
So I've just changed the method you deal with a/b by introducing ab_ratio, see my edited code:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#define EPS 0.000000000001
#include <math.h>
#define my_pow powl
#define my_abs fabsl
double my_arcsin(double x)
#if 0
long double a, an, b, bn;
a = an = 1.0;
b = bn = 2.0;
unsigned long _n = 0;
long double ab_ratio = 0.5;
long double n = 3.0;
long double xn;
long double xs = x;
long double xp = x;
int iterace = 0;
xn = xs + ab_ratio * (my_pow(xp,n) / n);
long double step = EPS;
#if 0
while (my_abs(step) >= EPS)
while (1) /* manually stop it */
n += 2.0;
#if 0
an += 2.0;
bn += 2.0;
a = a * an;
b = b * bn;
_n += 1;
ab_ratio *= (1.0 + 2.0 * _n) / (2.0 + 2.0 * _n);
xs = xn;
step = ab_ratio * (my_pow(xp,n) / n);
xn = xs + step;
if (_n % 10000000 == 0)
printf("%lu %.10g %g %g %g %g\n", _n, (double)xn, (double)ab_ratio, (double)step, (double)xn, (double)my_pow(xp, n));
//printf("%d\n", iterace);
return xn;
int main(int argc, char* argv[])
double x = 0.0;
if (argc > 2)
x = strtod(argv[2], NULL);
if (strcmp(argv[1], "--asin") == 0)
if (x < -1 || x > 1)
printf("%.10e\n", my_arcsin(x));
//printf("%.10e\n", asin(x));
return 0;
For 0.99 (and even 0.9999999) it soon gives correct results with more than 10 significant digits. However it gets slow when getting near to 1.
Actually the process has been running for nearly 12 minutes on my laptop calculating --asin 1, and the current result is 1.570786871 after 3560000000 iterations.
UPDATED: It's been 1h51min now and the result 1.570792915 and iteration count is 27340000000.
