Alpha blending using table lookup is not as fast as expected

Alpha blending using table lookup is not as fast as expected - c

I thought memory access would be faster than the multiplication and division (although compiler-optimized) done with alpha blending. But it wasn't as fast as expected.
The 16 megabytes used for the table is not an issue in this case. But it is a problem if table lookup could even be slower than doing all the CPU calculations.
Can anyone explain to me why and what is happening? Will the table lookup beat out with a slower CPU?
#include <stdio.h>
#include <stdlib.h>
#include <limits.h>
#include <time.h>
#define COLOR_MAX UCHAR_MAX
typedef unsigned char color;
color (*blending_table)[COLOR_MAX + 1][COLOR_MAX + 1];
static color blend(unsigned int destination, unsigned int source, unsigned int a) {
return (source * a + destination * (COLOR_MAX - a)) / COLOR_MAX;
}
void initialize_blending_table(void) {
int destination, source, a;
blending_table = malloc((COLOR_MAX + 1) * sizeof *blending_table);
for (destination = 0; destination <= COLOR_MAX; ++destination) {
for (source = 0; source <= COLOR_MAX; ++source) {
for (a = 0; a <= COLOR_MAX; ++a) {
blending_table[destination][source][a] = blend(destination, source, a);
}
}
}
}
struct timer {
double start;
double end;
};
void timer_start(struct timer *self) {
self->start = clock();
}
void timer_end(struct timer *self) {
self->end = clock();
}
double timer_measure_in_seconds(struct timer *self) {
return (self->end - self->start) / CLOCKS_PER_SEC;
}
#define n 300
int main(void) {
struct timer timer;
volatile int i, j, k, l, m;
timer_start(&timer);
initialize_blending_table();
timer_end(&timer);
printf("init %f\n", timer_measure_in_seconds(&timer));
timer_start(&timer);
for (i = 0; i <= n; ++i) {
for (j = 0; j <= COLOR_MAX; ++j) {
for (k = 0; k <= COLOR_MAX; ++k) {
for (l = 0; l <= COLOR_MAX; ++l) {
m = blending_table[j][k][l];
}
}
}
}
timer_end(&timer);
printf("table %f\n", timer_measure_in_seconds(&timer));
timer_start(&timer);
for (i = 0; i <= n; ++i) {
for (j = 0; j <= COLOR_MAX; ++j) {
for (k = 0; k <= COLOR_MAX; ++k) {
for (l = 0; l <= COLOR_MAX; ++l) {
m = blend(j, k, l);
}
}
}
}
timer_end(&timer);
printf("function %f\n", timer_measure_in_seconds(&timer));
return EXIT_SUCCESS;
}
result
$ gcc test.c -O3
$ ./a.out
init 0.034328
table 14.176643
function 14.183924

Table lookup is not a panacea. It helps when the table is small enough, but in your case the table is very big. You write
16 megabytes used for the table is not an issue in this case
which I think is very wrong, and is possibly the source of the problem you experience. 16 megabytes is too big for L1 cache, so reading data from random indices in the table will involve the slower caches (L2, L3, etc). The penalty for cache misses is typically large; your blending algorithm must be very complex if you want your LUT solution to be faster.
Read the Wikipedia article for more info.

Your benchmark is hopelessly broken, it makes the LUT look a lot better than it actually is because it reads the table in-order.
If your performance results show that the LUT is worse than direct calculation, then when you start with real-world random access patterns and cache misses, the LUT is going to be much worse.
Focus on improving the computation, and enabling vectorization. It's likely to pay off far better than a table-based approach.
(source * a + destination * (COLOR_MAX - a)) / COLOR_MAX
with rearrangement becomes
(source * a + destination * COLOR_MAX - destination * a) / COLOR_MAX
which simplifies to
destination + (source - destination) * a / COLOR_MAX
which has one multiply and one division by a constant, both of which are very efficient. And it is easily vectorized.
You should also mark your helper function as inline, although a good optimizing compiler is probably inlining it anyway.

Related

How come this very simple code produces a segmentation fault? [duplicate]

I'm having a problem with some program, I have searched about segmentation faults, by I don't understand them quite well, the only thing I know is that presumably I am trying to access some memory I shouldn't. The problem is that I see my code and don't understand what I am doing wrong.
#include<stdio.h>
#include<math.h>
#include<stdlib.h>
#define lambda 2.0
#define g 1.0
#define Lx 100
#define F0 1.0
#define Tf 10
#define h 0.1
#define e 0.00001
FILE *file;
double F[1000][1000000];
void Inicio(double D[1000][1000000]) {
int i;
for (i=399; i<600; i++) {
D[i][0]=F0;
}
}
void Iteration (double A[1000][1000000]) {
long int i,k;
for (i=1; i<1000000; i++) {
A[0][i]= A[0][i-1] + e/(h*h*h*h)*g*g*(A[2][i-1] - 4.0*A[1][i-1] + 6.0*A[0][i-1]-4.0*A[998][i-1] + A[997][i-1]) + 2.0*g*e/(h*h)*(A[1][i-1] - 2*A[0][i-1] + A[998][i-1]) + e*A[0][i-1]*(lambda-A[0][i-1]*A[0][i-1]);
A[1][i]= A[1][i-1] + e/(h*h*h*h)*g*g*(A[3][i-1] - 4.0*A[2][i-1] + 6.0*A[1][i-1]-4.0*A[0][i-1] + A[998][i-1]) + 2.0*g*e/(h*h)*(A[2][i-1] - 2*A[1][i-1] + A[0][i-1]) + e*A[1][i-1]*(lambda-A[1][i-1]*A[1][i-1]);
for (k=2; k<997; k++) {
A[k][i]= A[k][i-1] + e/(h*h*h*h)*g*g*(A[k+2][i-1] - 4.0*A[k+1][i-1] + 6.0*A[k][i-1]-4.0*A[k-1][i-1] + A[k-2][i-1]) + 2.0*g*e/(h*h)*(A[k+1][i-1] - 2*A[k][i-1] + A[k-1][i-1]) + e*A[k][i-1]*(lambda-A[k][i-1]*A[k][i-1]);
}
A[997][i] = A[997][i-1] + e/(h*h*h*h)*g*g*(A[0][i-1] - 4*A[998][i-1] + 6*A[997][i-1] - 4*A[996][i-1] + A[995][i-1]) + 2.0*g*e/(h*h)*(A[998][i-1] - 2*A[997][i-1] + A[996][i-1]) + e*A[997][i-1]*(lambda-A[997][i-1]*A[997][i-1]);
A[998][i] = A[998][i-1] + e/(h*h*h*h)*g*g*(A[1][i-1] - 4*A[0][i-1] + 6*A[998][i-1] - 4*A[997][i-1] + A[996][i-1]) + 2.0*g*e/(h*h)*(A[0][i-1] - 2*A[998][i-1] + A[997][i-1]) + e*A[998][i-1]*(lambda-A[998][i-1]*A[998][i-1]);
A[999][i]=A[0][i];
}
}
main() {
long int i,j;
Inicio(F);
Iteration(F);
file = fopen("P1.txt","wt");
for (i=0; i<1000000; i++) {
for (j=0; j<1000; j++) {
fprintf(file,"%lf \t %.4f \t %lf\n", 1.0*j/10.0, 1.0*i, F[j][i]);
}
}
fclose(file);
}
Thanks for your time.

This declaration:
double F[1000][1000000];
would occupy 8 * 1000 * 1000000 bytes on a typical x86 system. This is about 7.45 GB. Chances are your system is running out of memory when trying to execute your code, which results in a segmentation fault.

Your array is occupying roughly 8 GB of memory (1,000 x 1,000,000 x sizeof(double) bytes). That might be a factor in your problem. It is a global variable rather than a stack variable, so you may be OK, but you're pushing limits here.
Writing that much data to a file is going to take a while.
You don't check that the file was opened successfully, which could be a source of trouble, too (if it did fail, a segmentation fault is very likely).
You really should introduce some named constants for 1,000 and 1,000,000; what do they represent?
You should also write a function to do the calculation; you could use an inline function in C99 or later (or C++). The repetition in the code is excruciating to behold.
You should also use C99 notation for main(), with the explicit return type (and preferably void for the argument list when you are not using argc or argv):
int main(void)
Out of idle curiosity, I took a copy of your code, changed all occurrences of 1000 to ROWS, all occurrences of 1000000 to COLS, and then created enum { ROWS = 1000, COLS = 10000 }; (thereby reducing the problem size by a factor of 100). I made a few minor changes so it would compile cleanly under my preferred set of compilation options (nothing serious: static in front of the functions, and the main array; file becomes a local to main; error check the fopen(), etc.).
I then created a second copy and created an inline function to do the repeated calculation, (and a second one to do subscript calculations). This means that the monstrous expression is only written out once — which is highly desirable as it ensure consistency.
#include <stdio.h>
#define lambda 2.0
#define g 1.0
#define F0 1.0
#define h 0.1
#define e 0.00001
enum { ROWS = 1000, COLS = 10000 };
static double F[ROWS][COLS];
static void Inicio(double D[ROWS][COLS])
{
for (int i = 399; i < 600; i++) // Magic numbers!!
D[i][0] = F0;
}
enum { R = ROWS - 1 };
static inline int ko(int k, int n)
{
int rv = k + n;
if (rv >= R)
rv -= R;
else if (rv < 0)
rv += R;
return(rv);
}
static inline void calculate_value(int i, int k, double A[ROWS][COLS])
{
int ks2 = ko(k, -2);
int ks1 = ko(k, -1);
int kp1 = ko(k, +1);
int kp2 = ko(k, +2);
A[k][i] = A[k][i-1]
+ e/(h*h*h*h) * g*g * (A[kp2][i-1] - 4.0*A[kp1][i-1] + 6.0*A[k][i-1] - 4.0*A[ks1][i-1] + A[ks2][i-1])
+ 2.0*g*e/(h*h) * (A[kp1][i-1] - 2*A[k][i-1] + A[ks1][i-1])
+ e * A[k][i-1] * (lambda - A[k][i-1] * A[k][i-1]);
}
static void Iteration(double A[ROWS][COLS])
{
for (int i = 1; i < COLS; i++)
{
for (int k = 0; k < R; k++)
calculate_value(i, k, A);
A[999][i] = A[0][i];
}
}
int main(void)
{
FILE *file = fopen("P2.txt","wt");
if (file == 0)
return(1);
Inicio(F);
Iteration(F);
for (int i = 0; i < COLS; i++)
{
for (int j = 0; j < ROWS; j++)
{
fprintf(file,"%lf \t %.4f \t %lf\n", 1.0*j/10.0, 1.0*i, F[j][i]);
}
}
fclose(file);
return(0);
}
This program writes to P2.txt instead of P1.txt. I ran both programs and compared the output files; the output was identical. When I ran the programs on a mostly idle machine (MacBook Pro, 2.3 GHz Intel Core i7, 16 GiB 1333 MHz RAM, Mac OS X 10.7.5, GCC 4.7.1), I got reasonably but not wholly consistent timing:
Original Modified
6.334s 6.367s
6.241s 6.231s
6.315s 10.778s
6.378s 6.320s
6.388s 6.293s
6.285s 6.268s
6.387s 10.954s
6.377s 6.227s
8.888s 6.347s
6.304s 6.286s
6.258s 10.302s
6.975s 6.260s
6.663s 6.847s
6.359s 6.313s
6.344s 6.335s
7.762s 6.533s
6.310s 9.418s
8.972s 6.370s
6.383s 6.357s
However, almost all that time is spent on disk I/O. I reduced the disk I/O to just the very last row of data, so the outer I/O for loop became:
for (int i = COLS - 1; i < COLS; i++)
the timings were vastly reduced and very much more consistent:
Original Modified
0.168s 0.165s
0.145s 0.165s
0.165s 0.166s
0.164s 0.163s
0.151s 0.151s
0.148s 0.153s
0.152s 0.171s
0.165s 0.165s
0.173s 0.176s
0.171s 0.165s
0.151s 0.169s
The simplification in the code from having the ghastly expression written out just once is very beneficial, it seems to me. I'd certainly far rather have to maintain that program than the original.

What system are you running on? Do you have access to some sort of debugger (gdb, visual studio's debugger, etc.)?
That would give us valuable information, like the line of code where the program crashes... Also, the amount of memory may be prohibitive.
Additionally, may I recommend that you replace the numeric limits by named definitions?
As such:
#define DIM1_SZ 1000
#define DIM2_SZ 1000000
Use those whenever you wish to refer to the array dimension limits. It will help avoid typing errors.

Run your program with valgrind of linked to efence. That will tell you where the pointer is being dereferenced and most likely fix your problem if you fix all the errors they tell you about.

Jacobi method works with double, fails with float. What is wrong?

I coded a little program to solve a system of n equations with the Jacobi (iterational) method. Below is the code:
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
int main() {
float *a, *b, *x, *xnew, temp;
int i, j, k, maxiter=10000000, n=4;
a = malloc(n*n*sizeof(*a));
b = malloc(n*sizeof(*b));
x = malloc(n*sizeof(*x));
xnew = malloc(n*sizeof(*xnew));
srand((unsigned) time(NULL));
// Filling the matrix
for (i=0;i<=n-1;i++) {
for (j=0;j<=n-1;j++) {
a[n*i+j] = rand()%60;
}
b[i] = rand();
x[i] = rand();
xorg[i]=x[i];
}
// Establishing diagonal dominance
for (i=0;i<=n-1;i++) {
temp=0;
for (j=0;j<=n-1;j++) {
if (j==i) {continue;}
temp = temp + a[n*i+j];
}
a[n*i+i] = temp+1;
}
// Solve the system. Break when residue is low
for (k=0;k<=maxiter-1;k++) {
for (i=0;i<=n-1;i++) {
temp=0;
for (j=0;j<=n-1;j++) {
if (j==i) {continue;}
temp = temp + a[n*i+j]*x[j];
}
xnew[i] = (b[i]-temp)/a[n*i+i];
}
temp=0;
for (i=0;i<=n-1;i++) {
temp = temp + fabs(x[i]-xnew[i]);
x[i]=xnew[i];
}
if (temp<0.0001) {
break;
}
}
printf("Iterations = %d\n",k-1);
return 0;
}
The break-out-of-the-loop criterion is ridiculously easy. This program should never fail. Yet it apparently does not converge (it uses up all iterations in the loop), UNLESS I change the floats to doubles. Floats have much greater precision than this. What is wrong?
Compiled with CodeBlocks 16.01 under Windows 7, if that even matters.

if (temp<0.0001) { is too fine a request given float and the values.
Tried a different limit approach by adding the ULP of the difference of x[i] and xnew[i].
#include <assert.h>
#include <stdint.h>
static uint32_t ULPf(float x) {
union {
float f;
uint32_t u32;
} u;
assert(sizeof(float) == sizeof(uint32_t));
u.f = x;
if (u.u32 & 0x80000000) {
u.u32 ^= 0x80000000;
return 0x80000000 - u.u32;
}
return u.u32 + 0x80000000;
}
static uint32_t ULP_diff(float x, float y) {
uint32_t ullx = ULPf(x);
uint32_t ully = ULPf(y);
if (x > y) return ullx - ully;
return ully - ullx;
}
...
uint64_t sum0 = -1;
unsigned increase = 0;
for (k = 0; k <= maxiter - 1; k++) {
...
uint64_t sum = 0;
for (i = 0; i <= n - 1; i++) {
uint32_t e = ULP_diff(x[i], xnew[i]);
// printf("%u %e %e %llu\n", i, x[i], xnew[i], (unsigned long long) e);
sum += e;
x[i] = xnew[i];
}
if (sum < sum0) {
// answer is converging
sum0 = sum;
increase = 0;
} else {
increase++;
// If failed to find a better answer in `n` iterations and
// code did at least n*N iterations, break.
if (increase > n && k > n*n) break;
}

It appears that the float datatype does not have the required precision for the above algorithm, given the way is was coded. The algorithm does converge, but the "residue" will never be low enough to exit the loop.
The way I understand this is that due to the way float variables are stored internally, you cannot do computation with extremely small (0.0001) and extremely large (RAND_MAX) numbers and expect reasonable accuracy, as in the above example (temp grows to a huge number in the innermost loop).
Consequently, setting b[i] = rand()%60; and x[i] = rand()%60; will alleviate the problem.
Setting b[i] = rand()%6; x[i] = rand()%6; and a[n*i+j] = rand()%6 will allow an even tighter exit-loop condition to be eventually met.
Interestingly, establishing greater diagonal dominance (change a[n*i+i] = temp+1' to a[n*i+i] = temp+10; will also make the program converge, when it would previously not.
I was not familiar with the ULP conditions described by others, but will invest some time into that
If future readers of this have the time and energy, maybe they should read "What Every Computer Scientist Should Know About Floating-Point Arithmetic", even though I have not.
BTW, xorg was to store the original x vector, for debugging purposes, because I was having a terribly hard time getting CodeBlocks to debug
Thanks to all for contributing.

DTRMM & DTRSM hangs on certain matrix sizes

I'm testing performance of ?GEMM, ?TRMM, ?TRSM using MKL's automatic offload on the new Intel Xeon Phi coprocessors and am having some issues with DTRMM and DTRSM. I have code to test the performance for matrix size in steps of 1024 up to 10240 and performance seems to drop off significantly somewhere after N=M=K=8192. When I try testing exactly where by using step sizes of 2, my script was hanging. I then checked 512 step sizes, which work fine, 256 work as well, but anything under 256 just stalls. I cannot find any known issues in regards to this problem. All single precision versions work, as well as single and double precision on ?GEMM. Here is my code:
#include <stdio.h>
#include <stdlib.h>
#include <malloc.h>
#include <stdint.h>
#include <time.h>
#include "mkl.h"
#define DBG 0
int main(int argc, char **argv)
{
char transa = 'N', side = 'L', uplo = 'L', diag = 'U';
MKL_INT N, NP; // N = M, N, K, lda, ldb, ldc
double alpha = 1.0; // Scaling factors
double *A, *B; // Matrices
int matrix_bytes; // Matrix size in bytes
int matrix_elements; // Matrix size in elements
int i, j; // Counters
int msec;
clock_t start, diff;
N = atoi(argv[1]);
start = clock();
matrix_elements = N * N;
matrix_bytes = sizeof(double) * matrix_elements;
// Allocate the matrices
A = malloc(matrix_bytes);
if (A == NULL)
{
printf("Could not allocate matrix A\n");
return -1;
}
B = malloc(matrix_bytes);
if (B == NULL)
{
printf("Could not allocate matrix B\n");
return -1;
}
for (i = 0; i < matrix_elements; i++)
{
A[i] = 0.0;
B[i] = 0.0;
}
// Initialize the matrices
for (i = 0; i < N; i++)
for (j = 0; j <= i; j++)
{
A[i+N*j] = 1.0;
B[i+N*j] = 2.0;
}
// DTRMM call
dtrmm(&side, &uplo, &transa, &diag, &N, &N, &alpha, A, &N, B, &N);
diff = clock() - start;
msec = diff * 1000 / CLOCKS_PER_SEC;
printf("%f\n", (float)msec * 10e-4);
if (DBG == 1)
{
printf("\nMatrix dimension is set to %d \n\n", (int)N);
// Display the result
printf("\nResulting matrix B:\n");
if (N > 10)
{
printf("NOTE: B is too large, print only upper-left 10x10 block...\n");
NP = 10;
}
else
NP = N;
printf("\n");
for (i = 0; i < NP; i++)
{
for (j = 0; j < NP; j++)
printf("%7.3f ", B[i + j * N]);
printf("\n");
}
}
// Free the matrix memory
free(A);
free(B);
return 0;
}
Any help or insight would be greatly appreciated.

This phenomenon has been extensively discussed in other questions, and also in Intel's Software Optimization Manual and Agner Fog's notes.
Typically, you are experiencing a perfect storm of evictions in the memory hierarchy, such that suddenly (nearly) every single access misses cache and/or TLB (one can determine exactly which resource is missing by looking at the specific data access pattern or by using the PMCs; I can do the calculation later when I'm near a whiteboard, unless mystical gets to you first).
You can also search through some of my or Mystical's answers to find previous answers.

The issue was an older version of Intel's icc compiler (beta 10 update, I believe.. maybe). Gold update works like a charm.

1D Min-convolution in CUDA

I have two arrays, a and b, and I would like to compute the "min convolution" to produce result c. Simple pseudo code looks like the following:
for i = 0 to size(a)+size(b)
c[i] = inf
for j = 0 to size(a)
if (i - j >= 0) and (i - j < size(b))
c[i] = min(c[i], a[j] + b[i-j])
(edit: changed loops to start at 0 instead of 1)
If the min were instead a sum, we could use a Fast Fourier Transform (FFT), but in the min case, there is no such analog. Instead, I'd like to make this simple algorithm as fast as possible by using a GPU (CUDA). I'd be happy to find existing code that does this (or code that implements the sum case without FFTs, so that I could adapt it for my purposes), but my search so far hasn't turned up any good results. My use case will involve a's and b's that are of size between 1,000 and 100,000.
Questions:
Does code to do this efficiently already exist?
If I am going to implement this myself, structurally, how should the CUDA kernel look so as to maximize efficiency? I've tried a simple solution where each c[i] is computed by a separate thread, but this doesn't seem like the best way. Any tips in terms of how to set up thread block structure and memory access patterns?

An alternative which might be useful for large a and b would be to use a block per output entry in c. Using a block allows for memory coalescing, which will be important in what is a memory bandwidth limited operation, and a fairly efficient shared memory reduction can be used to combine per thread partial results into a final per block result. Probably the best strategy is to launch as many blocks per MP as will run concurrently and have each block emit multiple output points. This eliminates some of the scheduling overheads associated with launching and retiring many blocks with relatively low total instruction counts.
An example of how this might be done:
#include <math.h>
template<int bsz>
__global__ __launch_bounds__(512)
void minconv(const float *a, int sizea, const float *b, int sizeb, float *c)
{
__shared__ volatile float buff[bsz];
for(int i = blockIdx.x; i<(sizea + sizeb); i+=(gridDim.x*blockDim.x)) {
float cval = INFINITY;
for(int j=threadIdx.x; j<sizea; j+= blockDim.x) {
int t = i - j;
if ((t>=0) && (t<sizeb))
cval = min(cval, a[j] + b[t]);
}
buff[threadIdx.x] = cval; __syncthreads();
if (bsz > 256) {
if (threadIdx.x < 256)
buff[threadIdx.x] = min(buff[threadIdx.x], buff[threadIdx.x+256]);
__syncthreads();
}
if (bsz > 128) {
if (threadIdx.x < 128)
buff[threadIdx.x] = min(buff[threadIdx.x], buff[threadIdx.x+128]);
__syncthreads();
}
if (bsz > 64) {
if (threadIdx.x < 64)
buff[threadIdx.x] = min(buff[threadIdx.x], buff[threadIdx.x+64]);
__syncthreads();
}
if (threadIdx.x < 32) {
buff[threadIdx.x] = min(buff[threadIdx.x], buff[threadIdx.x+32]);
buff[threadIdx.x] = min(buff[threadIdx.x], buff[threadIdx.x+16]);
buff[threadIdx.x] = min(buff[threadIdx.x], buff[threadIdx.x+8]);
buff[threadIdx.x] = min(buff[threadIdx.x], buff[threadIdx.x+4]);
buff[threadIdx.x] = min(buff[threadIdx.x], buff[threadIdx.x+2]);
buff[threadIdx.x] = min(buff[threadIdx.x], buff[threadIdx.x+1]);
if (threadIdx.x == 0) c[i] = buff[0];
}
}
}
// Instances for all valid block sizes.
template __global__ void minconv<64>(const float *, int, const float *, int, float *);
template __global__ void minconv<128>(const float *, int, const float *, int, float *);
template __global__ void minconv<256>(const float *, int, const float *, int, float *);
template __global__ void minconv<512>(const float *, int, const float *, int, float *);
[disclaimer: not tested or benchmarked, use at own risk]
This is single precision floating point, but the same idea should work for double precision floating point. For integer, you would need to replace the C99 INFINITY macro with something like INT_MAX or LONG_MAX, but the principle remains the same otherwise.

A faster version:
__global__ void convAgB(double *a, double *b, double *c, int sa, int sb)
{
int i = (threadIdx.x + blockIdx.x * blockDim.x);
int idT = threadIdx.x;
int out,j;
__shared__ double c_local [512];
c_local[idT] = c[i];
out = (i > sa) ? sa : i + 1;
j = (i > sb) ? i - sb + 1 : 1;
for(; j < out; j++)
{
if(c_local[idT] > a[j] + b[i-j])
c_local[idT] = a[j] + b[i-j];
}
c[i] = c_local[idT];
}
**Benckmark:**
Size A Size B Size C Time (s)
1000 1000 2000 0.0008
10k 10k 20k 0.0051
100k 100k 200k 0.3436
1M 1M 1M 43,327
Old Version,
For sizes between 1000 and 100000, I tested with this naive version:
__global__ void convAgB(double *a, double *b, double *c, int sa, int sb)
{
int size = sa+sb;
int idT = (threadIdx.x + blockIdx.x * blockDim.x);
int out,j;
for(int i = idT; i < size; i += blockDim.x * gridDim.x)
{
if(i > sa) out = sa;
else out = i + 1;
if(i > sb) j = i - sb + 1;
else j = 1;
for(; j < out; j++)
{
if(c[i] > a[j] + b[i-j])
c[i] = a[j] + b[i-j];
}
}
}
I populated the array a and b with some random double numbers and c with 999999 (just for testing). I validated the c array (in the CPU) using your function (without any modifications).
I also removed the conditionals from inside of the inner loop, so it will only test them once.
I am not 100% sure but I think the following modification makes sense. Since you had i - j >= 0, which is the same as i >= j, this means that as soon as j > i it will never enter this block 'X' (since j++):
if(c[i] > a[j] + b[i-j])
c[i] = a[j] + b[i-j];
So I calculated on the variable out the loop conditional if i > sa, which means that the loop will finish when j == sa, if i < sa this means the loop will finish (earlier) on i + 1 because of the condition i >= j.
The other condition i - j < size(b) means that you will start the execution of the block 'X' when i > size(b) + 1 since j starts always = 1. So we can put j with the value that should begin, thus
if(i > sb) j = i - sb + 1;
else j = 1;
See if you can test this version with real arrays of data, and give me feedback. Also, any improvements are welcome.
EDIT : A new optimization can be implemented, but this one does not make much of a difference.
if(c[i] > a[j] + b[i-j])
c[i] = a[j] + b[i-j];
we can eliminate the if, by:
double add;
...
for(; j < out; j++)
{
add = a[j] + b[i-j];
c[i] = (c[i] < add) * c[i] + (add <= c[i]) * add;
}
Having:
if(a > b) c = b;
else c = a;
it the same of having c = (a < b) * a + (b <= a) * b.
if a > b then c = 0 * a + 1 * b; => c = b;
if a <= b then c = 1*a + 0 *b; => c = a;
**Benckmark:**
Size A Size B Size C Time (s)
1000 1000 2000 0.0013
10k 10k 20k 0.0051
100k 100k 200k 0.4436
1M 1M 1M 47,327
I am measuring the time of copying from CPU to GPU, running the kernel and copying from GPU to CPU.
GPU Specifications
Device Tesla C2050
CUDA Capability Major/Minor 2.0
Global Memory 2687 MB
Cores 448 CUDA Cores
Warp size 32

I have used you algorithm. I think it'll help you.
const int Length=1000;
__global__ void OneD(float *Ad,float *Bd,float *Cd){
int i=blockIdx.x;
int j=threadIdx.x;
Cd[i]=99999.99;
for(int k=0;k<Length/500;k++){
while(((i-j)>=0)&&(i-j<Length)&&Cd[i+k*Length]>Ad[j+k*Length]+Bd[i-j]){
Cd[i+k*Length]=Ad[j+k*Length]+Bd[i-j];
}}}
I have taken 500 Threads per block. And, 500 blocks per Grid. As, the number of threads per block in my device is restricted to 512, I used 500 threads. I have taken the size of all the arrays as Length (=1000).
Working:
i stores the Block Index and j stores the Thread Index.
The for loop is used as the number of threads are less than the size of the arrays.
The while loop is used for iterating Cd[n].
I have not used Shared Memory because, I have taken lots of blocks and threads. So, the amount of Shared Memory required for each block is low.
PS: If your device supports more Threads and Blocks, replace k<Length/500 with k<Length/(supported number of threads)

LU Decomposition from Numerical Recipes not working; what am I doing wrong?

I've literally copied and pasted from the supplied source code for Numerical Recipes for C for in-place LU Matrix Decomposition, problem is its not working.
I'm sure I'm doing something stupid but would appreciate anyone being able to point me in the right direction on this; I've been working on its all day and can't see what I'm doing wrong.
POST-ANSWER UPDATE: The project is finished and working. Thanks to everyone for their guidance.
#include <stdlib.h>
#include <stdio.h>
#include <math.h>
#define MAT1 3
#define TINY 1e-20
int h_NR_LU_decomp(float *a, int *indx){
//Taken from Numerical Recipies for C
int i,imax,j,k;
float big,dum,sum,temp;
int n=MAT1;
float vv[MAT1];
int d=1.0;
//Loop over rows to get implicit scaling info
for (i=0;i<n;i++) {
big=0.0;
for (j=0;j<n;j++)
if ((temp=fabs(a[i*MAT1+j])) > big)
big=temp;
if (big == 0.0) return -1; //Singular Matrix
vv[i]=1.0/big;
}
//Outer kij loop
for (j=0;j<n;j++) {
for (i=0;i<j;i++) {
sum=a[i*MAT1+j];
for (k=0;k<i;k++)
sum -= a[i*MAT1+k]*a[k*MAT1+j];
a[i*MAT1+j]=sum;
}
big=0.0;
//search for largest pivot
for (i=j;i<n;i++) {
sum=a[i*MAT1+j];
for (k=0;k<j;k++) sum -= a[i*MAT1+k]*a[k*MAT1+j];
a[i*MAT1+j]=sum;
if ((dum=vv[i]*fabs(sum)) >= big) {
big=dum;
imax=i;
}
}
//Do we need to swap any rows?
if (j != imax) {
for (k=0;k<n;k++) {
dum=a[imax*MAT1+k];
a[imax*MAT1+k]=a[j*MAT1+k];
a[j*MAT1+k]=dum;
}
d = -d;
vv[imax]=vv[j];
}
indx[j]=imax;
if (a[j*MAT1+j] == 0.0) a[j*MAT1+j]=TINY;
for (k=j+1;k<n;k++) {
dum=1.0/(a[j*MAT1+j]);
for (i=j+1;i<n;i++) a[i*MAT1+j] *= dum;
}
}
return 0;
}
void main(){
//3x3 Matrix
float exampleA[]={1,3,-2,3,5,6,2,4,3};
//pivot array (not used currently)
int* h_pivot = (int *)malloc(sizeof(int)*MAT1);
int retval = h_NR_LU_decomp(&exampleA[0],h_pivot);
for (unsigned int i=0; i<3; i++){
printf("\n%d:",h_pivot[i]);
for (unsigned int j=0;j<3; j++){
printf("%.1lf,",exampleA[i*3+j]);
}
}
}
WolframAlpha says the answer should be
1,3,-2
2,-2,7
3,2,-2
I'm getting:
2,4,3
0.2,2,-2.8
0.8,1,6.5
And so far I have found at least 3 different versions of the 'same' algorithm, so I'm completely confused.
PS yes I know there are at least a dozen different libraries to do this, but I'm more interested in understanding what I'm doing wrong than the right answer.
PPS since in LU Decomposition the lower resultant matrix is unity, and using Crouts algorithm as (i think) implemented, array index access is still safe, both L and U can be superimposed on each other in-place; hence the single resultant matrix for this.

I think there's something inherently wrong with your indices. They sometimes have unusual start and end values, and the outer loop over j instead of i makes me suspicious.
Before you ask anyone to examine your code, here are a few suggestions:
double-check your indices
get rid of those obfuscation attempts using sum
use a macro a(i,j) instead of a[i*MAT1+j]
write sub-functions instead of comments
remove unnecessary parts, isolating the erroneous code
Here's a version that follows these suggestions:
#define MAT1 3
#define a(i,j) a[(i)*MAT1+(j)]
int h_NR_LU_decomp(float *a, int *indx)
{
int i, j, k;
int n = MAT1;
for (i = 0; i < n; i++) {
// compute R
for (j = i; j < n; j++)
for (k = 0; k < i-2; k++)
a(i,j) -= a(i,k) * a(k,j);
// compute L
for (j = i+1; j < n; j++)
for (k = 0; k < i-2; k++)
a(j,i) -= a(j,k) * a(k,i);
}
return 0;
}
Its main advantages are:
it's readable
it works
It lacks pivoting, though. Add sub-functions as needed.
My advice: don't copy someone else's code without understanding it.
Most programmers are bad programmers.

For the love of all that is holy, don't use Numerical Recipies code for anything except as a toy implementation for teaching purposes of the algorithms described in the text -- and, really, the text isn't that great. And, as you're learning, neither is the code.
Certainly don't put any Numerical Recipies routine in your own code -- the license is insanely restrictive, particularly given the code quality. You won't be able to distribute your own code if you have NR stuff in there.
See if your system already has a LAPACK library installed. It's the standard interface to linear algebra routines in computational science and engineering, and while it's not perfect, you'll be able to find lapack libraries for any machine you ever move your code to, and you can just compile, link, and run. If it's not already installed on your system, your package manager (rpm, apt-get, fink, port, whatever) probably knows about lapack and can install it for you. If not, as long as you have a Fortran compiler on your system, you can download and compile it from here, and the standard C bindings can be found just below on the same page.
The reason it's so handy to have a standard API to linear algebra routines is that they are so common, but their performance is so system-dependant. So for instance, Goto BLAS
is an insanely fast implementation for x86 systems of the low-level operations which are needed for linear algebra; once you have LAPACK working, you can install that library to make everything as fast as possible.
Once you have any sort of LAPACK installed, the routine for doing an LU factorization of a general matrix is SGETRF for floats, or DGETRF for doubles. There are other, faster routines if you know something about the structure of the matrix - that it's symmetric positive definite, say (SBPTRF), or that it's tridiagonal (STDTRF). It's a big library, but once you learn your way around it you'll have a very powerful piece of gear in your numerical toolbox.

The thing that looks most suspicious to me is the part marked "search for largest pivot". This does not only search but it also changes the matrix A. I find it hard to believe that is correct.
The different version of the LU algorithm differ in pivoting, so make sure you understand that. You cannot compare the results of different algorithms. A better check is to see whether L times U equals your original matrix, or a permutation thereof if your algorithm does pivoting. That being said, your result is wrong because the determinant is wrong (pivoting does not change the determinant, except for the sign).
Apart from that #Philip has good advice. If you want to understand the code, start by understanding LU decomposition without pivoting.

To badly paraphrase Albert Einstein:
... a man with a watch always knows the
exact time, but a man with two is
never sure ....
Your code is definitely not producing the correct result, but even if it were, the result with pivoting will not directly correspond to the result without pivoting. In the context of a pivoting solution, what Alpha has really given you is probably the equivalent of this:
1 0 0 1 0 0 1 3 -2
P= 0 1 0 L= 2 1 0 U = 0 -2 7
0 0 1 3 2 1 0 0 -2
which will then satisfy the condition A = P.L.U (where . denotes the matrix product). If I compute the (notionally) same decomposition operation another way (using the LAPACK routine dgetrf via numpy in this case):
In [27]: A
Out[27]:
array([[ 1, 3, -2],
[ 3, 5, 6],
[ 2, 4, 3]])
In [28]: import scipy.linalg as la
In [29]: LU,ipivot = la.lu_factor(A)
In [30]: print LU
[[ 3. 5. 6. ]
[ 0.33333333 1.33333333 -4. ]
[ 0.66666667 0.5 1. ]]
In [31]: print ipivot
[1 1 2]
After a little bit of black magic with ipivot we get
0 1 0 1 0 0 3 5 6
P = 0 0 1 L = 0.33333 1 0 U = 0 1.3333 -4
1 0 0 0.66667 0.5 1 0 0 1
which also satisfies A = P.L.U . Both of these factorizations are correct, but they are different and they won't correspond to a correctly functioning version of the NR code.
So before you can go deciding whether you have the "right" answer, you really should spend a bit of time understanding the actual algorithm that the code you copied implements.

This thread has been viewed 6k times in the past 10 years. I had used NR Fortran and C for many years, and do not share the low opinions expressed here.
I explored the issue you encountered, and I believe the problem in your code is here:
for (k=j+1;k<n;k++) {
dum=1.0/(a[j*MAT1+j]);
for (i=j+1;i<n;i++) a[i*MAT1+j] *= dum;
}
while in the original if (j != n-1) { ... } is used. I think the two are not equivalent.
NR's lubksb() does have a small issue in the way they set up finding the first non-zero element, but this can be skipped at very low cost, even for a large matrix. With that, both ludcmp() and lubksb(), entered as published, work just fine, and as far as I can tell perform well.
Here's a complete test code, mostly preserving the notation of NR, wth minor simplifications (tested under Ubuntu Linux/gcc):
/* A sample program to demonstrate matrix inversion using the
* Crout's algorithm from Teukolsky and Press (Numerical Recipes):
* LU decomposition + back-substitution, with partial pivoting
* 2022.06 edward.sternin at brocku.ca
*/
#define N 7
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#define a(i,j) a[(i)*n+(j)]
/* implied 1D layout is a(0,0), a(0,1), ... a(0,n-1), a(1,0), a(1,1), ... */
void matrixPrint (double *M, int nrow, int ncol) {
int i,j;
for (i=0;i<nrow;i++) {
for (j=0;j<ncol;j++) { fprintf(stderr," %+.3f\t",M[i*ncol+j]); }
fprintf(stderr,"\n");
}
}
void die(char msg[]) {
fprintf(stderr,"ERROR in %s, aborting\n",msg);
exit(1);
}
void ludcmp(double *a, int n, int *indx) {
int i, imax, j, k;
double big, dum, sum, temp;
double *vv;
/* i=row index, i=0..(n-1); j=col index, j=0..(n-1) */
vv=(double *)malloc((size_t)(n * sizeof(double)));
if (!vv) die("ludcmp: allocation failure");
for (i = 0; i < n; i++) { /* loop over rows */
big = 0.0;
for (j = 0; j < n; j++) {
if ((temp=fabs(a(i,j))) > big) big=temp;
}
if (big == 0.0) die("ludcmp: a singular matrix provided");
vv[i] = 1.0 / big; /* vv stores the scaling factor for each row */
}
for (j = 0; j < n; j++) { /* Crout's method: loop over columns */
for (i = 0; i < j; i++) { /* except for i=j */
sum = a(i,j);
for (k = 0; k < i; k++) { sum -= a(i,k) * a(k,j); }
a(i,j) = sum; /* Eq. 2.3.12, in situ */
}
big = 0.0; /* searching for the largest pivot element */
for (i = j; i < n; i++) {
sum = a(i,j);
for (k = 0; k < j; k++) { sum -= a(i,k) * a(k,j); }
a(i,j) = sum;
if ((dum = vv[i] * fabs(sum)) >= big) {
big = dum;
imax = i;
}
}
if (j != imax) { /* if needed, interchange rows */
for (k = 0; k < n; k++){
dum = a(imax,k);
a(imax,k) = a(j,k);
a(j,k) = dum;
}
vv[imax] = vv[j]; /* keep the scale factor with the new row location */
}
indx[j] = imax;
if (j != n-1) { /* divide by the pivot element */
dum = 1.0 / a(j,j);
for (i = j + 1; i < n; i++) a(i,j) *= dum;
}
}
free(vv);
}
void lubksb(double *a, int n, int *indx, double *b) {
int i, ip, j;
double sum;
for (i = 0; i < n; i++) {
/* Forward substitution, Eq.2.3.6, unscrambling permutations from indx[] */
ip = indx[i];
sum = b[ip];
b[ip] = b[i];
for (j = 0; j < i; j++) sum -= a(i,j) * b[j];
b[i] = sum;
}
for (i = n-1; i >= 0; i--) { /* backsubstitution, Eq. 2.3.7 */
sum = b[i];
for (j = i + 1; j < n; j++) sum -= a(i,j) * b[j];
b[i] = sum / a(i,i);
}
}
int main() {
double *a,*y,*col,*aa,*res,sum;
int i,j,k,*indx;
a=(double *)malloc((size_t)(N*N * sizeof(double)));
y=(double *)malloc((size_t)(N*N * sizeof(double)));
col=(double *)malloc((size_t)(N * sizeof(double)));
indx=(int *)malloc((size_t)(N * sizeof(int)));
aa=(double *)malloc((size_t)(N*N * sizeof(double)));
res=(double *)malloc((size_t)(N*N * sizeof(double)));
if (!a || !y || !col || !indx || !aa || !res) die("main: memory allocation failure");
srand48((long int) N);
for (i=0;i<N;i++) {
for (j=0;j<N;j++) { aa[i*N+j] = a[i*N+j] = drand48(); }
}
fprintf(stderr,"\nRandomly generated matrix A = \n");
matrixPrint(a,N,N);
ludcmp(a,N,indx);
for(j=0;j<N;j++) {
for(i=0;i<N;i++) { col[i]=0.0; }
col[j]=1.0;
lubksb(a,N,indx,col);
for(i=0;i<N;i++) { y[i*N+j]=col[i]; }
}
fprintf(stderr,"\nResult of LU/BackSub is inv(A) :\n");
matrixPrint(y,N,N);
for (i=0; i<N; i++) {
for (j=0;j<N;j++) {
sum = 0;
for (k=0; k<N; k++) { sum += y[i*N+k] * aa[k*N+j]; }
res[i*N+j] = sum;
}
}
fprintf(stderr,"\nResult of inv(A).A = (should be 1):\n");
matrixPrint(res,N,N);
return(0);
}

Develop Reference

c reactjs sql-server angularjs arrays wpf database batch-file google-app-engine silverlight