CUDA C - how to use Texture2D for double precision floating point - c

I want to use texture 2D memory for double precision. I want to read from texture to shared memory and convert int2 to double, and then transfer back to host memory But I am getting only first row as desired and all other row's value is 2.00000000.
#include<stdio.h>
#include<cuda.h>
#define Xdim 8
#define Ydim 8
texture<int2,2>me_texture;
static __inline__ __device__ double fetch_double(int2 p){
return __hiloint2double(p.y, p.x);
}
__global__ void kern(double *o, int pitch){
__shared__ double A[Xdim][Ydim];
unsigned int i = blockIdx.x*blockDim.x + threadIdx.x;
unsigned int j = blockIdx.y*blockDim.y + threadIdx.y;
int2 jj;
if(i<Xdim && j<Ydim){
jj = tex2D(me_texture, i, j);
A[threadIdx.x][threadIdx.y] = fetch_double(jj);
}
__syncthreads();
if(i<Xdim && j<Ydim){
o[j*Xdim + i] = A[threadIdx.x][threadIdx.y];
}
}
int main(int argc, char *argv[]){
double hbuf[Xdim][Ydim];
double hout[Xdim][Ydim];
double *dob;
double *dbuf;
size_t pitch_bytes;
cudaMallocPitch((void**)&dbuf, &pitch_bytes, sizeof(double)*Xdim, Ydim);
cudaMallocPitch((void**)&dob, &pitch_bytes, sizeof(double)*Xdim, Ydim);
hbuf[0][0] = 1.234567891234567;
hbuf[0][1] = 12.34567891234567;
hbuf[0][2] = 123.4567891234567;
hbuf[0][3] = 1234.567891234567;
hbuf[0][4] = 12345.67891234567;
hbuf[0][5] = 123456.7891234567;
hbuf[0][6] = 1234567.891234567;
hbuf[0][7] = 12345678.91234567;
hbuf[1][0] = 123456789.1234567;
hbuf[1][1] = 1234567891.234567;
hbuf[1][2] = 12345678912.34567;
hbuf[1][3] = 123456789123.4567;
hbuf[1][4] = 1234567891234.567;
hbuf[1][5] = 12345678912345.67;
hbuf[1][6] = 123456789123456.7;
hbuf[1][7] = 1234567891234567;
hbuf[2][0] = 123456789.7654321;
hbuf[2][1] = 1234567897.654321;
hbuf[2][2] = 12345678976.54321;
hbuf[2][3] = 123456789765.4321;
hbuf[2][4] = 1234567897654.321;
hbuf[2][5] = 12345678976543.21;
hbuf[2][6] = 123456789765432.1;
hbuf[2][7] = 1234567897654321;
hbuf[3][0] = 9.876543211234567;
hbuf[3][1] = 98.76543211234567;
hbuf[3][2] = 987.6543211234567;
hbuf[3][3] = 9876.543211234567;
hbuf[3][4] = 98765.43211234567;
hbuf[3][5] = 987654.3211234567;
hbuf[3][6] = 9876543.211234567;
hbuf[3][7] = 98765432.11234567;
hbuf[4][0] = 987654321.1234567;
hbuf[4][1] = 9876543211.234567;
hbuf[4][2] = 98765432112.34567;
hbuf[4][3] = 987654321123.4567;
hbuf[4][4] = 9876543211234.567;
hbuf[4][5] = 98765432112345.67;
hbuf[4][6] = 987654321123456.7;
hbuf[4][7] = 9876543211234567;
hbuf[5][0] = 987654321.7654321;
hbuf[5][1] = 9876543217.654321;
hbuf[5][2] = 98765432176.54321;
hbuf[5][3] = 987654321765.4321;
hbuf[5][4] = 9876543217654.321;
hbuf[5][5] = 98765432176543.21;
hbuf[5][6] = 987654321765432.1;
hbuf[5][7] = 9876543217654321;
hbuf[6][0] = 1234567891234567;
hbuf[6][1] = 123456789123456.7;
hbuf[6][2] = 12345678912345.67;
hbuf[6][3] = 1234567891234.567;
hbuf[6][4] = 123456789123.4567;
hbuf[6][5] = 12345678912.34567;
hbuf[6][6] = 1234567891.234567;
hbuf[6][7] = 123456789.1234567;
hbuf[7][0] = 12345678.91234567;
hbuf[7][1] = 1234567.891234567;
hbuf[7][2] = 123456.7891234567;
hbuf[7][3] = 12345.67891234567;
hbuf[7][4] = 1234.567891234567;
hbuf[7][5] = 123.4567891234567;
hbuf[7][6] = 12.34567891234567;
hbuf[7][7] = 1.234567891234567;
for (int i=0; i<Xdim; i++){
for(int j=0; j<Ydim; j++){
printf("%.16f\t", hbuf[i][j]);
}
printf("\n");
}
cudaMemcpy2D(dbuf, pitch_bytes, hbuf, Xdim*sizeof(double), Xdim*sizeof(double), Ydim, cudaMemcpyHostToDevice);
me_texture.addressMode[0] = cudaAddressModeClamp;
me_texture.addressMode[1] = cudaAddressModeClamp;
me_texture.filterMode = cudaFilterModeLinear;
me_texture.normalized = false;
cudaBindTexture2D(0, me_texture, dbuf, cudaCreateChannelDesc(32,32,0,0, cudaChannelFormatKindSigned), Xdim, Ydim, pitch_bytes );
int pitch = pitch_bytes/sizeof(double);
kern<<<1, 64>>>(dob, pitch);
cudaMemcpy2D(hout,Xdim*sizeof(double), dob, pitch_bytes, Xdim*sizeof(double),Ydim, cudaMemcpyDeviceToHost);
printf("\nI am Fine\n");
for(int i = 0 ; i < Xdim ; i++){
for(int j=0; j<Ydim; j++){
printf("%.16f\t", hout[i][j]);
}
printf("\n");
}
cudaUnbindTexture(me_texture);
cudaFree(dbuf);
cudaFree(dob);
return 0;
}

Above code work fine if you change the following things.
Replace
kern<<<1, 64>>>(..., ..)
to
dim3 blockPerGrid(1, 1)
dim3 threadPerBlock(8, 8)
kern<<<blockPerGrid, threadPerBlock>>>(....)
here in place of Xdim change it to pitch
o[j*pitch + i] = A[threadIdx.x][threadIdx.y];
And change cudaFilterModeLinear to cudaFilterModePoint .
For the compilation you need to specify the computing capability, suppose your compute capability ie 3.0 then it would be
nvcc -arch=sm_30 file.cu

If your code contained error checking, you would realise that your kernel launch is failing with an invalid filter mode. It isn't legal in CUDA to use a cudaFilterModeLinear with non-float types, so nothing is actually running. If you change the filter mode to cudaFilterModePoint, you might find things start working.

Related

Function e^x returns 'inf' as output when x is larger then 0.6

I tried to implement this code and it works to a certain point (x<0.6). I am just wondering why it ouputs 'inf' although the stop criteria should terminate the program when it reaches the maximum accuracy of double.
#include <stdio.h>
#include <math.h>
double fak(int n) {
int f = 1;
int i = 0;
do {
i++;
f *= i;
} while(i<n);
return f;
}
double func_e() {
double res = 0;
double res_old = 0;
double x, k;
x = 1;
k = 0;
do {
res_old = res;
res += ((pow(x,k)) / fak(k));
k++;
} while(res != res_old);
return res;
}
int main(void) {
//printf("power %f", pow(3,3));
printf("%f", func_e());
//printf("%f", fak(3));
printf("\n");
return 0;
}
Check the return value of your function fak. It will overflow and at a certain point return 0. The division by 0.0 results in inf.
When I modify function fak as
double fak(int n) {
int f = 1;
int i = 0;
do {
i++;
f *= i;
} while(i<n);
printf("fak(%d) = %d\n", n, f);
return f;
}
and run it on https://onlinegdb.com/ZxaXfI5xcG, the output is
fak(0) = 1
fak(1) = 1
fak(2) = 2
fak(3) = 6
fak(4) = 24
fak(5) = 120
fak(6) = 720
fak(7) = 5040
fak(8) = 40320
fak(9) = 362880
fak(10) = 3628800
fak(11) = 39916800
fak(12) = 479001600
fak(13) = 1932053504
fak(14) = 1278945280
fak(15) = 2004310016
fak(16) = 2004189184
fak(17) = -288522240
fak(18) = -898433024
fak(19) = 109641728
fak(20) = -2102132736
fak(21) = -1195114496
fak(22) = -522715136
fak(23) = 862453760
fak(24) = -775946240
fak(25) = 2076180480
fak(26) = -1853882368
fak(27) = 1484783616
fak(28) = -1375731712
fak(29) = -1241513984
fak(30) = 1409286144
fak(31) = 738197504
fak(32) = -2147483648
fak(33) = -2147483648
fak(34) = 0
fak(35) = 0
inf
This means your loop ends when both res and res_old have the value inf.
Additional remark:
In func_e you use double k; and pass this to double fak(int n) which converts the value to int. Function fak does the calculation in int and implicitly converts the result to double in the return statement.
I suggest to avoid these conversions. (Or at least think about the possible problems.) The compiler may warn about this if you enable all warnings.

How to reference a previous iteration in a for loop?

I'm writing some code to draw 2 8 pixel long lines on a LCD end to end. I would like to do this using a for loop, however I am stuck working out how to connect the start of the second to the end of the first. The following code produces the pattern I am after, however is very repetitive when doing many lines:
void draw_road(){
double angle = PI/2;
double length = 8;
int starting_x = 24;
int starting_y = 48;
double x1a = starting_x;
double y1a = starting_y;
double x2a = x1a + (cos(angle) * length);
double y2a = y1a - (sin(angle) * length);
draw_line(x1a, y1a, x2a, y2a, FG_COLOUR);
double x1b = x2a ;
double y1b = y2a;
double x2b = x1b + (cos(angle-(angle/4.5)) * length);
double y2b = y1b - (sin(angle-(angle/4.5)) * length);
draw_line(x1b, y1b, x2b, y2b, FG_COLOUR);
}
I have tried the code below, however I don't think it knows where to look for [i-1].
void draw_road(){
double angle = PI/2;
double length = 8;
int starting_x = 24;
int starting_y = 48;
double x1[2];
double y1[2];
double x2[2];
double y2[2];
for (int i = 0; i < 2; i++){
x1[i] = starting_x + x2[i-1];
y1[i] = starting_y + y2[i-1];
x2[i] = x1[i] + (cos(angle) * length);
y2[i] = y1[i] - (sin(angle) * length);
draw_line(x1[i], y1[i], x2[i], y2[i], FG_COLOUR);
angle /= 2;
}
}
How can I correct this so the for loop knows the values of the last loop (especially if it is the very first loop)?
In the first iteration, you don't have a "previous" position; So there is no line to draw but just to declare the starting point.
An if around the call to draw and conditional operators for distinguishing between "setting a starting point" and "calculating the next point" could do the job:
for (int i = 0; i < 2; i++){
x1[i] = (i>0) ? starting_x + x2[i-1] : starting_x;
...
if (i>0) {
drawLine(...)
}
}

Erroneous array reading inside a thread

I Have a multi-threaded C program where I have 4 threads doing some arithmetic computations using some global arrays. here is example of the code.
__m256 *array_1;
__m256 *array_2;
__m256 *array_3;
#define ALIGNMENT 32
#define SIMD_STEP 8
void Init_arrays()
{
int i;
posix_memalign((void**) &array_1, ALIGNMENT, 32*sizeof(__m256));
posix_memalign((void**) &array_2, ALIGNMENT, 4 *sizeof(__m256));
posix_memalign((void**) &array_3, ALIGNMENT, 2 *sizeof(__m256));
for(i=0;i < 256; i+= SIMD_STEP)
{
// Filling array for the 1st stage
}
for(i=0;i < 64; i+= SIMD_STEP)
{
// Filling array for the 2nd stage
}
for(i=0;i < 16; i+= SIMD_STEP)
{
// Filling array for the 3rd stage
}
}
void *routine(void *thread_info)
{
int n;
unsigned t_start,t_stop;
unsigned ind1, ind2, ind3;
float *arr_in , *arr_out;
struct thread_data *mydata;
mydata = (struct thread_data*) thread_info;
t_start = mydata->start;
t_stop = mydata->stop;
arr_in = mydata->input;
arr_out = mydata->output;
for (n = t_start; n < t_stop; n += 8)
{
ind1 = 256 + n;
ind2 = 512 + n;
vec_a = _mm256_load_ps((float *) (&arr_in[n ]) );
vec_b = _mm256_load_ps((float *) (&arr_in[ind1]) );
vec_c = _mm256_load_ps((float *) (&arr_in[ind2]) );
T_fac1 = array_1[n];
T_fac2 = array_2[n];
T_fac3 = array_3[n];
// print data 'printf()'
// further computations
_mm256_store_ps((float *) (&arr_out[n ]), (vec_a) );
_mm256_store_ps((float *) (&arr_out[ind1]), (vec_b) );
_mm256_store_ps((float *) (&arr_out[ind2]), (vec_c) );
}
pthread_exit(NULL);
}
void foo(float* in,float* out)
{
unsigned t,i=0;
for(t=0;t<256;t+=64)
{
thread_data_array[i].start = t;
thread_data_array[i].stop = t+QUARTER;
thread_data_array[i].input = in;
thread_data_array[i].output = out;
pthread_create(&threads[i],NULL,routine,(void*)&thread_data_array[i]);
i++;
}
for(i=0; i<NUM_THREADS; i++)
{
int rc = pthread_join(threads[i], NULL);
if (rc)
{
fprintf(stderr, "failed to join thread #%u - %s\n",i, strerror(rc));
}
}
}
int main()
{
float *data1;
float *data2;
posix_memalign((void**)&data1, 32, 1024 * sizeof(float));
posix_memalign((void**)&data2, 32, 1024 * sizeof(float));
Load_inputs(reals,imags);//load data into the two arrays
Init_arrays();
// print data 'printf()'
foo(data1,data2);
return EXIT_SUCCESS;
}
For some reason reading from the array_1 for example doesnt work as it should be inside the thread, and I did not know the reason behind it. Here is display of the array_1 as it should be
Display from the main Display from the thread
RE = 1.000000 IM = -0.000000 RE = 1.000000 IM = -0.000000
RE = 0.999981 IM = -0.006136 RE = 0.399624 IM = 0.671559
RE = 0.999925 IM = -0.012272 RE = 0.416430 IM = 0.634393
RE = 0.999831 IM = -0.018407 RE = 0.433094 IM = 0.595699
RE = 0.999699 IM = -0.024541 RE = 0.449612 IM = 0.555570
RE = 0.999529 IM = -0.030675 RE = 0.465977 IM = 0.514103
RE = 0.999322 IM = -0.036807 RE = 0.482184 IM = 0.471397
RE = 0.999078 IM = -0.042938 RE = 0.498228 IM = 0.427555
RE = 0.998795 IM = -0.049068 // the same
RE = 0.998476 IM = -0.055195 // the same
RE = 0.998118 IM = -0.061321 // the same
RE = 0.997723 IM = -0.067444 // the same
RE = 0.997290 IM = -0.073565 // the same
RE = 0.996820 IM = -0.079682 // the same
RE = 0.996313 IM = -0.085797 // the same
RE = 0.995767 IM = -0.091909 // the same
Could somebody know what is the reason behind this erroneous results?
Given
__m256 *array_1;
__m256 *array_2;
__m256 *array_3;
#define ALIGNMENT 32
#define SIMD_STEP 8
void Init_arrays()
{
int i;
posix_memalign((void**) &array_1, ALIGNMENT, 32*sizeof(__m256));
posix_memalign((void**) &array_2, ALIGNMENT, 4 *sizeof(__m256));
posix_memalign((void**) &array_3, ALIGNMENT, 2 *sizeof(__m256));
.
.
.
This loop references array elements that are way out of range:
for (n = t_start; n < t_stop; n += 8)
{
.
.
.
T_fac1 = array_1[n];
T_fac2 = array_2[n];
T_fac3 = array_3[n];
array_3 has all of two members: 2 *sizeof(__m256) yet the index is incremented by eight?

8x8 float32_t Matrix multiplication using ARM NEON is slower?

I'm wondering what intrinsics make the SIMD slower than normal matrix multiplication and what should I do to make the multiplication of large matrix faster using SIMD. Here we have matrixA[8][8], matrixB[8][8] and result matrixC[8][8]. Because the maximum number of elements for float32_t is 4, so I did 2 vmul and vadd, which seem to be quite not optimized. I work on ARMv7-A Cortex A8.
void matrix_mult_neon (void)
{
int i;
float32x4x2_t vectB1, vectB2, vectB3, vectB4, vectB5, vectB6, vectB7, vectB8;
vectB1 = vld2q_f32(matrixB[0]);
vectB2 = vld2q_f32(matrixB[1]);
vectB3 = vld2q_f32(matrixB[2]);
vectB4 = vld2q_f32(matrixB[3]);
vectB5 = vld2q_f32(matrixB[4]);
vectB6 = vld2q_f32(matrixB[5]);
vectB7 = vld2q_f32(matrixB[6]);
vectB8 = vld2q_f32(matrixB[7]);
float32x4x2_t vectT1, vectT2, vectT3, vectT4, vectT5, vectT6, vectT7, vectT8;
for (i = 0; i < 8; i++)
{
vectT1.val[0] = vmulq_n_f32(vectB1.val[0], matrixA[i][0]);
vectT1.val[1] = vmulq_n_f32(vectB1.val[1], matrixA[i][0]);
vectT2.val[0] = vmulq_n_f32(vectB2.val[0], matrixA[i][1]);
vectT2.val[1] = vmulq_n_f32(vectB2.val[1], matrixA[i][1]);
vectT3.val[0] = vmulq_n_f32(vectB3.val[0], matrixA[i][2]);
vectT3.val[1] = vmulq_n_f32(vectB3.val[1], matrixA[i][2]);
vectT4.val[0] = vmulq_n_f32(vectB4.val[0], matrixA[i][3]);
vectT4.val[1] = vmulq_n_f32(vectB4.val[1], matrixA[i][3]);
vectT5.val[0] = vmulq_n_f32(vectB5.val[0], matrixA[i][4]);
vectT5.val[1] = vmulq_n_f32(vectB5.val[1], matrixA[i][4]);
vectT6.val[0] = vmulq_n_f32(vectB6.val[0], matrixA[i][5]);
vectT6.val[1] = vmulq_n_f32(vectB6.val[1], matrixA[i][5]);
vectT7.val[0] = vmulq_n_f32(vectB7.val[0], matrixA[i][6]);
vectT7.val[1] = vmulq_n_f32(vectB7.val[1], matrixA[i][6]);
vectT8.val[0] = vmulq_n_f32(vectB8.val[0], matrixA[i][7]);
vectT8.val[1] = vmulq_n_f32(vectB8.val[1], matrixA[i][7]);
vectT1.val[0] = vaddq_f32(vectT1.val[0], vectT2.val[0]);
vectT1.val[0] = vaddq_f32(vectT1.val[0], vectT3.val[0]);
vectT1.val[0] = vaddq_f32(vectT1.val[0], vectT4.val[0]);
vectT1.val[0] = vaddq_f32(vectT1.val[0], vectT5.val[0]);
vectT1.val[0] = vaddq_f32(vectT1.val[0], vectT6.val[0]);
vectT1.val[0] = vaddq_f32(vectT1.val[0], vectT7.val[0]);
vectT1.val[0] = vaddq_f32(vectT1.val[0], vectT8.val[0]);
vectT1.val[1] = vaddq_f32(vectT1.val[1], vectT2.val[1]);
vectT1.val[1] = vaddq_f32(vectT1.val[1], vectT3.val[1]);
vectT1.val[1] = vaddq_f32(vectT1.val[1], vectT4.val[1]);
vectT1.val[1] = vaddq_f32(vectT1.val[1], vectT5.val[1]);
vectT1.val[1] = vaddq_f32(vectT1.val[1], vectT6.val[1]);
vectT1.val[1] = vaddq_f32(vectT1.val[1], vectT7.val[1]);
vectT1.val[1] = vaddq_f32(vectT1.val[1], vectT8.val[1]);
vst2q_f32(matrixC_neon[i], vectT1);
}
}
My normal matrix multiplication function:
void matrix_mult (void)
{
float tempProduct;
int i, j, k;
for (i = 0; i < 8; i++)
{
for (j = 0; j < 8; j++)
{
tempProduct = 0;
for (k = 0; k < 8; k++)
{
tempProduct = tempProduct + matrixA[i][k] * matrixB[k][j];
}
matrixC[i][j] = tempProduct;
}
}
}
I use gettimeofday() function in the library <sys/time.h> to calculate time in nanoseconds.
The Problem:
aarch32 has a NEON register bank of the size 256bytes total
A 8x8 float matrix is already 256bytes large, and you need three of them. (768)
You have to read the matrix B "vertically", which means it's physically impossible to do it the "streaming" way for maximum data locality.
You do vector-scalar multiply which takes four times as much total than vector-vector multiplication.
You load Mat A via VFP. And VFP on the Cortex-A8 particularly is unbelievably slow, in addtion to the NEON<->VFP switching overhead. Unlike auto-vectorization, intrinsic do pretty much everything the way you tell it to do. And you gave the wrong instruction.
The Solution:
We transpose matrix B and do dot-product math line by line.
I hope the code below works for you, and if performance is crucial, consider writing in assembly since compilers aren't very trustworthy when it comes to NEON performance, even in intrinsics.
static __always_inline float32x2_t dotProduct(float32x4x2_t input1, float32x4x2_t input2)
{
float32x2_t d0, d1;
float32x4_t q0;
input1.val[0] = vmulq_f32(input1.val[0], input2.val[0]);
input1.val[1] = vmulq_f32(input1.val[1], input2.val[1]);
q0 = vaddq_f32(input1.val[0], input1.val[1]);
d0 = vget_low_f32(q0);
d1 = vget_high_f32(q0);
d0 = vpadd_f32(d0, d1);
d0 = vpadd_f32(d0, d1);
return d0;
}
void matMulF_neon(float *pDst, float *pMatA, float *pMatB)
{
float32x4x4_t line01, line23, line45, line67;
float32x4x2_t b[8], *pA, *pB, temp;
float32x2x4_t result;
uint32_t i;
// vld4 for easier transpose
line01 = vld4q_f32(pMatB++);
line23 = vld4q_f32(pMatB++);
line45 = vld4q_f32(pMatB++);
line67 = vld4q_f32(pMatB);
// transpose MatB
vuzpq_f32(line01.val[0], line45.val[0]);
vuzpq_f32(line01.val[1], line45.val[1]);
vuzpq_f32(line01.val[2], line45.val[2]);
vuzpq_f32(line01.val[3], line45.val[3]);
vuzpq_f32(line23.val[0], line67.val[0]);
vuzpq_f32(line23.val[1], line67.val[1]);
vuzpq_f32(line23.val[2], line67.val[2]);
vuzpq_f32(line23.val[3], line67.val[3]);
// store MatB to stack
b[0].val[0] = line01.val[0];
b[0].val[1] = line01.val[1];
b[1].val[0] = line01.val[2];
b[1].val[1] = line01.val[3];
b[2].val[0] = line23.val[0];
b[2].val[1] = line23.val[1];
b[3].val[0] = line23.val[2];
b[3].val[1] = line23.val[3];
b[4].val[0] = line45.val[0];
b[4].val[1] = line45.val[1];
b[5].val[0] = line45.val[2];
b[5].val[1] = line45.val[3];
b[6].val[0] = line67.val[0];
b[6].val[1] = line67.val[1];
b[7].val[0] = line67.val[2];
b[7].val[1] = line67.val[3];
pA = (float32x4x2_t *) pMatA;
i = 8;
do
{
// just the right amount of data for aarch32 NEON register bank size
pB = b;
temp = *pA++;
result.val[0] = dotProduct(*pB++, temp);
result.val[1] = dotProduct(*pB++, temp);
result.val[2] = dotProduct(*pB++, temp);
result.val[3] = dotProduct(*pB++, temp);
vst4_lane_f32(pDst++, result, 0);
result.val[0] = dotProduct(*pB++, temp);
result.val[1] = dotProduct(*pB++, temp);
result.val[2] = dotProduct(*pB++, temp);
result.val[3] = dotProduct(*pB, temp);
vst4_lane_f32(pDst++, result, 0);
} while (--i);
}
/////////////////////////// EDIT
I checked the disassembly and the generated code is FUBAR. (Linaro GCC 7.1.1)
I'd go the assembly route. Writing NEON codes in intrinsics is pure waste of time IMO.

How to cope with WebGL missing glBlendEquation(GL_MAX)

Here's my current C code that does what I'd like to do, but it relies on glBlendEquation(GL_MAX) which is unavailable in WebGL. What I want is to render a wiggly fuzzy line. I could use a Gaussian blur but it would have to have a VERY large radius (16 pixels) and I expect it would be REALLY slow.
Note I've removed some gl state management code and a couple other things fore clarity but the code should work as is.
Existing C code:
static const char *pnt_vtx_shader =
"#version 110\n"
"varying vec2 uv;\n"
"void main() {\n"
" uv = (gl_MultiTexCoord0.st - 1.0f);\n"
" gl_Position = gl_Vertex;\n"
"}";
static const char *pnt_shader_src =
"#version 110\n"
"varying vec2 uv;\n"
"void main() {\n"
" gl_FragColor = vec4(exp(-4.5f*0.5f*log2(dot(uv,uv)+1.0f)));\n"
"}";
GLuint shader_prog ;
int samp;
float pw, ph;
float sco_verts[128*8*4];
int sco_ind[128*3*6];
void init(int width, int height, int num_samp)
{
pw = 0.5f*fmaxf(1.0f/24, 8.0f/width), ph = 0.5f*fmaxf(1.0f/24, 8.0f/height);
samp = num_samp;
// helper function, compiles and links the shader, prints out any errors
shader_prog = compile_program(pnt_vtx_shader, pnt_shader_src);
for(int i=0; i<samp; i++) {
sco_verts[(i*8+0)*4+0] = 0; sco_verts[(i*8+0)*4+1] = 2;
sco_verts[(i*8+1)*4+0] = 0; sco_verts[(i*8+1)*4+1] = 0;
sco_verts[(i*8+2)*4+0] = 1; sco_verts[(i*8+2)*4+1] = 2;
sco_verts[(i*8+3)*4+0] = 1; sco_verts[(i*8+3)*4+1] = 0;
sco_verts[(i*8+4)*4+0] = 1; sco_verts[(i*8+4)*4+1] = 2;
sco_verts[(i*8+5)*4+0] = 1; sco_verts[(i*8+5)*4+1] = 0;
sco_verts[(i*8+6)*4+0] = 2; sco_verts[(i*8+6)*4+1] = 2;
sco_verts[(i*8+7)*4+0] = 2; sco_verts[(i*8+7)*4+1] = 0;
}
for(int i=0; i<samp; i++) {
sco_ind[(i*6+0)*3+0] = i*8+0; sco_ind[(i*6+0)*3+1] = i*8+1; sco_ind[(i*6+0)*3+2] = i*8+3;
sco_ind[(i*6+1)*3+0] = i*8+0; sco_ind[(i*6+1)*3+1] = i*8+3; sco_ind[(i*6+1)*3+2] = i*8+2;
sco_ind[(i*6+2)*3+0] = i*8+2; sco_ind[(i*6+2)*3+1] = i*8+4; sco_ind[(i*6+2)*3+2] = i*8+5;
sco_ind[(i*6+3)*3+0] = i*8+2; sco_ind[(i*6+3)*3+1] = i*8+5; sco_ind[(i*6+3)*3+2] = i*8+3;
sco_ind[(i*6+4)*3+0] = i*8+4; sco_ind[(i*6+4)*3+1] = i*8+6; sco_ind[(i*6+4)*3+2] = i*8+7;
sco_ind[(i*6+5)*3+0] = i*8+4; sco_ind[(i*6+5)*3+1] = i*8+7; sco_ind[(i*6+5)*3+2] = i*8+5;
}
}
// getsamp does some averaging over samples
static float getsamp(const float *data, int len, int i, int w) {
float sum = 0, err = 0;
int l = IMAX(i-w, 0);
int u = IMIN(i+w, len);
for(int i = l; i < u; i++)
sum+= data[i];
return sum / (2*w);
}
// R holds a rotation matrix... it's the transpose of what you would give GL though
// because of reasons :P (I wrote code that did all the stuff from this program in
// software first and the GL version shares a bunch of code with that one)
// data is audio samples, [-1, 1], the length of the array is in len
void render_scope(float R[3][3], const float *data, int len)
{
// do the rotate/project ourselves because the GL matrix won't do the right
// thing if we just send it our verticies, we want wour tris to always be
// parrallel to the view plane, because we're actually drawing a fuzzy line
// not a 3D object
// also it makes it easier to match the software implementation
float px, py;
{
float s = getsamp(data, len, 0, len/96);
s=copysignf(log2f(fabsf(s)*3+1)/2, s);
float xt = -0.5f, yt = 0.2f*s, zt = 0.0f;
float x = R[0][0]*xt + R[1][0]*yt + R[2][0]*zt;
float y = R[0][1]*xt + R[1][1]*yt + R[2][1]*zt;
float z = R[0][2]*xt + R[1][2]*yt + R[2][2]*zt;
const float zvd = 1/(z+2);
px=x*zvd*4/3; py=y*zvd*4/3;
}
for(int i=0; i<samp; i++) {
float s = getsamp(data, len, (i+1)*len/(samp), len/96);
s=copysignf(log2f(fabsf(s)*3+1)/2, s);
float xt = (i+1 - (samp)/2.0f)*(1.0f/(samp)), yt = 0.2f*s, zt = 0.0f;
float x = R[0][0]*xt + R[1][0]*yt + R[2][0]*zt;
float y = R[0][1]*xt + R[1][1]*yt + R[2][1]*zt;
float z = R[0][2]*xt + R[1][2]*yt + R[2][2]*zt;
const float zvd = 1/(z+2);
x=x*zvd*4/3; y=y*zvd*4/3;
const float dx=x-px, dy=y-py;
const float d = 1/hypotf(dx, dy);
const float tx=dx*d*pw, ty=dy*d*pw;
const float nx=-dy*d*pw, ny=dx*d*ph;
sco_verts[(i*8+0)*4+2] = px-nx-tx; sco_verts[(i*8+0)*4+3] = py-ny-ty;
sco_verts[(i*8+1)*4+2] = px+nx-tx; sco_verts[(i*8+1)*4+3] = py+ny-ty;
sco_verts[(i*8+2)*4+2] = px-nx ; sco_verts[(i*8+2)*4+3] = py-ny;
sco_verts[(i*8+3)*4+2] = px+nx ; sco_verts[(i*8+3)*4+3] = py+ny;
sco_verts[(i*8+4)*4+2] = x-nx ; sco_verts[(i*8+4)*4+3] = y-ny;
sco_verts[(i*8+5)*4+2] = x+nx ; sco_verts[(i*8+5)*4+3] = y+ny;
sco_verts[(i*8+6)*4+2] = x-nx+tx; sco_verts[(i*8+6)*4+3] = y-ny+ty;
sco_verts[(i*8+7)*4+2] = x+nx+tx; sco_verts[(i*8+7)*4+3] = y+ny+ty;
px=x,py=y;
}
glEnable(GL_BLEND);
glBlendEquation(GL_MAX);
glUseProgram(shader_prog);
glColor4f(1.0f, 1.0f, 1.0f, 1.0f);
glEnableClientState(GL_VERTEX_ARRAY);
glEnableClientState(GL_TEXTURE_COORD_ARRAY);
glTexCoordPointer(2, GL_FLOAT, sizeof(float)*4, sco_verts);
glVertexPointer(2, GL_FLOAT, sizeof(float)*4, sco_verts + 2);
glDrawElements(GL_TRIANGLES, samp*3*6, GL_UNSIGNED_INT, sco_ind);
}
Here's a screenshot from a test app, I'm not sure the line width is right in this screen shot... but meh it gives the idea, also I'd be using way more points so the lines would be smoother.

Resources