So, I wrote a little code to display to draw circles. It kinda works, but in a really janky way. This is the output of running the program and inputting 6:
O O
O O
O
O
O
O
O
O
O
O O
O O
OOOOOOO
Not looking great.
#include <stdio.h>
#include <math.h>
int main()
{
int num, dist, x, y;
printf("Enter the Radius of the Circle: ");
scanf_s("%d", &num);
{
for (x = 1; x <= num * 2; x++)
{
for (y = 1; y <= num * 2; y++)
{
dist = sqrt((x - num) * (x - num) + (y - num) * (y - num));
{
if (dist==num)
{
printf("O");
}
else
{
printf(" ");
}
}
}
printf("\n");
}
}
return 0;
}
This is the code, searched around the web for answers, it might be the placement of the "new line" code. Tried different spots, nope.
Your two for loop should start from 0:
for (x = 0; x <= num * 2; x++)
{
for (y = 0; y <= num * 2; y++)
{
dist = sqrt((x - num) * (x - num) + (y - num) * (y - num));
{
if (dist==num)
{
printf("O");
}
else
{
printf(" ");
}
}
}
printf("\n");
}
You are probably missing the first column and the first row.
I wrote one once, but mine just uses summations and subtractions, no multiplications (except one square, at the beginning) using The Bresenham Algorithm. Here is the code. (you can get it also from github)
/* bresenham.c -- program to draw ASCII circles with Bresenham's algorithm.
* Author: Luis Colorado <luicoloradourcola#gmail.com>
* Date: Wed Jan 11 10:50:17 EET 2017
* Disclaimer: (C) 2017 LUIS COLORADO. ALL RIGHTS RESERVED.
* BSD 3-Clause License
*
* Copyright (c) 2017, Luis Colorado
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <errno.h>
#include <getopt.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/ioctl.h>
#include <unistd.h>
#define FL_FILL (1 << 0)
#define FL_TRACE (1 << 1)
#define F(x) "%s:%d:%s: " x, __FILE__, __LINE__, __func__
/* draws a horizontal line at row y, between x1 and x2 */
static void hline(int x1, int x2, int y)
{
static char theLine[] =
"*****************************************"
"*****************************************"
"*****************************************"
"*****************************************"
"*****************************************"
"*****************************************"
"*****************************************"
"*****************************************"
"*****************************************"
"*****************************************"
"*****************************************"
"*****************************************";
#define THE_LINE_LENGTH (sizeof theLine - 1)
int count = (x2 - x1 + 1) << 1;
printf("\033[%d;%dH", y, x1<<1);
while (count >= THE_LINE_LENGTH) {
fputs(theLine, stdout);
count -= THE_LINE_LENGTH;
} /* while */
printf("%.*s", count, theLine);
} /* hline */
/* draws a dot at coordinates (x, y) */
static void dot(int x, int y)
{
printf("\033[%d;%dH*", y, x<<1);
} /* dot */
/* draws a circle of radius r and center at (cx, cy) */
void bh(int r, int cx, int cy, int flags)
{
int r2 = r*r + r;
int x = 0, x2 = 0, dx2 = 1;
int y = r, y2 = y*y, dy2 = 2*y - 1;
int sum = r2;
while(x <= y) {
if (flags & FL_TRACE) {
printf(F("x=%3d, x2=%5d, dx2=%3d, y=%3d, y2=%5d, dy2=%3d, sum=%5d\n"),
x, x2, dx2, y, y2, dy2, sum);
} else {
if (flags & FL_FILL) {
hline(cx - y, cx + y, cy + x);
if (x) hline(cx - y, cx + y, cy - x);
} else {
dot(cx - y, cy + x); if (y) dot(cx + y, cy + x);
if (x) { dot(cx - y, cy - x); if (y) dot(cx + y, cy - x); }
if (x != y) {
dot(cx - x, cy - y); if (x) dot(cx + x, cy - y);
if (y) { dot(cx - x, cy + y); if (x) dot(cx + x, cy + y); }
}
} /* if */
} /* if */
sum -= dx2;
if (sum <= y2) {
if (!(flags & FL_TRACE) && (flags & FL_FILL) && (x != y)) {
hline(cx - x, cx + x, cy - y);
if (y) hline(cx - x, cx + x, cy + y);
} /* if */
y--; y2 -= dy2; dy2 -= 2;
} /* if */
x++;
x2 += dx2;
dx2 += 2;
} /* while */
} /* bh */
int main(int argc, char **argv)
{
int i;
char *cols = getenv("COLUMNS");
char *lines = getenv("LINES");
int cx, cy;
int opt;
int flags = 0;
while ((opt = getopt(argc, argv, "fv")) != EOF) {
switch(opt) {
case 'f': flags |= FL_FILL; break;
case 'v': flags |= FL_TRACE; break;
} /* switch */
} /* while */
argc -= optind;
argv += optind;
if (cols && lines) {
cx = atoi(cols);
cy = atoi(lines);
} else { /* try to get from tty */
struct winsize win;
int res = ioctl(0, TIOCGWINSZ, &win);
if (res == 0) {
cx = win.ws_col;
cy = win.ws_row;
} else {
fprintf(stderr,
F("TIOCGWINSZ: %s (errno=%d)\n"),
strerror(errno), errno);
cx = 80; cy = 24;
} /* if */
} /* if */
/* center coordinates */
cx /= 4;
cy /= 2;
if (!(flags & FL_TRACE))
fputs("\033[2J", stdout);
for (i = 0; i < argc; i++) {
bh(atoi(argv[i]), cx, cy, flags);
} /* for */
if (!(flags & FL_TRACE)) {
puts("");
fflush(stdout);
} /* if */
} /* main */
it draws amazing circles like this:
$ bresenham 5 10
* * * * * * *
* * * *
* *
* *
$ _ * *
* * * * * * *
* * * *
* * * *
* * * *
* * * *
* * * *
* * * *
* * * *
* * * *
* * * *
* * * * * * *
* *
* *
* *
* * * *
* * * * * * *
Related
I'm working on calculation of Legendre Polynomial on GPU.
Briefly, Recursive Legendre Polynomial is computing the n-th order by (n-1)th and (n-2)th order. We divide the x into k (let's say k=23) parts to compute polynomial and do a summation, which would be more precise.
So my kernel goes below.
First, we create a k * width array.
int row = blockIdx.y * blockDim.y + threadIdx.y;
int col = blockIdx.x * blockDim.x + threadIdx.x;
float delta = 2. / width;
if ((row < d_k) && (col < width))
kXList[row * width + col] = -1.f + (col * d_k + row + 1.f) * delta / (float)d_k;
And 1st order and 2nd order, kXList_2 is the first, kXList_1 is the second.
kXList_1[row * width + col] = kXList[row * width + col];
kXList_2[row * width + col] = 1.f;
Do summation over columns and saving it into d_xLegendreP.
if (row == 0) {
float row_0 = 0.f;
float row_1 = 0.f;
for (int h = 0; h < d_k; ++h) {
row_0 += kXList_2[h * width + col];
row_1 += kXList_1[h * width + col];
}
d_xLegendreP[0 * width + col] = row_0;
d_xLegendreP[1 * width + col] = row_1;
}
recusive calculation of rest order.
float kX_2 = kXList_2[row * width + col];
float kX_1 = kXList_1[row * width + col];
float kX = kXList[row * width + col];
float row_n;
for (int n = 2; n <= order; n++) {
kXList_temp[row * width + col] = ((2.f * n - 1.f) * kX * kX_1) / (float)n - (((n - 1.f) * kX_2) / (float)n);
if ((row == 0)) {
row_n = 0.f;
for (int h = 0; h < d_k; h++) {
row_n += kXList_temp[h * width + col];
}
d_xLegendreP[n * width + col] = row_n;
}
kX_2 = kX_1;
kX_1 = kXList_temp[row * width + col];
}
As has been pointed out, CUDA makes no statements about the order of thread execution. However you have a number of points in your calculation sequence where you expect a previous line of code has been completed in its entirety, across the entire grid, in order for the next section of your code to be correct.
Generally the nature of CUDA parallel thread execution means that such dependencies lead to incorrect/broken code.
I haven't tried to fully realize your algorithm in an optimal way, but to demonstrate the proof of this, I have broken up your kernel code in such a way that such dependencies are made "correct" through the use of the kernel-call boundary, which is effectively a global sync. This is probably one way to sort out your problem, as indicated in the comments.
Here's an example. I'm not going to try to detail each change, but by breaking it up this way I believe I have satisfied the dependencies expected using your approach. I have not fully verified anything, but a quick check suggests the output seems to match your matlab output:
$ cat t1820.cu
#include <stdio.h>
#include <math.h>
#include<iostream>
#include <stdlib.h>
#define BLOCKDIM_32 32
#define k 23
#define Mmax 40
#define IMG_SIZE 1024
static const long DEVICE = 0;
#define CUDA_CHECK_RETURN(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char* file, int line, bool abort = true)
{
if (code != cudaSuccess)
{
fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
__global__ void LegendreMoment1(float* kXList, float* kXList_1, float* kXList_2, float* kXList_temp,
float* d_xLegendreP, int width, int d_k, int order) {
int row = blockIdx.y * blockDim.y + threadIdx.y;
int col = blockIdx.x * blockDim.x + threadIdx.x;
float delta = 2. / width;
if ((row < d_k) && (col < width)) {
kXList[row * width + col] = -1.f + (col * d_k + row + 1.f) * delta / (float)d_k;
kXList_1[row * width + col] = kXList[row * width + col];
kXList_2[row * width + col] = 1.f;
}
}
__global__ void LegendreMoment2(float* kXList, float* kXList_1, float* kXList_2, float* kXList_temp,
float* d_xLegendreP, int width, int d_k, int order) {
int row = blockIdx.y * blockDim.y + threadIdx.y;
int col = blockIdx.x * blockDim.x + threadIdx.x;
if ((row < d_k) && (col < width)) {
if (row == 0) {
float row_0 = 0.f;
float row_1 = 0.f;
for (int h = 0; h < d_k; ++h) {
row_0 += kXList_2[h * width + col];
row_1 += kXList_1[h * width + col];
}
d_xLegendreP[0 * width + col] = row_0;
d_xLegendreP[1 * width + col] = row_1;
}
}
}
__global__ void LegendreMoment3(float* kXList, float* kXList_1, float* kXList_2, float* kXList_temp,
float* d_xLegendreP, int width, int d_k, int order, int n, float *kXList_prev) {
int row = blockIdx.y * blockDim.y + threadIdx.y;
int col = blockIdx.x * blockDim.x + threadIdx.x;
if ((row < d_k) && (col < width)) {
float kX_2, kX_1, kX = kXList[row * width + col];
if (n == 2){
kX_2 = kXList_2[row * width + col];
kX_1 = kXList_1[row * width + col];}
if (n == 3){
kX_2 = kXList_1[row * width + col];
kX_1 = kXList_temp[row*width+col];}
if (n > 3){
kX_2 = kXList_prev[row * width + col];
kX_1 = kXList_temp[row*width+col];}
kXList_prev[row*width+col] = kX_1;
kXList_temp[row * width + col] = ((2.f * n - 1.f) * kX * kX_1) / (float)n - (((n - 1.f) * kX_2) / (float)n);
}
}
__global__ void LegendreMoment4(float* kXList, float* kXList_1, float* kXList_2, float* kXList_temp,
float* d_xLegendreP, int width, int d_k, int order, int n) {
int row = blockIdx.y * blockDim.y + threadIdx.y;
int col = blockIdx.x * blockDim.x + threadIdx.x;
float row_n;
if ((row < d_k) && (col < width)) {
if ((row == 0)) {
row_n = 0.f;
for (int h = 0; h < d_k; h++) {
row_n += kXList_temp[h * width + col];
}
d_xLegendreP[n * width + col] = row_n;
}
}
}
float matlab_result[][4] = {
{23., 23., 23., 23.},
{-22.9766, -22.9316, -22.8867, -22.8418},
{22.9297, 22.7952, 22.661, 22.527},
{-22.8596, -22.5914, -22.3245, -22.059},
{22.7663, 22.3211, 21.8799, 21.4425},
{-22.6501, -21.9856, -21.3303, -20.6839},
{22.5111, 21.5864, 20.6798, 19.7912},
{-22.3496, -21.1254, -19.9335, -18.7734},
{22.166, 20.6046, 19.0967, 17.6411},
{-21.9606, -20.0265, -18.1756, -16.4058},
{21.7339, 19.3937, 17.1772, 15.0802},
{-21.4862, -18.7091, -16.1086, -13.6777},
{21.2181, 17.9757, 14.9778, 12.2124},
{-20.9301, -17.1971, -13.7931, -10.6992},
{20.6228, 16.3766, 12.563, 9.15308},
{-20.2967, -15.5179, -11.2963, -7.5893},
{19.9525, 14.625, 10.0023, 6.02321},
{-19.5909, -13.7018, -8.69016, -4.46998},
{19.2126, 12.7524, 7.36912, 2.94447},
{-18.8183, -11.781, -6.04847, -1.46107},
{18.4087, 10.792, 4.73739, 0.0335239},
{-17.9847, -9.78953, -3.44488, 1.32519},
{17.5472, 8.77808, 2.17971, -2.60304},
{-17.0968, -7.76199, -0.950332, 3.78904},
{16.6345, 6.74559, -0.235176, -4.87336},
{-16.1611, -5.7332, 1.36917, 5.84745},
{15.6776, 4.72908, -2.44452, -6.70411},
{-15.1848, -3.73739, 3.45463, 7.43756},
{14.6836, 2.7622, -4.39351, -8.04346},
{-14.1751, -1.80747, 5.25583, 8.51902},
{13.66, 0.877003, -6.03692, -8.86292},
{-13.1395, 0.0255473, 6.73284, 9.07537},
{12.6143, -0.896704, -7.34039, -9.15805},
{-12.0855, 1.73318, 7.85712, 9.11411},
{11.554, -2.53191, -8.28135, -8.94808},
{-11.0207, 3.29003, 8.61218, 8.6658},
{10.4866, -4.00492, -8.84949, -8.27433},
{-9.95254, 4.67419, 8.99391, 7.78188},
{9.41953, -5.29574, -9.04682, -7.19767},
{-8.88843, 5.8677, 9.01035, 6.53179},
{8.36015, -6.38847, -8.88731, -5.79509}
};
#define TOL 0.0001f
int main()
{
float* kXList;
float* kXList_1;
float* kXList_2;
float* kXList_temp;
float* kXList_prev;
float* d_xLegendreP;
float* xLegendreP;
int width = IMG_SIZE;
cudaEvent_t d_total_begin, d_total_end;
xLegendreP = new float[(Mmax + 1) * width];
CUDA_CHECK_RETURN(cudaSetDevice(DEVICE));
CUDA_CHECK_RETURN(cudaEventCreate(&d_total_begin));
CUDA_CHECK_RETURN(cudaEventCreate(&d_total_end));
printf("Time kernel launch...\n");
CUDA_CHECK_RETURN(cudaEventRecord(d_total_begin, 0));
printf("Allocating space on device...\n");
CUDA_CHECK_RETURN(
cudaMalloc((void**)&kXList, width * k * sizeof(float)));
CUDA_CHECK_RETURN(
cudaMalloc((void**)&kXList_temp, width * k * sizeof(float)));
CUDA_CHECK_RETURN(
cudaMalloc((void**)&kXList_prev, width * k * sizeof(float)));
CUDA_CHECK_RETURN(
cudaMalloc((void**)&kXList_1, width * k * sizeof(float)));
CUDA_CHECK_RETURN(
cudaMalloc((void**)&kXList_2, width * k * sizeof(float)));
CUDA_CHECK_RETURN(
cudaMalloc((void**)&d_xLegendreP, width * (Mmax + 1) * sizeof(float)));
printf("Copying data from host to device...\n");
dim3 grid(ceil(Mmax / 32), ceil(width / 32), 1);
dim3 block(BLOCKDIM_32, BLOCKDIM_32, 1);
printf("Launching kernel...\n");
LegendreMoment1 << <grid, block >> > (kXList, kXList_1, kXList_2, kXList_temp,
d_xLegendreP, IMG_SIZE, k, Mmax);
LegendreMoment2 << <grid, block >> > (kXList, kXList_1, kXList_2, kXList_temp,
d_xLegendreP, IMG_SIZE, k, Mmax);
for (int n = 2; n <= Mmax; n++) {
LegendreMoment3 << <grid, block >> > (kXList, kXList_1, kXList_2, kXList_temp,
d_xLegendreP, IMG_SIZE, k, Mmax, n, kXList_prev);
LegendreMoment4 << <grid, block >> > (kXList, kXList_1, kXList_2, kXList_temp,
d_xLegendreP, IMG_SIZE, k, Mmax, n);
}
CUDA_CHECK_RETURN(
cudaMemcpy(xLegendreP, d_xLegendreP, width * (Mmax + 1) * sizeof(float), cudaMemcpyDeviceToHost));
CUDA_CHECK_RETURN(cudaEventRecord(d_total_end, 0));
printf("\n");
for (int n = 0; n <= Mmax; n++)
printf("row %2d:%8.4f %8.4f %8.4f %8.4f\n", n, xLegendreP[n * width + 0],xLegendreP[n * width + 1],xLegendreP[n * width + 2],xLegendreP[n * width + 3]);
for (int i = 0; i < Mmax; i++)
for (int j = 0; j < 4; j++)
if (fabsf(xLegendreP[i*width+j] - matlab_result[i][j]) > TOL) {printf("mismatch at %d, %d\n", i, j); return 0;}
CUDA_CHECK_RETURN(cudaEventSynchronize(d_total_end));
float gpuTime = 0.0;
CUDA_CHECK_RETURN(cudaEventElapsedTime(&gpuTime, d_total_begin, d_total_end));
printf(">>>Elapsed GPU Time is : %f ms\n", gpuTime);
printf("Freeing memory on device...\n");
CUDA_CHECK_RETURN(cudaEventDestroy(d_total_begin));
CUDA_CHECK_RETURN(cudaEventDestroy(d_total_end));
CUDA_CHECK_RETURN(cudaFree(kXList));
CUDA_CHECK_RETURN(cudaFree(kXList_temp));
CUDA_CHECK_RETURN(cudaFree(kXList_1));
CUDA_CHECK_RETURN(cudaFree(kXList_2));
CUDA_CHECK_RETURN(cudaFree(d_xLegendreP));
printf("Exiting program...\n");
return 0;
}
$ nvcc -o t1820 t1820.cu
$ ./t1820
Time kernel launch...
Allocating space on device...
Copying data from host to device...
Launching kernel...
row 0: 23.0000 23.0000 23.0000 23.0000
row 1:-22.9766 -22.9316 -22.8867 -22.8418
row 2: 22.9297 22.7952 22.6610 22.5270
row 3:-22.8596 -22.5914 -22.3245 -22.0590
row 4: 22.7663 22.3211 21.8799 21.4425
row 5:-22.6501 -21.9856 -21.3303 -20.6839
row 6: 22.5111 21.5864 20.6798 19.7912
row 7:-22.3496 -21.1254 -19.9335 -18.7734
row 8: 22.1660 20.6046 19.0967 17.6411
row 9:-21.9606 -20.0265 -18.1756 -16.4058
row 10: 21.7339 19.3937 17.1772 15.0802
row 11:-21.4862 -18.7090 -16.1086 -13.6777
row 12: 21.2181 17.9757 14.9778 12.2124
row 13:-20.9301 -17.1971 -13.7931 -10.6992
row 14: 20.6228 16.3766 12.5630 9.1531
row 15:-20.2967 -15.5179 -11.2963 -7.5893
row 16: 19.9525 14.6250 10.0023 6.0232
row 17:-19.5909 -13.7018 -8.6902 -4.4700
row 18: 19.2126 12.7524 7.3691 2.9445
row 19:-18.8183 -11.7810 -6.0485 -1.4611
row 20: 18.4087 10.7920 4.7374 0.0335
row 21:-17.9848 -9.7895 -3.4449 1.3252
row 22: 17.5472 8.7781 2.1797 -2.6030
row 23:-17.0968 -7.7620 -0.9503 3.7890
row 24: 16.6345 6.7456 -0.2352 -4.8734
row 25:-16.1611 -5.7332 1.3692 5.8475
row 26: 15.6776 4.7291 -2.4445 -6.7041
row 27:-15.1848 -3.7374 3.4546 7.4376
row 28: 14.6836 2.7622 -4.3935 -8.0435
row 29:-14.1751 -1.8075 5.2558 8.5190
row 30: 13.6600 0.8770 -6.0369 -8.8629
row 31:-13.1395 0.0255 6.7328 9.0754
row 32: 12.6143 -0.8967 -7.3404 -9.1581
row 33:-12.0855 1.7332 7.8571 9.1141
row 34: 11.5540 -2.5319 -8.2813 -8.9481
row 35:-11.0207 3.2900 8.6122 8.6658
row 36: 10.4866 -4.0049 -8.8495 -8.2743
row 37: -9.9525 4.6742 8.9939 7.7819
row 38: 9.4195 -5.2957 -9.0468 -7.1977
row 39: -8.8884 5.8677 9.0103 6.5318
row 40: 8.3601 -6.3885 -8.8873 -5.7951
>>>Elapsed GPU Time is : 1.223776 ms
Freeing memory on device...
Exiting program...
$
I'm not suggesting the above code is defect-free or suitable for any particular purpose. It is mostly your code. I've made some changes to demonstrate the need for global sync that is inherent in your approach.
I am a Python developer for the most part, but recently I have needed to solve a few problems using C and, honestly, it makes me suffer.
func.c
#define _CRT_SECURE_NO_WARNINGS
#define M_PI 3.1415926535897932384626433832795028841971693993751058209
#include <stdio.h>
#include <stdbool.h>
#include <stdlib.h>
#include <math.h>
#include "Point.h"
bool point_on_line(struct Point p1, struct Point p2, struct Point point) {
double min_x = min(p1.x, p2.x);
double max_x = max(p1.x, p2.x);
double min_y = min(p1.y, p2.y);
double max_y = max(p1.y, p2.y);
if (p2.y - p1.y == 0) {
return point.y == p2.y && point.x >= min_x && point.x <= max_x && point.y >= min_y && point.y <= max_y;
}
if (p2.x - p1.x == 0) {
return point.x == p2.x && point.x <= max_x && point.x >= min_x && point.y >= min_y && point.y <= max_y;
}
double k = (p2.y - p1.y) / (p2.x - p1.x);
double b = (p2.x * p1.y - p1.x * p2.y) / (p2.x - p1.x);
return point.y == (k * point.x + b) && point.x >= min_x && point.x <= max_x && point.y >= min_y && point.y <= max_y;
}
double calculate_angle(struct Point p1, struct Point p2) {
double dot_product = p1.x * p2.x + p1.y * p2.y;
double lenp1 = sqrt(p1.x * p1.x + p1.y * p1.y);
double lenp2 = sqrt(p2.x * p2.x + p2.y * p2.y);
double orientation = p1.x * p2.y - p1.y * p2.x;
int sign;
if (orientation > 0) {
sign = 1;
}
else {
sign = -1;
}
return sign * acos(dot_product / (lenp1 * lenp2));
}
bool check_border(struct Point p, struct Point points[], int size) {
for (int i = 0; i < size - 1; i++) {
if (point_on_line(points[i], points[i + 1], p)) {
return true;
}
}
return false;
}
bool calc_angle_sum(struct Point p1, struct Point points[], int size) {
struct Point* vectors = malloc(size * sizeof(struct Point));
for (int i = 0; i < size; i++) {
struct Point temp = { points[i].x - p1.x,points[i].y - p1.y };
vectors[i] = temp;
}
double total_sum = 0;
for (int i = 0; i < size - 1; i++) {
total_sum += calculate_angle(vectors[i], vectors[i + 1]);
}
bool res = (fabs(total_sum - 2 * M_PI)) < 0.00005;
printf("TOTAL SUM %.100f\n", total_sum);
printf("DIFFERENCE SMALL %d\n", fabs(total_sum - 2 * M_PI) < 0.00005);
return fabs(total_sum - 2 * M_PI) < 0.00005;
//return res;
}
Source.c
#define _CRT_SECURE_NO_WARNINGS
#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include "Point.h"
int main() {
int length = 5;
struct Point p1 = { 1,2 };
struct Point p2 = { 5,0 };
struct Point p3 = { 7,4 };
struct Point p4 = { 5,6 };
struct Point p5 = { 2,5 };
struct Point p6 = { 1,2 };
struct Point points_test[6] = {p1,p2,p3,p4,p5,p6};
struct Point point_test = {7,3};
//bool result = calc_angle_sum(point, points, length + 1);
//printf("%s", result ? "true\n" : "false\n");
if (check_border(point_test, points_test, length + 1)) {
printf("BORDER");
return 0;
}
else if (calc_angle_sum(point_test, points_test, length + 1)) {
printf("INSIDE");
return 0;
}
else {
printf("OUTSIDE");
return 0;
}
}
Point.h
#pragma once
struct Point {
double x, y;
};
coordinates.txt
1 2 5 0 7 4 5 6 2 5
lap.txt (the first number is the number of rectangle vertices, the second and third - the coordinates of the point.
5 7 3
This algorithm determines whether a point is inside/outside/on an edge of a given polygon using the winding number method.
The point (7,3) (the second and third numbers inside lap.txt) lies outside the polygon so the correct answer is "OUTSIDE/FALSE". Nevertheless, the output differs depending on Debug/Release and the way I return from calc_angle_sum function.
When I return this way:
return fabs(total_sum - 2 * M_PI) < 0.00005;
I get inconsistent results depending on debug/release mode.
The following way, however, seems to work fine:
bool res = (fabs(total_sum - 2 * M_PI)) < 0.00005
return res
Below is the part inside Source.c that calls the method:
else if (calc_angle_sum(point, points, length + 1)) {
printf("INSIDE");
return 0;
}
There is something I cannot understand about how expressions are evaluated in C.
In Python, I am used to returning like return <some_expression> and expect it to get converted to True/False. In C, however, this doesn't seem to be true, or else there is some other error.
One obvious problem is that of implicit function declarations.
Neither
bool calc_angle_sum(struct Point p1, struct Point points[], int size)
and
bool check_border(struct Point p, struct Point points[], int size)
are not declared in the translation unit source.c. Therefore the compiler assumes that the functions are of type
int calc_angle_sum()
int check_border()
Were the return type of the functions int, they could be called compatibly in this manner - with these arguments. But because the actual return type of the functions is bool, the behaviour of the function calls is undefined.
C99 removed implicit function definitions. A compliant C99, C11, C17 compiler must complain about these function calls in your source.c.But the substandard MSVC compiler barely meets the long obsolete C89 specification...
Thus, try declaring
bool calc_angle_sum(struct Point p1, struct Point points[], int size);
bool check_border(struct Point p, struct Point points[], int size);
in Point.h.
My problem is that I want to pass a 3 dimensional array as output to matlab and work with it in a seperate C void function.. But when I try my Matlab Crashes. I have almost just started working with mex functions and have created others that works with no problems, but they have only been outputting arrays and single values.
Here is my Code.:
First file is my entry file called pm_motion_entry.c .:
#include <mex.h>
#include <matrix.h>
/* The gateway function */
void mexFunction(int nlhs, mxArray *plhs[],
int nrhs, const mxArray *prhs[])
{
/* Variable declarations here */
double sigma;
double L;
double radius;
int N;
int M;
double *outMatrixOne;
mwSignedIndex *outMatrixTwo;
mxArray *Array;
/* Dimensions for cell array */
mwSize dim[2] = {M,N};
/* checks for right type*/
if(nrhs != 5) {
mexErrMsgIdAndTxt("MyToolbox:arrayProduct:nrhs","Inputs: sigma, L, radius, N, NumberOfParticles");
}
if(nlhs != 2) {
mexErrMsgIdAndTxt("MyToolbox:arrayProduct:nrhs","Too Few output argumetns, Output is I and Particle");
}
sigma = mxGetScalar(prhs[0]);
L = mxGetScalar(prhs[1]);
radius = mxGetScalar(prhs[2]);
N = mxGetScalar(prhs[3]);
M = mxGetScalar(prhs[4]);
if( !mxIsDouble(prhs[0]) || !mxIsDouble(prhs[1]) || !mxIsDouble(prhs[2]) || !mxIsDouble(prhs[3]) || !mxIsDouble(prhs[4])) {
mexErrMsgIdAndTxt("MyToolBox:arrayProduct:notScalar","Input 1,2 and 3 should be of type double, and 4 and 5 should be of type Integer.");
}
/* create output matrix */
plhs[0] = mxCreateDoubleMatrix(N,1,mxREAL);
Array = mxCreateNumericArray(2,dim,mxDOUBLE_CLASS,mxREAL); /* Two cell arrays one for x positions and one for y positions*/
/* assign pointer to output*/
outMatrixOne = mxGetPr(plhs[0]);
outMatrixTwo = (mwSignedIndex*)mxGetData(Array);
/* run function */
pBm_motion(sigma,L,radius,N,M,outMatrixOne,outMatrixTwo);
}
And my main function for calculations called pm_motion.c .:
#define _USE_MATH_DEFINES
#include <math.h>
#define UNIFORM ((rand()+0.5)/(RAND_MAX+1.0)) /* uniform inside [0,1] */
#include "mex.h"
/* Function for the calculation of the Intensity spectra of pure Brownian motion in FSC data */
void pBm_motion(double sigma, double L, double radius, int N, int M, double *In, double ***Particle)
{
/* Inputs/Outputs: D is the diffusion coefficient, tau is the size of timesteps, L is the length of on side of the system, radius is the radius of the focal volume, N is the number of time steps and M is the number of particles */
/* Initialization of Parameters */
int n = 1;
int i;
double pos_x,pos_y;
/* The random walk and advancement in time */
/* First place all particles randomly, this is t=0 */
for(i = 0; i <= M; i++){
Particle[i][0][0] = (-1.0)*UNIFORM + 0.5;
Particle[i][0][1] = (-1.0)*UNIFORM + 0.5;
In[0] += exp(-((Particle[i][0][0]*Particle[i][0][0])+(Particle[i][0][1]*Particle[i][0][1]))/(2.0*radius*radius));
}
/* Then we let time flow in the system */
while(N >= n){
for(i = 0; i <= M; i++){
/* Using the Box-Muller Method to generate random normal distrubuted number to find the new position */
pos_x = Particle[i][n][0] + sigma*sqrt(-2.0*log(UNIFORM))*sin(2.0*M_PI*UNIFORM);
pos_y = Particle[i][n][1] + sigma*sqrt(-2.0*log(UNIFORM))*sin(2.0*M_PI*UNIFORM);
/* Double Periodic violation */
if(pos_x > (L/2.0) && pos_y > (L/2.0)){
pos_x = -(L/2.0)+(pos_x - (L/2.0));
pos_y = -(L/2.0)+(pos_y - (L/2.0));
}
if(pos_x < -(L/2.0) && pos_y > (L/2.0)){
pos_y = -(L/2.0)+(pos_y - (L/2.0));
pos_x = (L/2.0)+((L/2.0) + pos_x);
}
if(pos_x < (L/2.0) && pos_y < -(L/2.0)){
pos_x = -(L/2.0)+(pos_x - (L/2.0));
pos_y = (L/2.0)+((L/2.0) + pos_y);
}
if(pos_x < -(L/2.0) && pos_y < -(L/2.0)){
pos_y = (L/2.0)+((L/2.0) + pos_y);
pos_x = (L/2.0)+((L/2.0) + pos_x);
}
/* Periodic Boundary condition invoked, maybe.. */
if(pos_x > (L/2.0)){
pos_x = -(L/2.0)+(pos_x - (L/2.0));
}
if(pos_y > (L/2.0)){
pos_y = -(L/2.0)+(pos_y - (L/2.0));
}
if(pos_x < -(L/2.0)){
pos_x = (L/2.0)+((L/2.0) + pos_x);
}
if(pos_y < -(L/2.0)){
pos_y = (L/2.0)+((L/2.0) + pos_y);
}
/* Update position. */
Particle[i][n][0] = pos_x;
Particle[i][n][1] = pos_y;
/* Calculate Intensity */
In[n] += exp(-((Particle[i][n][0]*Particle[i][n][0])+(Particle[i][n][1]*Particle[i][n][1]))/(2.0*radius*radius));
}
n++;
}
}
I have I feeling that it is the way i access the 3D Matrix in the Entry file, But I don't know enough about mex yet to be really sure about it.
Closed. This question needs to be more focused. It is not currently accepting answers.
Want to improve this question? Update the question so it focuses on one problem only by editing this post.
Closed 9 years ago.
Improve this question
anyone know how to do the QR decomposition via modified Gram-Schmidt method in C and CUDA. Some example/source/paper or other else? Thanks so much.
Edit: I can't answer to my question because someone have closed it, so i decided to update my question.
/*
* QR decomposition via modified Gram-Schmidt algorithm
*
* #Package = QR-decomposition
* #Program = QR_gpu
* #Version = 13.0928
*/
// Libraries
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <unistd.h>
#include <ctype.h>
#include <time.h>
#include <math.h>
#include <cuda.h>
// Constant
#define PROG "QR_cpu"
#define VERSION "13.1003"
#define PACKAGE "QR-Decomposition"
// Threads per block
#define THREAD_P_BLOCK 512
// Blocks per grid
#define BLOCKS_P_GRID 512
// macro
/* wrap each API call with the gpuErrchk macro, which will process the return
status of the API call it wraps: http://bit.ly/1dTD0ZE */
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
// Prototypes
__global__ void xTA (float *, float *, int, int, int, int, int);
__global__ void scale (float *, int, int, float *);
__global__ void r1_update(float *, float *, int, int, int, int);
__host__ void gpuAssert(cudaError_t, char *, int);
__host__ void print_matrix(float *, int, int, int);
__host__ void print_help (int);
__host__ int option_parser(int, char **, int *, int *);
// Host code
int main (int argc, char **argv) {
int m, n, lda, ldr, i;
float *A_d, *R_d;
//cudaEvent_t t_start, t_stop;
// Get "m" and "n" from command line
if (0 != option_parser(argc, argv, &m, &n)) {
fprintf(stderr, "Can\'t continue, exiting now!\n");
exit(EXIT_FAILURE);
}
size_t A_size = m * n * sizeof(float);
size_t R_size = n * n * sizeof(float);
lda = n; ldr = n;
// Allocate input matrices A_h and R_h in host memory
float *A_h = (float *) malloc(A_size);
float *R_h = (float *) malloc(R_size);
memset(R_h, 0, R_size);
// Initialize input matrix
for (i = 0; i < n; i++)
A_h[i*lda + i] = i + 1;
// Allocate matrices in device memory
gpuErrchk (cudaMalloc(&A_d, A_size));
gpuErrchk (cudaMalloc(&R_d, R_size));
// Copy the A matrix from host memory to device memory
gpuErrchk (cudaMemcpy(A_d, A_h, A_size, cudaMemcpyHostToDevice));
// Set R matrix to 0
gpuErrchk (cudaMemset(R_d, 0, R_size));
/**** Invoke kernel ****/
dim3 dimBlock (THREAD_P_BLOCK, 1, 1);
// dimGrid 'monodimensional' (just x value)
dim3 dimGrid_M ((m + THREAD_P_BLOCK - 1) / THREAD_P_BLOCK, 1, 1);
// dimGrid 'bidimensional' (x and y values)
dim3 dimGrid_B (BLOCKS_P_GRID, (m + THREAD_P_BLOCK - 1) / THREAD_P_BLOCK,1);
// Gram-Schmidt algorithm step by step
for (i = 0; i < n; i++) {
// Step #1 --> R(i,i:n-1) = A'(:,i) * A(:,i:n-1)
xTA <<< dimBlock, dimGrid_B >>> (R_d, A_d, m, n, lda, ldr, i);
// Step #3 (Is the scale of a column vector)
scale <<< dimBlock, dimGrid_M >>> (A_d + i, m, lda, R_d + i*ldr + i);
// Step #4 (Is the scale of a row)
scale <<< dimBlock, dimGrid_M >>> (R_d + ldr*i, m, 1, R_d + i*ldr + i);
// Step #5 --> A(:,i+1:n−1) = A(:,i+1:n−1) − A(:,i) ∗ R(i,i+1:n−1)
r1_update <<< dimBlock, dimGrid_B >>> (A_d, R_d + i*lda, m, n, lda, i);
}
// Copy the results from device memory to host memory
gpuErrchk (cudaMemcpy(A_h, A_d, A_size, cudaMemcpyDeviceToHost));
// Free device memory
cudaFree(A_d); cudaFree(R_d);
// Free host memory
free(A_h); free(R_h);
return 0;
}
/**
* ## Kernel 1
*
* Rank 1 update of columns of A
*/
__global__ void r1_update (float *A, float *R, int m, int n, int lda, int k) {
// get x,y cordinates
unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
unsigned int y = blockIdx.y * blockDim.y + threadIdx.x;
if (x < m && y < n-k-1)
A[x*lda + y + k + 1] -= A[x*lda + k] * R[y + k + 1];
}
/**
* ## Kernel 2
*
* matrix vector product
* Performs R[i] = x'A where x' is a row of A
*
* How leading dimension is used for matrices: http://ibm.co/19PLtIX
*/
__global__ void xTA (float *R, float *A, int m, int n, int lda, int ldr, int k){
// block column * block dim + column (computed by each thread)
unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
int j;
// upper triangular matrix
if (i < n - k) {
for (j = 0; j < m; j++)
R[k*ldr + k + i] += A[k*lda + j] * A[j*lda + k + i];
}
}
/**
* ## Kernel 3
*
* mult. for constant s
* d vector
* ld leading dimension (distance from elements)
*/
__global__ void scale (float *d, int m, int ld, float *R_x) {
// block colum * block dim + column (computed by each thread)
unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
// s = sqrt(R(i,i))
// Static initialization of shared variables is illegal in CUDA.
// The problem is that the semantics of how every thread should treat
// static initialization of shared memory is undefined in the
// programming model. Which thread should do the write? What happens if
// the value is not uniform between threads? How should the compiler
// emit code for such a case and how should the hardware run it?
__shared__ float s; s = sqrt(*R_x);
// and scale
if (i < m) d[i*ld] /= s;
}
/*
* GPU Error Handler (CUDA_SAFE_CALL deprecated from CUDA 5.0)
*/
__host__ void gpuAssert(cudaError_t code, char *file, int line) {
if (code != cudaSuccess) {
fprintf(stderr,"GPUassert: %s %s %d\n",
cudaGetErrorString(code), file, line);
exit(code);
}
}
/*
* Print matrix
*
* Print a matrix passed as argument
*/
__host__ void print_matrix (float * matrix, int m, int n, int ld) {
int i, j;
for (i = 0; i < m; i++) {
for (j = 0; j < n; j++)
printf("%0.5f ", matrix[i*ld + j]);
printf("\n");
}
}
/*
* The option parser
*
* The function parses the parameters passed from the command line and run
* their own procedures.
*
* Return value:
* 0 on success
* -1 on failure
*
* Please, see http://www.gnu.org/software/libc/manual/html_node/Getopt.html
* for further informations. (thanks to Frodo Looijaard)
*/
__host__ int option_parser (int argc, char **argv, int * m, int * n) {
int opt;
if (argc < 2) {
fprintf(stderr, "The program needs arguments...\n\n");
print_help(1);
}
opterr = 0;
while ( -1 != (opt = getopt (argc, argv, "hr:c:"))) {
switch (opt) {
case 'h':
print_help(0);
case 'r':
printf("optarg: %s\n", optarg);
if ((*m = atoi(optarg)) < 2) return -1;
break;
case 'c':
printf("optarg: %s\n", optarg);
if ((*n = atoi(optarg)) < 2 || *n > *m) return -1;
break;
case '?':
if (optopt == 'r' || optopt == 'c')
fprintf(stderr,"Option -%c requires an argument.\n",optopt);
else if (isprint (optopt))
fprintf(stderr,"Unknown option `-%c'.\n", optopt);
else
fprintf(stderr,"Unknown option chr `\\x%x'.\n", optopt);
return -1;
default:
fprintf(stderr, "default switch-case statement reached\n");
return -1;
}
//for (ii = optind; ii < argc; ii++)
// printf ("non-option argument %s\n", argv[ii]);
}
return 0;
}
/*
* The helper
*
* Show the info to run the program in the correct way
*/
__host__ void print_help (int exit_code) {
printf("\nPKG : %s\nPROGRAM : %s\nVERSION : %s\n\n",PACKAGE,PROG,VERSION);
printf("%s [-h] [-r num of rows] [-c num of columns]\n\n", PROG);
printf(" -h print this help and exit\n");
printf(" -r provide the number of rows\n");
printf(" -c provide the number of colums\n\n");
printf(" Example: ./qr_gpu -r 800 -c 600\n\n");
exit_code == -1 ? exit(EXIT_FAILURE) : exit(EXIT_SUCCESS);
}
When i run the program with cuda-memcheck I obtain this result:
[mcrociara#tesla project_CUDA]$ cuda-memcheck ./qr_gpu -r 4 -c 4
========= CUDA-MEMCHECK optarg: 4 optarg: 4 GPUassert: unspecified launch failure src/qr_gpu.cu 99
========= Invalid global read of size 4
========= at 0x000000c8 in xTA
========= by thread (0,0,0) in block (0,0,0)
========= Address 0x3b5273104 is out of bounds
========= Invalid global read of size 4
========= at 0x000000c8 in xTA
========= by thread (1,0,0) in block (0,0,0)
========= Address 0x3b5273108 is out of bounds
========= Invalid global read of size 4
========= at 0x000000c8 in xTA
========= by thread (2,0,0) in block (0,0,0)
========= Address 0x3b527310c is out of bounds
========= ERROR SUMMARY: 3 errors
Someone may help to understand why? I implemented the serial version on this algorithm that seem to work properly:
/*
* QR decomposition via modified Gram-Schmidt algorithm
*/
// Libraries
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <unistd.h>
#include <ctype.h>
#include <time.h>
#include <math.h>
// Constant
#define PROG "QR_cpu"
#define VERSION "28.0913"
#define PACKAGE "QR-Decomposition"
// Prototypes
void r1_update(double *, double *, int);
void xTA (double *, double *, int);
void scale (double *, int, double);
void gram (double *, double *);
void print_matrix(double *);
int option_parser(int, char **);
void print_help (int);
// Number of rows
int M = -1;
// Number of columns
int N = -1;
// Leading Dimension of A and R
int LDA = -1;
int LDR = -1;
// The clock
clock_t start, stop;
/*
*
*/
int main (int argc, char **argv) {
int i;
// Get M, N from command line
if (0 != option_parser(argc, argv)) {
fprintf(stderr, "Bad option man!!!\n");
exit(EXIT_FAILURE);
}
// Check the size of M and N
if (M > 5000 || N > 5000) {
fprintf(stderr, "Too big man!!!\n");
exit(EXIT_FAILURE);
}
// Set the leading dimension of A and R
LDA = N; LDR = N;
// Allocate memory for A and R
double *A = calloc(M * N, sizeof(*A));
double *R = calloc(N * N, sizeof(*R));
// Set the diagonal of A as A(i,i) = i + 1 with i from 0 to N−1
for (i = 0; i < N; i++)
A[i*LDA + i] = i + 1;
start = clock();
gram(A, R);
stop = clock();
printf("\nTime: %0.4f\n\n",(stop - start) / (double)(CLOCKS_PER_SEC));
// print_matrix(A);
// print_matrix(R);
free(A); free(R);
return 0;
}
/**
* Rank 1 update of columns of A
*/
void r1_update (double *A, double *R, int k) {
int i, j;
for(i = 0; i < M; i++)
for(j = k + 1; j < N; j++)
A[i*LDA + j] -= A[i*LDA + k] * R[j];
}
/**
* Matrix vector product
* Performs R[i] = x'A where x' is a row of A
* A : m x k, leading dimebsion, lda
*
* How leading dimension is used for matrices: http://ibm.co/19PLtIX
*/
void xTA (double *R, double *A, int k) {
int i, j;
// upper triangular matrix
for (i = 0; i < N-k; i++)
for (j = 0; j < M; j++)
R[k*LDR + k + i] += A[k*LDA + j] * A[j*LDA + k + i];
}
/**
* Mult. for constant s
* d vector
* ld leading dimension (distance from elements)
*/
void scale (double *d, int ld, double s) {
int i;
for (i = 0; i < M; i++) d[i*ld] *= s;
}
/**
* Performs Modified Gram Schmidt
* ortogonalization of columns of A
* A m x n
* R n x n
*/
void gram (double *A, double *R) {
int i;
double s;
// Modified Gram Schmidt algorithm step by step
for (i = 0; i < N; i++) {
// Step #1 --> R(i,i:n-1) = A'(:,i) * A(:,i:n-1)
xTA(R, A, i);
// Step #2 (Normalizing) --> s = sqrt(R(i,i))
s = 1 / sqrt(R[i*LDR + i]);
// Step #3 (Is the scale of a column vector)
scale(A + i, LDA, s);
// Step #4 (Is the scale of a row)
scale(R + LDR*i, 1, s);
// Step #5 --> A(:,i+1:n−1) = A(:,i+1:n−1) − A(:,i) ∗ R(i,i+1:n−1)
r1_update(A, R + i*LDA, i);
}
}
/*
* Print Matrix
*
* Print a matrix passed as argument
*/
void print_matrix (double * matrix) {
int i, j;
for (i = 0; i < M; i++) {
for (j = 0; j < N; j++)
printf("%0.2f ", matrix[i*LDA + j]);
printf("\n");
}
}
/*
* The option parser
*
* The function parses the parameters passed from the command line and run
* their own procedures.
*
* Return value:
* 0 on success
* -1 on failure
*
* Please, see http://www.gnu.org/software/libc/manual/html_node/Getopt.html
* for further informations. (thanks to Frodo Looijaard)
*/
int option_parser (int argc, char **argv) {
int opt;
if (argc < 2) {
fprintf(stderr, "This program needs arguments...\n\n");
print_help(1);
}
opterr = 0;
while ( -1 != (opt = getopt (argc, argv, "hr:c:"))) {
switch (opt) {
case 'h':
print_help(0);
case 'r':
printf("optarg: %s\n", optarg);
if ((M = atoi(optarg)) < 2) return -1;
break;
case 'c':
printf("optarg: %s\n", optarg);
if ((N = atoi(optarg)) < 2 || N > M) return -1;
break;
case '?':
if (optopt == 'r' || optopt == 'c')
fprintf(stderr,"Option -%c requires an argument.\n",optopt);
else if (isprint (optopt))
fprintf(stderr,"Unknown option `-%c'.\n", optopt);
else
fprintf(stderr,"Unknown option chr `\\x%x'.\n", optopt);
return -1;
default:
fprintf(stderr, "default switch-case statement reached\n");
return -1;
}
//for (ii = optind; ii < argc; ii++)
// printf ("Non-option argument %s\n", argv[ii]);
}
return 0;
}
/*
* The helper
*
* Shows the info to run the program in the correct way
*/
void print_help (int exit_val) {
printf("\nPKG : %s\nPROGRAM : %s\nVERSION : %s\n\n",PACKAGE,PROG,VERSION);
printf("%s [-h] [-r num of rows] [-c num of columns]\n\n", PROG);
printf(" -h print this help and exit\n");
printf(" -r provide the number of rows\n");
printf(" -c provide the number of colums\n\n");
printf(" Example: ./qr_cpu -r 800 -c 600\n\n");
exit_val == -1 ? exit(EXIT_FAILURE) : exit(EXIT_SUCCESS);
}
Thanks in advance!
There are different ways to calculate the QR decomposition of a matrix. The main methods are:
Gram-Schmidt process;
Householder reflections;
Givens rotations;
Gram-Schmidt is a sequence of projections and vector subtractions, which may be implemented as a sequence of kernels performing reductions (for projections) and element-wise array operations (vector subtractions). You can have a look at the papers
a) QR Decomposition on GPUs
b) Parallel Implementation of Classical Gram-Schmidt Orthogonalization on CUDA Graphics Cards
c) CPU vs. GPU - Performance comparison for the Gram-Schmidt algorithm
QR decomposition via Householder reflections is dominated by matrix-vector operations and you can find some information in paper a), paper
d) Benchmarking GPUs to Tune Dense Linear Algebra
and an implementation by V. Volkov and J.W. Demmel is available at
LU, QR and Cholesky factorizations using GPU
Givens rotations do not appear to me to be very popular as a parallel approach to QR decomposition. Basically, each Givens rotation modifies two rows, so that some parallelization possible also by aid of the Sameh-Kuck pattern allowing up to n concurrent rotations. You can find some information at
Benchmarking the NVIDIA 8800GTX with the CUDA Development Platform
Actually, a clear performance comparison between the different approaches as implemented in CUDA architectures isn't available. Be aware that some of the posted material regards optimizations on "old" architectures. So, perhaps further improvements could be achieved by further optimizations for the "newer" GPU generations.
Can anyone give me a short code example in c using opengl where clicking in two different squares changes their color? I'm particularly interested in knowing how to detect that a mouse click has happened to a particular primitive.
Here's a GL selection-mode example
/* Copyright (c) Mark J. Kilgard, 1994. */
/**
* (c) Copyright 1993, Silicon Graphics, Inc.
* ALL RIGHTS RESERVED
* Permission to use, copy, modify, and distribute this software for
* any purpose and without fee is hereby granted, provided that the above
* copyright notice appear in all copies and that both the copyright notice
* and this permission notice appear in supporting documentation, and that
* the name of Silicon Graphics, Inc. not be used in advertising
* or publicity pertaining to distribution of the software without specific,
* written prior permission.
*
* THE MATERIAL EMBODIED ON THIS SOFTWARE IS PROVIDED TO YOU "AS-IS"
* AND WITHOUT WARRANTY OF ANY KIND, EXPRESS, IMPLIED OR OTHERWISE,
* INCLUDING WITHOUT LIMITATION, ANY WARRANTY OF MERCHANTABILITY OR
* FITNESS FOR A PARTICULAR PURPOSE. IN NO EVENT SHALL SILICON
* GRAPHICS, INC. BE LIABLE TO YOU OR ANYONE ELSE FOR ANY DIRECT,
* SPECIAL, INCIDENTAL, INDIRECT OR CONSEQUENTIAL DAMAGES OF ANY
* KIND, OR ANY DAMAGES WHATSOEVER, INCLUDING WITHOUT LIMITATION,
* LOSS OF PROFIT, LOSS OF USE, SAVINGS OR REVENUE, OR THE CLAIMS OF
* THIRD PARTIES, WHETHER OR NOT SILICON GRAPHICS, INC. HAS BEEN
* ADVISED OF THE POSSIBILITY OF SUCH LOSS, HOWEVER CAUSED AND ON
* ANY THEORY OF LIABILITY, ARISING OUT OF OR IN CONNECTION WITH THE
* POSSESSION, USE OR PERFORMANCE OF THIS SOFTWARE.
*
* US Government Users Restricted Rights
* Use, duplication, or disclosure by the Government is subject to
* restrictions set forth in FAR 52.227.19(c)(2) or subparagraph
* (c)(1)(ii) of the Rights in Technical Data and Computer Software
* clause at DFARS 252.227-7013 and/or in similar or successor
* clauses in the FAR or the DOD or NASA FAR Supplement.
* Unpublished-- rights reserved under the copyright laws of the
* United States. Contractor/manufacturer is Silicon Graphics,
* Inc., 2011 N. Shoreline Blvd., Mountain View, CA 94039-7311.
*
* OpenGL(TM) is a trademark of Silicon Graphics, Inc.
*/
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <string.h>
#include <time.h>
#include <GL/glut.h>
#define MAXOBJS 10000
#define MAXSELECT 100
#define MAXFEED 300
#define SOLID 1
#define LINE 2
#define POINT 3
GLint windW = 300, windH = 300;
GLuint selectBuf[MAXSELECT];
GLfloat feedBuf[MAXFEED];
GLint vp[4];
float zRotation = 90.0;
float zoom = 1.0;
GLint objectCount;
GLint numObjects;
struct object {
float v1[2];
float v2[2];
float v3[2];
float color[3];
} objects[MAXOBJS];
GLenum linePoly = GL_FALSE;
static void InitObjects(GLint num)
{
GLint i;
float x, y;
if (num > MAXOBJS) {
num = MAXOBJS;
}
if (num < 1) {
num = 1;
}
objectCount = num;
srand((unsigned int) time(NULL));
for (i = 0; i < num; i++) {
x = (rand() % 300) - 150;
y = (rand() % 300) - 150;
objects[i].v1[0] = x + (rand() % 50) - 25;
objects[i].v2[0] = x + (rand() % 50) - 25;
objects[i].v3[0] = x + (rand() % 50) - 25;
objects[i].v1[1] = y + (rand() % 50) - 25;
objects[i].v2[1] = y + (rand() % 50) - 25;
objects[i].v3[1] = y + (rand() % 50) - 25;
objects[i].color[0] = ((rand() % 100) + 50) / 150.0;
objects[i].color[1] = ((rand() % 100) + 50) / 150.0;
objects[i].color[2] = ((rand() % 100) + 50) / 150.0;
}
}
static void Init(void)
{
numObjects = 10;
InitObjects(numObjects);
}
static void Reshape(int width, int height)
{
windW = width;
windH = height;
glViewport(0, 0, windW, windH);
glGetIntegerv(GL_VIEWPORT, vp);
}
static void Render(GLenum mode)
{
GLint i;
for (i = 0; i < objectCount; i++) {
if (mode == GL_SELECT) {
glLoadName(i);
}
glColor3fv(objects[i].color);
glBegin(GL_POLYGON);
glVertex2fv(objects[i].v1);
glVertex2fv(objects[i].v2);
glVertex2fv(objects[i].v3);
glEnd();
}
}
static GLint DoSelect(GLint x, GLint y)
{
GLint hits;
glSelectBuffer(MAXSELECT, selectBuf);
glRenderMode(GL_SELECT);
glInitNames();
glPushName(~0);
glPushMatrix();
glMatrixMode(GL_PROJECTION);
glLoadIdentity();
gluPickMatrix(x, windH - y, 4, 4, vp);
gluOrtho2D(-175, 175, -175, 175);
glMatrixMode(GL_MODELVIEW);
glClearColor(0.0, 0.0, 0.0, 0.0);
glClear(GL_COLOR_BUFFER_BIT);
glScalef(zoom, zoom, zoom);
glRotatef(zRotation, 0, 0, 1);
Render(GL_SELECT);
glPopMatrix();
hits = glRenderMode(GL_RENDER);
if (hits <= 0) {
return -1;
}
return selectBuf[(hits - 1) * 4 + 3];
}
static void RecolorTri(GLint h)
{
objects[h].color[0] = ((rand() % 100) + 50) / 150.0;
objects[h].color[1] = ((rand() % 100) + 50) / 150.0;
objects[h].color[2] = ((rand() % 100) + 50) / 150.0;
}
static void DeleteTri(GLint h)
{
objects[h] = objects[objectCount - 1];
objectCount--;
}
static void GrowTri(GLint h)
{
float v[2];
float *oldV;
GLint i;
v[0] = objects[h].v1[0] + objects[h].v2[0] + objects[h].v3[0];
v[1] = objects[h].v1[1] + objects[h].v2[1] + objects[h].v3[1];
v[0] /= 3;
v[1] /= 3;
for (i = 0; i < 3; i++) {
switch (i) {
case 0:
oldV = objects[h].v1;
break;
case 1:
oldV = objects[h].v2;
break;
case 2:
oldV = objects[h].v3;
break;
}
oldV[0] = 1.5 * (oldV[0] - v[0]) + v[0];
oldV[1] = 1.5 * (oldV[1] - v[1]) + v[1];
}
}
static void Mouse(int button, int state, int mouseX, int mouseY)
{
GLint hit;
if (state == GLUT_DOWN) {
hit = DoSelect((GLint) mouseX, (GLint) mouseY);
if (hit != -1) {
if (button == GLUT_LEFT_BUTTON) {
RecolorTri(hit);
} else if (button == GLUT_MIDDLE_BUTTON) {
GrowTri(hit);
} else if (button == GLUT_RIGHT_BUTTON) {
DeleteTri(hit);
}
glutPostRedisplay();
}
}
}
static void Draw(void)
{
glPushMatrix();
glMatrixMode(GL_PROJECTION);
glLoadIdentity();
gluOrtho2D(-175, 175, -175, 175);
glMatrixMode(GL_MODELVIEW);
glClearColor(0.0, 0.0, 0.0, 0.0);
glClear(GL_COLOR_BUFFER_BIT);
glScalef(zoom, zoom, zoom);
glRotatef(zRotation, 0, 0, 1);
Render(GL_RENDER);
glPopMatrix();
glutSwapBuffers();
}
static void DumpFeedbackVert(GLint * i, GLint n)
{
GLint index;
index = *i;
if (index + 7 > n) {
*i = n;
printf(" ???\n");
return;
}
printf(" (%g %g %g), color = (%4.2f %4.2f %4.2f)\n",
feedBuf[index],
feedBuf[index + 1],
feedBuf[index + 2],
feedBuf[index + 3],
feedBuf[index + 4],
feedBuf[index + 5]);
index += 7;
*i = index;
}
static void DrawFeedback(GLint n)
{
GLint i;
GLint verts;
printf("Feedback results (%d floats):\n", n);
for (i = 0; i < n; i++) {
switch ((GLint) feedBuf[i]) {
case GL_POLYGON_TOKEN:
printf("Polygon");
i++;
if (i < n) {
verts = (GLint) feedBuf[i];
i++;
printf(": %d vertices", verts);
} else {
verts = 0;
}
printf("\n");
while (verts) {
DumpFeedbackVert(&i, n);
verts--;
}
i--;
break;
case GL_LINE_TOKEN:
printf("Line:\n");
i++;
DumpFeedbackVert(&i, n);
DumpFeedbackVert(&i, n);
i--;
break;
case GL_LINE_RESET_TOKEN:
printf("Line Reset:\n");
i++;
DumpFeedbackVert(&i, n);
DumpFeedbackVert(&i, n);
i--;
break;
default:
printf("%9.2f\n", feedBuf[i]);
break;
}
}
if (i == MAXFEED) {
printf("...\n");
}
printf("\n");
}
static void DoFeedback(void)
{
GLint x;
glFeedbackBuffer(MAXFEED, GL_3D_COLOR, feedBuf);
(void) glRenderMode(GL_FEEDBACK);
glPushMatrix();
glMatrixMode(GL_PROJECTION);
glLoadIdentity();
gluOrtho2D(-175, 175, -175, 175);
glMatrixMode(GL_MODELVIEW);
glClearColor(0.0, 0.0, 0.0, 0.0);
glClear(GL_COLOR_BUFFER_BIT);
glScalef(zoom, zoom, zoom);
glRotatef(zRotation, 0, 0, 1);
Render(GL_FEEDBACK);
glPopMatrix();
x = glRenderMode(GL_RENDER);
if (x == -1) {
x = MAXFEED;
}
DrawFeedback((GLint) x);
}
static void Key(unsigned char key, int x, int y)
{
switch (key) {
case 'z':
zoom /= 0.75;
glutPostRedisplay();
break;
case 'Z':
zoom *= 0.75;
glutPostRedisplay();
break;
case 'f':
DoFeedback();
glutPostRedisplay();
break;
case 'l':
linePoly = !linePoly;
if (linePoly) {
glPolygonMode(GL_FRONT_AND_BACK, GL_LINE);
} else {
glPolygonMode(GL_FRONT_AND_BACK, GL_FILL);
}
glutPostRedisplay();
break;
case 27:
exit(0);
}
}
static void SpecialKey(int key, int x, int y)
{
switch (key) {
case GLUT_KEY_LEFT:
zRotation += 0.5;
glutPostRedisplay();
break;
case GLUT_KEY_RIGHT:
zRotation -= 0.5;
glutPostRedisplay();
break;
}
}
int main(int argc, char **argv)
{
glutInit(&argc, argv);
glutInitDisplayMode(GLUT_RGB | GLUT_DOUBLE);
glutCreateWindow("Select Test");
Init();
glutReshapeFunc(Reshape);
glutKeyboardFunc(Key);
glutSpecialFunc(SpecialKey);
glutMouseFunc(Mouse);
glutDisplayFunc(Draw);
glutMainLoop();
return 0; /* ANSI C requires main to return int. */
}
That program is a part of some other GLUT examples.
The GL selection buffer is old and busted though, you're probably better off using color-readback selection or some CPU-side "ray casting" system that integrates with your geometry representation.
Read up on OpenGL's selection feature. That is the classical way of doing it, and should work well for at least small amount of object (which sounds right for your question).
Not a complete example, but there's also a section on the OpenGL FAQ on Picking and Selection which should be noted.