atomicAdd causing error Unable to Launch/Execute Kernel - c

I have the following CUDA C code:
int i = threadIdx.x + blockIdx.x*blockDim.x;
int stride = blockDim.x*gridDim.x;
while(i < size)
{
atomicAdd(&(histo_private[buffer[i]]),1);
i+=stride;
}
which causes my program to crash with the error: "unable to launch/execute kernel"
Here buffer is an input array of integers to this function of size elements and histo_private is an array of integers in shared memory of histo_size elements.
I know this isn't an index out of bounds error because when I use the code:
int i = threadIdx.x + blockIdx.x*blockDim.x;
int stride = blockDim.x*gridDim.x;
while(i < size)
{
int a = histo_private[buffer[i]];
i+=stride;
}
So I gather that there is something wrong with the atomicAdd function and/or the memory address of this 32-bit int array.
The kernel.cu file contains the following code:
// Define your kernels in this file you may use more than one kernel if you
// need to
// INSERT KERNEL(S) HERE
__global__ void histo_kernel(unsigned int* buffer, unsigned int size, int* histo, unsigned int histo_size)
{
extern __shared__ int histo_private[];
if(threadIdx.x < histo_size)
histo_private[threadIdx.x] = 0;
__syncthreads();
// compute block's histogram
int i = threadIdx.x + blockIdx.x*blockDim.x;
int stride = blockDim.x*gridDim.x;
while(i < size)
{
//int a = histo_private[buffer[i]];
atomicAdd(&(histo_private[buffer[i]]),1);
i+=stride;
}
// store to global histogram
__syncthreads();
//if(threadIdx.x < histo_size)
// atomicAdd(&(histo[threadIdx.x]),histo_private[threadIdx.x]);
}
// ensures that no bins contains more than 255 elements
__global__ void enforce_saturation(int* histo, unsigned int histo_size)
{
int i = threadIdx.x + blockIdx.x*blockDim.x;
if(i < histo_size)
{
if(histo[i] > 255) // this will be necessary to prevent data loss
histo[i] = 255; // when converting from int to uint8_t
}
}
__global__ void construct_histo(uint8_t* histo_unpacked, int* histo, unsigned int histo_size)
{
int i = threadIdx.x + blockIdx.x*blockDim.x;
if(i < histo_size)
histo_unpacked[i] = histo[i];
}
// unpacks the input array into an output array with 'spaces'
__global__ void unpack(uint8_t* in, uint8_t* out, unsigned int size)
{
int i = threadIdx.x + blockIdx.x*blockDim.x;
if(i < size)
{
out[4*i] = in[i];
out[4*i+1] = 0;
out[4*i+2] = 0;
out[4*i+3] = 0;
}
}
// converts the input uint8_t array to an int array
__global__ void convert(uint8_t* in, int* out, unsigned int size)
{
int i = threadIdx.x + blockIdx.x*blockDim.x;
if(i < size)
{
out[i] = (int) in[4*i];
}
}
// converts the input int array to a uint8_t array
__global__ void convert_back(int* in, uint8_t* out, unsigned int size)
{
int i = threadIdx.x + blockIdx.x*blockDim.x;
if(i < size)
{
out[i] = (uint8_t) in[i];
}
}
void histogram(unsigned int* input, uint8_t* bins, unsigned int num_elements, unsigned int num_bins)
{
int BLOCK_SIZE = (int) num_bins;
BLOCK_SIZE = 512;
dim3 dim_grid, dim_block;
dim_block.x = BLOCK_SIZE; dim_block.y = dim_block.z = 1;
dim_grid.x = 1+(num_elements-1)/BLOCK_SIZE; dim_grid.y = dim_grid.z = 1;
// create an array of uint8_t to be converted into an array of int
uint8_t* bins_unpacked;
cudaMalloc((void**)&bins_unpacked, 4 * num_bins * sizeof(uint8_t));
// unpack the input uint8_t array
unpack<<<dim_grid,dim_block>>>(bins, bins_unpacked, num_bins);
// need an int version of bins_d
int* bins_int_d;
cudaMalloc((void**)&bins_int_d, num_bins * sizeof(int));
// convert the uint8_t array to an int array
convert<<<dim_grid,dim_block>>>(bins_unpacked, bins_int_d, num_bins);
// run kernel and enforce saturation requirements
int histo_private_size = num_bins;
histo_kernel<<<dim_grid,dim_block,histo_private_size>>>(input, num_elements, bins_int_d, num_bins);
enforce_saturation<<<dim_grid,dim_block>>>(bins_int_d,num_bins);
// convert the int array back to uint8_t
convert_back<<<dim_grid,dim_block>>>(bins_int_d, bins, num_bins);
}
While the function that calls this last histogram function is in main.cu (I did NOT make this second file--it was provided to me--also, I have been testing this on consistent data by compiling via make test-mode):
#include <stdio.h>
#include <stdint.h>
#include "support.h"
#include "kernel.cu"
int main(int argc, char* argv[])
{
Timer timer;
// Initialize host variables ----------------------------------------------
#if TEST_MODE
printf("\n***Running in test mode***\n"); fflush(stdout);
#endif
printf("\nSetting up the problem..."); fflush(stdout);
startTime(&timer);
unsigned int *in_h;
uint8_t* bins_h;
unsigned int *in_d;
uint8_t* bins_d;
unsigned int num_elements, num_bins;
cudaError_t cuda_ret;
if(argc == 1) {
num_elements = 1000000;
num_bins = 4096;
} else if(argc == 2) {
num_elements = atoi(argv[1]);
num_bins = 4096;
} else if(argc == 3) {
num_elements = atoi(argv[1]);
num_bins = atoi(argv[2]);
} else {
printf("\n Invalid input parameters!"
"\n Usage: ./histogram # Input: 1,000,000, Bins: 4,096"
"\n Usage: ./histogram <m> # Input: m, Bins: 4,096"
"\n Usage: ./histogram <m> <n> # Input: m, Bins: n"
"\n");
exit(0);
}
initVector(&in_h, num_elements, num_bins);
bins_h = (uint8_t*) malloc(num_bins*sizeof(uint8_t));
// TESTING
for(unsigned int i = 0; i < num_bins; ++i)
{
bins_h[i] = i;
//printf("uint8_t Element %u: is %u \n", i, bins_h[i]);
}
stopTime(&timer); printf("%f s\n", elapsedTime(timer));
printf(" Input size = %u\n Number of bins = %u\n", num_elements,
num_bins);
// Allocate device variables ----------------------------------------------
printf("Allocating device variables..."); fflush(stdout);
startTime(&timer);
cuda_ret = cudaMalloc((void**)&in_d, num_elements * sizeof(unsigned int));
if(cuda_ret != cudaSuccess) FATAL("Unable to allocate device memory");
cuda_ret = cudaMalloc((void**)&bins_d, num_bins * sizeof(uint8_t));
if(cuda_ret != cudaSuccess) FATAL("Unable to allocate device memory");
cudaDeviceSynchronize();
stopTime(&timer); printf("%f s\n", elapsedTime(timer));
// Copy host variables to device ------------------------------------------
printf("Copying data from host to device..."); fflush(stdout);
startTime(&timer);
cuda_ret = cudaMemcpy(in_d, in_h, num_elements * sizeof(unsigned int),
cudaMemcpyHostToDevice);
if(cuda_ret != cudaSuccess) FATAL("Unable to copy memory to the device");
cuda_ret = cudaMemset(bins_d, 0, num_bins * sizeof(uint8_t));
if(cuda_ret != cudaSuccess) FATAL("Unable to set device memory");
// TESTING
//cuda_ret = cudaMemcpy(bins_d, bins_h, num_bins * sizeof(uint8_t),
// cudaMemcpyHostToDevice);
//if(cuda_ret != cudaSuccess) FATAL("Unable to copy memory to the device");
cudaDeviceSynchronize();
stopTime(&timer); printf("%f s\n", elapsedTime(timer));
// Launch kernel ----------------------------------------------------------
printf("Launching kernel..."); fflush(stdout);
startTime(&timer);
histogram(in_d, bins_d, num_elements, num_bins);
cuda_ret = cudaDeviceSynchronize();
if(cuda_ret != cudaSuccess) FATAL("Unable to launch/execute kernel");
stopTime(&timer); printf("%f s\n", elapsedTime(timer));
// Copy device variables from host ----------------------------------------
printf("Copying data from device to host..."); fflush(stdout);
startTime(&timer);
cuda_ret = cudaMemcpy(bins_h, bins_d, num_bins * sizeof(uint8_t),
cudaMemcpyDeviceToHost);
if(cuda_ret != cudaSuccess) FATAL("Unable to copy memory to host");
cudaDeviceSynchronize();
stopTime(&timer); printf("%f s\n", elapsedTime(timer));
#if TEST_MODE
printf("\nResult:\n");
for(unsigned int binIdx = 0; binIdx < num_bins; ++binIdx) {
printf("Bin %u: %u elements\n", binIdx, bins_h[binIdx]);
}
printf("\nElements Vec:\n");
for(unsigned int i = 0; i < num_elements; ++i) {
printf("Element %u: %u is \n", i, in_h[i]);
}
#endif
// Verify correctness -----------------------------------------------------
printf("Verifying results..."); fflush(stdout);
verify(in_h, bins_h, num_elements, num_bins);
// Free memory ------------------------------------------------------------
cudaFree(in_d); cudaFree(bins_d);
free(in_h); free(bins_h);
return 0;
}

Turns out that this was just an index out of bounds error. The element buffer[i] was greater than the length of histo_private. As another poster mentioned, this was not obvious due to the following artifact of the c compiler:
The compiler is permitted to assume every access is within bounds. That line of my test code did nothing if the access is within bounds and therefore the compiler is permitted to assume that line of code does nothing. Thus it didn't require an access so the successful run of the test code was misleading. Once that line was changed to where the variable hist_private was modified at buffer[i], runtime errors came about.

Related

Assigning pointers of different type to existing memory

I tried to write a type agnostic mem_take.
It would take a peace of preallocated memory and assign its chunks to several pointers. Those pointers could point to different types: floats, doubles, etc.
unsigned int mem_take( void** mem_input, void** mem_pos, const int size_bytes, const int alignment )
{
// get address
unsigned int addr = (unsigned int)*mem_pos;
// align
unsigned int adjustment_bytes = 0;
unsigned int misalignment = addr % alignment;
if(misalignment != 0)
{
adjustment_bytes = alignment - misalignment;
addr += adjustment_bytes;
}
// take aligned address
*mem_input = (void*)addr;
// move current position to next free location
addr += size_bytes;
*mem_pos = (void*)addr;
// return bytes taken
return (size_bytes + adjustment_bytes);
}
Example:
main()
{
char mem[SOME_SIZE];
void* mem_pos = mem;
float* f;
double* d;
int bytes_taken_f = mem_take((void**)&f , &mem_pos, 2 * sizeof(float) , 8); // 2 floats
int bytes_taken_d = mem_take((void**)&d , &mem_pos, 3 * sizeof(double), 8); // 3 doubles
// etc.
// now free to use the memory via arrays
f[0] = 1.0f;
f[1] = 2.0f;
d[0] = 1.0f;
d[1] = 2.0f;
d[2] = 3.0f;
}
The reason why this is done is the platform - a DSP processor, where memory is very limited. (Long story.)
Is this solution valid? Is mem_take written properly?
p.s.
Note, real use case is a fixed point DSP processor, so the above example is "simplified" in a way that types will not be float, double, but ones specific to the processor.
Doing:
void f(void **f);
float a;
f((void**)&a);
is not a good idea, cause void** is not portable. You need to take the address of void* variable:
void f(void **f);
float a;
void *tmp = &a;
f(&tmp);
, but you can return void* just like malloc!
It's like writing simple malloc function which allocated memory on custom stack.
#include <stdio.h>
#include <stdint.h>
#include <assert.h>
void *mem_take(void **mem, size_t *memsize, size_t nmemb, size_t size, size_t alignment, size_t *bytes_to_take)
{
assert(mem != NULL);
assert(nmemb != 0); // we can't "free"
assert(size != 0);
assert(alignment != 0);
// get array size
const size_t s = nmemb * size;
if (nmemb != 0 && s / nmemb != size) {
// overflow!
return NULL;
}
// align
const size_t rest = (size_t)((uintptr_t)*mem % alignment);
const size_t alignmentadd = rest == 0 ? 0 : alignment - rest;
// fprintf(stderr, "alignment %d %d %d %d\n", alignmentadd, ((uintptr_t)*mem % alignment), *memsize, s);
// inform of bytes we need to take
if (bytes_to_take != NULL) {
*bytes_to_take = alignmentadd + s;
}
// check free memory
if (*memsize < alignmentadd + s) {
//fprintf(stderr, "ENOMEM %d %d %d \n", *memsize, alignmentadd, s);
// ENOMEM!
return NULL;
}
// update state, get s + alignmentadd bytes, return ret
char *memc = *mem;
memc += alignmentadd;
void * const ret = memc;
memc += s;
*mem = memc;
*memsize -= alignmentadd + s;
return ret;
}
int main()
{
#define SOME_SIZE (sizeof(float) * 2 + 3 * sizeof(double) + 8)
char mem[SOME_SIZE];
void *mem_pos = mem;
size_t memsize = sizeof(mem);
float *f = mem_take(&mem_pos, &memsize, 2, sizeof(float), 8, NULL); // 2 floats
assert(f != NULL);
double *d = mem_take(&mem_pos, &memsize, 3, sizeof(double), 8, NULL); // 3 doubles
assert(d != NULL);
// etc.
// now free to use the memory via arrays
f[0] = 1.0f;
f[1] = 2.0f;
d[0] = 1.0f;
d[1] = 2.0f;
d[2] = 3.0f;
printf("%f %f %lf %lf %lf\n", f[0], f[1], d[0], d[1], d[2]);
}

AVX2 1GB long array

I have a 1gb long array with floats in a .bin file. After i read it how can i sum the elements with avx2 instrucion, and print the result?
I edited my code with Jake 'Alquimista' LEE's answer.
The problem is the result much smaller than it will be. And other question, how can i add a constant to each number that i readed from .bin file?
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <immintrin.h>
inline float sumf(const float *pSrc, uint32_t len)
{
__m256 sum, in;
float sumr;
uint32_t sumi;
uint32_t lenr = len & 7;
while (len--)
len >>= 3;
sum = _mm256_set1_ps(0.0f);
{
in = _mm256_loadu_ps(pSrc++);
sum = _mm256_add_ps(in, sum);
}
sum = _mm256_hadd_ps(sum, in);
sum = _mm256_hadd_ps(sum, in);
sum = _mm256_hadd_ps(sum, in);
sumi = _mm256_extract_epi32(*(__m256i *)&sum, 0);
sumr = *(float *)&sumi;
while (lenr--)
{
sumr += *pSrc++;
}
return sumr;
}
int main(void)
{
FILE *file;
float *buffer2;
uint32_t fileLen;
if((file = fopen("example.bin","rb"))==NULL)
{
printf("Error! opening file");
exit(1);
}
fseek(file, 0, SEEK_END);
fileLen=ftell(file);
fseek(file, 0, SEEK_SET);
buffer2=(float *)malloc(fileLen+1);
if (!buffer2)
{
fprintf(stderr, "Memory error!");
fclose(file);
return 0;
}
fread(buffer2, fileLen, 1, file);
fclose(file);
printf( "File size : %lu Bits \n", fileLen );
for(int i = 0; i<10; i++)
printf("%f \n", buffer2[i]);
float sum =sumf(buffer2,fileLen);
printf("%f\n",s);
free(buffer2);
return 0;
}
Reading 1GB file into memory is big memory and I/O overhead. Although I'm not very familiar with AVX2, i read articles from Internet & i could come up with the following solution which is actually tested and proved to be working.
My solution consists of reading the file as chuncks of 512 Bytes (Blocks of 128 floats) then summing up the pairs of vectors (16 Total vectors per block) so that at the end we get a final __m256 vector, by casting it to a float* we could sum up its individual components to get the final result.
A case where the file is not 128-floats aligned is handled in the last for loop by summing up individual floats.
The code is commented but in case you have any suggestions to add more explanation to the answer then feel free to do so.
#include <immintrin.h>
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <errno.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <unistd.h>
int make_floatf(char *, int);
float avx_sfadd(char*);
char error_buf[1024];
#define PERROR() \
do { \
strerror_r(errno, error_buf, 1024); \
printf("Error: %s\n", error_buf); \
fclose(fp); \
return -1; \
} while(0)
/* This function generates a .bin file containing blocks
* of 128 floating point numbers
*/
int make_floatf(char *filename, int nblocks)
{
FILE *fp = NULL;
if(!(fp = fopen(filename, "wb+")))
PERROR();
float *block_ptr = malloc(sizeof(float) * 128); /* 512 Bytes block of 128 floats */
if(!block_ptr)
PERROR();
int j, i;
for(j = 0; j < nblocks; j++)
{
for(i = 0; i < 128; i++)
block_ptr[i] = 1.0;
int ret = fwrite(block_ptr, sizeof(float), 128, fp);
if(ret < 128)
{
free(block_ptr);
PERROR();
}
}
free(block_ptr);
fclose(fp);
return 0;
}
/* This function reads the .bin file as chuncks of 512B
* blocks (128 floating point numbers) and calculates thier sum.
* The final sum in a form of vector is looped through and its
* components are summed up to get the final result.
*/
float avx_sfadd(char *filename)
{
FILE *fp = NULL;
__m256 v1;
__m256 v2;
__m256 sum = _mm256_setzero_ps();
if(!(fp = fopen(filename, "rb")))
PERROR();
struct stat stat_buf;
stat(filename, &stat_buf);
size_t fsize = stat_buf.st_size;
size_t nblocks = fsize / (sizeof(float) * 128);
size_t rem_size = fsize - nblocks * sizeof(float) * 128;
size_t rem_floats = rem_size / (sizeof(float));
printf("File size: %ld\nnblocks:%ld\nnremfloats: %ld\n",\
fsize, nblocks, rem_floats);
/* This memory area will hold the 128 floating point numbers per block */
float *block_ptr = malloc(sizeof(float) * 128);
if(!block_ptr)
PERROR();
int i;
for(i = 0; i < nblocks; i++)
{
int ret = fread(block_ptr, sizeof(float), 128, fp);
if(ret < 128)
PERROR();
/* Summing up vectors in a block of 16 vectors (128 floats) */
int j;
for(j = 0; j < 16; j += 2)
{
v1 = _mm256_loadu_ps(block_ptr + j*8);
v2 = _mm256_loadu_ps(block_ptr + (j+1)*8);
sum += _mm256_add_ps(v1, v2);
}
}
/* Handling the case if the last chunck of the file doesn't make
* a complete block.
*/
float rem_sum = 0;
if(rem_size > 0)
{
int ret = fread(block_ptr, 1, rem_size, fp);
if(ret < rem_floats)
PERROR();
int j;
for(j = 0; j < rem_floats; j++)
rem_sum += block_ptr[j];
}
float final_sum = rem_sum;
float *sum_ptr = (float*)∑ /* The final vector hold the sum of all vectors */
/* Summing up the values of the last vector to get the final result */
int k;
for(k = 0; k < 8; k++)
final_sum += sum_ptr[k];
free(block_ptr);
fclose(fp);
return final_sum;
}
int main(int argc, char **argv)
{
if(argc < 2){
puts("./main filename [nblocks]");
return 0;
}
/* ./main filename number_of_block_to_create (eg. ./main floats.bin 1024 )*/
else if(argc == 3){
if(!make_floatf(argv[1], atoi(argv[2])))
puts("File has been created sucessfully\n");
}
/* ./main filename (eg. ./main floats.bin) to calculate sum*/
else
printf("avx_sum = %f\n", avx_sfadd(argv[1])) :
return 0;
}
Here's (most likely) your bug:
while (len--)
len >>= 3;
That's a while loop. As long as len != 0, you replace len with (len - 1) >> 3. And then you change it to -1. No loop to be seen.
inline float sumf(const float *pSrc, uint32_t len)
{
__m256 sum, in;
float sumr;
uint32_t sumi;
uint32_t lenr = len & 7;
len >>= 3;
sum = _mm256_set1_ps(0.0f);
while (len--)
{
in = _mm256_loadu_ps(pSrc++);
sum = _mm256_add_ps(in, sum);
}
in = *(__m256 *)&_mm256_permute4x64_pd(*(__m256d *)&sum, 0b01001110);
sum = _mm256_hadd_ps(sum, in);
sum = _mm256_hadd_ps(sum, in);
sum = _mm256_hadd_ps(sum, in);
sumi = _mm256_extract_epi32(*(__m256i *)&sum, 0);
sumr = *(float *)&sumi;
while (lenr--)
{
sumr += *pSrc++;
}
return sumr;
}
The function above will do. However, I don't think that it will bring much of a performance gain, if any, since it's a very trivial one, and the compiler will do auto-vectorize it anyway.
Please note that you have to typecast the pointer to float *, and divide filelen by sizeof(float) when you pass them as arguments.

CUDA reduction to find the maximum of an array

I am doing the Udacity course on parallel programming (homework 3) and can not figure out why I can't get the maximum in the array using parallel reduction (Udacity forums yet to provide solution). I am pretty certain that I have set up the arrays properly and that the algorithm is correct. I suspect that I have a problem with memory management (accessing out of bounds, incorrect array sizes, copying to and from). Please help! I am running this in the Udacity environment, not locally. Below is the code that I am currently using. For some reason when I change the fmaxf's to fminf's it does find the minimum.
#include "reference_calc.cpp"
#include "utils.h"
#include "math.h"
#include <stdio.h>
#include <cmath>
__global__ void reduce_max_kernel(float *d_out, const float *d_logLum, int size) {
// Reduce log Lum with Max Operator
int myId = threadIdx.x + blockDim.x * blockIdx.x;
int tid = threadIdx.x;
extern __shared__ float temp[];
if (myId < size) {
temp[tid] = d_logLum[myId];
}
else {
temp[tid] = d_logLum[tid];
}
for (unsigned int s = blockDim.x / 2; s > 0; s >>= 1) {
if (tid < s) {
if (myId < size) {
temp[tid] = fmaxf(d_logLum[myId + s], d_logLum[myId]);
} else {
temp[tid] = d_logLum[tid];
}
}
__syncthreads();
}
if (tid == 0) {
d_out[blockIdx.x] = temp[0];
}
}
__global__ void reduce_max_kernel2(float *d_out, float *d_in) {
// Reduce log Lum with Max Operator
int myId = threadIdx.x + blockDim.x * blockIdx.x;
int tid = threadIdx.x;
for (unsigned int s = blockDim.x >> 1; s > 0; s >>= 1) {
if (tid < s) {
d_in[myId] = fmaxf(d_in[myId + s], d_in[myId]);
}
__syncthreads();
}
if (tid == 0) {
d_out[0] = d_in[0];
}
}
void your_histogram_and_prefixsum(const float* const d_logLuminance,
unsigned int* const d_cdf,
float &min_logLum,
float &max_logLum,
const size_t numRows,
const size_t numCols,
const size_t numBins)
{
//TODO
/*Here are the steps you need to implement
1) find the minimum and maximum value in the input logLuminance channel
store in min_logLum and max_logLum
2) subtract them to find the range
3) generate a histogram of all the values in the logLuminance channel using
the formula: bin = (lum[i] - lumMin) / lumRange * numBins
4) Perform an exclusive scan (prefix sum) on the histogram to get
the cumulative distribution of luminance values (this should go in the
incoming d_cdf pointer which already has been allocated for you) */
//int size = 1 << 18;
int points = numRows * numCols;
int logPoints = ceil(log(points)/log(2));
int sizePow = logPoints;
int size = pow(2, sizePow);
int numThreads = 1024;
int numBlocks = size / numThreads;
float *d_out;
float *d_max_out;
checkCudaErrors(cudaMalloc((void **) &d_out, numBlocks * sizeof(float)));
checkCudaErrors(cudaMalloc((void **) &d_max_out, sizeof(float)));
cudaDeviceSynchronize();
reduce_max_kernel<<<numBlocks, numThreads, sizeof(float)*numThreads>>>(d_out, d_logLuminance, points);
cudaDeviceSynchronize();
reduce_max_kernel2<<<1, numBlocks>>>(d_max_out, d_out);
float h_out_max;
checkCudaErrors(cudaMemcpy(&h_out_max, d_max_out, sizeof(float), cudaMemcpyDeviceToHost));
printf("%f\n", h_out_max);
checkCudaErrors(cudaFree(d_max_out));
checkCudaErrors(cudaFree(d_out));
}
You are trying to reproduce the reduce2 reduction kernel of the CUDA SDK reduction sample. Robert Crovella has already spot two mistakes that you have made in your code. Besides them, I think you are also mistakenly initializing the shared memory.
Below, please find a complete working example constructed around your attempt. I have left the wrong instructions of your approach.
#include <thrust\device_vector.h>
#define BLOCKSIZE 256
/********************/
/* CUDA ERROR CHECK */
/********************/
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) { getchar(); exit(code); }
}
}
/*******************************************************/
/* CALCULATING THE NEXT POWER OF 2 OF A CERTAIN NUMBER */
/*******************************************************/
unsigned int nextPow2(unsigned int x)
{
--x;
x |= x >> 1;
x |= x >> 2;
x |= x >> 4;
x |= x >> 8;
x |= x >> 16;
return ++x;
}
__global__ void reduce_max_kernel(float *d_out, const float *d_logLum, int size) {
int tid = threadIdx.x; // Local thread index
int myId = blockIdx.x * blockDim.x + threadIdx.x; // Global thread index
extern __shared__ float temp[];
// --- Loading data to shared memory. All the threads contribute to loading the data to shared memory.
temp[tid] = (myId < size) ? d_logLum[myId] : -FLT_MAX;
// --- Your solution
// if (myId < size) { temp[tid] = d_logLum[myId]; } else { temp[tid] = d_logLum[tid]; }
// --- Before going further, we have to make sure that all the shared memory loads have been completed
__syncthreads();
// --- Reduction in shared memory. Only half of the threads contribute to reduction.
for (unsigned int s=blockDim.x/2; s>0; s>>=1)
{
if (tid < s) { temp[tid] = fmaxf(temp[tid], temp[tid + s]); }
// --- At the end of each iteration loop, we have to make sure that all memory operations have been completed
__syncthreads();
}
// --- Your solution
//for (unsigned int s = blockDim.x / 2; s > 0; s >>= 1) {
// if (tid < s) { if (myId < size) { temp[tid] = fmaxf(d_logLum[myId + s], d_logLum[myId]); } else { temp[tid] = d_logLum[tid]; } }
// __syncthreads();
//}
if (tid == 0) {
d_out[blockIdx.x] = temp[0];
}
}
/********/
/* MAIN */
/********/
int main()
{
const int N = 10;
thrust::device_vector<float> d_vec(N,3.f); d_vec[4] = 4.f;
int NumThreads = (N < BLOCKSIZE) ? nextPow2(N) : BLOCKSIZE;
int NumBlocks = (N + NumThreads - 1) / NumThreads;
// when there is only one warp per block, we need to allocate two warps
// worth of shared memory so that we don't index shared memory out of bounds
int smemSize = (NumThreads <= 32) ? 2 * NumThreads * sizeof(int) : NumThreads * sizeof(int);
// --- reduce2
thrust::device_vector<float> d_vec_block(NumBlocks);
reduce_max_kernel<<<NumBlocks, NumThreads, smemSize>>>(thrust::raw_pointer_cast(d_vec_block.data()), thrust::raw_pointer_cast(d_vec.data()), N);
// --- The last part of the reduction, which would be expensive to perform on the device, is executed on the host
thrust::host_vector<float> h_vec_block(d_vec_block);
float result_reduce0 = -FLT_MAX;
for (int i=0; i<NumBlocks; i++) result_reduce0 = fmax(h_vec_block[i], result_reduce0);
printf("Result = %f\n",result_reduce0);
}

Void pointer casted to some other pointer type T does not behave like T

I'm trying to make a generic circular buffer using a void pointer buffer and I get some issues I don't understand.
If I use a double pointer for my buffer, my circular buffer behaves as expected (see test1 below), but if I use a void pointer, I get a wrong behaviour (see test2), although all the void pointers are casted to double pointers. What's wrong with my casting?
One more question: in circular_buffer_write_chunk I'm using a double pointer to pass the data. I need to pass it as with a void pointer in order to make it generic. I would then have to cast it dynamically in the function to the type of the buffer (which could be double or int) in order to get the pointer arithmetic working. How can I do that? How can I get the type of the pointer to the buffer and then cast my data pointer to that type?
Any comment or suggestion welcome.
test1 Running circular_buffer_test() with buffer as double pointer
*** circular_buffer test***
capacity: 12
Write 5 values
cb[0]=0.000000
cb[1]=1.000000
cb[2]=2.000000
cb[3]=3.000000
cb[4]=4.000000
cb[5]=0.000000
cb[6]=0.000000
cb[7]=0.000000
cb[8]=0.000000
cb[9]=0.000000
cb[10]=0.000000
cb[11]=0.000000
Write 10 values
cb[0]=12.000000
cb[1]=13.000000
cb[2]=14.000000
cb[3]=3.000000
cb[4]=4.000000
cb[5]=5.000000
cb[6]=6.000000
cb[7]=7.000000
cb[8]=8.000000
cb[9]=9.000000
cb[10]=10.000000
cb[11]=11.000000
Test done
test2 Running circular_buffer_test() with buffer as void pointer. Why is that different?
*** circular_buffer test***
capacity: 12
Write 5 values
cb[0]=0.000000
cb[1]=1.000000
cb[2]=2.000000
cb[3]=3.000000
cb[4]=4.000000
cb[5]=0.000000
cb[6]=0.000000
cb[7]=0.000000
cb[8]=0.000000
cb[9]=0.000000
cb[10]=0.000000
cb[11]=0.000000
Write 10 values
cb[0]=12.000000
cb[1]=13.000000
cb[2]=14.000000
cb[3]=0.000000 // ?
cb[4]=0.000000 // ?
cb[5]=0.000000 // ?
cb[6]=0.000000 // ?
cb[7]=0.000000 // ?
cb[8]=0.000000 // ?
cb[9]=0.000000 // ?
cb[10]=0.000000 // ?
cb[11]=0.000000 // ?
Test done
circular_buffer.c
#include "circular_buffer.h"
#include <stdlib.h>
#include <string.h>
#include <stdio.h>
typedef struct
{
double *buffer;
double *buffer_end;
unsigned int capacity;
double *read_ptr;
double *write_ptr;
unsigned int count;
} circular_buffer;
/*
// Why doesn't this behave like with double pointer, since it's casted in circular_buffer_init_double?
typedef struct
{
void *buffer;
void *buffer_end;
unsigned int capacity;
void *read_ptr;
void *write_ptr;
unsigned int count;
} circular_buffer;
*/
void circular_buffer_init_double(circular_buffer *cb, unsigned int capacity)
{
cb->buffer = (double*) malloc(sizeof(double) * capacity);
cb->buffer_end = (double*) (cb->buffer + capacity);
cb->capacity = capacity;
cb->read_ptr = (double*) (cb->buffer);
cb->write_ptr = (double*) (cb->buffer);
cb->count = 0;
}
void circular_buffer_destroy(circular_buffer *cb){
free(cb->buffer);
cb->buffer = NULL;
}
int circular_buffer_is_full(circular_buffer *cb) {
return cb->count == cb->capacity;
}
int circular_buffer_is_empty(circular_buffer *cb) {
return cb->count == 0;
}
unsigned int circular_buffer_write_chunk(circular_buffer* cb, double *data, unsigned int type_size,
unsigned int num_elements){
if( num_elements > cb->capacity ) return 0;
if( cb->write_ptr + num_elements > cb->buffer_end ){
unsigned int write_ptr_to_buffer_end = cb->buffer_end - cb->write_ptr;
unsigned int write_ptr_position = cb->write_ptr - cb->buffer;
memcpy( cb->write_ptr, data, write_ptr_to_buffer_end * type_size );
memcpy( cb->buffer, data + write_ptr_to_buffer_end,
( num_elements - write_ptr_to_buffer_end) * type_size );
cb->write_ptr = cb->buffer + num_elements - write_ptr_to_buffer_end;
}
else{
memcpy( cb->write_ptr, data, type_size * num_elements );
cb->write_ptr += num_elements;
if( cb->write_ptr == cb->buffer_end ){
cb->write_ptr = cb->buffer;
}
}
if( circular_buffer_is_full(cb) ) cb->read_ptr = cb->write_ptr;
cb->count = cb->count + num_elements > cb->capacity ? cb->capacity : cb->count + num_elements;
return num_elements;
}
void circular_buffer_test(){
fprintf(stdout, "*** circular_buffer test***\n");
circular_buffer *cb = malloc(sizeof(circular_buffer));
circular_buffer_init_double(cb, 12);
fprintf(stdout, "capacity: %d\n", cb->capacity);
int i;
double *w1 = malloc(sizeof(double) * 5);
for(i=0; i<5; i++) w1[i] = (double) i;
fprintf(stdout, "\nWrite 5 values\n");
circular_buffer_write_chunk(cb, w1, sizeof(double), 5);
for(i=0; i<12; i++) fprintf(stdout, "cb[%d]=%f\n", i, ((double *)cb->buffer)[i]);
fprintf(stdout, "\nWrite 10 values\n");
double *w2 = malloc(sizeof(double) * 10);
for(i=5; i<15; i++) w2[i-5] = (double) i;
circular_buffer_write_chunk(cb, w2, sizeof(double), 10);
for(i=0; i<12; i++) fprintf(stdout, "cb[%d]=%f\n", i, ((double *)cb->buffer)[i]);
free(w1);
free(w2);
circular_buffer_destroy(cb);
fprintf(stdout, "Test done\n");
}
Pointer arithmetic with void * is not allowed in C:
(double*) (cb->buffer + capacity);
gcc adds pointer arithmetic for void * as an extension by considering the size of void to be 1.
Whether buffer is void * with gcc or double * the result of cb->buffer + capacity will be different. Use:
((double*) cb->buffer + capacity);
if buffer is void *.

Dereferencing pointer in CUDA C

I am a novice C programmer and was a bit confused about this segmentation fault. I have worked with pointers before and this doesn't make sense. This code is being done on an NVIDIA GPU but I am not using any of the CUDA API functions yet (commented them out to isolate the error).
I get the error when de-referencing the pointer *mu on the GPU (see code below) in the function calibrate. That is, the error is a segmentation fault.
My host code is:
/******************************************************************************
*cr
*cr
******************************************************************************/
#include <stdio.h>
#include <stdlib.h>
#include "kernel.cu"
#include "support.h"
int main (int argc, char *argv[])
{
Timer timer;
cudaError_t cuda_ret;
// Initialize host variables ----------------------------------------------
printf("\nSetting up the problem...\n"); fflush(stdout);
startTime(&timer);
double* A_h, *T_h, *Delta_h, *E_h, *p_h, *p2_h, *D_h, *Times_h, *ones_h;
double* A_d, *T_d, *Delta_d, *E_d, *p_d, *p2_d, *D_d, *Times_d, *ones_d, *temp_1, *temp_2;
double* mu_h, *alpha_h, *omega_h;
double* mu_d, *alpha_d, *omega_d;
int N;
unsigned int mat_size, vec_size;
// Import data
FILE *fp;
char str[60];
unsigned int count=0;
double d;
/* opening file for reading */
fp = fopen("AAPL_data.txt","r");
if(fp == NULL) {
perror("Error opening file");
return(-1);
}
while(fgets (str, 60, fp)!=NULL)
++count;
// Stick with a limited subset of the data for now
N = 2000;
fclose(fp);
printf("Count is %u \n",count);
mat_size = N*N;
vec_size = N;
dim3 dim_grid, dim_block;
// Fill matrices with 0's
A_h = (double*) malloc( sizeof(double)*mat_size );
for (unsigned int i=0; i < mat_size; ++i) { A_h[i] = 0; }
T_h = (double*) malloc( sizeof(double)*mat_size );
for (unsigned int i=0; i < mat_size; ++i) { T_h[i] = 0; }
Delta_h = (double*) malloc( sizeof(double)*mat_size );
for (unsigned int i=0; i < mat_size; ++i) { Delta_h[i] = 0; }
E_h = (double*) malloc( sizeof(double)*mat_size );
for (unsigned int i=0; i < mat_size; ++i) { E_h[i] = 0; }
p_h = (double*) malloc( sizeof(double)*mat_size );
for (unsigned int i=0; i < mat_size; ++i) { p_h[i] = 0; }
// Fill vectors with 0's, except the 1's vector
p2_h = (double*) malloc( sizeof(double)*vec_size );
for (unsigned int i=0; i < vec_size; ++i) { p2_h[i] = 0; }
Times_h = (double*) malloc( sizeof(double)*vec_size );
for (unsigned int i=0; i < vec_size; ++i) { Times_h[i] = 0; }
D_h = (double*) malloc( sizeof(double)*vec_size );
for (unsigned int i=0; i < vec_size; ++i) { D_h[i] = 0; }
ones_h = (double*) malloc( sizeof(double)*vec_size );
for (unsigned int i=0; i < vec_size; ++i) { ones_h[i] = 0; }
// Start constants as zero
mu_h = (double*) malloc( sizeof(double));
alpha_h = (double*) malloc( sizeof(double));
omega_h = (double*) malloc( sizeof(double));
*mu_h = 0;
*alpha_h = 0;
*omega_h = 0;
// Import data
count=0;
/* opening file for reading */
fp = fopen("AAPL_data.txt","r");
if(fp == NULL) {
perror("Error opening file");
return(-1);
}
while(fgets (str, 60, fp)!=NULL)
{
sscanf(str, "%lf", &d);
if(count < vec_size)
Times_h[count] = d;
++count;
}
fclose(fp);
/*printf("TIMES VECTOR: \n");
for (unsigned int i=0; i < vec_size; ++i)
{
printf("TIMES_H[ %u ] is ",i);
printf("%f \n", Times_h[i]);
}*/
printf("Count is %u \n",count);
stopTime(&timer); printf("%f s\n", elapsedTime(timer));
// Allocate device variables ----------------------------------------------
printf("Allocating device variables..."); fflush(stdout);
startTime(&timer);
cudaMalloc((void**) &A_d, mat_size*sizeof(double)); // Create device variable for matrix A
cudaMalloc((void**) &T_d, mat_size*sizeof(double)); // Create device variable for matrix T
cudaMalloc((void**) &Delta_d, mat_size*sizeof(double)); // Create device variable for matrix Delta
cudaMalloc((void**) &E_d, mat_size*sizeof(double)); // Create device variable for matrix E
cudaMalloc((void**) &p_d, mat_size*sizeof(double)); // Create device variable for matrix p
cudaMalloc((void**) &p2_d, vec_size*sizeof(double)); // Create device variable for vector p2
cudaMalloc((void**) &D_d, vec_size*sizeof(double)); // Create device variable for vector D
cudaMalloc((void**) &Times_d, vec_size*sizeof(double)); // Create device variable for vector Times
cudaMalloc((void**) &ones_d, vec_size*sizeof(double)); // Create device variable for vector ones
cudaMalloc((void**) &mu_d, sizeof(double)); // Create device variable for constant mu
cudaMalloc((void**) &alpha_d, sizeof(double)); // Create device variable for constant alpha
cudaMalloc((void**) &omega_d, sizeof(double)); // Create device variable for constant omega
cudaMalloc((void**) &temp_1, vec_size*sizeof(double)); // Create device variable for constant omega
cudaMalloc((void**) &temp_2, mat_size*sizeof(double)); // Create device variable for constant omega
cudaDeviceSynchronize();
stopTime(&timer); printf("%f s\n", elapsedTime(timer));
// Copy host variables to device ------------------------------------------
printf("Copying data from host to device..."); fflush(stdout);
startTime(&timer);
cudaMemcpy(A_d,A_h,mat_size*sizeof(double), cudaMemcpyHostToDevice); // Copy from host var to device var
cudaMemcpy(T_d,T_h,mat_size*sizeof(double), cudaMemcpyHostToDevice); // Copy from host var to device var
cudaMemcpy(Delta_d,Delta_h,mat_size*sizeof(double), cudaMemcpyHostToDevice); // Copy from host var to device var
cudaMemcpy(E_d,E_h,mat_size*sizeof(double), cudaMemcpyHostToDevice); // Copy from host var to device var
cudaMemcpy(p_d,p_h,mat_size*sizeof(double), cudaMemcpyHostToDevice); // Copy from host var to device var
cudaMemcpy(p2_d,p2_h,vec_size*sizeof(double), cudaMemcpyHostToDevice); // Copy from host var to device var
cudaMemcpy(D_d,D_h,vec_size*sizeof(double), cudaMemcpyHostToDevice); // Copy from host var to device var
cudaMemcpy(ones_d,ones_h,vec_size*sizeof(double), cudaMemcpyHostToDevice); // Copy from host var to device var
cudaMemcpy(Times_d,Times_h,mat_size*sizeof(double), cudaMemcpyHostToDevice); // Copy from host var to device var
cudaMemcpy(mu_d,mu_h,sizeof(double), cudaMemcpyHostToDevice); // Copy from host var to device var
cudaMemcpy(alpha_d,alpha_h,sizeof(double), cudaMemcpyHostToDevice); // Copy from host var to device var
cudaMemcpy(omega_d,omega_h,sizeof(double), cudaMemcpyHostToDevice); // Copy from host var to device var
cudaMemcpy(temp_1,D_h,vec_size*sizeof(double), cudaMemcpyHostToDevice); // Copy from host var to device var
cudaMemcpy(temp_2,A_h,mat_size*sizeof(double), cudaMemcpyHostToDevice); // Copy from host var to device var
cudaDeviceSynchronize();
stopTime(&timer); printf("%f s\n", elapsedTime(timer));
// Launch kernel using standard sgemm interface ---------------------------
printf("Launching kernel..."); fflush(stdout);
startTime(&timer);
int MAX_ITER = 100;
double TOL = .001;
calibrate(vec_size,mu_d, alpha_d, omega_d, A_d, T_d, Delta_d, E_d, p_d, p2_d, D_d, ones_d, Times_d,
MAX_ITER, TOL, temp_1, temp_2);
//tiledSgemm('N', 'N', matArow, matBcol, matBrow, 1.0f, \
// A_d, matArow, B_d, matBrow, 0.0f, C_d, matBrow); // A1_d, B1_d);
cuda_ret = cudaDeviceSynchronize();
if(cuda_ret != cudaSuccess) FATAL("Unable to launch kernel");
stopTime(&timer); printf("%f s\n", elapsedTime(timer));
// Copy device variables from host ----------------------------------------
printf("Copying data from device to host...\n"); fflush(stdout);
startTime(&timer);
cudaMemcpy(mu_h,mu_d,sizeof(float), cudaMemcpyDeviceToHost); // Copy from device var to host var
cudaMemcpy(alpha_h,alpha_d,sizeof(float), cudaMemcpyDeviceToHost); // Copy from device var to host var
cudaMemcpy(omega_h,omega_d,sizeof(float), cudaMemcpyDeviceToHost); // Copy from device var to host var
printf("mu is %f: \n",mu_h);
printf("alpha is %f: \n",alpha_h);
printf("omega is %f: \n",omega_h);
cudaDeviceSynchronize();
stopTime(&timer); printf("%f s\n", elapsedTime(timer));
// Free memory ------------------------------------------------------------
free(A_h);
free(T_h);
free(Delta_h);
free(E_h);
free(p_h);
free(p2_h);
free(D_h);
free(ones_h);
free(Times_h);
free(mu_h);
free(alpha_h);
free(omega_h);
cudaFree(A_d);
cudaFree(T_d);
cudaFree(Delta_d);
cudaFree(E_d);
cudaFree(p_d);
cudaFree(p2_d);
cudaFree(D_d);
cudaFree(ones_d);
cudaFree(Times_d);
cudaFree(mu_d);
cudaFree(alpha_d);
cudaFree(omega_d);
return 0;
}
The Kernel code on the GPU is:
/*****************************************************************************************/
#include <stdio.h>
#define TILE_SIZE 16
#define BLOCK_SIZE 512
__global__ void mysgemm(int m, int n, int k, const double *A, const double *B, double* C) {
__shared__ float ds_A[TILE_SIZE][TILE_SIZE];
__shared__ float ds_B[TILE_SIZE][TILE_SIZE];
int bx = blockIdx.x;
int by = blockIdx.y;
int tx = threadIdx.x;
int ty = threadIdx.y;
int row = (by*TILE_SIZE+ty);//%m;
int col = (bx*TILE_SIZE+tx);//%n;
float pvalue = 0;
for(int i=0;i<(k-1)/TILE_SIZE+1;++i)
{
if((i*TILE_SIZE +tx < k) && (row < m))
ds_A[ty][tx] = A[row*k+i*TILE_SIZE+tx];
else ds_A[ty][tx] = 0;
if((i*TILE_SIZE+ty < k) && (col < n))
ds_B[ty][tx] = B[(i*TILE_SIZE+ty)*n+col]; // Load data into shared memory
else ds_B[ty][tx] = 0;
__syncthreads();
if(row < m && col < n)
{
for(int j=0;j<TILE_SIZE;++j)
{
//if(j < k)
pvalue += ds_A[ty][j]*ds_B[j][tx];
}
}
__syncthreads();
}
if(row < m && col < n)
C[row*n+col] = pvalue;
}
// Kernel to multiply each element in A by the corresponding element in B and store
// the result to the corresponding element in C. All vectors should be of length m
__global__ void elem_mul(int m, const double *A, const double *B, double* C)
{
int bx = blockIdx.x;
int tx = threadIdx.x;
int i = tx+bx*blockDim.x;
if(i < m)
C[i] = A[i]*B[i];
}
// Kernel for parallel sum
__global__ void reduction(double *out, double *in, unsigned size)
{
__shared__ float partialSum[2*BLOCK_SIZE];
unsigned int t = threadIdx.x;
unsigned int start = 2*blockIdx.x*blockDim.x;
if(start + t >= size)
partialSum[t] = 0;
else partialSum[t] = in[start+t];
if(start + blockDim.x+t>= size)
partialSum[blockDim.x+t] = 0;
else partialSum[blockDim.x+t] = in[start + blockDim.x+t];
for(unsigned int stride = 1; stride <=blockDim.x; stride*=2)
{
__syncthreads();
if(t % stride ==0)
partialSum[2*t]+=partialSum[2*t+stride];
}
__syncthreads();
out[blockIdx.x] = partialSum[0];
}
// Uses several kernels to compute the inner product of A and B
void inner_product(double *out, int m, const double *A, const double* B, double* temp)
{
dim3 dimGrid((m-1)/BLOCK_SIZE+1,(m-1)/BLOCK_SIZE+1,1);
dim3 dimBlock(BLOCK_SIZE,BLOCK_SIZE,1);
elem_mul<<<dimGrid,dimBlock>>>(m,A,B,temp);
reduction<<<dimGrid,dimBlock>>>(out,temp,m);
}
// Kernel to multiply each element in the matrix out in the following manner:
// out(i,j) = in(i) - in(j)
__global__ void fill(int m, const double *in, double *out)
{
int bx = blockIdx.x;
int by = blockIdx.y;
int tx = threadIdx.x;
int ty = threadIdx.y;
int i = tx+bx*blockDim.x;
int j = ty+by*blockDim.y;
if((i < m) && (j < m))
out[i*m+j] = in[i]-in[j];
}
// Kernel to fill the matrix out with the formula out(i,j) = exp(-omega*T(i.j))
__global__ void fill_E(int m, double coeff, double *in, double *out)
{
int bx = blockIdx.x;
int tx = threadIdx.x;
int i = tx+bx*blockDim.x;
if(i < m)
out[i] = exp(-coeff * in[i]);
}
// Kernel for scalar multiplication for an mxk matirx and a coefficient coeff
__global__ void scal_mul(int m, int k, double coeff, double *in, double *out)
{
int bx = blockIdx.x;
int tx = threadIdx.x;
int i = tx+bx*blockDim.x;
if(i < m*k)
out[i] = coeff * in[i];
}
// Kernel for scalar multiplication for an mxk matirx and a coefficient coeff
__global__ void scal_add(int m, int k, double coeff, double *in, double *out)
{
int bx = blockIdx.x;
int tx = threadIdx.x;
int i = tx+bx*blockDim.x;
if(i < m*k)
out[i] = coeff + in[i];
}
// Kernel to update vector p2
__global__ void update_p2(int m, double coeff, double *in, double *out)
{
int bx = blockIdx.x;
int tx = threadIdx.x;
int i = tx+bx*blockDim.x;
if(i < m)
out[i] = coeff/in[i];
}
// Kernel to update matrix p
__global__ void update_p(int m, double* p2, double *denom, double *num, double *out)
{
int bx = blockIdx.x;
int tx = threadIdx.x;
int i = tx+bx*blockDim.x;
// loop through columns j
for(int j=0; j<m; ++j)
{
if(i == j)
out[i*m + j] = p2[i];
else if(i < m)
out[i*m + j] = num[i*m+j]/denom[i];
}
}
/*****************************************************************************************/
// int size: length of the Time-series vectors. Also the number of rows and columns in input matrices
// double mu: One of three parameters calibrated
// double alpha: One of three parameters calibrated
// double omega: One of three parameters calibrated
// double* A: A matrix filled out and used to calibrate
// double* T: A distance matrix T(i,j) = Times[i]-Times[j]
// double* Delta: A dissimilarity matrix Delta(i,j) = 1 if i > j, 0 otherwise
// double* E: A matrix filled out and used to calibrate--E(i,j) = exp(-omega*T(i,j))
// double* p: A probability matrix of cross excitations
// double* p2: A vector of self-excitation probabilities
// double* ones: A (size x 1) vector of 1's used in inner products and identity transformations
// double* Times: A (size x 1) vector of time series data to be calibrated
// int MAX_ITER: The maximum number of iterations allowed in the calibration
// double* TOL: The error tolerance or accuracy allowed in the calibration
// double* temp_1: A (size x 1) temporary vector used in intermediate calculations
// double* temp_2: A temporary matrix used in intermediate calculations
/*****************************************************************************************/
void calibrate(int size, double *mu, double *alpha, double *omega, double *A, double *T, double *Delta, double *E, double *p, double *p2, double *D, double* ones, double *Times, int MAX_ITER, double TOL, double* temp_1, double* temp_2)
{
//1) (a) Perform inner product to start initial values of mu, alpha, and omega
*mu = .11; // ERROR IS HERE!!
/*
inner_product(mu, size, Times, ones, temp_1);
double a = *(mu);
a = a/size;
*mu = .11;
/*
/size;
*alpha = *mu;
*omega = *mu;
double mu_t = 0;
double alpha_t = 0;
double omega_t = 0;
double err = 0;
int ctr = 0;
//1) (b) Fill out matrix T of time differences
dim3 dimGrid((size-1)/BLOCK_SIZE+1,(size-1)/BLOCK_SIZE+1,1);
dim3 dimBlock(BLOCK_SIZE,BLOCK_SIZE,1);
fill<<<dimGrid,dimBlock>>>(size, Times, T);
while(ctr < MAX_ITER && err < TOL)
{
// 2) Fill out matrix E
dim3 dimGrid((size-1)/BLOCK_SIZE+1,(size-1)/BLOCK_SIZE+1,1);
dim3 dimBlock(BLOCK_SIZE,BLOCK_SIZE,1);
fill_E<<<dimGrid,dimBlock>>>(size, omega, T, E);
// 3) Update matrix A
dim3 dimGrid((size-1)/BLOCK_SIZE+1,(size-1)/BLOCK_SIZE+1,1);
dim3 dimBlock(BLOCK_SIZE,BLOCK_SIZE,1);
scal_mult<<<dimGrid,dimBlock>>>(size,size, alpha, delta, A);
scal_mult<<<dimGrid,dimBlock>>>(size,size, omega, A, A);
dim3 dimGrid((n-1)/TILE_SIZE+1,(m-1)/TILE_SIZE+1,1);
dim3 dimBlock(TILE_SIZE,TILE_SIZE,1);
mysgemm<<<dimGrid,dimBlock>>>(size,size,size,A,E,A)
// 4) Update matrix D
mysgemm<<<dimGrid,dimBlock>>>(size,size,1,A,ones,D);
scal_add<<<dimGrid,dimBlock>>>(size,size, mu, D, D);
// 5) Update matrix p and vector p2
update_p2<<<dimGrid,dimBlock>>>(size,mu, D, p2);
update_p<<<dimGrid,dimBlock>>>(size,p2, D, A, p);
// 6) Update parameters mu, alpha, omega
inner_product(mu_t, size, p2, ones, temp_1);
mu_t /=Times[size-1];
reduction<<<dimGrid,dimBlock>>>(alpha_t,p,size*size);
alpha_t/= size;
// Treat T and p as very long vectors and calculate the inner product
inner_product(omega_t, size*size, T, p, temp_2);
omega_t = alpha_t/omega_t;
// 7) Update error
ctr++;
err = (mu - mu_t)*(mu - mu_t) + (alpha-alpha_t)*(alpha-alpha_t) + (omega-omega_t)*(omega-omega_t);
mu = mu_t;
alpha = alpha_t;
omega = omega_t;
cudaError_t error = cudaGetLastError();
if(error != cudaSuccess)
{
printf("CUDA error: %s\n",cudaGetErrorString(error));
exit(-1);
}
}
*/
}
However, I think 99% of this code isn't relevant to the issue (I use nothing from "support.h" at the moment. Basically, I get an error de-referencing the pointer on the GPU, even though it is presumably not null. Thanks!
If you do proper cuda error checking you'll discover another problem with your code, this line:
cudaMemcpy(Times_d,Times_h,mat_size*sizeof(double), cudaMemcpyHostToDevice);
should be something like this:
cudaMemcpy(Times_d,Times_h,vec_size*sizeof(double), cudaMemcpyHostToDevice);
However that's not the crux of the issue. It took me a while to figure out that you are not making any kernel calls. If you call a kernel, all the parameters you pass to that kernel must be accessible by the device. So if you pass a pointer, the pointer must point to device memory. You are doing this with mu_d which is a device pointer:
calibrate(vec_size,mu_d,...
But your calibrate is not a kernel!!
It's an ordinary host function running on the host (CPU). So when you try and dereference the device pointer mu_d in host code:
*mu = .11; // ERROR IS HERE!!
You get a seg fault. I'm not sure why you're trying to debug this way, but simply converting kernel calls to host routines, while leaving all the parameters the same, is not a valid way to debug.
Fundamental CUDA rules (ignoring cuda 6 Unified Memory):
you cannot dereference a host pointer in device code
you cannot dereference a device pointer in host code
Your code is a violation of the 2nd rule above.

Resources