CUDA COnstant memory use [duplicate] - c

I have the following code to copy from a host variable to a __constant__ variable in CUDA
int main(int argc, char **argv){
int exit_code;
if (argc < 4) {
std::cout << "Usage: \n " << argv[0] << " <input> <output> <nColors>" << std::endl;
return 1;
}
Color *h_input;
int h_rows, h_cols;
timer1.Start();
exit_code = readText2RGB(argv[1], &h_input, &h_rows, &h_cols);
timer1.Stop();
std::cout << "Reading: " << timer1.Elapsed() << std::endl;
if (exit_code != SUCCESS){
std::cout << "Error trying to read file." << std::endl;
return FAILURE;
}
CpuTimer timer1;
GpuTimer timer2;
float timeStep2 = 0, timeStep3 = 0;
int h_numColors = atoi(argv[3]);
int h_change = 0;
int *h_pixelGroup = new int[h_rows*h_cols];
Color *h_groupRep = new Color[h_numColors];
Color *h_output = new Color[h_rows*h_cols];
Color *d_input;
int *d_pixelGroup;
Color *d_groupRep;
Color *d_output;
dim3 block(B_WIDTH, B_HEIGHT);
dim3 grid((h_cols+B_WIDTH-1)/B_WIDTH, (h_rows+B_HEIGHT-1)/B_HEIGHT);
checkCudaError(cudaMalloc((void**)&d_input, sizeof(Color)*h_rows*h_cols));
checkCudaError(cudaMalloc((void**)&d_pixelGroup, sizeof(int)*h_rows*h_cols));
checkCudaError(cudaMalloc((void**)&d_groupRep, sizeof(Color)*h_numColors));
checkCudaError(cudaMalloc((void**)&d_output, sizeof(Color)*h_rows*h_cols));
// STEP 1
//Evenly distribute all pixels of the image onto the color set
timer2.Start();
checkCudaError(cudaMemcpyToSymbol(c_rows, &h_rows, sizeof(int)));
checkCudaError(cudaMemcpyToSymbol(c_cols, &h_cols, sizeof(int)));
checkCudaError(cudaMemcpyToSymbol(c_numColors, &h_numColors, sizeof(int)));
checkCudaError(cudaMemcpy(d_input, h_input, sizeof(Color)*h_rows*h_cols, cudaMemcpyHostToDevice));
clut_distributePixels<<<grid, block>>>(d_pixelGroup);
checkCudaError(cudaMemcpy(h_pixelGroup, d_pixelGroup, sizeof(int)*h_rows*h_cols, cudaMemcpyDeviceToHost));
timer2.Stop();
std::cout << "Phase 1: " << timer2.Elapsed() << std::endl;
std::cout << h_pixelGroup[0] << ","
<< h_pixelGroup[3] << ","
<< h_pixelGroup[4] << ","
<< h_pixelGroup[7] << ","
<< h_pixelGroup[8] << std::endl;
//Do the STEP 2 and STEP 3 as long as there is at least one change of representative in a group
do {
// STEP 2
//Set the representative value to the average colour of all pixels in the same set
timer1.Start();
for (int ng = 0; ng < h_numColors; ng++) {
int r = 0, g = 0, b = 0;
int elem = 0;
for (int i = 0; i < h_rows; i++) {
for (int j = 0; j < h_cols; j++) {
if (h_pixelGroup[i*h_cols+j] == ng) {
r += h_input[i*h_cols+j].r;
g += h_input[i*h_cols+j].g;
b += h_input[i*h_cols+j].b;
elem++;
}
}
}
if (elem == 0) {
h_groupRep[ng].r = 255;
h_groupRep[ng].g = 255;
h_groupRep[ng].b = 255;
}else{
h_groupRep[ng].r = r/elem;
h_groupRep[ng].g = g/elem;
h_groupRep[ng].b = b/elem;
}
}
timer1.Stop();
timeStep2 += timer1.Elapsed();
// STEP 3
//For each pixel in the image, compute Euclidean's distance to each representative
//and assign it to the set which is closest
h_change = 0;
timer2.Start();
checkCudaError(cudaMemcpyToSymbol(d_change, &h_change, sizeof(int)));
checkCudaError(cudaMemcpy(d_groupRep, h_groupRep, sizeof(Color)*h_numColors, cudaMemcpyHostToDevice));
clut_checkDistances<<<grid, block>>>(d_input, d_pixelGroup, d_groupRep);
checkCudaError(cudaMemcpy(h_pixelGroup, d_pixelGroup, sizeof(int)*h_rows*h_cols, cudaMemcpyDeviceToHost));
checkCudaError(cudaMemcpyFromSymbol(&h_change, d_change, sizeof(int)));
timer2.Stop();
timeStep3 += timer2.Elapsed();
std::cout << "Chunche" << std::endl;
} while (h_change == 1);
std::cout << "Phase 2: " << timeStep2 << std::endl;
std::cout << "Phase 3: " << timeStep3 << std::endl;
// STEP 4
//Create the new image with the resulting color lookup table
timer2.Start();
clut_createImage<<<grid, block>>>(d_output, d_pixelGroup, d_groupRep);
checkCudaError(cudaMemcpy(h_output, d_output, sizeof(Color)*h_rows*h_cols, cudaMemcpyDeviceToHost));
timer2.Stop();
std::cout << "Phase 4: " << timer2.Elapsed() << std::endl;
checkCudaError(cudaFree(d_input));
checkCudaError(cudaFree(d_pixelGroup));
checkCudaError(cudaFree(d_groupRep));
checkCudaError(cudaFree(d_output));
timer1.Start();
exit_code = writeRGB2Text(argv[2], h_input, h_rows, h_cols);
timer1.Stop();
std::cout << "Writing: " << timer1.Elapsed() << std::endl;
delete[] h_pixelGroup;
delete[] h_groupRep;
delete[] h_output;
return SUCCESS;
}
when I print from within the kernel I get zeros for the three values
__global__
void clut_distributePixels(int *pixelGroup){
int i = blockDim.y * blockIdx.y + threadIdx.y;
int j = blockDim.x * blockIdx.x + threadIdx.x;
if(i == 0 && j == 0){
printf("a: %d\n", c_rows);
printf("b: %d\n", c_cols);
printf("c: %d\n", c_numColors);
}
while (i < c_rows) {
while (j < c_cols) {
pixelGroup[i*c_cols+j] = (i*c_cols+j)/c_numColors;
j += gridDim.x * blockDim.x;
}
j = blockDim.x * blockIdx.x + threadIdx.x;
i += gridDim.y * blockDim.y;
}
}
Either I am not copying correctly to constant memory or ... I don't know what could be wrong. Any advise !?
I posted the entire host code probably something else is messing with the constant copies.
UPDATE
Main.cu
#include "Imageproc.cuh"
int main(){
int h_change = 0;
int h_rows = 512;
cudaMemcpyToSymbol(c_rows, &h_rows, sizeof(int));
chunche<<<1,1>>>();
cudaMemcpyFromSymbol(&h_change, d_change, sizeof(int));
std::cout << "H = " << h_change << std::endl;
return 0
}
Imageproc.cuh
#ifndef _IMAGEPROC_CUH_
#define _IMAGEPROC_CUH_
#include "Utilities.cuh"
#define B_WIDTH 16
#define B_HEIGHT 16
__constant__ int c_rows;
__constant__ int c_cols;
__constant__ int c_numColors;
__device__ int d_change;
#ifdef __cplusplus
extern "C"
{
#endif
__global__
void chunche();
__global__
void clut_distributePixels(int *pixelGroup);
__global__
void clut_checkDistances(Color *input, int *pixelGroup, Color *groupRep);
__global__
void clut_createImage(Color *clutImage, int *pixelGroup, Color *groupRep);
#ifdef __cplusplus
}
#endif
#endif
Imageproc.cu
#include "Imageproc.cuh"
__global__
void chunche(){
d_change = c_rows + 1;
}
__global__
void clut_distributePixels(int *pixelGroup){
int i = blockDim.y * blockIdx.y + threadIdx.y;
int j = blockDim.x * blockIdx.x + threadIdx.x;
while (i < c_rows) {
while (j < c_cols) {
pixelGroup[i*c_cols+j] = (i*c_cols+j)/c_numColors;
j += gridDim.x * blockDim.x;
}
j = blockDim.x * blockIdx.x + threadIdx.x;
i += gridDim.y * blockDim.y;
}
}
__global__
void clut_checkDistances(Color *input, int *pixelGroup, Color *groupRep){
int i = blockDim.y * blockIdx.y + threadIdx.y;
int j = blockDim.x * blockIdx.x + threadIdx.x;
int newGroup;
while (i < c_rows) {
while (j < c_cols) {
newGroup = 0;
for (int ng = 1; ng < c_numColors; ng++) {
if (
/*If distance from color to group ng is less than distance from color to group idx
then color should belong to ng*/
(groupRep[ng].r-input[i*c_cols+j].r)*(groupRep[ng].r-input[i*c_cols+j].r) +
(groupRep[ng].g-input[i*c_cols+j].g)*(groupRep[ng].g-input[i*c_cols+j].g) +
(groupRep[ng].b-input[i*c_cols+j].b)*(groupRep[ng].b-input[i*c_cols+j].b)
<
(groupRep[newGroup].r-input[i*c_cols+j].r)*(groupRep[newGroup].r-input[i*c_cols+j].r)+
(groupRep[newGroup].g-input[i*c_cols+j].g)*(groupRep[newGroup].g-input[i*c_cols+j].g)+
(groupRep[newGroup].b-input[i*c_cols+j].b)*(groupRep[newGroup].b-input[i*c_cols+j].b)
)
{
newGroup = ng;
}
}
if (pixelGroup[i*c_cols+j] != newGroup) {
pixelGroup[i*c_cols+j] = newGroup;
d_change = 1;
}
j += gridDim.x * blockDim.x;
}
j = blockDim.x * blockIdx.x + threadIdx.x;
i += gridDim.y * blockDim.y;
}
}
__global__
void clut_createImage(Color *clutImage, int *pixelGroup, Color *groupRep){
int i = blockDim.y * blockIdx.y + threadIdx.y;
int j = blockDim.x * blockIdx.x + threadIdx.x;
while (i < c_rows) {
while (j < c_cols) {
clutImage[i*c_cols+j].r = groupRep[pixelGroup[i*c_cols+j]].r;
clutImage[i*c_cols+j].g = groupRep[pixelGroup[i*c_cols+j]].g;
clutImage[i*c_cols+j].b = groupRep[pixelGroup[i*c_cols+j]].b;
j += gridDim.x * blockDim.x;
}
j = blockDim.x * blockIdx.x + threadIdx.x;
i += gridDim.y * blockDim.y;
}
}
Utilities.cuh
#ifndef _UTILITIES_CUH_
#define _UTILITIES_CUH_
#include <iostream>
#include <fstream>
#include <string>
#define SUCCESS 1
#define FAILURE 0
#define checkCudaError(val) check( (val), #val, __FILE__, __LINE__)
typedef struct {
int r;
int g;
int b;
} vec3u;
typedef vec3u Color;
typedef unsigned char uchar;
typedef uchar Grayscale;
struct GpuTimer{
cudaEvent_t start;
cudaEvent_t stop;
GpuTimer(){
cudaEventCreate(&start);
cudaEventCreate(&stop);
}
~GpuTimer(){
cudaEventDestroy(start);
cudaEventDestroy(stop);
}
void Start(){
cudaEventRecord(start, 0);
}
void Stop(){
cudaEventRecord(stop, 0);
}
float Elapsed(){
float elapsed;
cudaEventSynchronize(stop);
cudaEventElapsedTime(&elapsed, start, stop);
return elapsed;
}
};
template<typename T>
void check(T err, const char* const func, const char* const file, const int line) {
if (err != cudaSuccess) {
std::cerr << "CUDA error at: " << file << ":" << line << std::endl;
std::cerr << cudaGetErrorString(err) << " " << func << std::endl;
exit(1);
}
}
int writeGrayscale2Text(const std::string filename, const Grayscale *image, const int rows, const int cols);
int readText2Grayscale(const std::string filename, Grayscale **image, int *rows, int *cols);
int writeRGB2Text(const std::string filename, const Color *image, const int rows, const int cols);
int readText2RGB(const std::string filename, Color **image, int *rows, int *cols);
struct CpuTimer{
clock_t start;
clock_t stop;
void Start(){
start = clock();
}
void Stop(){
stop = clock();
}
float Elapsed(){
return ((float)stop-start)/CLOCKS_PER_SEC * 1000.0f;
}
};
#endif
Utilities.cu
#include "Utilities.cuh"
int writeGrayscale2Text(const std::string filename, const Grayscale *image, const int rows, const int cols){
std::ofstream fileWriter(filename.c_str());
if (!fileWriter.is_open()) {
std::cerr << "** writeGrayscale2Text() ** : Unable to open file." << std::endl;
return FAILURE;
}
fileWriter << rows << "\n";
fileWriter << cols << "\n";
for (int i = 0; i < rows; i++) {
for (int j = 0; j < cols; j++) {
fileWriter << (int)image[i*cols+j] << "\n";
}
}
fileWriter.close();
return SUCCESS;
}
int readText2Grayscale(const std::string filename, Grayscale **image, int *rows, int *cols){
std::ifstream fileReader(filename.c_str());
if (!fileReader.is_open()) {
std::cerr << "** readText2Grayscale() ** : Unable to open file." << std::endl;
return FAILURE;
}
fileReader >> *rows;
fileReader >> *cols;
*image = new Grayscale[(*rows)*(*cols)];
int value;
for (int i = 0; i < *rows; i++) {
for (int j = 0; j < *cols; j++) {
fileReader >> value;
(*image)[i*(*cols)+j] = (Grayscale)value;
}
}
fileReader.close();
return SUCCESS;
}
int writeRGB2Text(const std::string filename, const Color *image, const int rows, const int cols){
std::ofstream fileWriter(filename.c_str());
if (!fileWriter.is_open()) {
std::cerr << "** writeRGB2Text() ** : Unable to open file." << std::endl;
return FAILURE;
}
fileWriter << rows << "\n";
fileWriter << cols << "\n";
for (int k = 0; k < 3; k++) {
for (int i = 0; i < rows; i++) {
for (int j = 0; j < cols; j++) {
switch (k) {
case 0:
fileWriter << image[i*cols+j].r << "\n";
break;
case 1:
fileWriter << image[i*cols+j].g << "\n";
break;
case 2:
fileWriter << image[i*cols+j].b << "\n";
break;
}
}
}
}
fileWriter.close();
return SUCCESS;
}
int readText2RGB(const std::string filename, Color **image, int *rows, int *cols){
std::ifstream fileReader(filename.c_str());
if (!fileReader.is_open()) {
std::cerr << "** readText2Grayscale() ** : Unable to open file." << std::endl;
return FAILURE;
}
fileReader >> *rows;
fileReader >> *cols;
*image = new Color[(*rows)*(*cols)];
for (int k = 0; k < 3; k++) {
for (int i = 0; i < *rows; i++) {
for (int j = 0; j < *cols; j++) {
switch (k) {
case 0:
fileReader >> (*image)[i*(*cols)+j].r;
break;
case 1:
fileReader >> (*image)[i*(*cols)+j].g;
break;
case 2:
fileReader >> (*image)[i*(*cols)+j].b;
break;
}
}
}
}
fileReader.close();
return SUCCESS;
}

Constant memory has implicit local scope linkage - answer to this on stack overflow.
This means that the cudaMemcpyToSymbol have to be in the same generated .obj file of the kernel where you want to use it.
You do your memcopy in Main.cu, but the kernel where you use your canstant memory is in Imageproc.cu. So for the constant values are unknown for the kernel chunche.
A option to solve you're problem can be, to implement a wrapper. Just add a function in Imagepro.cu where you do the cudaMemcpyToSymbol and call the wrapper in Main.cu and pass your desired values for the constant memory in there.

Related

Why changing this array's value, in a function using pointers, is wrong?

I was solving a revert array homework and, at some point, the code compiled, suddenly, after an (apparently unrelated) alteration, the code couldn't be compiled anymore.
This one compiles
#include <iostream>
#define MAX 100
#define fori(x, y) for(int i = x; i<y; i++)
using namespace std;
int vetor[MAX];
void inverterElemento(int* a, int* b){
int* inverter;
*inverter = *a;
*a = *b;
*b = *inverter;
}
int main(){
for (int i = 0; i < MAX; i++){
vetor[i] = 7*i;
}
for (int k = 0; k <= MAX/2; k++){
int a = vetor[k];
int b = vetor[MAX-k-1];
inverterElemento(&vetor[k], &vetor[MAX - k - 1]);
}
fori(0, MAX) cout << vetor[i] << " "; cout << endl;
return 0;
}
This one gives Bus error:10 issue:
#include <iostream>
#define MAX 100
#define fori(x, y) for(int i = x; i<y; i++)
using namespace std;
int vetor[MAX];
void inverterElemento(int* a, int* b){
int* inverter;
*inverter = *a;
*a = *b;
*b = *inverter;
}
int main(){
for (int i = 0; i < MAX; i++){
vetor[i] = 7*i;
}
for (int k = 0; k <= MAX/2; k++){
inverterElemento(&vetor[k], &vetor[MAX - k - 1]);
}
fori(0, MAX) cout << vetor[i] << " "; cout << endl;
return 0;
}
Why this 2 lines changes the outcome?
int a = vetor[k];
int b = vetor[MAX-k-1];
In the following lines:
int* inverter;
*inverter = *a;
inverter is declared as a pointer to int but there is no initial address to which it points (the content, i.e address, is undefined).
Should be:
int inverter;
inverter = *a;
*a = *b;
*b = inverter;

Cuda program for matrix batch multiplication

I am a novice in the field of CUDA program and I am trying to repeat the function of cublasSgemmBatched, which means that I want to perform the matrix-matrix multiplication of a batch of matrices. I try to implement my idea as the following code.
#include <stdio.h>
__global__ void BatchMulCUDA(float* array1, float* array2, int narray1, int dim, float* result)
{
int tx = blockIdx.x * blockDim.x + threadIdx.x;
if (tx < narray1 * dim)
{
float temp = 0;
int index = tx / dim;
#pragma
for (int i = 0; i < dim; i++)
{
temp += array1[tx * dim + i] * array2[index * dim + i];
}
result[tx] = temp;
}
}
void BatchMulGPU(float* array1, float* array2, int narray1, int dim, float* result)
{
dim3 threads(1024, 1);
dim3 grid(narray1 / 1024 + 1, 1);
int threadsPerBlock = threads.x * threads.y;
int blocksPerGrid = grid.x * grid.y;
printf("CUDA kernel launch with %d blocks of %d threads\n", blocksPerGrid, threadsPerBlock);
BatchMulCUDA<<<grid, threads>>>(array1, array2, narray1, dim, result);
}
However, strangely, I found that I can get the right output before the index 19730. After the element of 19730, the output of GPU is always 0. I do not know what the problem is. The CPU version of my code and test function are as the following. Is there any hardware limitation that I do not realize?
#include "kernel.h"
#include <cuda_runtime.h>
#include <stdio.h>
#include <stdlib.h>
#include <iostream>
#include <sys/time.h>
#include <math.h>
double cpuSecond()
{
struct timeval tp;
gettimeofday(&tp, NULL);
return ((double) tp.tv_sec + (double)tp.tv_usec*1e-6);
}
void BatchMulCPU(float* array1, float* array2, int narray1, int dim, float* result)
{
for (int i = 0; i < narray1 * dim; i++)
{
float temp = 0;
int index = i / dim;
for (int j = 0; j < dim; j++)
{
temp += array1[i * dim + j] * array2[index * dim + j];
}
result[i] = temp;
}
}
int main(int argc, char** argv)
{
int narray1 = 6980;
int dim = 4;
float* array1 = new float[narray1 * dim * dim];
float* array2 = new float[narray1 * dim];
float* resultGPU = new float[narray1 * dim];
float* resultCPU = new float[narray1 * dim];
float* d_array1;
float* d_array2;
float* d_result;
for (int i = 0; i < narray1 * dim * dim; i++)
{
array1[i] = static_cast<float> (rand() / (static_cast<float> (RAND_MAX / 10)));
}
for (int i = 0; i < narray1 * dim; i++)
{
array2[i] = static_cast<float> (rand() / (static_cast<float> (RAND_MAX / 10)));
}
cudaError_t err;
double iStart = cpuSecond();
err = cudaMalloc((void**)&d_array1, narray1 * dim * dim * sizeof(float));
err = cudaMalloc((void**)&d_array2, narray1 * dim * sizeof(float));
err = cudaMalloc((void**)&d_result, narray1 * dim * sizeof(float));
err = cudaMemcpy(d_array1, array1, narray1 * dim * dim * sizeof(float), cudaMemcpyHostToDevice);
err = cudaMemcpy(d_array2, array2, narray1 * dim * sizeof(float), cudaMemcpyHostToDevice);
BatchMulGPU(d_array1, d_array2, narray1, dim, d_result);
err = cudaMemcpy(resultGPU, d_result, narray1 * dim * sizeof(float), cudaMemcpyDeviceToHost);
double iElaps = cpuSecond() - iStart;
printf("Total GPU computation time is %lf \n" , iElaps);
iStart = cpuSecond();
BatchMulCPU(array1, array2, narray1, dim, resultCPU);
iElaps = cpuSecond() - iStart;
printf("Total CPU computation time is %lf \n" , iElaps);
float error = 0;
float temp = 0;
for (long i = 0; i < narray1 * dim; i++)
{
// temp = abs(resultCPU[i] - resultGPU[i]);
// if (temp > 0.5)
// {
// std::cout << i << std::endl;
// }
error += abs(resultCPU[i] - resultGPU[i]);
}
printf("Error is %f \n", error);
// for (int i = 19730; i < 19750; i++)
// {
// std::cout << "GPU " << resultGPU[i] << std::endl;
// std::cout << "CPU " << resultCPU[i] << std::endl;
// }
cudaFree(d_array1);
cudaFree(d_array2);
cudaFree(d_result);
return 0;
}
Apart from the possibility of a WDDM TDR timeout as discussed in the comments, the code has an error.
Its evident that the kernel design expects that a total grid size (total number of threads) will be launched that is equal to or greater than the number of arrays times the side dimension:
int tx = blockIdx.x * blockDim.x + threadIdx.x;
if (tx < narray1 * dim)
i.e. narray1*dim are the needed number of threads
However the number being launched is only narray1:
dim3 threads(1024, 1);
dim3 grid(narray1 / 1024 + 1, 1);
If we change the last line above to:
dim3 grid((narray1*dim) / 1024 + 1, 1);
this code design error will be addressed.
The reason the code works correctly for small number of matrices (anything up to 256) is because of the rounding-up effect in the grid sizing to a minimum of 1024 threads, which is 256*4 (narray1 * dim).
As an aside, this code is not functionally similar to cublasSgemmBatched from what I can see. I don't recognize this code as being any matrix multiplication (matrix dot product) that I am familiar with.

Dynamic array class won't print

#include <iostream>
#include <cmath>
#include <iomanip>
#include <cstring>
#include <cstddef>
class ArrayList
{
public:
ArrayList();
void expand();
void store(std::string x);
void display_size();
friend std::ostream& operator<<(std::ostream& os, const ArrayList &arr);
void clean();
ArrayList(const ArrayList &arr);
~ArrayList();
void operator=(const ArrayList& arr);
private:
int size; //total size of your dynamic array
int max; //double the size of the array.
int free_space; //total size of unused space in your dynamic array
char *array; //a pointer used to create a dynamic array
};
ArrayList::ArrayList()
{
size = 2;
free_space = 1;
array = new char[2]();
array[0] = ' ';
array[1] = '\0';
max = size;
}
void ArrayList::expand()
{
max = size + size;
char *temp = new char[max];
for( int i = 0; i < max; i++ )
{
array[i] = temp[i];
}
free_space = free_space + size;
delete [] array;
array = temp;
size = max;
}
void ArrayList::store(std::string x)
{
int taken = max - free_space;
int y = x.size();
free_space = free_space - y;
for(int i = 0; i < y; i++)
{
if(y >= size)
{
while(y >= size)
{
expand();
}
}
else
{
array[i + taken] = x[i];
}
}
}
std::ostream& operator<<(std::ostream& os, const ArrayList &arr)
{
os << arr.array;
return os;
}
void ArrayList::display_size()
{
size = max;
std::cout << "Array Content: ";
std::cout << array;
std::cout << std::endl;
std::cout << "Remaining size: ";
std::cout << free_space;
std::cout << std::endl;
}
void ArrayList::clean()
{
int x = 0;
for(int i = 0; i < size; i++)
{
if(array[i] == ' ')
{
x++;
}
}
size = x;
}
ArrayList::ArrayList(const ArrayList &arr)
{
array = new char[size + 1];
strcpy(array, arr.array);
}
ArrayList::~ArrayList()
{
delete [] array;
}
void ArrayList::operator=(const ArrayList& arr)
{
int new_length = strlen(arr.array);
if(new_length > max)
{
delete [] array;
max = new_length;
array = new char[max + 1];
}
for(int i = 0; i < new_length; i++)
{
array[i] = arr.array[i];
}
array[new_length] = '\0';
}
int main()
{
ArrayList x;
std::string y;
char ans;
x.display_size();
std::cout << "Please enter your string: ";
std::cin >> y;
x.store(y);
x.display_size();// << std::endl;
do
{
std::cout << "Please enter your string: ";
std::cin >> y;
x.store(y);
x.display_size();
std::cout << "Do you want to enter another string? (y/n) ";
std::cin >> ans;
}while(ans != 'n');
return 0;
}
My question is regarding C++ dynamic arrays. I've created a class which creates a dynamic array.
I've posted my entire code it should be runnable.
The issue stems from the use of the store and expand functions.
store takes a string and puts each character into the array, if there isn't enough space it calls expand.
expand doubles the size of the array.
Array Content:
Remaining size: 1
Please enter your string: h
Array Content: h
Remaining size: 0
Please enter your string: ello
Array Content:
Remaining size: 2
Do you want to enter another string? (y/n) n
In theory the output above should have returned "hello" however it hasn't returned anything, despite returning the 'h' earlier. I'm completely out of ideas on how to solve this.
EDIT:
I've since changed the functions according to the advice given to me:
void ArrayList::expand()
{
max = size + size;
char *temp = new char[max];
for( int i = 0; i < max; i++ )
{
temp[i] = array[i];
}
free_space = free_space + size;
delete [] array;
array = temp;
size = max;
}
void ArrayList::store(std::string x)
{
int taken = max - free_space;
int y = x.size();
free_space = free_space - y;
for(int i = 0; i < y; i++)
{
if(free_space <= 0)
{
while(free_space <= 0)
{
expand();
}
}
else
{
array[i+taken] = x[i]; //I'm cetain this didn't do anything
}
}
}
I've solved the negative number issue detailed in the comments. The only issue now is printing the number.
This "for" loop in "expand" method:
for( int i = 0; i < max; i++ )
{
array[i] = temp[i];
}
should be replaced by:
for( int i = 0; i < size; i++ )
{
temp[i] = array[i];
}

CUDA reduction to find the maximum of an array

I am doing the Udacity course on parallel programming (homework 3) and can not figure out why I can't get the maximum in the array using parallel reduction (Udacity forums yet to provide solution). I am pretty certain that I have set up the arrays properly and that the algorithm is correct. I suspect that I have a problem with memory management (accessing out of bounds, incorrect array sizes, copying to and from). Please help! I am running this in the Udacity environment, not locally. Below is the code that I am currently using. For some reason when I change the fmaxf's to fminf's it does find the minimum.
#include "reference_calc.cpp"
#include "utils.h"
#include "math.h"
#include <stdio.h>
#include <cmath>
__global__ void reduce_max_kernel(float *d_out, const float *d_logLum, int size) {
// Reduce log Lum with Max Operator
int myId = threadIdx.x + blockDim.x * blockIdx.x;
int tid = threadIdx.x;
extern __shared__ float temp[];
if (myId < size) {
temp[tid] = d_logLum[myId];
}
else {
temp[tid] = d_logLum[tid];
}
for (unsigned int s = blockDim.x / 2; s > 0; s >>= 1) {
if (tid < s) {
if (myId < size) {
temp[tid] = fmaxf(d_logLum[myId + s], d_logLum[myId]);
} else {
temp[tid] = d_logLum[tid];
}
}
__syncthreads();
}
if (tid == 0) {
d_out[blockIdx.x] = temp[0];
}
}
__global__ void reduce_max_kernel2(float *d_out, float *d_in) {
// Reduce log Lum with Max Operator
int myId = threadIdx.x + blockDim.x * blockIdx.x;
int tid = threadIdx.x;
for (unsigned int s = blockDim.x >> 1; s > 0; s >>= 1) {
if (tid < s) {
d_in[myId] = fmaxf(d_in[myId + s], d_in[myId]);
}
__syncthreads();
}
if (tid == 0) {
d_out[0] = d_in[0];
}
}
void your_histogram_and_prefixsum(const float* const d_logLuminance,
unsigned int* const d_cdf,
float &min_logLum,
float &max_logLum,
const size_t numRows,
const size_t numCols,
const size_t numBins)
{
//TODO
/*Here are the steps you need to implement
1) find the minimum and maximum value in the input logLuminance channel
store in min_logLum and max_logLum
2) subtract them to find the range
3) generate a histogram of all the values in the logLuminance channel using
the formula: bin = (lum[i] - lumMin) / lumRange * numBins
4) Perform an exclusive scan (prefix sum) on the histogram to get
the cumulative distribution of luminance values (this should go in the
incoming d_cdf pointer which already has been allocated for you) */
//int size = 1 << 18;
int points = numRows * numCols;
int logPoints = ceil(log(points)/log(2));
int sizePow = logPoints;
int size = pow(2, sizePow);
int numThreads = 1024;
int numBlocks = size / numThreads;
float *d_out;
float *d_max_out;
checkCudaErrors(cudaMalloc((void **) &d_out, numBlocks * sizeof(float)));
checkCudaErrors(cudaMalloc((void **) &d_max_out, sizeof(float)));
cudaDeviceSynchronize();
reduce_max_kernel<<<numBlocks, numThreads, sizeof(float)*numThreads>>>(d_out, d_logLuminance, points);
cudaDeviceSynchronize();
reduce_max_kernel2<<<1, numBlocks>>>(d_max_out, d_out);
float h_out_max;
checkCudaErrors(cudaMemcpy(&h_out_max, d_max_out, sizeof(float), cudaMemcpyDeviceToHost));
printf("%f\n", h_out_max);
checkCudaErrors(cudaFree(d_max_out));
checkCudaErrors(cudaFree(d_out));
}
You are trying to reproduce the reduce2 reduction kernel of the CUDA SDK reduction sample. Robert Crovella has already spot two mistakes that you have made in your code. Besides them, I think you are also mistakenly initializing the shared memory.
Below, please find a complete working example constructed around your attempt. I have left the wrong instructions of your approach.
#include <thrust\device_vector.h>
#define BLOCKSIZE 256
/********************/
/* CUDA ERROR CHECK */
/********************/
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) { getchar(); exit(code); }
}
}
/*******************************************************/
/* CALCULATING THE NEXT POWER OF 2 OF A CERTAIN NUMBER */
/*******************************************************/
unsigned int nextPow2(unsigned int x)
{
--x;
x |= x >> 1;
x |= x >> 2;
x |= x >> 4;
x |= x >> 8;
x |= x >> 16;
return ++x;
}
__global__ void reduce_max_kernel(float *d_out, const float *d_logLum, int size) {
int tid = threadIdx.x; // Local thread index
int myId = blockIdx.x * blockDim.x + threadIdx.x; // Global thread index
extern __shared__ float temp[];
// --- Loading data to shared memory. All the threads contribute to loading the data to shared memory.
temp[tid] = (myId < size) ? d_logLum[myId] : -FLT_MAX;
// --- Your solution
// if (myId < size) { temp[tid] = d_logLum[myId]; } else { temp[tid] = d_logLum[tid]; }
// --- Before going further, we have to make sure that all the shared memory loads have been completed
__syncthreads();
// --- Reduction in shared memory. Only half of the threads contribute to reduction.
for (unsigned int s=blockDim.x/2; s>0; s>>=1)
{
if (tid < s) { temp[tid] = fmaxf(temp[tid], temp[tid + s]); }
// --- At the end of each iteration loop, we have to make sure that all memory operations have been completed
__syncthreads();
}
// --- Your solution
//for (unsigned int s = blockDim.x / 2; s > 0; s >>= 1) {
// if (tid < s) { if (myId < size) { temp[tid] = fmaxf(d_logLum[myId + s], d_logLum[myId]); } else { temp[tid] = d_logLum[tid]; } }
// __syncthreads();
//}
if (tid == 0) {
d_out[blockIdx.x] = temp[0];
}
}
/********/
/* MAIN */
/********/
int main()
{
const int N = 10;
thrust::device_vector<float> d_vec(N,3.f); d_vec[4] = 4.f;
int NumThreads = (N < BLOCKSIZE) ? nextPow2(N) : BLOCKSIZE;
int NumBlocks = (N + NumThreads - 1) / NumThreads;
// when there is only one warp per block, we need to allocate two warps
// worth of shared memory so that we don't index shared memory out of bounds
int smemSize = (NumThreads <= 32) ? 2 * NumThreads * sizeof(int) : NumThreads * sizeof(int);
// --- reduce2
thrust::device_vector<float> d_vec_block(NumBlocks);
reduce_max_kernel<<<NumBlocks, NumThreads, smemSize>>>(thrust::raw_pointer_cast(d_vec_block.data()), thrust::raw_pointer_cast(d_vec.data()), N);
// --- The last part of the reduction, which would be expensive to perform on the device, is executed on the host
thrust::host_vector<float> h_vec_block(d_vec_block);
float result_reduce0 = -FLT_MAX;
for (int i=0; i<NumBlocks; i++) result_reduce0 = fmax(h_vec_block[i], result_reduce0);
printf("Result = %f\n",result_reduce0);
}

Decompression stops inbetween and output file filled with zeros(BLACK PIXELS)?

I am trying to apply DCT(discrete cosine transformation) compression on a bmp(bitmap) file. I have a c file which i am running in Turbo C++. This is not actually compressing but i was trying to implement the DCT and IDCT. The code is as follows:
/*
the image to be compressed is a bmp with 24 bpp and
with name "college4.bmp" of dimensions 200*160 ie 25*20- 8*8 blocks
o/p is college2.dat
format: 8 bit signed integers starting rowwise from 0,0 to 8,8
the coefficients order is blue,green,red
for the block no 1 then 2 and soon
*/
#include<stdlib.h>
#include<stdio.h>
#include<math.h>
#define WIDTH 25
#define HEIGHT 20
typedef struct {
unsigned int type;
unsigned long int filesize;
unsigned int reserved1,reserved2;
unsigned long int offset;
} BMPHEAD;
typedef struct {
unsigned long int infosize;
unsigned long int width,height;
unsigned int planes,bitsperpixel;
unsigned long int compression;
unsigned long int sizeimage;
long int xpelspermeter,ypelspermeter;
unsigned long int colorused,colorimportant;
} INFOHEAD;
typedef struct {
char rgbquad[4];
} colortable;
BMPHEAD bmphead;
INFOHEAD infohead;
FILE *bmp_fp1,*bmp_fp2;
int buf[WIDTH][8][8][3],buf1[WIDTH][8][8][3];
float pi=3.14159265,DCTcoeff[8][8][8][8];
void generatedctcoeff() {
int y, i, j, x;
for (i = 0; i < 8; i++) {
for (j = 0; j < 8; j++) {
for (x = 0; x < 8; x++) {
for (y = 0; y < 8; y++) {
DCTcoeff[i][j][x][y] = cos(((2 * y + 1) * pi * j) / 16)
* cos(((2 * x + 1) * i * pi) / 16);
}
}
}
}
}
void outputtofile1() { // Write into college2.dat
int i, j, x, y, blockno; // One block at a time, buf contains pixel
int redcoef, greencoef, bluecoef; // data of one row of blocks
float gijred, gijgreen, gijblue, c, ci, cj;
c = 1 / (sqrt(2));
for (blockno = 0; blockno < WIDTH; blockno++) {
for (i = 0; i < 8; i++) {
for (j = 0; j < 8; j++) {
gijred = 0;
gijgreen = 0;
gijblue = 0;
for (x = 0; x < 8; x++) {
for (y = 0; y < 8; y++) {
gijblue = gijblue + DCTcoeff[i][j][x][y]
* buf[blockno][x][y][0];
gijgreen = gijgreen + DCTcoeff[i][j][x][y]
* buf[blockno][x][y][1];
gijred = gijred + DCTcoeff[i][j][x][y]
* buf[blockno][x][y][2];
}
}
ci = cj = 1.0;
if (i == 0)
ci = c;
if (j == 0)
cj = c;
gijblue = ci * cj * gijblue / 4;
gijgreen = ci * cj * gijgreen / 4;
gijred = ci * cj * gijred / 4;
bluecoef = (int) gijblue;
greencoef = (int) gijgreen;
redcoef = (int) gijred;
fprintf(bmp_fp2, "%d %d %d ", bluecoef, greencoef, redcoef);
}
}
} /* end of one block processing */
}
void compressimage() {
int rowcount,x,y;
bmp_fp1=fopen("college4.bmp","r");
bmp_fp2=fopen("college2.dat","w");
printf("generating coefficients...\n");
generatedctcoeff();
if(bmp_fp1==NULL) {
printf("can't open");
return;
}
printf("compressing....\n");
fread(&bmphead,1,sizeof(bmphead),bmp_fp1);
fread(&infohead,1,sizeof(infohead),bmp_fp1);
fseek(bmp_fp1,bmphead.offset,SEEK_SET);
for(rowcount=0;rowcount<HEIGHT;rowcount++) {
for(y=0;y<8;y++) {
for(x=0;x<infohead.width;x++) {
buf[x/8][x%8][y][0]=(int)fgetc(bmp_fp1);
buf[x/8][x%8][y][1]=(int)fgetc(bmp_fp1);
buf[x/8][x%8][y][2]=(int)fgetc(bmp_fp1);
}
}
outputtofile1(); //output contents of buf after dct to file
}
fclose(bmp_fp1);
fclose(bmp_fp2);
}
void outputtofile2() { //output buf to college3.bmp
int i, j, x, y, blockno; // buf now contains coefficients
float pxyred, pxygreen, pxyblue, c, ci, cj; // a temp buffer buf1 used to
c = 1 / (sqrt(2)); // store one row of block of
for (blockno = 0; blockno < WIDTH; blockno++) { // decoded pixel values
for (x = 0; x < 8; x++)
for (y = 0; y < 8; y++) {
pxyred = 0;
pxygreen = 0;
pxyblue = 0;
for (j = 0; j < 8; j++) {
cj = 1.0;
if (j == 0)
cj = c;
for (i = 0; i < 8; i++) {
ci = 1.0;
if (i == 0)
ci = c;
pxyblue = pxyblue + ci * cj * DCTcoeff[i][j][y][x] * buf[blockno][i][j][0];
pxygreen = pxygreen + ci * cj
* DCTcoeff[i][j][y][x] * buf[blockno][i][j][1];
pxyred = pxyred + ci * cj * DCTcoeff[i][j][y][x] * buf[blockno][i][j][2];
}
}
pxyblue /= 4;
pxygreen /= 4;
pxyred /= 4;
buf1[blockno][y][x][0] = pxyblue;
buf1[blockno][y][x][1] = pxygreen;
buf1[blockno][y][x][2] = pxyred;
}
}
for (y = 0; y < 8; y++) {
for (blockno = 0; blockno < WIDTH; blockno++)
for (x = 0; x < 8; x++) {
fprintf(bmp_fp2, "%c%c%c", (char) buf1[blockno][x][y][0],
(char) buf1[blockno][x][y][1],
(char) buf1[blockno][x][y][2]);
}
}
}
void uncompressimage() {
int blue,green,red,rowcount,colcount,i,j;
bmp_fp1=fopen("college2.dat","r");
bmp_fp2=fopen("college3.bmp","w");
printf("generating coefficients...\n");
generatedctcoeff();
if (bmp_fp1==NULL) {
printf("open failed");
return;
}
printf("uncompressing....\n");
bmphead.type=0x4d42;
bmphead.filesize=30518;
bmphead.reserved1=0;
bmphead.reserved2=0;
bmphead.offset=sizeof(bmphead)+sizeof(infohead);
infohead.infosize=sizeof(infohead);
infohead.width=200;
infohead.height=160;
infohead.planes=1;
infohead.bitsperpixel=24;
infohead.compression=0;
infohead.sizeimage=0;
infohead.xpelspermeter=3780;
infohead.ypelspermeter=3780;
infohead.colorused=0;
infohead.colorimportant=0;
fwrite(&bmphead,sizeof(BMPHEAD),1,bmp_fp2);
fwrite(&infohead,sizeof(INFOHEAD),1,bmp_fp2);
for(rowcount=0;rowcount<HEIGHT;rowcount++) {
for(colcount=0;colcount<WIDTH;colcount++) {
for(i=0;i<8;i++) {
for(j=0;j<8;j++) {
fscanf(bmp_fp1,"%d",&blue);
fscanf(bmp_fp1,"%d",&green);
fscanf(bmp_fp1,"%d",&red);
buf[colcount][i][j][0]=blue;
buf[colcount][i][j][1]=green;
buf[colcount][i][j][2]=red;
}
}
}
outputtofile2();
}
fclose(bmp_fp1);
fclose(bmp_fp2);
}
int main() {
printf("opening files...\n");
compressimage();
printf("opening files...again\n");
uncompressimage();
printf("successful decompression\nenter any key\n");
return 0;
}
Here is the image i am using as input
(im srry the site converted the bmp into png. You may convert it back to bmp to use it)
Here is the image that is generated:
The file college3.bmp that gets created is of size 200x160 and of 93.8 kB but till quarter of the image it has decoded the coefficients correctly but later the file is filled with black pixels. I have taken a screenshot of the o/p as it was saying not a valid bmp while uploading. I am sitting on this problem since feb,2004. If anyone can say me where there is a bug i would be very thankful. I have analysed the output file and found an EOF right at the place where the pixels are starting to be black. I read some other questions on the topic and found that the conversion factors ci,cj have been used improperly. While coding i had also got confused with the indices x,y,i and j. So i hope this problem i will solve in a few days.
Apparently, the problem in the above code is in how you open your files.
This is what should be in your code (note the explicitly specified open modes, binary and text):
void compressimage() {
...
bmp_fp1=fopen("college4.bmp","rb");
bmp_fp2=fopen("college2.dat","wt");
...
}
void uncompressimage() {
...
bmp_fp1=fopen("college2.dat","rt");
bmp_fp2=fopen("college3.bmp","wb");
...
}
With that and slightly altered structure definitions:
#pragma pack(push,1)
typedef struct {
unsigned short int type;
unsigned long int filesize;
unsigned short int reserved1,reserved2;
unsigned long int offset;
} BMPHEAD;
typedef struct {
unsigned long int infosize;
unsigned long int width,height;
unsigned short int planes,bitsperpixel;
unsigned long int compression;
unsigned long int sizeimage;
long int xpelspermeter,ypelspermeter;
unsigned long int colorused,colorimportant;
} INFOHEAD;
typedef struct {
char rgbquad[4];
} colortable;
#pragma pack(pop)
I'm able to compile your program successfully using 3 different compilers (Turbo C++, Open Watcom, gcc) and get the desired output picture.

Resources