Working on an edge detection function. Looking back at my code I think that I have concept / logic down. But the results aren't coming out the way it should.
typedef struct {
int Red;
int Green;
int Blue;
} GTOTALS;
// Detect edges
void edges(int height, int width, RGBTRIPLE image[height][width])
{
const int MAX = 3;
// Copy Image
RGBTRIPLE Copy[height][width];
for (int i = 0; i < height; i++)
{
for (int j = 0; j < width; j++)
{
Copy[i][j] = image[i][j];
}
}
// Gx and Gy Grids 3 x 3
int Gx[MAX][MAX] = {
{-1, 0, 1},
{-2, 0, 2},
{-1, 0, 1}
};
int Gy[MAX][MAX] = {
{-1, -2, -1},
{0, 0, 0},
{1, 2, 1}
};
// Loop through each pixel
for (int Rows = 0; Rows < height; Rows++)
{
for (int Cols = 0; Cols < width; Cols++)
{
// Hold RGB Values + Refresh Current Pixel RGB
int CRed = 0, CGreen = 0, CBlue = 0;
// Store Gx and Gy RGB Values
GTOTALS X;
GTOTALS Y;
// Loop through surrouding pixels
for (int S_Rows = Rows - 1, R = 0; S_Rows <= Rows + 1; S_Rows++, R++)
{
for (int S_Cols = Cols - 1, C = 0; S_Cols <= Cols + 1; S_Cols++, C++)
{
// Check Pixel Validity
if ((S_Rows >= 0) && (S_Rows < height) && (S_Cols >= 0) && (S_Cols < width))
{
// RGB Gx Total Values
X.Red += Copy[S_Rows][S_Cols].rgbtRed * Gx[R][C]; // Current Pixel Red * Gx[N][N]
X.Green += Copy[S_Rows][S_Cols].rgbtGreen * Gx[R][C]; // Current Pixel Green * Gx[N][N]
X.Blue += Copy[S_Rows][S_Cols].rgbtBlue * Gx[R][C]; // Current Pixel Blue * Gx[N][N]
// RGB Gy Total Values
Y.Red += Copy[S_Rows][S_Cols].rgbtRed * Gy[R][C]; // Current Pixel Red * Gy[N][N]
Y.Green += Copy[S_Rows][S_Cols].rgbtGreen * Gy[R][C]; // Current Pixel Green * Gy[N][N]
Y.Blue += Copy[S_Rows][S_Cols].rgbtBlue * Gy[R][C]; // Current Pixel Blue * Gy[N][N]
}
}
}
// Value = Square Root(Gx^2 + Gx^2)
CRed = round( sqrt( pow(X.Red, 2.0) + pow(Y.Red, 2.0) ) );
CGreen = round( sqrt( pow(X.Green, 2.0) + pow(Y.Green, 2.0) ) );
CBlue = round( sqrt( pow(X.Blue, 2.0) + pow(Y.Blue, 2.0) ) );
// MAX 255
Cap(&CRed);
Cap(&CGreen);
Cap(&CBlue);
// Update Target Pixel
image[Rows][Cols].rgbtRed = CRed;
image[Rows][Cols].rgbtGreen = CGreen;
image[Rows][Cols].rgbtBlue = CBlue;
}
}
return;
}
void Cap(int *Value)
{
if (*Value > 255)
{
*Value = 255;
}
}
When I run the prograM most of the RGB values turn out to be 255. I've played around with using different data types and moving around when variables are created but that doesn't seem to help. I've also tried miniature versions of the code and all seems to work as intended but not sure why when I add it together it doesn't seem to give the correct results
Here is description of Sobel filter
// from Source to Destination
int ComputeBoundaries(unsigned char S[], unsigned char D[])
{
unsigned int iX,iY; /* indices of 2D virtual array (image) = integer coordinate /
unsigned int i; / index of 1D array /
/ sobel filter */
unsigned char G, Gh, Gv;
// boundaries are in D array ( global var )
// clear D array
memset(D, iColorOfBasin1, iSize*sizeof(*D)); // for heap-allocated arrays, where N is the number of elements = FillArrayWithColor(D , iColorOfBasin1);
// printf(" find boundaries in S array using Sobel filter\n");
#pragma omp parallel for schedule(dynamic) private(i,iY,iX,Gv,Gh,G) shared(iyMax,ixMax)
for(iY=1;iY<iyMax-1;++iY){
for(iX=1;iX<ixMax-1;++iX){
Gv= S[Give_i(iX-1,iY+1)] + 2S[Give_i(iX,iY+1)] + S[Give_i(iX-1,iY+1)] - S[Give_i(iX-1,iY-1)] - 2S[Give_i(iX-1,iY)] - S[Give_i(iX+1,iY-1)];
Gh= S[Give_i(iX+1,iY+1)] + 2S[Give_i(iX+1,iY)] + S[Give_i(iX-1,iY-1)] - S[Give_i(iX+1,iY-1)] - 2S[Give_i(iX-1,iY)] - S[Give_i(iX-1,iY-1)];
G = sqrt(GhGh + GvGv);
i= Give_i(iX,iY); /* compute index of 1D array from indices of 2D array /
if (G==0) {D[i]=255;} / background /
else {D[i]=0;} / boundary */
}
}
return 0;
}
// copy from Source to Destination
int CopyBoundaries(unsigned char S[], unsigned char D[])
{
unsigned int iX,iY; /* indices of 2D virtual array (image) = integer coordinate /
unsigned int i; / index of 1D array */
//printf("copy boundaries from S array to D array \n");
for(iY=1;iY<iyMax-1;++iY)
for(iX=1;iX<ixMax-1;++iX)
{i= Give_i(iX,iY); if (S[i]==0) D[i]=0;}
return 0;
}
Here is the image and a full program
result:
I'm working on calculation of Legendre Polynomial on GPU.
Briefly, Recursive Legendre Polynomial is computing the n-th order by (n-1)th and (n-2)th order. We divide the x into k (let's say k=23) parts to compute polynomial and do a summation, which would be more precise.
So my kernel goes below.
First, we create a k * width array.
int row = blockIdx.y * blockDim.y + threadIdx.y;
int col = blockIdx.x * blockDim.x + threadIdx.x;
float delta = 2. / width;
if ((row < d_k) && (col < width))
kXList[row * width + col] = -1.f + (col * d_k + row + 1.f) * delta / (float)d_k;
And 1st order and 2nd order, kXList_2 is the first, kXList_1 is the second.
kXList_1[row * width + col] = kXList[row * width + col];
kXList_2[row * width + col] = 1.f;
Do summation over columns and saving it into d_xLegendreP.
if (row == 0) {
float row_0 = 0.f;
float row_1 = 0.f;
for (int h = 0; h < d_k; ++h) {
row_0 += kXList_2[h * width + col];
row_1 += kXList_1[h * width + col];
}
d_xLegendreP[0 * width + col] = row_0;
d_xLegendreP[1 * width + col] = row_1;
}
recusive calculation of rest order.
float kX_2 = kXList_2[row * width + col];
float kX_1 = kXList_1[row * width + col];
float kX = kXList[row * width + col];
float row_n;
for (int n = 2; n <= order; n++) {
kXList_temp[row * width + col] = ((2.f * n - 1.f) * kX * kX_1) / (float)n - (((n - 1.f) * kX_2) / (float)n);
if ((row == 0)) {
row_n = 0.f;
for (int h = 0; h < d_k; h++) {
row_n += kXList_temp[h * width + col];
}
d_xLegendreP[n * width + col] = row_n;
}
kX_2 = kX_1;
kX_1 = kXList_temp[row * width + col];
}
As has been pointed out, CUDA makes no statements about the order of thread execution. However you have a number of points in your calculation sequence where you expect a previous line of code has been completed in its entirety, across the entire grid, in order for the next section of your code to be correct.
Generally the nature of CUDA parallel thread execution means that such dependencies lead to incorrect/broken code.
I haven't tried to fully realize your algorithm in an optimal way, but to demonstrate the proof of this, I have broken up your kernel code in such a way that such dependencies are made "correct" through the use of the kernel-call boundary, which is effectively a global sync. This is probably one way to sort out your problem, as indicated in the comments.
Here's an example. I'm not going to try to detail each change, but by breaking it up this way I believe I have satisfied the dependencies expected using your approach. I have not fully verified anything, but a quick check suggests the output seems to match your matlab output:
$ cat t1820.cu
#include <stdio.h>
#include <math.h>
#include<iostream>
#include <stdlib.h>
#define BLOCKDIM_32 32
#define k 23
#define Mmax 40
#define IMG_SIZE 1024
static const long DEVICE = 0;
#define CUDA_CHECK_RETURN(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char* file, int line, bool abort = true)
{
if (code != cudaSuccess)
{
fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
__global__ void LegendreMoment1(float* kXList, float* kXList_1, float* kXList_2, float* kXList_temp,
float* d_xLegendreP, int width, int d_k, int order) {
int row = blockIdx.y * blockDim.y + threadIdx.y;
int col = blockIdx.x * blockDim.x + threadIdx.x;
float delta = 2. / width;
if ((row < d_k) && (col < width)) {
kXList[row * width + col] = -1.f + (col * d_k + row + 1.f) * delta / (float)d_k;
kXList_1[row * width + col] = kXList[row * width + col];
kXList_2[row * width + col] = 1.f;
}
}
__global__ void LegendreMoment2(float* kXList, float* kXList_1, float* kXList_2, float* kXList_temp,
float* d_xLegendreP, int width, int d_k, int order) {
int row = blockIdx.y * blockDim.y + threadIdx.y;
int col = blockIdx.x * blockDim.x + threadIdx.x;
if ((row < d_k) && (col < width)) {
if (row == 0) {
float row_0 = 0.f;
float row_1 = 0.f;
for (int h = 0; h < d_k; ++h) {
row_0 += kXList_2[h * width + col];
row_1 += kXList_1[h * width + col];
}
d_xLegendreP[0 * width + col] = row_0;
d_xLegendreP[1 * width + col] = row_1;
}
}
}
__global__ void LegendreMoment3(float* kXList, float* kXList_1, float* kXList_2, float* kXList_temp,
float* d_xLegendreP, int width, int d_k, int order, int n, float *kXList_prev) {
int row = blockIdx.y * blockDim.y + threadIdx.y;
int col = blockIdx.x * blockDim.x + threadIdx.x;
if ((row < d_k) && (col < width)) {
float kX_2, kX_1, kX = kXList[row * width + col];
if (n == 2){
kX_2 = kXList_2[row * width + col];
kX_1 = kXList_1[row * width + col];}
if (n == 3){
kX_2 = kXList_1[row * width + col];
kX_1 = kXList_temp[row*width+col];}
if (n > 3){
kX_2 = kXList_prev[row * width + col];
kX_1 = kXList_temp[row*width+col];}
kXList_prev[row*width+col] = kX_1;
kXList_temp[row * width + col] = ((2.f * n - 1.f) * kX * kX_1) / (float)n - (((n - 1.f) * kX_2) / (float)n);
}
}
__global__ void LegendreMoment4(float* kXList, float* kXList_1, float* kXList_2, float* kXList_temp,
float* d_xLegendreP, int width, int d_k, int order, int n) {
int row = blockIdx.y * blockDim.y + threadIdx.y;
int col = blockIdx.x * blockDim.x + threadIdx.x;
float row_n;
if ((row < d_k) && (col < width)) {
if ((row == 0)) {
row_n = 0.f;
for (int h = 0; h < d_k; h++) {
row_n += kXList_temp[h * width + col];
}
d_xLegendreP[n * width + col] = row_n;
}
}
}
float matlab_result[][4] = {
{23., 23., 23., 23.},
{-22.9766, -22.9316, -22.8867, -22.8418},
{22.9297, 22.7952, 22.661, 22.527},
{-22.8596, -22.5914, -22.3245, -22.059},
{22.7663, 22.3211, 21.8799, 21.4425},
{-22.6501, -21.9856, -21.3303, -20.6839},
{22.5111, 21.5864, 20.6798, 19.7912},
{-22.3496, -21.1254, -19.9335, -18.7734},
{22.166, 20.6046, 19.0967, 17.6411},
{-21.9606, -20.0265, -18.1756, -16.4058},
{21.7339, 19.3937, 17.1772, 15.0802},
{-21.4862, -18.7091, -16.1086, -13.6777},
{21.2181, 17.9757, 14.9778, 12.2124},
{-20.9301, -17.1971, -13.7931, -10.6992},
{20.6228, 16.3766, 12.563, 9.15308},
{-20.2967, -15.5179, -11.2963, -7.5893},
{19.9525, 14.625, 10.0023, 6.02321},
{-19.5909, -13.7018, -8.69016, -4.46998},
{19.2126, 12.7524, 7.36912, 2.94447},
{-18.8183, -11.781, -6.04847, -1.46107},
{18.4087, 10.792, 4.73739, 0.0335239},
{-17.9847, -9.78953, -3.44488, 1.32519},
{17.5472, 8.77808, 2.17971, -2.60304},
{-17.0968, -7.76199, -0.950332, 3.78904},
{16.6345, 6.74559, -0.235176, -4.87336},
{-16.1611, -5.7332, 1.36917, 5.84745},
{15.6776, 4.72908, -2.44452, -6.70411},
{-15.1848, -3.73739, 3.45463, 7.43756},
{14.6836, 2.7622, -4.39351, -8.04346},
{-14.1751, -1.80747, 5.25583, 8.51902},
{13.66, 0.877003, -6.03692, -8.86292},
{-13.1395, 0.0255473, 6.73284, 9.07537},
{12.6143, -0.896704, -7.34039, -9.15805},
{-12.0855, 1.73318, 7.85712, 9.11411},
{11.554, -2.53191, -8.28135, -8.94808},
{-11.0207, 3.29003, 8.61218, 8.6658},
{10.4866, -4.00492, -8.84949, -8.27433},
{-9.95254, 4.67419, 8.99391, 7.78188},
{9.41953, -5.29574, -9.04682, -7.19767},
{-8.88843, 5.8677, 9.01035, 6.53179},
{8.36015, -6.38847, -8.88731, -5.79509}
};
#define TOL 0.0001f
int main()
{
float* kXList;
float* kXList_1;
float* kXList_2;
float* kXList_temp;
float* kXList_prev;
float* d_xLegendreP;
float* xLegendreP;
int width = IMG_SIZE;
cudaEvent_t d_total_begin, d_total_end;
xLegendreP = new float[(Mmax + 1) * width];
CUDA_CHECK_RETURN(cudaSetDevice(DEVICE));
CUDA_CHECK_RETURN(cudaEventCreate(&d_total_begin));
CUDA_CHECK_RETURN(cudaEventCreate(&d_total_end));
printf("Time kernel launch...\n");
CUDA_CHECK_RETURN(cudaEventRecord(d_total_begin, 0));
printf("Allocating space on device...\n");
CUDA_CHECK_RETURN(
cudaMalloc((void**)&kXList, width * k * sizeof(float)));
CUDA_CHECK_RETURN(
cudaMalloc((void**)&kXList_temp, width * k * sizeof(float)));
CUDA_CHECK_RETURN(
cudaMalloc((void**)&kXList_prev, width * k * sizeof(float)));
CUDA_CHECK_RETURN(
cudaMalloc((void**)&kXList_1, width * k * sizeof(float)));
CUDA_CHECK_RETURN(
cudaMalloc((void**)&kXList_2, width * k * sizeof(float)));
CUDA_CHECK_RETURN(
cudaMalloc((void**)&d_xLegendreP, width * (Mmax + 1) * sizeof(float)));
printf("Copying data from host to device...\n");
dim3 grid(ceil(Mmax / 32), ceil(width / 32), 1);
dim3 block(BLOCKDIM_32, BLOCKDIM_32, 1);
printf("Launching kernel...\n");
LegendreMoment1 << <grid, block >> > (kXList, kXList_1, kXList_2, kXList_temp,
d_xLegendreP, IMG_SIZE, k, Mmax);
LegendreMoment2 << <grid, block >> > (kXList, kXList_1, kXList_2, kXList_temp,
d_xLegendreP, IMG_SIZE, k, Mmax);
for (int n = 2; n <= Mmax; n++) {
LegendreMoment3 << <grid, block >> > (kXList, kXList_1, kXList_2, kXList_temp,
d_xLegendreP, IMG_SIZE, k, Mmax, n, kXList_prev);
LegendreMoment4 << <grid, block >> > (kXList, kXList_1, kXList_2, kXList_temp,
d_xLegendreP, IMG_SIZE, k, Mmax, n);
}
CUDA_CHECK_RETURN(
cudaMemcpy(xLegendreP, d_xLegendreP, width * (Mmax + 1) * sizeof(float), cudaMemcpyDeviceToHost));
CUDA_CHECK_RETURN(cudaEventRecord(d_total_end, 0));
printf("\n");
for (int n = 0; n <= Mmax; n++)
printf("row %2d:%8.4f %8.4f %8.4f %8.4f\n", n, xLegendreP[n * width + 0],xLegendreP[n * width + 1],xLegendreP[n * width + 2],xLegendreP[n * width + 3]);
for (int i = 0; i < Mmax; i++)
for (int j = 0; j < 4; j++)
if (fabsf(xLegendreP[i*width+j] - matlab_result[i][j]) > TOL) {printf("mismatch at %d, %d\n", i, j); return 0;}
CUDA_CHECK_RETURN(cudaEventSynchronize(d_total_end));
float gpuTime = 0.0;
CUDA_CHECK_RETURN(cudaEventElapsedTime(&gpuTime, d_total_begin, d_total_end));
printf(">>>Elapsed GPU Time is : %f ms\n", gpuTime);
printf("Freeing memory on device...\n");
CUDA_CHECK_RETURN(cudaEventDestroy(d_total_begin));
CUDA_CHECK_RETURN(cudaEventDestroy(d_total_end));
CUDA_CHECK_RETURN(cudaFree(kXList));
CUDA_CHECK_RETURN(cudaFree(kXList_temp));
CUDA_CHECK_RETURN(cudaFree(kXList_1));
CUDA_CHECK_RETURN(cudaFree(kXList_2));
CUDA_CHECK_RETURN(cudaFree(d_xLegendreP));
printf("Exiting program...\n");
return 0;
}
$ nvcc -o t1820 t1820.cu
$ ./t1820
Time kernel launch...
Allocating space on device...
Copying data from host to device...
Launching kernel...
row 0: 23.0000 23.0000 23.0000 23.0000
row 1:-22.9766 -22.9316 -22.8867 -22.8418
row 2: 22.9297 22.7952 22.6610 22.5270
row 3:-22.8596 -22.5914 -22.3245 -22.0590
row 4: 22.7663 22.3211 21.8799 21.4425
row 5:-22.6501 -21.9856 -21.3303 -20.6839
row 6: 22.5111 21.5864 20.6798 19.7912
row 7:-22.3496 -21.1254 -19.9335 -18.7734
row 8: 22.1660 20.6046 19.0967 17.6411
row 9:-21.9606 -20.0265 -18.1756 -16.4058
row 10: 21.7339 19.3937 17.1772 15.0802
row 11:-21.4862 -18.7090 -16.1086 -13.6777
row 12: 21.2181 17.9757 14.9778 12.2124
row 13:-20.9301 -17.1971 -13.7931 -10.6992
row 14: 20.6228 16.3766 12.5630 9.1531
row 15:-20.2967 -15.5179 -11.2963 -7.5893
row 16: 19.9525 14.6250 10.0023 6.0232
row 17:-19.5909 -13.7018 -8.6902 -4.4700
row 18: 19.2126 12.7524 7.3691 2.9445
row 19:-18.8183 -11.7810 -6.0485 -1.4611
row 20: 18.4087 10.7920 4.7374 0.0335
row 21:-17.9848 -9.7895 -3.4449 1.3252
row 22: 17.5472 8.7781 2.1797 -2.6030
row 23:-17.0968 -7.7620 -0.9503 3.7890
row 24: 16.6345 6.7456 -0.2352 -4.8734
row 25:-16.1611 -5.7332 1.3692 5.8475
row 26: 15.6776 4.7291 -2.4445 -6.7041
row 27:-15.1848 -3.7374 3.4546 7.4376
row 28: 14.6836 2.7622 -4.3935 -8.0435
row 29:-14.1751 -1.8075 5.2558 8.5190
row 30: 13.6600 0.8770 -6.0369 -8.8629
row 31:-13.1395 0.0255 6.7328 9.0754
row 32: 12.6143 -0.8967 -7.3404 -9.1581
row 33:-12.0855 1.7332 7.8571 9.1141
row 34: 11.5540 -2.5319 -8.2813 -8.9481
row 35:-11.0207 3.2900 8.6122 8.6658
row 36: 10.4866 -4.0049 -8.8495 -8.2743
row 37: -9.9525 4.6742 8.9939 7.7819
row 38: 9.4195 -5.2957 -9.0468 -7.1977
row 39: -8.8884 5.8677 9.0103 6.5318
row 40: 8.3601 -6.3885 -8.8873 -5.7951
>>>Elapsed GPU Time is : 1.223776 ms
Freeing memory on device...
Exiting program...
$
I'm not suggesting the above code is defect-free or suitable for any particular purpose. It is mostly your code. I've made some changes to demonstrate the need for global sync that is inherent in your approach.
I have a few values which are offsets to a multidimensional array , and look like this :
static const int TILE_SIZE = 32;
int Offset2D = (y * TILE_SIZE) + (x * TILE_SIZE);
int Offset3D = (y * TILE_SIZE) + (x * TILE_SIZE) + (z * TILE_SIZE);
Now what i would like to do is to convert an offset to x,y,z pair , like so :
void ConvertBack(int offset,int size,int& x,int& y,int& z)
{
//What's wrong with this code ?
x = offset / size;
y = offset % size;
z = ??; //How to get Z?
}
or
//Get back offsets from any dimension ?
void ConvertBackComplex(unsigned int offset,int size,int* vector,int len)
{
for (int i = 0;i < len;i++)
{
vector[i] = offset ?... ?
}
}
...So far all of my attempts have failed....So i would really welcome any help!...
First of all I think you indexing system is a bit off. The way you have things arranged different values of x, y, and z can give the same offset. So, first of all, assuming that TILE_SIZE is how many cells of the array store the data for a given point:
myArray = new arr[xSize*ySize*zSize*TILESIZE]
int offset2D = (x*ySize*zSize + y*zSize)*TILE_SIZE;
int offset3D = (x*ySize*zSize + y*zSize + z)*TILE_SIZE;
To get x,y,z back from the offset one simply does the following:
temp = offset/TILE_SIZE;
x = temp/(ySize*zSize);
y = (temp%(ySize*zSize))/zSize;
z = (temp%(ySize*zSize))%zSize;
For multiple dimensions:
temp = offset/TILE_SIZE;
sizeProduct = 1;
for(int k=1; k<numDims; ++k)
{
sizeProduct*=size[k];
}
for(int i=0; i<numDims; ++i)
{
vector[i]=temp/sizeProduct;
temp = temp % sizeProduct;
if((i+1)<numDims)
{
sizeProduct/=sizes[i+1];
}
}
To calculate array sizes in multiple dimensions:
int arraySize = TILE_SIZE;
for(int i=0; i<numDims; ++i)
{
arraySize*=sizes[i];
}
To calculate array indices in multiple dimensions (assuming vector is your array of coordinates):
int index =0;
sizeProduct = 1;
for(int k=1; k<numDims; ++k)
{
sizeProduct*=size[k];
}
for(int i=0; i<numDims; ++i)
{
index+=sizeProduct*vector[i];
if((i+1)<numDims)
{
sizeProduct/=sizes[i+1];
}
}
index*=TILE_SIZE;
Assuming that all dimensions are TILE_SIZE long, your offset calculations are wrong. Let's say I have an array a which simulated 3d array with all dimensions TILE_SIZE long:
int a[TILE_SIZE * TILE_SIZE * TILE_SIZE];
Then point p with coordinates (x, y, z) would have an offset like this:
int p_offset = z * (TILE_SIZE * TILE_SIZE)
+ y * (TILE_SIZE)
+ x;
Reverse calculation is then:
int p_z = p_offset / (TILE_SIZE * TILE_SIZE);
int p_y = (p_offset - p_z * (TILE_SIZE * TILE_SIZE)) / TILE_SIZE;
int p_x = p_offset % TILE_SIZE;
You can choose different order of dimensions (x, y, z) but you have to be consistent.
Assuming the dimensions go from X to Y to Z (as in X represents the lowest dimension):
You can't use a single function to calculate both the 2D and 3D offsets back into coordinates.
For 2D:
void ConvertBack2D(int offset, int x_len, int &x, int &y)
{
y = offset / x_len;
x = offset % x_len;
}
For 3D:
void ConvertBack3D(int offset, int x_len, int y_len, int &x, int &y, int &z)
{
z = offset / (x_len * y_len);
y = (offset - (x * x_len * y_len)) / y_len;
x = (offset - (x * x_len * y_len)) % x_len;
}