How to optimize chgemm (int = char*char) matrix multiplication using avx2 intrinsics? - c

It seems there is few discussion about chgemm(int = char*char) matrix multiplication. Let's assume M%8=0, N%8=0, K%8=0, B is transposed. I recall the CPU which supports AVX2 only has 16 ymm registers. So I tried to implement blocked matrix of 2x8 to maximize using the registers. However, I couldn't find any better solution (e.g, modify algorithm to move load of pb to outer loop). Another issue I am worried about is the latency of sum reduction (permute, sli, add).
I also tried 4x8 and 8x8, it seems 8x8 severely degrades performance.
Could anyone help me to further optimize this code? Thanks!
void _chgemm_mm_u_c_N_T_2x8(
size_t M, size_t N, size_t K, float scaleAB,
unsigned char *A, size_t lda, signed char *B, size_t ldb,
float scaleT, int *C, size_t ldc)
{
int h = M;
int w = N;
int d = K;
int i, j, k;
__m256i tmp_short = _mm256_set1_epi16(1);
for (i = 0; i < h; i += 2) {
__m256i pc0, pc1, pc2, pc3;
for (j = 0; j < w; j += 8 ) {
unsigned char *pa0 = A + i * lda;
unsigned char *pa1 = pa0 + 1*lda;
signed char *pb0 = (signed char*)B + j*ldb;
signed char *pb1 = pb0 + 1*ldb;
signed char *pb2 = pb0 + 2*ldb;
signed char *pb3 = pb0 + 3*ldb;
signed char *pb4 = pb0 + 4*ldb;
signed char *pb5 = pb0 + 5*ldb;
signed char *pb6 = pb0 + 6*ldb;
signed char *pb7 = pb0 + 7*ldb;
int *pc = (int*)C + i * ldc + j;
__m256i ma0, ma1; //ma2, ma3, ma4, ma5, ma6, ma7;
__m256i mb0, mb1, mb2, mb3, mb4, mb5, mb6, mb7;
__m256i mc0, mc1; //mc2, mc3, mc4, mc5, mc6, mc7;
__m256i sum0 = _mm256_setzero_si256();
__m256i sum1 = _mm256_setzero_si256();
__m256i sum2 = _mm256_setzero_si256();
__m256i sum3 = _mm256_setzero_si256();
__m256i sum4 = _mm256_setzero_si256();
__m256i sum5 = _mm256_setzero_si256();
__m256i sum6 = _mm256_setzero_si256();
__m256i sum7 = _mm256_setzero_si256();
__m256i sum8 = _mm256_setzero_si256();
__m256i sum9 = _mm256_setzero_si256();
__m256i sum10 = _mm256_setzero_si256();
__m256i sum11 = _mm256_setzero_si256();
__m256i sum12 = _mm256_setzero_si256();
__m256i sum13 = _mm256_setzero_si256();
__m256i sum14 = _mm256_setzero_si256();
__m256i sum15 = _mm256_setzero_si256();
for (k = 0; k < d; k += 32) {
//__m128i low0, low1, low2, low3;
//__m128i hi0, hi1, hi2, hi3;
ma0 = _mm256_loadu_si256((__m256i*)pa0);
ma1 = _mm256_loadu_si256((__m256i*)pa1);
mb0 = _mm256_loadu_si256((__m256i*)pb0);
mb1 = _mm256_loadu_si256((__m256i*)pb1);
mb2 = _mm256_loadu_si256((__m256i*)pb2);
mb3 = _mm256_loadu_si256((__m256i*)pb3);
mb4 = _mm256_loadu_si256((__m256i*)pb4);
mb5 = _mm256_loadu_si256((__m256i*)pb5);
mb6 = _mm256_loadu_si256((__m256i*)pb6);
mb7 = _mm256_loadu_si256((__m256i*)pb7);
_mm_prefetch((char *)pa0 + 32, _MM_HINT_T0);
_mm_prefetch((char *)pa1 + 32, _MM_HINT_T0);
_mm_prefetch((char *)pb0 + 32, _MM_HINT_T0);
_mm_prefetch((char *)pb1 + 32, _MM_HINT_T0);
_mm_prefetch((char *)pb2 + 32, _MM_HINT_T0);
_mm_prefetch((char *)pb3 + 32, _MM_HINT_T0);
_mm_prefetch((char *)pb4 + 32, _MM_HINT_T0);
_mm_prefetch((char *)pb5 + 32, _MM_HINT_T0);
_mm_prefetch((char *)pb6 + 32, _MM_HINT_T0);
_mm_prefetch((char *)pb7 + 32, _MM_HINT_T0);
mc0 = _mm256_maddubs_epi16(ma0, mb0);
sum0 = _mm256_add_epi32(_mm256_madd_epi16(mc0, tmp_short), sum0);
mc0 = _mm256_maddubs_epi16(ma0, mb1);
sum1 = _mm256_add_epi32(_mm256_madd_epi16(mc0, tmp_short), sum1);
mc0 = _mm256_maddubs_epi16(ma0, mb2);
sum2 = _mm256_add_epi32(_mm256_madd_epi16(mc0, tmp_short), sum2);
mc0 = _mm256_maddubs_epi16(ma0, mb3);
sum3 = _mm256_add_epi32(_mm256_madd_epi16(mc0, tmp_short), sum3);
mc0 = _mm256_maddubs_epi16(ma0, mb4);
sum4 = _mm256_add_epi32(_mm256_madd_epi16(mc0, tmp_short), sum4);
mc0 = _mm256_maddubs_epi16(ma0, mb5);
sum5 = _mm256_add_epi32(_mm256_madd_epi16(mc0, tmp_short), sum5);
mc0 = _mm256_maddubs_epi16(ma0, mb6);
sum6 = _mm256_add_epi32(_mm256_madd_epi16(mc0, tmp_short), sum6);
mc0 = _mm256_maddubs_epi16(ma0, mb7);
sum7 = _mm256_add_epi32(_mm256_madd_epi16(mc0, tmp_short), sum7);
//
mc0 = _mm256_maddubs_epi16(ma1, mb0);
sum8 = _mm256_add_epi32(_mm256_madd_epi16(mc0, tmp_short), sum8);
mc0 = _mm256_maddubs_epi16(ma1, mb1);
sum9 = _mm256_add_epi32(_mm256_madd_epi16(mc0, tmp_short), sum9);
mc0 = _mm256_maddubs_epi16(ma1, mb2);
sum10 = _mm256_add_epi32(_mm256_madd_epi16(mc0, tmp_short), sum10);
mc0 = _mm256_maddubs_epi16(ma1, mb3);
sum11 = _mm256_add_epi32(_mm256_madd_epi16(mc0, tmp_short), sum11);
mc0 = _mm256_maddubs_epi16(ma1, mb4);
sum12 = _mm256_add_epi32(_mm256_madd_epi16(mc0, tmp_short), sum12);
mc0 = _mm256_maddubs_epi16(ma1, mb5);
sum13 = _mm256_add_epi32(_mm256_madd_epi16(mc0, tmp_short), sum13);
mc0 = _mm256_maddubs_epi16(ma1, mb6);
sum14 = _mm256_add_epi32(_mm256_madd_epi16(mc0, tmp_short), sum14);
mc0 = _mm256_maddubs_epi16(ma1, mb7);
sum15 = _mm256_add_epi32(_mm256_madd_epi16(mc0, tmp_short), sum15);
//
pa0+=32; pa1+=32; //pa2+=32; pa3+=32;
pb0+=32; pb1+=32; pb2+=32; pb3+=32;
pb4+=32; pb5+=32; pb6+=32; pb7+=32;
}
sum0 = _mm256_add_epi32(sum0, _mm256_permute2x128_si256(sum0, sum0, 0x81));
sum0 = _mm256_add_epi32(sum0, _mm256_srli_si256(sum0, 8));
sum0 = _mm256_add_epi32(sum0, _mm256_srli_si256(sum0, 4));
sum1 = _mm256_add_epi32(sum1, _mm256_permute2x128_si256(sum1, sum1, 0x81));
sum1 = _mm256_add_epi32(sum1, _mm256_srli_si256(sum1, 8));
sum1 = _mm256_add_epi32(sum1, _mm256_srli_si256(sum1, 4));
sum2 = _mm256_add_epi32(sum2, _mm256_permute2x128_si256(sum2, sum2, 0x81));
sum2 = _mm256_add_epi32(sum2, _mm256_srli_si256(sum2, 8));
sum2 = _mm256_add_epi32(sum2, _mm256_srli_si256(sum2, 4));
sum3 = _mm256_add_epi32(sum3, _mm256_permute2x128_si256(sum3, sum3, 0x81));
sum3 = _mm256_add_epi32(sum3, _mm256_srli_si256(sum3, 8));
sum3 = _mm256_add_epi32(sum3, _mm256_srli_si256(sum3, 4));
sum4 = _mm256_add_epi32(sum4, _mm256_permute2x128_si256(sum4, sum4, 0x81));
sum4 = _mm256_add_epi32(sum4, _mm256_srli_si256(sum4, 8));
sum4 = _mm256_add_epi32(sum4, _mm256_srli_si256(sum4, 4));
sum5 = _mm256_add_epi32(sum5, _mm256_permute2x128_si256(sum5, sum5, 0x81));
sum5 = _mm256_add_epi32(sum5, _mm256_srli_si256(sum5, 8));
sum5 = _mm256_add_epi32(sum5, _mm256_srli_si256(sum5, 4));
sum6 = _mm256_add_epi32(sum6, _mm256_permute2x128_si256(sum6, sum6, 0x81));
sum6 = _mm256_add_epi32(sum6, _mm256_srli_si256(sum6, 8));
sum6 = _mm256_add_epi32(sum6, _mm256_srli_si256(sum6, 4));
sum7 = _mm256_add_epi32(sum7, _mm256_permute2x128_si256(sum7, sum7, 0x81));
sum7 = _mm256_add_epi32(sum7, _mm256_srli_si256(sum7, 8));
sum7 = _mm256_add_epi32(sum7, _mm256_srli_si256(sum7, 4));
sum8 = _mm256_add_epi32(sum8, _mm256_permute2x128_si256(sum8, sum8, 0x81));
sum8 = _mm256_add_epi32(sum8, _mm256_srli_si256(sum8, 8));
sum8 = _mm256_add_epi32(sum8, _mm256_srli_si256(sum8, 4));
sum9 = _mm256_add_epi32(sum9, _mm256_permute2x128_si256(sum9, sum9, 0x81));
sum9 = _mm256_add_epi32(sum9, _mm256_srli_si256(sum9, 8));
sum9 = _mm256_add_epi32(sum9, _mm256_srli_si256(sum9, 4));
sum10 = _mm256_add_epi32(sum10, _mm256_permute2x128_si256(sum10, sum10, 0x81));
sum10 = _mm256_add_epi32(sum10, _mm256_srli_si256(sum10, 8));
sum10 = _mm256_add_epi32(sum10, _mm256_srli_si256(sum10, 4));
sum11 = _mm256_add_epi32(sum11, _mm256_permute2x128_si256(sum11, sum11, 0x81));
sum11 = _mm256_add_epi32(sum11, _mm256_srli_si256(sum11, 8));
sum11 = _mm256_add_epi32(sum11, _mm256_srli_si256(sum11, 4));
sum12 = _mm256_add_epi32(sum12, _mm256_permute2x128_si256(sum12, sum12, 0x81));
sum12 = _mm256_add_epi32(sum12, _mm256_srli_si256(sum12, 8));
sum12 = _mm256_add_epi32(sum12, _mm256_srli_si256(sum12, 4));
sum13 = _mm256_add_epi32(sum13, _mm256_permute2x128_si256(sum13, sum13, 0x81));
sum13 = _mm256_add_epi32(sum13, _mm256_srli_si256(sum13, 8));
sum13 = _mm256_add_epi32(sum13, _mm256_srli_si256(sum13, 4));
sum14 = _mm256_add_epi32(sum14, _mm256_permute2x128_si256(sum14, sum14, 0x81));
sum14 = _mm256_add_epi32(sum14, _mm256_srli_si256(sum14, 8));
sum14 = _mm256_add_epi32(sum14, _mm256_srli_si256(sum14, 4));
sum15 = _mm256_add_epi32(sum15, _mm256_permute2x128_si256(sum15, sum15, 0x81));
sum15 = _mm256_add_epi32(sum15, _mm256_srli_si256(sum15, 8));
sum15 = _mm256_add_epi32(sum15, _mm256_srli_si256(sum15, 4));
pc[0] = _mm256_extract_epi32(sum0, 0);
pc[1] = _mm256_extract_epi32(sum1, 0);
pc[2] = _mm256_extract_epi32(sum2, 0);
pc[3] = _mm256_extract_epi32(sum3, 0);
pc[4] = _mm256_extract_epi32(sum4, 0);
pc[5] = _mm256_extract_epi32(sum5, 0);
pc[6] = _mm256_extract_epi32(sum6, 0);
pc[7] = _mm256_extract_epi32(sum7, 0);
pc[ldc+0] = _mm256_extract_epi32(sum8, 0);
pc[ldc+1] = _mm256_extract_epi32(sum9, 0);
pc[ldc+2] = _mm256_extract_epi32(sum10, 0);
pc[ldc+3] = _mm256_extract_epi32(sum11, 0);
pc[ldc+4] = _mm256_extract_epi32(sum12, 0);
pc[ldc+5] = _mm256_extract_epi32(sum13, 0);
pc[ldc+6] = _mm256_extract_epi32(sum14, 0);
pc[ldc+7] = _mm256_extract_epi32(sum15, 0);
}
}
}

Related

So, this function was supposed to use edge detection and apply sobel filter. But well It doesn't

I'm trying to figure out what I did wrong. My intent is in title :)
This is the code I tried, and it clearly isn't working. It just print's 1 solid colour, also edges are not included yet. What part is wrong I can't figure it out what part of this code is wrong. It's just a function and it's in C.
void edges(int height, int width, RGBTRIPLE image[height][width])
{
RGBTRIPLE edit[height][width];
for (int i = 0; i < height; i++)
{
for (int j = 0; j < width; j++)
{
edit[i][j] = image[i][j];
}
}
for (int q = 1; q < height - 1; q++)
{
for (int w = 1; w < width - 1; w++)
{
int redEdit[9];
int greenEdit[9];
int blueEdit[9];
for (int top = 0; top < 3; top++)
{
redEdit[top] += edit[q - 1][w - 1 + top].rgbtRed;
greenEdit[top] += edit[q - 1][w - 1 + top].rgbtGreen;
blueEdit[top] += edit[q - 1][w - 1 + top].rgbtBlue;
}
int midctr = 0;
for (int mid = 3; mid < 6; mid++)
{
redEdit[mid] += edit[q][w - 1 + midctr].rgbtRed;
greenEdit[mid] += edit[q][w - 1 + midctr].rgbtGreen;
blueEdit[mid] += edit[q][w - 1 + midctr].rgbtBlue;
midctr ++;
}
int topctr = 0;
for (int top = 6; top < 9; top++)
{
redEdit[top] += edit[q + 1][w - 1 + topctr].rgbtRed;
greenEdit[top] += edit[q + 1][w - 1 + topctr].rgbtGreen;
blueEdit[top] += edit[q + 1][w - 1 + topctr].rgbtBlue;
topctr ++;
}
int matrixgx[] = {-1, 0, 1, -2, 0, 2, -1, 0, 1};
int matrixgy[] = {- 1, -2, -1, 0, 0, 0, 1, 2, 1};
int redEditgx[9];
int greenEditgx[9];
int blueEditgx[9];
int redEditgy[9];
int greenEditgy[9];
int blueEditgy[9];
for (int mtrx = 0; mtrx < 9; mtrx++)
{
redEditgx[mtrx] = redEdit[mtrx] * matrixgx[mtrx];
greenEditgx[mtrx] = greenEdit[mtrx] * matrixgx[mtrx];
blueEditgx[mtrx] = blueEdit[mtrx] * matrixgx[mtrx];
redEditgy[mtrx] = redEdit[mtrx] * matrixgy[mtrx];
greenEditgy[mtrx] = greenEdit[mtrx] * matrixgy[mtrx];
blueEditgy[mtrx] = blueEdit[mtrx] * matrixgy[mtrx];
}
// now sum up the changes of gx and gt
int redSumgx = 0;
int greenSumgx = 0;
int blueSumgx = 0;
int redSumgy = 0;
int greenSumgy = 0;
int blueSumgy = 0;
for (int sum = 0; sum < 9; sum++)
{
redSumgx += redEditgx[sum];
greenSumgx += greenEditgx[sum];
blueSumgx += blueEditgx[sum];
redSumgy += redEditgy[sum];
greenSumgy += greenEditgy[sum];
blueSumgy += blueEditgy[sum];
}
int finalRed = round(sqrt(pow(redSumgx, 2) + pow(redSumgy, 2)));
int finalGreen = round(sqrt(pow(greenSumgx, 2) + pow(greenSumgy, 2)));
int finalBlue = round(sqrt(pow(blueSumgx, 2) + pow(blueSumgy, 2)));
if (finalRed > 255)
{
finalRed = 255;
}
if (finalGreen > 255)
{
finalGreen = 255;
}
if (finalBlue > 255)
{
finalBlue = 255;
}
image[q][w].rgbtRed = finalRed;
image[q][w].rgbtGreen = finalGreen;
image[q][w].rgbtBlue = finalBlue;
}
}
return;
}
I know this code is cancer please don't judge
These arrays
int redEdit[9];
int greenEdit[9];
int blueEdit[9];
have not been initialised and hold arbitrary values. So when you execute statements such as
redEdit[top] += edit[q - 1][w - 1 + top].rgbtRed;
the sum is meaningless. You need
int redEdit[9] = { 0 };
int greenEdit[9] = { 0 };
int blueEdit[9] = { 0 };
Only static variables are implicitly initialised to 0. Local (auto) variables are not.

CS50x - Filter More

I doing filter of images bmp in c. The pset required the Sobel Operator. I don't know where I being mistake.
Help me please.
I'm basically making a copy of my image (because the original will be changed.)
Then I take the 3x3 values to put in the formula
So we add and multiply
Finally I take the result and put it in the formula: square root (Gx ^ 2 + Gy ^ 2)
If it exceeds 255 it must be 255, because RGB goes up to 255 which is white
And if there is a broken number, round to the nearest
// Detect edges
void edges(int height, int width, RGBTRIPLE image[height][width])
{
// Variáveis
RGBTRIPLE temp[height][width];
int GR[3][3];
int GG[3][3];
int GB[3][3];
int Gx[3][3] = {{-1, 0, 1}, {-2, 0, 2}, {-1, 0, 1}};
int Gy[3][3] = {{-1, -2, -1}, {0, 0, 0}, {1, 2, 1}};
float resultR, resultG, resultB;
// Cópia temporária do original
for (int tempi = 0; tempi < height; tempi++)
{
for (int tempj = 0; tempj < width; tempj++)
{
temp[tempi][tempj].rgbtRed = image[tempi][tempj].rgbtRed;
temp[tempi][tempj].rgbtGreen = image[tempi][tempj].rgbtGreen;
temp[tempi][tempj].rgbtBlue = image[tempi][tempj].rgbtBlue;
}
}
for (int i = 0; i < height; i++)
{
for (int j = 0; j < width; j++)
{
int countx = 0;
// Pegar 3X3
for (int x = i - 1; x < i + 2; x++)
{
int county = 0;
for (int y = j - 1; y < j + 2; y++)
{
if ((x < 0 || y < 0) || (x >= height || y >= width))
{
GR[countx][county] = 0;
GG[countx][county] = 0;
GB[countx][county] = 0;
}
else
{
GR[countx][county] = temp[x][y].rgbtRed;
GG[countx][county] = temp[x][y].rgbtGreen;
GB[countx][county] = temp[x][y].rgbtBlue;
}
county++;
}
countx++;
}
float sumxR = 0, sumyR = 0, sumxG = 0, sumyG = 0, sumxB = 0, sumyB = 0;
for (int ix = 0; ix <= 2; ix++)
{
for (int iy = 0; iy <= 2; iy++)
{
sumxR = sumxR + (GR[ix][iy] * Gx[ix][iy]);
sumxG = sumxG + (GG[ix][iy] * Gx[ix][iy]);
sumxB = sumxB + (GB[ix][iy] * Gx[ix][iy]);
sumyR = sumyR + (GR[ix][iy] * Gy[ix][iy]);
sumyG = sumyG + (GG[ix][iy] * Gy[ix][iy]);
sumyB = sumyB + (GB[ix][iy] * Gy[ix][iy]);
}
}
resultR = sqrt(sumxR * sumxR) + sqrt(sumyR * sumyR);
resultG = sqrt(sumxG * sumxG) + sqrt(sumyG * sumyG);
resultB = sqrt(sumxB * sumxB) + sqrt(sumyB * sumyB);
if (resultR > 255)
{
resultR = 255;
}
if (resultG > 255)
{
resultG = 255;
}
if (resultB > 255)
{
resultB = 255;
}
image[i][j].rgbtRed = round(resultR);
image[i][j].rgbtGreen = round(resultG);
image[i][j].rgbtBlue = round(resultB);
}
}
}
You need to changesqrt(sumxR * sumxR) + sqrt(sumyR * sumyR) to sqrt((sumxR * sumxR) + (sumyR * sumyR)) and they are not the same

cs50 pset4 - edge detection filter not working - sobel operator

The following code i have written has the purpose of detecting the edges in a pixel using the sobel operator. However, it fails all of the tests given by check50 (tool offered by cs50). The output image is also the exact same as the input.
before you continue reading, visit the pset's link
note: I'm supposed to form a 3x3 grid around the pixel I want to filter so that I can iterate over each value in the GX and GY values. I've used ints hh and ww to do this.
// Detect edges
void edges(int height, int width, RGBTRIPLE image[height][width])
{
int sqrtRedd;
int sqrtGreenn;
int sqrtBluee;
//make copy of image
RGBTRIPLE copy[height][width];
for(int h = 0; h < height; h++)
{
for(int w = 0; w < width; w++)
{
copy[h][w] = image[h][w];
}
}
//loop through pixels
for(int h = 0; h < height; h++)
{
for(int w = 0; w < width; w++)
{
int GXred = 0;
int GYred = 0;
int GXgreen = 0;
int GYgreen = 0;
int GXblue = 0;
int GYblue = 0;
for(int hh = -1; hh <= 1; hh++)
{
for(int ww = -1; ww <= 1; ww++)
{
if( h + hh >= 0 && h + hh < height && w + ww >= 0 && w + ww < width)
{
//form 3x3 grid
GXred += ww * copy[2 - hh * hh][2 - ww * ww].rgbtRed;
GYred += hh * copy[2 - hh * hh][2 - ww * ww].rgbtRed;
GXgreen += ww * copy[2 - hh * hh][2 - ww * ww].rgbtGreen;
GYgreen += hh * copy[2 - hh * hh][2 - ww * ww].rgbtGreen;
GXblue += ww * copy[2 - hh * hh][2 - ww * ww].rgbtBlue;
GXblue += hh * copy[2 - hh * hh][2 - ww * ww].rgbtBlue;
}
}
}
int red = round(sqrt(GXred * GXred + GYred * GYred));
int green = round(sqrt(GXgreen * GXgreen + GYgreen * GXgreen));
int blue = round(sqrt(GXblue * GXblue + GYblue * GYblue));
if(red > 225)
{
red = 225;
}
else if(green > 225)
{
green = 225;
}
else if(blue > 225)
{
blue = 225
}
image[h][w].rgbtRed = red;
image[h][w].rgbtGreen = green;
image[h][w].rgbtBlue = blue;
}
}
return;
}
RGBTRIPLE:
typedef struct
{
BYTE rgbtBlue;
BYTE rgbtGreen;
BYTE rgbtRed;
} __attribute__((__packed__))
RGBTRIPLE;
These are the error messages given by check50:
:( edges correctly filters middle pixel
expected "210 150 60\n", not "143 105 30\n"
:( edges correctly filters pixel on edge
expected "213 228 255\n", not "164 144 79\n"
:( edges correctly filters pixel in corner
expected "76 117 255\n", not "58 77 64\n"
:( edges correctly filters 3x3 image
expected "76 117 255\n21...", not "58 77 64\n164 ..."
:( edges correctly filters 4x4 image
expected "76 117 255\n21...", not "58 77 64\n164 ..."
As you can see, the output values are way off and not even near what they should actually be. the problem is: I don't know if these errors are caused by a) my way of trying to find the GX and GY values from the kernels or b)by the way i'm applying the sobel operator.
I've tried finding GX and GY values in other ways (didn't work) such as:
if(hh == -1)
{
GYred += copy[h - 1][w + ww].rgbtRed * -1;
GYgreen += copy[h - 1][w + ww].rgbtGreen * -1;
GYblue += copy[h - 1][w + ww].rgbtBlue * -1;
}
else if( hh == 0)
{
GYred += copy[h][w + ww].rgbtRed * 0;
GYgreen += copy[h][w + ww].rgbtGreen * 0;
GYblue += copy[h][w + ww].rgbtBlue * 0;
}
else if(hh == 1)
{
GYred += copy[h + 1][w + ww].rgbtRed * 1;
GYgreen += copy[h + 1][w + ww].rgbtGreen * 1;
GYblue += copy[h + 1][w + ww].rgbtBlue * 2;
}
else if(hh == 2)
{
GYred += copy[h + 2][w + ww].rgbtRed * 2;
GYgreen += copy[h + 2][w + ww].rgbtGreen * 2;
GYblue += copy[h + 2][w + ww].rgbtBlue * 2;
}
//start setting GX values
if(ww == -2)
{
GXred += copy[h + hh][w - 2].rgbtRed * -2;
GXgreen += copy[h + hh][w - 2].rgbtGreen * -2;
GXblue += copy[h + hh][w - 2].rgbtBlue * -2;
}
else if(ww == -1)
{
GXred += copy[h + hh][w - 1].rgbtRed * -1;
GXgreen += copy[h + hh][w - 1].rgbtGreen * -1;
GXblue += copy[h + hh][w - 1].rgbtBlue * -1;
}
else if(ww == 0)
{
GXred += copy[h + hh][w].rgbtRed * 0;
GXgreen += copy[h + hh][w].rgbtGreen * 0;
GXblue += copy[h + hh][w].rgbtBlue * 0;
}
else if(ww == 1)
{
GXred += copy[h + hh][w + 1].rgbtRed * 1;
GXgreen += copy[h + hh][w + 1].rgbtGreen * 1;
GXblue += copy[h + hh][w + 1].rgbtBlue * 1;
}
I've been stuck on this pset for almost a week now and so at this point I don't know what else to try.
You have quite some parts that do not really make sense or are missing.
You do not apply the sobel factors anywhere. Just taking the offset inside your 3x3 grid does not yield the correct values,
You only limit 1 color channel in case of overflow,
You limit to 225 instead of 255,
You mixed GXblue and GYblue, same for GXgreen and GYgreen.
I have prepared a new version that should do the trick.
Now tested and with test data from initially failed test for 4x4 image.
#include <stdio.h>
#include <math.h>
typedef unsigned char BYTE;
typedef struct
{
BYTE rgbtBlue;
BYTE rgbtGreen;
BYTE rgbtRed;
} __attribute__((__packed__))
RGBTRIPLE;
// Detect edges
void edges(int height, int width, RGBTRIPLE image[height][width])
{
int sqrtRedd;
int sqrtGreenn;
int sqrtBluee;
//make copy of image
RGBTRIPLE copy[height][width];
for(int h = 0; h < height; h++)
{
for(int w = 0; w < width; w++)
{
copy[h][w] = image[h][w];
}
}
//loop through pixels
for(int h = 0; h < height; h++)
{
for(int w = 0; w < width; w++)
{
int GXred = 0;
int GYred = 0;
int GXgreen = 0;
int GYgreen = 0;
int GXblue = 0;
int GYblue = 0;
int index = 0;
int factorsX[] = {-1, 0, 1, -2, 0, 2, -1, 0, 1};
int factorsY[] = {-1, -2, -1, 0, 0, 0, 1, 2, 1};
//form 3x3 grid
for(int hh = -1; hh <= 1; hh++)
{
for(int ww = -1; ww <= 1; ww++)
{
int x = w+ww;
int y = h+hh;
if( y >= 0 && y < height && x >= 0 && x < width)
{
GXred += factorsX[index] * copy[y][x].rgbtRed;
GYred += factorsY[index] * copy[y][x].rgbtRed;
GXgreen += factorsX[index] * copy[y][x].rgbtGreen;
GYgreen += factorsY[index] * copy[y][x].rgbtGreen;
GXblue += factorsX[index] * copy[y][x].rgbtBlue;
GYblue += factorsY[index] * copy[y][x].rgbtBlue;
}
index++;
}
}
int red = round(sqrt(GXred * GXred + GYred * GYred));
int green = round(sqrt(GXgreen * GXgreen + GYgreen * GYgreen));
int blue = round(sqrt(GXblue * GXblue + GYblue * GYblue));
if(red > 255)
{
red = 255;
}
if(green > 255)
{
green = 255;
}
if(blue > 255)
{
blue = 255;
}
image[h][w].rgbtRed = red;
image[h][w].rgbtGreen = green;
image[h][w].rgbtBlue = blue;
}
}
return;
}
int main(void)
{
RGBTRIPLE test_4x4[4][4] = {
{{0, 10, 25}, {0, 10, 30}, {40, 60, 80}, {50, 60, 80}},
{{20, 30, 90}, {30, 40, 100}, {80, 70, 90}, {80, 80, 90}},
{{20, 20, 40}, {30, 10, 30}, {50, 40, 10}, {50, 40, 100}},
{{50, 20, 40}, {50, 20, 40}, {50, 40, 80}, {50, 40, 80}},
};
edges(4, 4, test_4x4);
for(int h = 0; h < 4; h++)
{
for(int w = 0; w < 4; w++)
{
printf("%d %d %d\n", test_4x4[h][w].rgbtBlue, test_4x4[h][w].rgbtGreen, test_4x4[h][w].rgbtRed);
}
}
return 0;
}

Transpose SSE2 Vectors

I try to convolve an image for wavelet decomposition using SSE2 and C. This image has 4 channels (Lab + alpha) stored contiguously in memory : [LabA][LabA][LabA]… The alpha channel is irrelevant for what I do here.
Accessing a pixel is then straightforward by loading the content of a pointer incremented by 4 interatively:
static void eaw_decompose_sse2(float *const out,
const float *const in,
float *const detail,
const int scale,
const float sharpen,
const size_t width,
const size_t height)
{
/* Convolve rows */
#ifdef _OPENMP
#pragma omp parallel for collapse(2)
#endif
for(size_t j = 0; j < height; j++)
{
for(size_t i = 0; i < width; i++)
{
const size_t inc = (j * width + i) * 4;
float *pdetail = detail + inc;
float *pcoarse = tmp + inc;
// pixel to be convolved
const __m128 pin0 = _mm_load_ps(in + inc);
const __m128 w_0 = _mm_set1_ps(filter[2]);
// neighbours
const __m128 pin1 = _mm_load_ps(in + ASAN_ROW(i, j, -2, mult, max_height_i, width));
const __m128 pin2 = _mm_load_ps(in + ASAN_ROW(i, j, -1, mult, max_height_i, width));
const __m128 pin3 = _mm_load_ps(in + ASAN_ROW(i, j, 1, mult, max_height_i, width));
const __m128 pin4 = _mm_load_ps(in + ASAN_ROW(i, j, 2, mult, max_height_i, width));
// neighbours contribution
const __m128 w_1 = _mm_set1_ps(filter[0]) * weight_sse2(pin0, pin1, sharpen);
const __m128 w_2 = _mm_set1_ps(filter[1]) * weight_sse2(pin0, pin2, sharpen);
const __m128 w_3 = _mm_set1_ps(filter[3]) * weight_sse2(pin0, pin2, sharpen);
const __m128 w_4 = _mm_set1_ps(filter[4]) * weight_sse2(pin0, pin3, sharpen);
// Filter computation
const __m128 wgt = w_1 + w_2 + w_3 + w_4 + w_0;
const __m128 sum = (w_1 * pin1 + w_2 * pin2 + w_3 * pin3 + w_4 * pin4 + w_0 * pin0) * _mm_rcp_ps(wgt);
// High frequency layer
_mm_stream_ps(pdetail, pin0 - sum);
// Low frequency layer
_mm_stream_ps(pcoarse, sum);
}
}
}
The function ASAN_ROW slides the pointer along the rows ensuring we stay in the bounds, if not, it takes the nearest neighbour. weight_sse2 is a gaussian weight that does complicated bit-shifts because L and a/b have different weightings.
So, instead of operating on 4 Lab SSE vectors, with the last element lost, I feel it would be faster to operate on 3 SSE vecors, each vector being a Lab channel, of which each element is a neighbouring pixel. So that would become:
static void eaw_decompose_sse2(float *const out,
const float *const in,
float *const detail,
const int scale,
const float sharpen,
const size_t width,
const size_t height)
{
/* Convolve rows */
#ifdef _OPENMP
#pragma omp parallel for collapse(2)
#endif
for(size_t j = 0; j < height; j++)
{
for(size_t i = 0; i < width; i++)
{
const size_t inc = (j * width + i) * 4;
float *pdetail = detail + inc;
float *pcoarse = tmp + inc;
// pixel to be convolved
const __m128 pin0 = _mm_load_ps(in + inc);
const __m128 w_0 = _mm_set1_ps(filter[2]);
// neighbours
const __m128 pin1 = _mm_load_ps(in + ASAN_ROW(i, j, -2, mult, max_height_i, width));
const __m128 pin2 = _mm_load_ps(in + ASAN_ROW(i, j, -1, mult, max_height_i, width));
const __m128 pin3 = _mm_load_ps(in + ASAN_ROW(i, j, 1, mult, max_height_i, width));
const __m128 pin4 = _mm_load_ps(in + ASAN_ROW(i, j, 2, mult, max_height_i, width));
// Lab extraction - pixel to be convolved
__m128 L_0 = _mm_set1_ps( pin0[0] ); // ?
__m128 a_0 = _mm_set1_ps( pin0[1] ); // ?
__m128 b_0 = _mm_set1_ps( pin0[2] ); // ?
// Lab extraction - neighbours
__m128 L_f = _mm_set_ps ({ pin1[0], pin2[0], pin3[0], pin4[0] }); // ?
__m128 a_f = _mm_set_ps ({ pin1[1], pin2[1], pin3[1], pin4[1] }); // ?
__m128 b_f = _mm_set_ps ({ pin1[2], pin2[2], pin3[2], pin4[2] }); // ?
// neighbours contribution
const __m128 filter = _mm_load_ps(filter_coeff);
const __m128 w_L = filter * weight_sse(L_0, L_f, sharpen);
const __m128 w_c = filter * weight_sse(a_0 + b_0, a_f + b_f, sharpen);
// Filter computation
const __m128 wgt = _mm_set_ps( { sum_of_elts_sse(w_L),
sum_of_elts_sse(w_c),
sum_of_elts_sse(w_c),
0.0f } );
const __m128 w1 = _mm_set_ps ({ w_L[0], w_c[0], w_c[0], 0.0f }); // ?
const __m128 w2 = _mm_set_ps ({ w_L[1], w_c[1], w_c[1], 0.0f }); // ?
const __m128 w3 = _mm_set_ps ({ w_L[2], w_c[2], w_c[2], 0.0f }); // ?
const __m128 w4 = _mm_set_ps ({ w_L[3], w_c[3], w_c[3], 0.0f }); // ?
const __m128 sum = (w_1 * pin1 + w_2 * pin2 + w_3 * pin3 + w_4 * pin4 + w_0 * pin0) * _mm_rcp_ps(wgt);
// High frequency layer
_mm_stream_ps(pdetail, pin0 - sum);
// Low frequency layer
_mm_stream_ps(pcoarse, sum);
}
}
}
What is the most cache-efficient way to switch from the channel-based vectors (pixels vectors pin0 to pin4) to the neighbour-based vectors (L_0, L_f), and the other way around (w_L and w_c to w_1-w_4) ? Would the second version be faster ?

Create a particle circle

I'm creating a particles engine in C (with the CSFML library) where each particle has a position on {X,Y}, a life, and a movement vector in {X,Y}.
I'm actually creating X particles at the same position on my screen (ex: {100, 100}, and i wan't to make them moving to create a growing circle from the initial position.
What is the mathematical function that can help me to make this ?
#include <SFML/Graphics/Vertex.h>
#include <math.h>
#include "main.h"
int my_rand(int a, int b){
return rand()%(b-a) +a;
}
partBuffer *newPartBuffer(int size)
{
partBuffer *this;
const size_t size_m = (sizeof(partBuffer) + sizeof(sfVertex) * size * 4
+ sizeof(t_info) * size);
void *ptn = malloc(size_m);
if (ptn == NULL)
return (NULL);
memset(ptn, 0, size_m);
this = (partBuffer*)(ptn);
this->size = size;
this->vertex = (sfVertex*)(ptn + sizeof(partBuffer));
this->info = (t_info*)(this->vertex + (size * 4));
return (this);
}
void setPart(partBuffer *this, uint id, sfVector2f pos)
{
if (id >= this->size)
return ;
this->vertex[(id * 4) + 0].position = (sfVector2f){pos.x + 0, pos.y + 0};
this->vertex[(id * 4) + 1].position = (sfVector2f){pos.x + 5, pos.y + 0};
this->vertex[(id * 4) + 2].position = (sfVector2f){pos.x + 5, pos.y + 5};
this->vertex[(id * 4) + 3].position = (sfVector2f){pos.x + 0, pos.y + 5};
this->vertex[(id * 4) + 0].color = (sfColor){255, 255, 255};
this->vertex[(id * 4) + 1].color = (sfColor){255, 255, 255};
this->vertex[(id * 4) + 2].color = (sfColor){255, 255, 255};
this->vertex[(id * 4) + 3].color = (sfColor){255, 255, 255};
//this->info[id].speed = (sfVector2f){my_rand(-3, 3), my_rand(-3, 3)};
this->info[id].speed = (sfVector2f){0, 0};
this->info[id].life = 1.0;
}
static uint newPart(partBuffer *this)
{
for (uint id = this->size - 1; id != 0; id -= 1)
if (this->info[id].life <= 0)
return (id);
return ((uint)(-1));
}
void updatePartBuffer(partBuffer *this)
{
for (uint id = 0; id < this->size; id += 1) {
if (this->info[id].life > 0)
this->info[id].life -= 0.0;
if (this->info[id].life <= 0)
this->info[id].life = 0.0;
this->vertex[(id * 4) + 0].position.x += this->info[id].speed.x;
this->vertex[(id * 4) + 1].position.x += this->info[id].speed.x;
this->vertex[(id * 4) + 2].position.x += this->info[id].speed.x;
this->vertex[(id * 4) + 3].position.x += this->info[id].speed.x;
this->vertex[(id * 4) + 0].position.y += this->info[id].speed.y;
this->vertex[(id * 4) + 1].position.y += this->info[id].speed.y;
this->vertex[(id * 4) + 2].position.y += this->info[id].speed.y;
this->vertex[(id * 4) + 3].position.y += this->info[id].speed.y;
this->vertex[(id * 4) + 0].color.a = (sfUint8)(this->info[id].life * 255.);
this->vertex[(id * 4) + 1].color.a = (sfUint8)(this->info[id].life * 255.);
this->vertex[(id * 4) + 2].color.a = (sfUint8)(this->info[id].life * 255.);
this->vertex[(id * 4) + 3].color.a = (sfUint8)(this->info[id].life * 255.);
}
}
void drawPartBuffer(partBuffer *this, sfRenderWindow *window)
{
sfRenderWindow_drawPrimitives(window, this->vertex, this->size * 4,
sfQuads, NULL);
}
void set_circle(partBuffer *this, float points)
{
sfVector2f start = {0, -1};
sfVector2f adder = {1 / (points), 1 / (points)};
for (uint id = 0; id < points; id += 1) {
this->info[id].speed = (sfVector2f){start.x + adder.x * id, start.y + adder.y * id};
}
}
int game_loop(sfRenderWindow *window)
{
partBuffer *buffer = newPartBuffer(10000);
int points = 11;
sfClock *clock = sfClock_create();
sfVector2f position = {250, 250};
for (uint nb = 0; nb < points; nb += 1)
setPart(buffer, nb, position);
set_circle(buffer, points);
while (sfRenderWindow_isOpen(window)) {
if (sfTime_asMilliseconds(sfClock_getElapsedTime(clock)) >= 10) {
updatePartBuffer(buffer);
sfClock_restart(clock);
}
sfRenderWindow_clear(window, sfBlack);
drawPartBuffer(buffer, window);
sfRenderWindow_display(window);
}
}
int main(void)
{
sfRenderWindow *window = sfRenderWindow_create((sfVideoMode){500, 500, 32},
"Title", sfDefaultStyle, NULL);
sfRenderWindow_setFramerateLimit(window, 120);
game_loop(window);
return (0);
}

Resources