Transpose SSE2 Vectors - c

I try to convolve an image for wavelet decomposition using SSE2 and C. This image has 4 channels (Lab + alpha) stored contiguously in memory : [LabA][LabA][LabA]… The alpha channel is irrelevant for what I do here.
Accessing a pixel is then straightforward by loading the content of a pointer incremented by 4 interatively:
static void eaw_decompose_sse2(float *const out,
const float *const in,
float *const detail,
const int scale,
const float sharpen,
const size_t width,
const size_t height)
{
/* Convolve rows */
#ifdef _OPENMP
#pragma omp parallel for collapse(2)
#endif
for(size_t j = 0; j < height; j++)
{
for(size_t i = 0; i < width; i++)
{
const size_t inc = (j * width + i) * 4;
float *pdetail = detail + inc;
float *pcoarse = tmp + inc;
// pixel to be convolved
const __m128 pin0 = _mm_load_ps(in + inc);
const __m128 w_0 = _mm_set1_ps(filter[2]);
// neighbours
const __m128 pin1 = _mm_load_ps(in + ASAN_ROW(i, j, -2, mult, max_height_i, width));
const __m128 pin2 = _mm_load_ps(in + ASAN_ROW(i, j, -1, mult, max_height_i, width));
const __m128 pin3 = _mm_load_ps(in + ASAN_ROW(i, j, 1, mult, max_height_i, width));
const __m128 pin4 = _mm_load_ps(in + ASAN_ROW(i, j, 2, mult, max_height_i, width));
// neighbours contribution
const __m128 w_1 = _mm_set1_ps(filter[0]) * weight_sse2(pin0, pin1, sharpen);
const __m128 w_2 = _mm_set1_ps(filter[1]) * weight_sse2(pin0, pin2, sharpen);
const __m128 w_3 = _mm_set1_ps(filter[3]) * weight_sse2(pin0, pin2, sharpen);
const __m128 w_4 = _mm_set1_ps(filter[4]) * weight_sse2(pin0, pin3, sharpen);
// Filter computation
const __m128 wgt = w_1 + w_2 + w_3 + w_4 + w_0;
const __m128 sum = (w_1 * pin1 + w_2 * pin2 + w_3 * pin3 + w_4 * pin4 + w_0 * pin0) * _mm_rcp_ps(wgt);
// High frequency layer
_mm_stream_ps(pdetail, pin0 - sum);
// Low frequency layer
_mm_stream_ps(pcoarse, sum);
}
}
}
The function ASAN_ROW slides the pointer along the rows ensuring we stay in the bounds, if not, it takes the nearest neighbour. weight_sse2 is a gaussian weight that does complicated bit-shifts because L and a/b have different weightings.
So, instead of operating on 4 Lab SSE vectors, with the last element lost, I feel it would be faster to operate on 3 SSE vecors, each vector being a Lab channel, of which each element is a neighbouring pixel. So that would become:
static void eaw_decompose_sse2(float *const out,
const float *const in,
float *const detail,
const int scale,
const float sharpen,
const size_t width,
const size_t height)
{
/* Convolve rows */
#ifdef _OPENMP
#pragma omp parallel for collapse(2)
#endif
for(size_t j = 0; j < height; j++)
{
for(size_t i = 0; i < width; i++)
{
const size_t inc = (j * width + i) * 4;
float *pdetail = detail + inc;
float *pcoarse = tmp + inc;
// pixel to be convolved
const __m128 pin0 = _mm_load_ps(in + inc);
const __m128 w_0 = _mm_set1_ps(filter[2]);
// neighbours
const __m128 pin1 = _mm_load_ps(in + ASAN_ROW(i, j, -2, mult, max_height_i, width));
const __m128 pin2 = _mm_load_ps(in + ASAN_ROW(i, j, -1, mult, max_height_i, width));
const __m128 pin3 = _mm_load_ps(in + ASAN_ROW(i, j, 1, mult, max_height_i, width));
const __m128 pin4 = _mm_load_ps(in + ASAN_ROW(i, j, 2, mult, max_height_i, width));
// Lab extraction - pixel to be convolved
__m128 L_0 = _mm_set1_ps( pin0[0] ); // ?
__m128 a_0 = _mm_set1_ps( pin0[1] ); // ?
__m128 b_0 = _mm_set1_ps( pin0[2] ); // ?
// Lab extraction - neighbours
__m128 L_f = _mm_set_ps ({ pin1[0], pin2[0], pin3[0], pin4[0] }); // ?
__m128 a_f = _mm_set_ps ({ pin1[1], pin2[1], pin3[1], pin4[1] }); // ?
__m128 b_f = _mm_set_ps ({ pin1[2], pin2[2], pin3[2], pin4[2] }); // ?
// neighbours contribution
const __m128 filter = _mm_load_ps(filter_coeff);
const __m128 w_L = filter * weight_sse(L_0, L_f, sharpen);
const __m128 w_c = filter * weight_sse(a_0 + b_0, a_f + b_f, sharpen);
// Filter computation
const __m128 wgt = _mm_set_ps( { sum_of_elts_sse(w_L),
sum_of_elts_sse(w_c),
sum_of_elts_sse(w_c),
0.0f } );
const __m128 w1 = _mm_set_ps ({ w_L[0], w_c[0], w_c[0], 0.0f }); // ?
const __m128 w2 = _mm_set_ps ({ w_L[1], w_c[1], w_c[1], 0.0f }); // ?
const __m128 w3 = _mm_set_ps ({ w_L[2], w_c[2], w_c[2], 0.0f }); // ?
const __m128 w4 = _mm_set_ps ({ w_L[3], w_c[3], w_c[3], 0.0f }); // ?
const __m128 sum = (w_1 * pin1 + w_2 * pin2 + w_3 * pin3 + w_4 * pin4 + w_0 * pin0) * _mm_rcp_ps(wgt);
// High frequency layer
_mm_stream_ps(pdetail, pin0 - sum);
// Low frequency layer
_mm_stream_ps(pcoarse, sum);
}
}
}
What is the most cache-efficient way to switch from the channel-based vectors (pixels vectors pin0 to pin4) to the neighbour-based vectors (L_0, L_f), and the other way around (w_L and w_c to w_1-w_4) ? Would the second version be faster ?

Related

cs50 PSET4 - filter questions and issues in my code

Have been working for some time on the PSET4 of CS50, it requires one to create four functions that apply various effects to an input image. grayscale, sepia, reflect, and blur.
When I first finished the code, barely any of it worked when I checked it. Much of it I fixed through essentially mindlessly adding a bunch of variables which for some reason had fixed many of my problems.
Could someone explain why the original code didn't work and achieved mere approximations?
Here is the original and then updated:
void grayscale(int height, int width, RGBTRIPLE image[height][width])
{
for (int x = 0; x < height; x++) {
for (int y = 0; y < width; y++) {
BYTE avg = round((image[x][y].rgbtBlue + image[x][y].rgbtRed + image[x][y].rgbtGreen) / 3);
image[x][y].rgbtBlue = avg;
image[x][y].rgbtRed = avg;
image[x][y].rgbtGreen = avg;
}
}
return;
}
void sepia(int height, int width, RGBTRIPLE image[height][width])
{
for (int x = 0; x < height; x++) {
for (int y = 0; y < width; y++) {
BYTE originalRed = image[x][y].rgbtRed;
BYTE originalGreen = image[x][y].rgbtGreen;
BYTE originalBlue = image[x][y].rgbtBlue;
image[x][y].rgbtRed = .393 * originalRed + .769 * originalGreen + .189 * originalBlue;
image[x][y].rgbtGreen = .349 * originalRed + .686 * originalGreen + .168 * originalBlue;
image[x][y].rgbtBlue = .272 * originalRed + .534 * originalGreen + .131 * originalBlue;
if (image[x][y].rgbtRed > 255) {
image[x][y].rgbtRed = 255;
}
if (image[x][y].rgbtGreen > 255) {
image[x][y].rgbtGreen = 255;
}
if (image[x][y].rgbtBlue > 255) {
image[x][y].rgbtBlue = 255;
}
}
}
return;
}
void grayscale(int height, int width, RGBTRIPLE image[height][width])
{
for (int x = 0; x < height; x++) {
for (int y = 0; y < width; y++) {
int red = image[x][y].rgbtRed;
int green = image[x][y].rgbtGreen;
int blue = image[x][y].rgbtBlue;
int avg = round(((float)red + (float)green + (float)blue) / 3);
image[x][y].rgbtBlue = avg;
image[x][y].rgbtRed = avg;
image[x][y].rgbtGreen = avg;
}
}
return;
}
// Convert image to sepia
void sepia(int height, int width, RGBTRIPLE image[height][width])
{
for (int x = 0; x < height; x++) {
for (int y = 0; y < width; y++) {
int originalRed = image[x][y].rgbtRed;
int originalGreen = image[x][y].rgbtGreen;
int originalBlue = image[x][y].rgbtBlue;
int sepiaRed = round(.393 * originalRed + .769 * originalGreen + .189 * originalBlue);
int sepiaGreen = round(.349 * originalRed + .686 * originalGreen + .168 * originalBlue);
int sepiaBlue = round(.272 * originalRed + .534 * originalGreen + .131 * originalBlue);
if (sepiaRed > 255) {
image[x][y].rgbtRed = 255;
}
else{
image[x][y].rgbtRed = sepiaRed;
}
if (sepiaGreen > 255) {
image[x][y].rgbtGreen = 255;
}
else{
image[x][y].rgbtGreen = sepiaGreen;
}
if (sepiaBlue > 255) {
image[x][y].rgbtBlue = 255;
}
else{
image[x][y].rgbtBlue = sepiaBlue;
}
}
}
return;
}

CS 50 Edges filter

Could someone tell me where in my code I made a mistake? The values are failing check50. The loops seem to be correct, I really can't tell where the problem is.
I've looked at https://medium.com/swlh/cs50-pset-4-filter-8cbf734b0dbc and the code seems pretty much the same...
Thank you so much!
https://pastebin.com/MkZPFbEK
void edges(int height, int width, RGBTRIPLE image[height][width])
{
// for pixels at the border, treat value as B.G.R value as 0
// compute gx and gy for each value of B, G, R
// square root of gx and gy squared
RGBTRIPLE tempstore[height][width];
int Gx[3][3] = {
{-1, 0 , 1},
{-2, 0 , 2},
{-1, 0 , 1}
};
int Gy[3][3] = {
{-2, -1 , -1},
{0, 0 , 0},
{2, -1 , 1}
};
// for each row
for (int i = 0; i < height; i = i + 1)
{
// for each column
for (int j = 0; j < width; j = j + 1)
{
// float horvalueB=0;
// float horvalueG=0;
// float horvalueR=0;
// float vertvalueB=0;
// float vertvalueG=0;
// float vertvalueR=0;
float sumhorB=0;
float sumhorG=0;
float sumhorR=0;
float sumvertB=0;
float sumvertG=0;
float sumvertR=0;
// check if height of neighbor cell
for (int k = -1; k <= 1; k = k + 1)
{
for (int l = -1; l <= 1; l = l + 1)
{
int htcheck = i + k;
int wdcheck = j + l;
// check if height of neighbor cell is within bounds
if (((htcheck) >= 0) && ((htcheck) < height))
{
// check if width of neighbor cell is within bounds
if (((wdcheck) >= 0) && ((wdcheck) < width))
{
sumhorB += image[htcheck][wdcheck].rgbtBlue * Gx[k+1][l+1];
sumhorG += image[htcheck][wdcheck].rgbtGreen * Gx[k+1][l+1];
sumhorR += image[htcheck][wdcheck].rgbtRed * Gx[k+1][l+1];
sumvertB += image[htcheck][wdcheck].rgbtBlue * Gy[k+1][l+1];
sumvertG += image[htcheck][wdcheck].rgbtGreen * Gy[k+1][l+1];
sumvertR += image[htcheck][wdcheck].rgbtRed * Gy[k+1][l+1];
// sumhorB = sumhorB + horvalueB;
// sumhorG = sumhorG + horvalueG;
// sumhorR = sumhorR + horvalueR;
// sumvertB = sumvertB + vertvalueB;
// sumvertG = sumvertG + vertvalueG;
// sumvertR = sumvertR + vertvalueR;
}
}
}
}
int blue = round(sqrt( sumhorB*sumhorB + sumvertB*sumhorB ));
int green = round(sqrt( sumhorG*sumhorG + sumvertG*sumhorG ));
int red = round(sqrt( sumhorR*sumhorR + sumvertR*sumhorR ));
// Cap at 255
if (red > 255)
{
red = 255;
}
if (green > 255)
{
green = 255;
}
if (blue > 255)
{
blue = 255;
} // Assign new values to pixels
tempstore[i][j].rgbtRed = red;
tempstore[i][j].rgbtGreen = green;
tempstore[i][j].rgbtBlue = blue;
}
}
for (int i = 0; i < height; i = i + 1)
{
for (int j = 0; j < width; j = j + 1)
{
image[i][j].rgbtBlue = tempstore[i][j].rgbtBlue;
image[i][j].rgbtGreen = tempstore[i][j].rgbtGreen;
image[i][j].rgbtRed = tempstore[i][j].rgbtRed;
}
}
return;
}
Ahh.... The problem is I was multiplying the wrong things... Sorry about it. Was multiplying sumvert with sumhor...
int blue = round(sqrt( (sumhorB*sumhorB) + (sumvertB*sumhorB) ));
int green = round(sqrt( (sumhorG*sumhorG) + (sumvertG*sumhorG) ));
int red = round(sqrt( (sumhorR*sumhorR) + (sumvertR*sumhorR) ));
Should be
int blue = round(sqrt( (sumhorB*sumhorB) + (sumvertB*sumvertB) ));
int green = round(sqrt( (sumhorG*sumhorG) + (sumvertG*sumvertG) ));
int red = round(sqrt( (sumhorR*sumhorR) + (sumvertR*sumvertR) ));

GLSL ES does not behave the same way with for loop and unrolled loop

It seems GLSL ES 3.0 does not execute properly my code.
I wrote twice the same code, first in an unrolled manner ; and second with a for loop :
// Unrolled version:
float minDistance = 1000000.0;
vec3 finalColor1 = vec3(0.0);
int index1 = 0;
float distance = colorDifferenceCIE94FromRGB(pixel, colors[0]);
if(distance < minDistance) {
finalColor1 = colors[0];
minDistance = distance;
index1 = 0;
}
distance = colorDifferenceCIE94FromRGB(pixel, colors[1]);
if(distance < minDistance) {
finalColor1 = colors[1];
minDistance = distance;
index1 = 1;
}
distance = colorDifferenceCIE94FromRGB(pixel, colors[2]);
if(distance < minDistance) {
finalColor1 = colors[2];
minDistance = distance;
index1 = 2;
}
distance = colorDifferenceCIE94FromRGB(pixel, colors[3]);
if(distance < minDistance) {
finalColor1 = colors[3];
minDistance = distance;
index1 = 3;
}
distance = colorDifferenceCIE94FromRGB(pixel, colors[4]);
if(distance < minDistance) {
finalColor1 = colors[4];
minDistance = distance;
index1 = 4;
}
distance = colorDifferenceCIE94FromRGB(pixel, colors[5]);
if(distance < minDistance) {
finalColor1 = colors[5];
minDistance = distance;
index1 = 5;
}
distance = colorDifferenceCIE94FromRGB(pixel, colors[6]);
if(distance < minDistance) {
finalColor1 = colors[6];
minDistance = distance;
index1 = 6;
}
distance = colorDifferenceCIE94FromRGB(pixel, colors[7]);
if(distance < minDistance) {
finalColor1 = colors[7];
minDistance = distance;
index1 = 7;
}
distance = colorDifferenceCIE94FromRGB(pixel, colors[8]);
if(distance < minDistance) {
finalColor1 = colors[8];
minDistance = distance;
index1 = 8;
}
// For Loop version:
int index2 = 0;
vec3 finalColor2 = pixel;
minDistance = 100000.0;
for(int i=0 ; i<9 ; i++) {
distance = colorDifferenceCIE94FromRGB(pixel, colors[i]);
if(distance < minDistance) {
finalColor2 = colors[i];
minDistance = distance;
index2 = i;
}
}
gl_FragColor = vec4((uv.x < 0.5 ? finalColor1 : finalColor2), 1.0);
At the left of the screen should be exactly the same as the right of the screen, but it is not (at least on my Macbook Pro Unibody 2012 OS X 10.14.4). Why?
I made a snippet available at the bottom of the question (and a demo project) to show the bug.
Code explanation
The code uses Paper.js to draw a red circle on a canvas, then gives this canvas to a Three.js CanvasTexture, which is applied to a fullscreen quad (a plane mesh).
The fragment shader computes the distance between the pixel colors (of the given texture) and a set of colors ; then render the closest color. This operation is performed twice, once in an unrolled version / sequencially, and once with a for loop. The result of the first version is displayed on the left of the screen, the second version on the right.
The result should be exactly the same but strangely it is not. Why?
You can see that by uncommenting line 157:
colors[8] = vec3(0.8, 0.4, 0.1);
(or lines 148 to 152) both codes execute similarly.
Here is what I get on my computer:
The white square on the left is the paper.js canvas, the black square at the right is the three.js scene ; the second circle should be entirely red as well.
Code snippet
<!DOCTYPE html>
<html>
<head>
<title>Dynamic array glsl test</title>
<meta charset="UTF-8" />
<script src="https://cdnjs.cloudflare.com/ajax/libs/three.js/103/three.js"></script>
<script src="https://cdnjs.cloudflare.com/ajax/libs/jquery/3.3.1/jquery.min.js"></script>
<script src="https://cdnjs.cloudflare.com/ajax/libs/paper.js/0.12.0/paper-full.min.js"></script>
<style>
#three,
#paper {
width: 100px;
height: 100px;
}
#three {
/*display: none;*/
}
#paper {
/*display: none;*/
}
</style>
</head>
<body>
<canvas id="paper"></canvas>
<canvas id="three"></canvas>
<script type="application/glsl" id="fragmentShader2">
varying vec2 vUV;
uniform sampler2D textureSampler;
uniform vec2 screenResolution;
uniform vec2 textureResolution;
void main(void) {
vec2 uv = gl_FragCoord.xy / screenResolution.xy - 0.5; // [-0.5, 0.5]
float screenRatio = screenResolution.x / screenResolution.y;
float textureRatio = textureResolution.x / textureResolution.y;
vec2 textureUV = textureRatio > screenRatio ? vec2(uv.x, uv.y * textureRatio / screenRatio) : vec2(uv.x / textureRatio * screenRatio, uv.y);
gl_FragColor = texture2D(textureSampler, textureUV + 0.5);
// gl_FragColor = vec4(1.0, 0.0, 0.0, 1.0);
}
</script>
<script type="application/glsl" id="fragmentShader">
precision highp float;
varying vec2 vUV;
uniform sampler2D textureSampler;
uniform vec2 screenResolution;
uniform vec2 textureResolution;
uniform float hueRotation;
vec3 colors[9];
#define PI 3.1415926535897932384626433832795
vec3 hsv2rgb(vec3 c) {
vec4 K = vec4(1.0, 2.0 / 3.0, 1.0 / 3.0, 3.0);
vec3 p = abs(fract(c.xxx + K.xyz) * 6.0 - K.www);
return c.z * mix(K.xxx, clamp(p - K.xxx, 0.0, 1.0), c.y);
}
vec4 hsv2rgb(vec4 c) {
return vec4(hsv2rgb(c.xyz), c.w);
}
vec3 rgb2hsv(vec3 c) {
vec4 K = vec4(0.0, -1.0 / 3.0, 2.0 / 3.0, -1.0);
vec4 p = mix(vec4(c.bg, K.wz), vec4(c.gb, K.xy), step(c.b, c.g));
vec4 q = mix(vec4(p.xyw, c.r), vec4(c.r, p.yzx), step(p.x, c.r));
float d = q.x - min(q.w, q.y);
float e = 1.0e-10;
return vec3(abs(q.z + (q.w - q.y) / (6.0 * d + e)), d / (q.x + e), q.x);
}
vec4 rgb2hsv(vec4 c) {
return vec4(rgb2hsv(c.xyz), c.w);
}
vec4 rotateHue(vec4 c, float angle) {
vec4 chsv = rgb2hsv(c);
chsv.x = mod(chsv.x + angle, 1.0);
return hsv2rgb(chsv);
}
vec3 mixColors(vec3 c1, vec3 c2) {
return sqrt(0.5 * c1 * c1 + 0.5 * c2 * c2);
}
vec3 mixColors(vec3 c1, vec3 c2, vec3 c3) {
return sqrt( (c1 * c1 / 3.0) + (c2 * c2 / 3.0) + (c3 * c3 / 3.0));
}
vec3 rgb2xyz(vec3 rgb) {
rgb.r = rgb.r > 0.04045 ? pow( ( rgb.r + 0.055 ) / 1.055, 2.4) : rgb.r / 12.92;
rgb.g = rgb.g > 0.04045 ? pow( ( rgb.g + 0.055 ) / 1.055, 2.4) : rgb.g / 12.92;
rgb.b = rgb.b > 0.04045 ? pow( ( rgb.b + 0.055 ) / 1.055, 2.4) : rgb.b / 12.92;
rgb *= 100.0;
return vec3(rgb.r * 0.4124 + rgb.g * 0.3576 + rgb.b * 0.1805,
rgb.r * 0.2126 + rgb.g * 0.7152 + rgb.b * 0.0722,
rgb.r * 0.0193 + rgb.g * 0.1192 + rgb.b * 0.9505);
}
vec3 xyz2lab(vec3 xyz) {
xyz = xyz / vec3(94.811, 100.000, 107.304);
xyz = vec3( xyz.r > 0.008856 ? pow( xyz.r, 1.0/3.0) : (7.787 * xyz.r) + (16.0 / 116.0),
xyz.g > 0.008856 ? pow( xyz.g, 1.0/3.0) : (7.787 * xyz.g) + (16.0 / 116.0),
xyz.b > 0.008856 ? pow( xyz.b, 1.0/3.0) : (7.787 * xyz.b) + (16.0 / 116.0));
return vec3( (116.0 * xyz.y) - 16.0, 500.0 * (xyz.x - xyz.y), 200.0 * (xyz.y - xyz.z) );
}
vec3 rgb2lab(in vec3 rgb) {
vec3 xyz = rgb2xyz(rgb);
vec3 lab = xyz2lab(xyz);
return(lab);
}
float colorDifferenceCIE94FromLab(vec3 cieLab1, vec3 cieLab2) {
// Just to make it more readable
float cL1 = cieLab1.r;
float ca1 = cieLab1.g;
float cb1 = cieLab1.b;
float cL2 = cieLab2.r;
float ca2 = cieLab2.g;
float cb2 = cieLab2.b;
float c1 = sqrt(ca1 * ca1 + cb1 * cb1);
float c2 = sqrt(ca2 * ca2 + cb2 * cb2);
float dL = cL2 - cL1;
float dC = c2 - c1;
float dE = sqrt( (cL1 - cL2) * (cL1 - cL2) + (ca1 - ca2) * (ca1 - ca2) + (cb1 - cb2) * (cb1 - cb2) );
float dH = (dE * dE) - (dL * dL) - (dC * dC);
dH = dH > 0.0 ? sqrt(dH) : 0.0;
float kL = 1.0;
float kC = 1.0;
float kH = 1.0;
float k1 = 0.045;
float k2 = 0.015;
float sL = 1.0;
float sC = 1.0 + ( k1 * c1 ); // sX
float sH = 1.0 + ( k2 * c1 ); // sH
float dLw = dL / (kL * sL);
float dCw = dC / (kC * sC);
float dHw = dH / (kH * sH);
float deltaE94 = sqrt(dLw * dLw + dCw * dCw + dHw * dHw);
return deltaE94;
}
float colorDifferenceCIE94FromRGB(vec3 rgb1, vec3 rgb2) {
vec3 lab1 = rgb2lab(rgb1);
vec3 lab2 = rgb2lab(rgb2);
return colorDifferenceCIE94FromLab(lab1, lab2);
}
// float colorDifferenceCIE94FromRGB(vec3 rgb1, vec3 rgb2) {
// return abs(rgb2.g - rgb1.g);
// }
void main()
{
vec2 uv = gl_FragCoord.xy / screenResolution.xy;
vec3 pixel = texture2D(textureSampler, uv).xyz;
vec3 white = vec3(1.0);
vec3 black = vec3(0.0);
vec3 c1 = rotateHue(vec4(1.0, 0.0, 0.0, 1.0), hueRotation).xyz;
vec3 c2 = rotateHue(vec4(0.0, 1.0, 0.0, 1.0), hueRotation).xyz;
vec3 c3 = rotateHue(vec4(0.0, 0.0, 1.0, 1.0), hueRotation).xyz;
/*
vec3 c1 = vec3(1.0, 0.0, 0.0);
vec3 c2 = vec3(0.0, 1.0, 0.0);
vec3 c3 = vec3(0.0, 0.0, 1.0);
*/
vec3 c12 = mixColors(c1, c2);
vec3 c13 = mixColors(c1, c3);
vec3 c23 = mixColors(c2, c3);
vec3 c123 = mixColors(c1, c2, c3);
colors[0] = white;
colors[1] = black;
colors[2] = c1;
colors[3] = c2;
colors[4] = c3;
colors[5] = c12;
colors[6] = c13;
colors[7] = c23;
colors[8] = c123;
// colors[8] = vec3(0.8, 0.4, 0.1);
/*
colors[0] = white;
colors[1] = black;
colors[2] = vec3(1.0, 0.0, 0.8);
colors[3] = vec3(0.0, 0.7, 0.4);
colors[4] = vec3(0.0, 0.8, 0.9);
colors[5] = vec3(0.8, 0.4, 0.1);
colors[6] = vec3(0.4, 0.9, 0.0);
colors[7] = vec3(0.1, 0.2, 7.0);
colors[8] = vec3(0.9, 0.1, 0.0);
*/
float minDistance = 1000000.0;
vec3 finalColor1 = vec3(0.0);
int index1 = 0;
float distance = colorDifferenceCIE94FromRGB(pixel, colors[0]);
if(distance < minDistance) {
finalColor1 = colors[0];
minDistance = distance;
index1 = 0;
}
distance = colorDifferenceCIE94FromRGB(pixel, colors[1]);
if(distance < minDistance) {
finalColor1 = colors[1];
minDistance = distance;
index1 = 1;
}
distance = colorDifferenceCIE94FromRGB(pixel, colors[2]);
if(distance < minDistance) {
finalColor1 = colors[2];
minDistance = distance;
index1 = 2;
}
distance = colorDifferenceCIE94FromRGB(pixel, colors[3]);
if(distance < minDistance) {
finalColor1 = colors[3];
minDistance = distance;
index1 = 3;
}
distance = colorDifferenceCIE94FromRGB(pixel, colors[4]);
if(distance < minDistance) {
finalColor1 = colors[4];
minDistance = distance;
index1 = 4;
}
distance = colorDifferenceCIE94FromRGB(pixel, colors[5]);
if(distance < minDistance) {
finalColor1 = colors[5];
minDistance = distance;
index1 = 5;
}
distance = colorDifferenceCIE94FromRGB(pixel, colors[6]);
if(distance < minDistance) {
finalColor1 = colors[6];
minDistance = distance;
index1 = 6;
}
distance = colorDifferenceCIE94FromRGB(pixel, colors[7]);
if(distance < minDistance) {
finalColor1 = colors[7];
minDistance = distance;
index1 = 7;
}
distance = colorDifferenceCIE94FromRGB(pixel, colors[8]);
if(distance < minDistance) {
finalColor1 = colors[8];
minDistance = distance;
index1 = 8;
}
int index2 = 0;
vec3 finalColor2 = pixel;
minDistance = 100000.0;
for(int i=0 ; i<9 ; i++) {
distance = colorDifferenceCIE94FromRGB(pixel, colors[i]);
if(distance < minDistance) {
finalColor2 = colors[i];
minDistance = distance;
index2 = i;
}
}
vec3 green = vec3(0.0, 1.0, 0.0);
vec3 red = vec3(1.0, 0.0, 0.0);
/*
// Display colors:
if(uv.x < 0.1) {
float y = uv.y;
finalColor1 = colors[int(floor(y * 8.9))];
}
*/
// gl_FragColor = vec4(index1 == index2 ? green : red,1.0);
gl_FragColor = vec4((uv.x < 0.5 ? finalColor1 : finalColor2), 1.0);
}
</script>
<script type="application/glsl" id="vertexShader">
varying vec2 vUv;
void main() {
vUv = uv;
gl_Position = projectionMatrix * modelViewMatrix * vec4(position, 1.0 );
}
</script>
<!-- <script src="src/index.js"></script> -->
<script type="application/javascript">
//import * as vertexShader from "./vertex-shader";
//import * as fragmentShader from "./fragment-shader";
let fragmentShader = document.getElementById("fragmentShader").textContent;
let vertexShader = document.getElementById("vertexShader").textContent;
let screenWidth = window.innerWidth;
let screenHeight = window.innerHeight;
// let screenWidth = document.body.clientWidth;
// let screenHeight = document.body.clientHeight;
let uniforms = {};
let scene = null;
let parameters = {
hueRotation: 0
};
let renderer, camera, texture, raster;
var paperCanvas = document.getElementById("paper");
paper.setup(paperCanvas);
function initialize() {
let canvas = $("#three").get(0);
let context = canvas.getContext("webgl2");
renderer = new THREE.WebGLRenderer({
context: context,
canvas: canvas,
antialias: true,
preserveDrawingBuffer: true
});
renderer.setSize(screenWidth, screenHeight);
scene = new THREE.Scene();
camera = new THREE.OrthographicCamera(
screenWidth / -2,
screenWidth / 2,
screenHeight / 2,
screenHeight / -2,
1,
1000
);
texture = new THREE.CanvasTexture(paper.view.element, THREE.UVMapping);
texture.needsUpdate = true;
window.texture = texture;
uniforms = {
screenResolution: {
type: "v2",
value: new THREE.Vector2(screenWidth, screenHeight)
},
textureSampler: { value: texture },
textureResolution: new THREE.Uniform(
new THREE.Vector2(
canvas ? canvas.height / 2 : 0,
canvas ? canvas.width / 2 : 0
)
),
hueRotation: { type: "f", value: parameters.hueRotation }
};
let material = new THREE.ShaderMaterial({
uniforms: uniforms,
// extensions: { derivatives: true },
vertexShader: vertexShader.trim(),
fragmentShader: fragmentShader.trim(),
side: THREE.DoubleSide
});
let mesh = new THREE.Mesh(
new THREE.PlaneGeometry(screenWidth, screenHeight),
material
);
// mesh = new THREE.Mesh(geometry, material);
mesh.position.z = -1;
window.mesh = mesh;
scene.add(mesh);
//raster = new paper.Raster("./Velo.jpg");
//raster.onLoad = rasterLoaded;
let circle = new paper.Path.Circle(paper.view.bounds.center, 25);
circle.fillColor = "red";
setTimeout(() => { texture.needsUpdate = true }, 100);
}
$(document).ready(()=> {
let paperCanvas = $("#paper").get(0);
screenWidth = paperCanvas.clientWidth;
screenHeight = paperCanvas.clientHeight;
initialize();
})
//document.addEventListener("DOMContentLoaded", initialize);
function rasterLoaded() {
console.log("raster loaded");
// raster.fitBounds(paper.view.bounds);
console.log("texture:", texture);
if (texture) {
console.log("texture.needsUpdate");
texture.needsUpdate = true;
}
setTimeout(() => updateUniforms(parameters), 100);
}
function updateUniforms(parameters) {
if (uniforms == null) {
return;
}
uniforms.hueRotation.value = parameters.hueRotation;
texture.needsUpdate = true;
}
function animate() {
requestAnimationFrame(animate);
if (renderer) {
renderer.render(scene, camera);
}
}
animate();
</script>
</body>
</html>

OpenGL ortho projection is broken

So I just added ortho projection to my rendering and everything stopped rendering... If I remove it, it works again. This is my matrix code:
#include <stdlib.h>
#include <stdlib.h>
#include <math.h>
matrix4x4 init_matrix4x4() {
matrix4x4 m = calloc(16, sizeof(float));
m[0] = 1; m[1] = 0; m[2] = 0; m[3] = 0;
m[4] = 0; m[5] = 1; m[6] = 0; m[7] = 0;
m[8] = 0; m[9] = 0; m[10] = 1; m[11] = 0;
m[12] = 0; m[13] = 0; m[14] = 0; m[15] = 1;
return m;
}
void translate_matrix4x4(matrix4x4* matrix, float x, float y, float z) {
matrix4x4 m = (*matrix);
m[12] = m[0] * x + m[4] * y + m[8] * z + m[12];
m[13] = m[1] * x + m[5] * y + m[9] * z + m[13];
m[14] = m[2] * x + m[6] * y + m[10] * z + m[14];
m[15] = m[3] * x + m[7] * y + m[11] * z + m[15];
}
void ortho_matrix4x4(matrix4x4* matrix, float left, float right, float bottom, float top, float near, float far) {
matrix4x4 m = (*matrix);
m[0] = 2 / (right-left);
m[1] = 0;
m[2] = 0;
m[3] = 0;
m[4] = 0;
m[5] = 2 / (top - bottom);
m[6] = 0;
m[7] = 0;
m[8] = 0;
m[9] = 0;
m[10] = 1 / (far - near);
m[11] = 0;
m[12] = (left + right) / (left - right);
m[13] = (top + bottom) / (bottom - top);
m[14] = near / (near - far);
m[15] = 1;
}
void mat4_identity(matrix4x4* matrix) {
matrix4x4 out = (*matrix);
out[0] = 1;
out[1] = 0;
out[2] = 0;
out[3] = 0;
out[4] = 0;
out[5] = 1;
out[6] = 0;
out[7] = 0;
out[8] = 0;
out[9] = 0;
out[10] = 1;
out[11] = 0;
out[12] = 0;
out[13] = 0;
out[14] = 0;
out[15] = 1;
}
void mat4_lookAtf(matrix4x4* matrix, float eye[3], float center[3], float up[3]) {
matrix4x4 out = (*matrix);
float x0, x1, x2, y0, y1, y2, z0, z1, z2, len,
eyex = eye[0],
eyey = eye[1],
eyez = eye[2],
upx = up[0],
upy = up[1],
upz = up[2],
centerx = center[0],
centery = center[1],
centerz = center[2];
if (fabs(eyex - centerx) < 0.000001 &&
fabs(eyey - centery) < 0.000001 &&
fabs(eyez - centerz) < 0.000001) {
mat4_identity(&out);
return;
}
z0 = eyex - centerx;
z1 = eyey - centery;
z2 = eyez - centerz;
len = 1 / sqrt/*f*/(z0 * z0 + z1 * z1 + z2 * z2);
z0 *= len;
z1 *= len;
z2 *= len;
x0 = upy * z2 - upz * z1;
x1 = upz * z0 - upx * z2;
x2 = upx * z1 - upy * z0;
len = sqrt(x0 * x0 + x1 * x1 + x2 * x2);
if (!len) {
x0 = 0;
x1 = 0;
x2 = 0;
} else {
len = 1 / len;
x0 *= len;
x1 *= len;
x2 *= len;
}
y0 = z1 * x2 - z2 * x1;
y1 = z2 * x0 - z0 * x2;
y2 = z0 * x1 - z1 * x0;
len = sqrt(y0 * y0 + y1 * y1 + y2 * y2);
if (!len) {
y0 = 0;
y1 = 0;
y2 = 0;
} else {
len = 1 / len;
y0 *= len;
y1 *= len;
y2 *= len;
}
out[0] = x0;
out[1] = y0;
out[2] = z0;
out[3] = 0;
out[4] = x1;
out[5] = y1;
out[6] = z1;
out[7] = 0;
out[8] = x2;
out[9] = y2;
out[10] = z2;
out[11] = 0;
out[12] = -(x0 * eyex + x1 * eyey + x2 * eyez);
out[13] = -(y0 * eyex + y1 * eyey + y2 * eyez);
out[14] = -(z0 * eyex + z1 * eyey + z2 * eyez);
out[15] = 1;
};
And here is the main.c , where I render things:
#include <glad/glad.h>
#include <GLFW/glfw3.h>
#include <stdio.h>
#include <stdlib.h>
#include "include/matrix.h"
#include "include/io.h"
const int WIDTH = 640;
const int HEIGHT = 480;
// called when user resizes window
void framebuffer_size_callback(GLFWwindow* window, int width, int height) {
glViewport(0, 0, width, height);
}
// called when we receive input
void processInput(GLFWwindow *window) {
if(glfwGetKey(window, GLFW_KEY_ESCAPE) == GLFW_PRESS)
glfwSetWindowShouldClose(window, 1);
}
GLuint get_checker_texture() {
unsigned char texDat[64];
for (int i = 0; i < 64; ++i)
texDat[i] = ((i + (i / 8)) % 2) * 128 + 127;
//upload to GPU texture
GLuint tex;
glGenTextures(1, &tex);
glBindTexture(GL_TEXTURE_2D, tex);
glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
glTexImage2D(GL_TEXTURE_2D, 0, GL_RED, 8, 8, 0, GL_RED, GL_UNSIGNED_BYTE, texDat);
glGenerateMipmap(GL_TEXTURE_2D);
glBindTexture(GL_TEXTURE_2D, 0);
return tex;
}
//void render_box(renderable* this, unsigned int vbo, unsigned int vao, unsigned int ebo) {
// draw_texture(this->texture, this->x, this->y, this->z, vbo, vao, ebo);
//}
int main(int argc, char* argv[]) {
glfwInit();
glfwWindowHint(GLFW_CONTEXT_VERSION_MAJOR, 3);
glfwWindowHint(GLFW_CONTEXT_VERSION_MINOR, 3);
glfwWindowHint(GLFW_OPENGL_PROFILE, GLFW_OPENGL_CORE_PROFILE);
#ifdef __APPLE__
glfwWindowHint(GLFW_OPENGL_FORWARD_COMPAT, GL_TRUE); // only on MACOS
#endif
// creating the window
GLFWwindow* window = glfwCreateWindow(WIDTH, HEIGHT, "OpenGL App", NULL, NULL);
if (window == NULL) {
printf("Failed to create GLFW window");
glfwTerminate();
return -1;
}
glfwMakeContextCurrent(window);
// hook on window resize
glfwSetFramebufferSizeCallback(window, framebuffer_size_callback);
if (!gladLoadGLLoader((GLADloadproc)glfwGetProcAddress)) {
printf("Failed to initialize GLAD");
return -1;
}
printf("OpenGL %d.%d\n", GLVersion.major, GLVersion.minor);
glEnable(GL_DEPTH_TEST);
glViewport(0, 0, WIDTH, HEIGHT);
unsigned int tex = get_checker_texture();
const char* vertex_shader_src = read_file("res/shaders/textured_and_positioned.vs.glsl");
unsigned int vertex_shader;
vertex_shader = glCreateShader(GL_VERTEX_SHADER);
glShaderSource(vertex_shader, 1, &vertex_shader_src, NULL);
glCompileShader(vertex_shader);
int success;
char infoLog[512];
glGetShaderiv(vertex_shader, GL_COMPILE_STATUS, &success);
if (!success) {
glGetShaderInfoLog(vertex_shader, 512, NULL, infoLog);
printf("%s\n", infoLog);
}
const char* fragment_shader_src = read_file("res/shaders/textured_and_positioned.fs.glsl");
unsigned int fragment_shader;
fragment_shader = glCreateShader(GL_FRAGMENT_SHADER);
glShaderSource(fragment_shader, 1, &fragment_shader_src, NULL);
glCompileShader(fragment_shader);
int success0;
char infoLog0[512];
glGetShaderiv(fragment_shader, GL_COMPILE_STATUS, &success0);
if (!success0) {
glGetShaderInfoLog(fragment_shader, 512, NULL, infoLog0);
printf("%s\n", infoLog0);
}
unsigned int shaderProgram;
shaderProgram = glCreateProgram();
glAttachShader(shaderProgram, vertex_shader);
glAttachShader(shaderProgram, fragment_shader);
glLinkProgram(shaderProgram);
unsigned uniform_sampler_ourTexture = glGetUniformLocation(shaderProgram, "ourTexture");
unsigned uniform_mat4_model = glGetUniformLocation(shaderProgram, "model");
unsigned uniform_mat4_view = glGetUniformLocation(shaderProgram, "view");
unsigned uniform_mat4_perspective = glGetUniformLocation(shaderProgram, "perspective");
int success1;
char infoLog1[512];
glGetProgramiv(shaderProgram, GL_LINK_STATUS, &success1);
if(!success1) {
glGetProgramInfoLog(shaderProgram, 512, NULL, infoLog1);
printf("%s\n", infoLog1);
}
float vertices[] = {
// positions // colors // texture coords
0.1f, 0.1f, 0.0f, 1.0f, 0.0f, 0.0f, 1.0f, 1.0f, // top right
0.1f, -0.1f, 0.0f, 0.0f, 1.0f, 0.0f, 1.0f, 0.0f, // bottom right
-0.1f, -0.1f, 0.0f, 0.0f, 0.0f, 1.0f, 0.0f, 0.0f, // bottom left
-0.1f, 0.1f, 0.0f, 1.0f, 1.0f, 0.0f, 0.0f, 1.0 // top left
};
unsigned elements[] = {
0, 1, 2, // triangle
2, 3, 0 // triangle
};
unsigned int vao;
glGenVertexArrays(1, &vao);
glBindVertexArray(vao);
matrix4x4 model = init_matrix4x4();
matrix4x4 view = init_matrix4x4();
translate_matrix4x4(&view, 0.0f, 0.0f, 0.0f);
float x = 0.0f;
float y = 0.0f;
float z = 0.0f;
matrix4x4 perspective = calloc(16, sizeof(float));
ortho_matrix4x4(&perspective, 0.0f, 640.0f, 0.0f, 480.0f, 0.1f, 100.0f);
unsigned int vbo;
glGenBuffers(1, &vbo);
glBindBuffer(GL_ARRAY_BUFFER, vbo);
glBufferData(GL_ARRAY_BUFFER, sizeof(vertices), vertices, GL_STATIC_DRAW);
// positions
glVertexAttribPointer(0, 3, GL_FLOAT, GL_FALSE, 8 * sizeof(float), (void*)(0 * sizeof(float)));
glEnableVertexAttribArray(0);
// colors
glVertexAttribPointer(1, 3, GL_FLOAT, GL_FALSE, 8 * sizeof(float), (void*)(3 * sizeof(float)));
glEnableVertexAttribArray(1);
// texture coords
glVertexAttribPointer(2, 2, GL_FLOAT, GL_FALSE, 8 * sizeof(float), (void*)(6 * sizeof(float)));
glEnableVertexAttribArray(2);
unsigned int ebo;
glGenBuffers(1, &ebo);
glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, ebo);
glBufferData(GL_ELEMENT_ARRAY_BUFFER, sizeof(elements), elements, GL_STATIC_DRAW);
glUseProgram(shaderProgram);
glUniformMatrix4fv(uniform_mat4_view, 1, GL_FALSE, view);
glUniformMatrix4fv(uniform_mat4_perspective, 1, GL_FALSE, perspective);
// render loop
while(!glfwWindowShouldClose(window)) {
processInput(window);
// render here
glClearColor(
0, 0, 0, 0
);
glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT);
glActiveTexture(GL_TEXTURE0);
glBindTexture(GL_TEXTURE_2D, tex);
glUniform1i(uniform_sampler_ourTexture, 0);
translate_matrix4x4(&model, x, y, z);
glUniformMatrix4fv(uniform_mat4_model, 1, GL_FALSE, model);
//x += 0.0001f;
//y += 0.0001f;
glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, ebo);
glDrawElements(GL_TRIANGLES, 6, GL_UNSIGNED_INT, 0);
glfwSwapBuffers(window);
glfwPollEvents();
}
glfwTerminate();
return 0;
}
Here is the vertex shader:
#version 330 core
layout (location = 0) in vec3 aPos;
layout (location = 1) in vec3 aColor;
layout (location = 2) in vec2 aTexCoord;
uniform mat4 model;
uniform mat4 view;
uniform mat4 perspective;
out vec3 ourColor;
out vec2 TexCoord;
void main()
{
gl_Position = perspective * view * model * vec4(aPos, 1.0);
ourColor = aColor;
TexCoord = aTexCoord;
}
Here is the fragment shader:
#version 330 core
out vec4 FragColor;
in vec3 ourColor;
in vec2 TexCoord;
uniform sampler2D ourTexture;
void main()
{
FragColor = vec4(vec3(texture(ourTexture, TexCoord).r), 1.);
}
Now, if I remove the perspective value from the shader, which is the ortho matrix, the checkered texture is rendered as it should.
What is wrong here?
Is it my shader or is it the matrix ortho function?
Your matrices are stored in row-major, submit them to the uniforms without transposing and do calculations left-associative in the shader.
You can either
store in column major order
or
transpose upon loading into the uniform
or
switch to left-associative multiplication in the shader
Each to the same effect.

How to optimize chgemm (int = char*char) matrix multiplication using avx2 intrinsics?

It seems there is few discussion about chgemm(int = char*char) matrix multiplication. Let's assume M%8=0, N%8=0, K%8=0, B is transposed. I recall the CPU which supports AVX2 only has 16 ymm registers. So I tried to implement blocked matrix of 2x8 to maximize using the registers. However, I couldn't find any better solution (e.g, modify algorithm to move load of pb to outer loop). Another issue I am worried about is the latency of sum reduction (permute, sli, add).
I also tried 4x8 and 8x8, it seems 8x8 severely degrades performance.
Could anyone help me to further optimize this code? Thanks!
void _chgemm_mm_u_c_N_T_2x8(
size_t M, size_t N, size_t K, float scaleAB,
unsigned char *A, size_t lda, signed char *B, size_t ldb,
float scaleT, int *C, size_t ldc)
{
int h = M;
int w = N;
int d = K;
int i, j, k;
__m256i tmp_short = _mm256_set1_epi16(1);
for (i = 0; i < h; i += 2) {
__m256i pc0, pc1, pc2, pc3;
for (j = 0; j < w; j += 8 ) {
unsigned char *pa0 = A + i * lda;
unsigned char *pa1 = pa0 + 1*lda;
signed char *pb0 = (signed char*)B + j*ldb;
signed char *pb1 = pb0 + 1*ldb;
signed char *pb2 = pb0 + 2*ldb;
signed char *pb3 = pb0 + 3*ldb;
signed char *pb4 = pb0 + 4*ldb;
signed char *pb5 = pb0 + 5*ldb;
signed char *pb6 = pb0 + 6*ldb;
signed char *pb7 = pb0 + 7*ldb;
int *pc = (int*)C + i * ldc + j;
__m256i ma0, ma1; //ma2, ma3, ma4, ma5, ma6, ma7;
__m256i mb0, mb1, mb2, mb3, mb4, mb5, mb6, mb7;
__m256i mc0, mc1; //mc2, mc3, mc4, mc5, mc6, mc7;
__m256i sum0 = _mm256_setzero_si256();
__m256i sum1 = _mm256_setzero_si256();
__m256i sum2 = _mm256_setzero_si256();
__m256i sum3 = _mm256_setzero_si256();
__m256i sum4 = _mm256_setzero_si256();
__m256i sum5 = _mm256_setzero_si256();
__m256i sum6 = _mm256_setzero_si256();
__m256i sum7 = _mm256_setzero_si256();
__m256i sum8 = _mm256_setzero_si256();
__m256i sum9 = _mm256_setzero_si256();
__m256i sum10 = _mm256_setzero_si256();
__m256i sum11 = _mm256_setzero_si256();
__m256i sum12 = _mm256_setzero_si256();
__m256i sum13 = _mm256_setzero_si256();
__m256i sum14 = _mm256_setzero_si256();
__m256i sum15 = _mm256_setzero_si256();
for (k = 0; k < d; k += 32) {
//__m128i low0, low1, low2, low3;
//__m128i hi0, hi1, hi2, hi3;
ma0 = _mm256_loadu_si256((__m256i*)pa0);
ma1 = _mm256_loadu_si256((__m256i*)pa1);
mb0 = _mm256_loadu_si256((__m256i*)pb0);
mb1 = _mm256_loadu_si256((__m256i*)pb1);
mb2 = _mm256_loadu_si256((__m256i*)pb2);
mb3 = _mm256_loadu_si256((__m256i*)pb3);
mb4 = _mm256_loadu_si256((__m256i*)pb4);
mb5 = _mm256_loadu_si256((__m256i*)pb5);
mb6 = _mm256_loadu_si256((__m256i*)pb6);
mb7 = _mm256_loadu_si256((__m256i*)pb7);
_mm_prefetch((char *)pa0 + 32, _MM_HINT_T0);
_mm_prefetch((char *)pa1 + 32, _MM_HINT_T0);
_mm_prefetch((char *)pb0 + 32, _MM_HINT_T0);
_mm_prefetch((char *)pb1 + 32, _MM_HINT_T0);
_mm_prefetch((char *)pb2 + 32, _MM_HINT_T0);
_mm_prefetch((char *)pb3 + 32, _MM_HINT_T0);
_mm_prefetch((char *)pb4 + 32, _MM_HINT_T0);
_mm_prefetch((char *)pb5 + 32, _MM_HINT_T0);
_mm_prefetch((char *)pb6 + 32, _MM_HINT_T0);
_mm_prefetch((char *)pb7 + 32, _MM_HINT_T0);
mc0 = _mm256_maddubs_epi16(ma0, mb0);
sum0 = _mm256_add_epi32(_mm256_madd_epi16(mc0, tmp_short), sum0);
mc0 = _mm256_maddubs_epi16(ma0, mb1);
sum1 = _mm256_add_epi32(_mm256_madd_epi16(mc0, tmp_short), sum1);
mc0 = _mm256_maddubs_epi16(ma0, mb2);
sum2 = _mm256_add_epi32(_mm256_madd_epi16(mc0, tmp_short), sum2);
mc0 = _mm256_maddubs_epi16(ma0, mb3);
sum3 = _mm256_add_epi32(_mm256_madd_epi16(mc0, tmp_short), sum3);
mc0 = _mm256_maddubs_epi16(ma0, mb4);
sum4 = _mm256_add_epi32(_mm256_madd_epi16(mc0, tmp_short), sum4);
mc0 = _mm256_maddubs_epi16(ma0, mb5);
sum5 = _mm256_add_epi32(_mm256_madd_epi16(mc0, tmp_short), sum5);
mc0 = _mm256_maddubs_epi16(ma0, mb6);
sum6 = _mm256_add_epi32(_mm256_madd_epi16(mc0, tmp_short), sum6);
mc0 = _mm256_maddubs_epi16(ma0, mb7);
sum7 = _mm256_add_epi32(_mm256_madd_epi16(mc0, tmp_short), sum7);
//
mc0 = _mm256_maddubs_epi16(ma1, mb0);
sum8 = _mm256_add_epi32(_mm256_madd_epi16(mc0, tmp_short), sum8);
mc0 = _mm256_maddubs_epi16(ma1, mb1);
sum9 = _mm256_add_epi32(_mm256_madd_epi16(mc0, tmp_short), sum9);
mc0 = _mm256_maddubs_epi16(ma1, mb2);
sum10 = _mm256_add_epi32(_mm256_madd_epi16(mc0, tmp_short), sum10);
mc0 = _mm256_maddubs_epi16(ma1, mb3);
sum11 = _mm256_add_epi32(_mm256_madd_epi16(mc0, tmp_short), sum11);
mc0 = _mm256_maddubs_epi16(ma1, mb4);
sum12 = _mm256_add_epi32(_mm256_madd_epi16(mc0, tmp_short), sum12);
mc0 = _mm256_maddubs_epi16(ma1, mb5);
sum13 = _mm256_add_epi32(_mm256_madd_epi16(mc0, tmp_short), sum13);
mc0 = _mm256_maddubs_epi16(ma1, mb6);
sum14 = _mm256_add_epi32(_mm256_madd_epi16(mc0, tmp_short), sum14);
mc0 = _mm256_maddubs_epi16(ma1, mb7);
sum15 = _mm256_add_epi32(_mm256_madd_epi16(mc0, tmp_short), sum15);
//
pa0+=32; pa1+=32; //pa2+=32; pa3+=32;
pb0+=32; pb1+=32; pb2+=32; pb3+=32;
pb4+=32; pb5+=32; pb6+=32; pb7+=32;
}
sum0 = _mm256_add_epi32(sum0, _mm256_permute2x128_si256(sum0, sum0, 0x81));
sum0 = _mm256_add_epi32(sum0, _mm256_srli_si256(sum0, 8));
sum0 = _mm256_add_epi32(sum0, _mm256_srli_si256(sum0, 4));
sum1 = _mm256_add_epi32(sum1, _mm256_permute2x128_si256(sum1, sum1, 0x81));
sum1 = _mm256_add_epi32(sum1, _mm256_srli_si256(sum1, 8));
sum1 = _mm256_add_epi32(sum1, _mm256_srli_si256(sum1, 4));
sum2 = _mm256_add_epi32(sum2, _mm256_permute2x128_si256(sum2, sum2, 0x81));
sum2 = _mm256_add_epi32(sum2, _mm256_srli_si256(sum2, 8));
sum2 = _mm256_add_epi32(sum2, _mm256_srli_si256(sum2, 4));
sum3 = _mm256_add_epi32(sum3, _mm256_permute2x128_si256(sum3, sum3, 0x81));
sum3 = _mm256_add_epi32(sum3, _mm256_srli_si256(sum3, 8));
sum3 = _mm256_add_epi32(sum3, _mm256_srli_si256(sum3, 4));
sum4 = _mm256_add_epi32(sum4, _mm256_permute2x128_si256(sum4, sum4, 0x81));
sum4 = _mm256_add_epi32(sum4, _mm256_srli_si256(sum4, 8));
sum4 = _mm256_add_epi32(sum4, _mm256_srli_si256(sum4, 4));
sum5 = _mm256_add_epi32(sum5, _mm256_permute2x128_si256(sum5, sum5, 0x81));
sum5 = _mm256_add_epi32(sum5, _mm256_srli_si256(sum5, 8));
sum5 = _mm256_add_epi32(sum5, _mm256_srli_si256(sum5, 4));
sum6 = _mm256_add_epi32(sum6, _mm256_permute2x128_si256(sum6, sum6, 0x81));
sum6 = _mm256_add_epi32(sum6, _mm256_srli_si256(sum6, 8));
sum6 = _mm256_add_epi32(sum6, _mm256_srli_si256(sum6, 4));
sum7 = _mm256_add_epi32(sum7, _mm256_permute2x128_si256(sum7, sum7, 0x81));
sum7 = _mm256_add_epi32(sum7, _mm256_srli_si256(sum7, 8));
sum7 = _mm256_add_epi32(sum7, _mm256_srli_si256(sum7, 4));
sum8 = _mm256_add_epi32(sum8, _mm256_permute2x128_si256(sum8, sum8, 0x81));
sum8 = _mm256_add_epi32(sum8, _mm256_srli_si256(sum8, 8));
sum8 = _mm256_add_epi32(sum8, _mm256_srli_si256(sum8, 4));
sum9 = _mm256_add_epi32(sum9, _mm256_permute2x128_si256(sum9, sum9, 0x81));
sum9 = _mm256_add_epi32(sum9, _mm256_srli_si256(sum9, 8));
sum9 = _mm256_add_epi32(sum9, _mm256_srli_si256(sum9, 4));
sum10 = _mm256_add_epi32(sum10, _mm256_permute2x128_si256(sum10, sum10, 0x81));
sum10 = _mm256_add_epi32(sum10, _mm256_srli_si256(sum10, 8));
sum10 = _mm256_add_epi32(sum10, _mm256_srli_si256(sum10, 4));
sum11 = _mm256_add_epi32(sum11, _mm256_permute2x128_si256(sum11, sum11, 0x81));
sum11 = _mm256_add_epi32(sum11, _mm256_srli_si256(sum11, 8));
sum11 = _mm256_add_epi32(sum11, _mm256_srli_si256(sum11, 4));
sum12 = _mm256_add_epi32(sum12, _mm256_permute2x128_si256(sum12, sum12, 0x81));
sum12 = _mm256_add_epi32(sum12, _mm256_srli_si256(sum12, 8));
sum12 = _mm256_add_epi32(sum12, _mm256_srli_si256(sum12, 4));
sum13 = _mm256_add_epi32(sum13, _mm256_permute2x128_si256(sum13, sum13, 0x81));
sum13 = _mm256_add_epi32(sum13, _mm256_srli_si256(sum13, 8));
sum13 = _mm256_add_epi32(sum13, _mm256_srli_si256(sum13, 4));
sum14 = _mm256_add_epi32(sum14, _mm256_permute2x128_si256(sum14, sum14, 0x81));
sum14 = _mm256_add_epi32(sum14, _mm256_srli_si256(sum14, 8));
sum14 = _mm256_add_epi32(sum14, _mm256_srli_si256(sum14, 4));
sum15 = _mm256_add_epi32(sum15, _mm256_permute2x128_si256(sum15, sum15, 0x81));
sum15 = _mm256_add_epi32(sum15, _mm256_srli_si256(sum15, 8));
sum15 = _mm256_add_epi32(sum15, _mm256_srli_si256(sum15, 4));
pc[0] = _mm256_extract_epi32(sum0, 0);
pc[1] = _mm256_extract_epi32(sum1, 0);
pc[2] = _mm256_extract_epi32(sum2, 0);
pc[3] = _mm256_extract_epi32(sum3, 0);
pc[4] = _mm256_extract_epi32(sum4, 0);
pc[5] = _mm256_extract_epi32(sum5, 0);
pc[6] = _mm256_extract_epi32(sum6, 0);
pc[7] = _mm256_extract_epi32(sum7, 0);
pc[ldc+0] = _mm256_extract_epi32(sum8, 0);
pc[ldc+1] = _mm256_extract_epi32(sum9, 0);
pc[ldc+2] = _mm256_extract_epi32(sum10, 0);
pc[ldc+3] = _mm256_extract_epi32(sum11, 0);
pc[ldc+4] = _mm256_extract_epi32(sum12, 0);
pc[ldc+5] = _mm256_extract_epi32(sum13, 0);
pc[ldc+6] = _mm256_extract_epi32(sum14, 0);
pc[ldc+7] = _mm256_extract_epi32(sum15, 0);
}
}
}

Resources