How to convert this assembly code to intrinsic code?

How to convert this assembly code to intrinsic code? - c

Below it seems like intrinsics, however, I am not familiar with intrinsic functions. Please help me to convert the real code. Especially, testFunc() is more ambiguous for me.
I guess it is also for dot product of two float vectors, but, the labels Lrep and Lexit make me confuse.
Please figure out clearly for me.
And intrinsics are available for mobile processor?
void testFunc(int M, int N, int K, float* A, float* B, float* C)
{
float *a;
float *b = new float[K*N];
float *pointb = B;
float *bb;
float *answer = C;
float c[8];
for (int j = 0, k; j < K; j++) {
bb = b + j;
for (k = N / 8; k > 0; k--) {
*bb = *pointb++; bb += K;
*bb = *pointb++; bb += K;
*bb = *pointb++; bb += K;
*bb = *pointb++; bb += K;
*bb = *pointb++; bb += K;
*bb = *pointb++; bb += K;
*bb = *pointb++; bb += K;
*bb = *pointb++; bb += K;
}
for (k = N / 8 * 8; k < N; k++) {
*bb = *pointb++; bb += K;
}
}
int K8 = K / 8 * 8;
for (int i = 0; i < M; i++) for (int k = 0; k < N; k++) {
a = A + i * K;
bb = b + k * K;
__asm {
mov esi, K8;
sub esi, 8;
shl esi, 2;
xor edi, edi;
mov edx, a;
mov ebx, bb;
vxorps ymm3, ymm3, ymm3;
Lrep:
cmp edi, esi;
jg Lexit;
vmovups ymm0, ymmword ptr[edx + edi];
vfmadd231ps ymm3, ymm0, ymmword ptr[ebx + edi];
add edi, 32;
jmp Lrep;
Lexit:
vmovups ymmword ptr[c], ymm3;
}
for (int j = K8; j < K; ) {
*c += *(a + j) * *(bb + j); j++;
}
*answer = (c[0] + c[1] + c[2] + c[3] + c[4] + c[5] + c[6] + c[7]);
answer++;
}
}
and
pA = A;
for (k = 0; k < K; k++) {
pC = C;
for (i = 0; i < M; i++) {
pA = A + i * K + k;
pB = B + k * N;
for (j = N / 32; j > 0; j--) {
_asm {
mov eax, pC;
mov ebx, pA;
mov ecx, pB;
vmovups ymm0, ymmword ptr[eax];
vmovss xmm1, dword ptr[ebx];
vbroadcastss ymm4, xmm1;
vmovups ymm2, ymmword ptr[ecx];
vfmadd231ps ymm0, ymm4, ymm2;
vmovups ymmword ptr[eax], ymm0;
}
pC += 8; pB += 8;
_asm {
mov eax, pC;
mov ebx, pA;
mov ecx, pB;
vmovups ymm0, ymmword ptr[eax];
vmovss xmm1, dword ptr[ebx];
vbroadcastss ymm4, xmm1;
vmovups ymm2, ymmword ptr[ecx];
vfmadd231ps ymm0, ymm4, ymm2;
vmovups ymmword ptr[eax], ymm0;
}
pC += 8; pB += 8;
_asm {
mov eax, pC;
mov ebx, pA;
mov ecx, pB;
vmovups ymm0, ymmword ptr[eax];
vmovss xmm1, dword ptr[ebx];
vbroadcastss ymm4, xmm1;
vmovups ymm2, ymmword ptr[ecx];
vfmadd231ps ymm0, ymm4, ymm2;
vmovups ymmword ptr[eax], ymm0;
}
pC += 8; pB += 8;
_asm {
mov eax, pC;
mov ebx, pA;
mov ecx, pB;
vmovups ymm0, ymmword ptr[eax];
vmovss xmm1, dword ptr[ebx];
vbroadcastss ymm4, xmm1;
vmovups ymm2, ymmword ptr[ecx];
vfmadd231ps ymm0, ymm4, ymm2;
vmovups ymmword ptr[eax], ymm0;
}
pC += 8; pB += 8;
}
for (j = N / 32 * 32; j < N; j++) {
*pC += *pA * *pB;
pC += 1; pB += 1;
}
}
}

In intrinsics, it's this code repeated 4 times.
{
// vmovups ymm0, ymmword ptr[eax];
__m256 tempC = _mm256_loadu_ps((float*)pC);
// vmovss xmm1, dword ptr[ebx];
// vbroadcastss ymm4, xmm1;
__m256 tempA = _mm256_set1_ps(*pA);
// vmovups ymm2, ymmword ptr[ecx];
__m256 tempB = _mm256_loadu_ps((float*)pB);
// vfmadd231ps ymm0, ymm4, ymm2;
__m256 result = _mm256_fmadd_ps(tempA, tempB, tempC);
// vmovups ymmword ptr[eax], ymm0;
_mm256_storeu_ps(pC, result);
}
pC += 8; pB += 8;
Constantly broadcasting the same value from pA seems a bit redundant though.

2 vector loads (from the same position in 2 arrays) feeding an FMA into a vector accumulator smells like a dot-product to me.
I didn't check the asm reference manual to see that the destination operand was the sum rather than 1 of the multiplicands, but that's the way that makes sense.
The triple-nested loop looks like a matrix multiplication. It broadcasts 1 input while doing a vector load from the other to feed an FMA, so probably it's generating a SIMD vector of results for an output row.
Using MSVC inline asm syntax for this is pretty bad; it can only accept inputs via memory operands so it forces a reload + store between each block of asm. If you're going to unroll, use one big asm statement and use displacements in the addressing modes.
IDK why the dot-produce loop is written inefficiently (with both a conditional and unconditional branch inside the loop), and not unrolled with multiple accumulators. Pretty much defeats the purpose of hand-coding in asm. See Why does mulss take only 3 cycles on Haswell, different from Agner's instruction tables? for how to use multiple accumulators to hide FMA latency. Or let clang do it for you when unrolling+vectorizing a pure C loop.
I also don't know why it doesn't horizontal-sum the result, but instead just stores it to memory with vmovups [c], ymm3. Seems pointless. I guess the caller has to reload from memory and sum, or you could declare the function as returning a __m256 vector and ignore the store.
Anyway, you can obviously write a dot-product in scalar C code, perhaps using fma(a[i], b[i], sum) from math.h to replicate the asm's behaviour of not rounding the temporary result.
Or copy the manual vectorization with intrinsics like sum = _mm256_fmadd_ps(_mm256_loadu_ps(a[i]), _mm256_loadu_ps(b[i]), sum); or something. (See Intel's intrinsics guide).

I'll do the first couple of lines to get you started, but really, if you can't read the assembly you'll need to refer to the Intel CPU manual to be able to decipher it.
mov esi, K8;
sub esi, 8;
shl esi, 2;
xor edi, edi;
mov edx, a;
mov ebx, bb;
mov esi, K8
copy the contents of K8 into esi
subtract 8 from the value in easi
shift left 2 bits of esi and the copy result into esi
apply xor operation to edi against edi (this will be 0 and the reason clear if you understand binary and how registers work)
copy contents of a into edx
copy contents of bb into ebx
copy contents of K8 into esi
From here you'll need to familiarise yourself with depending on where your knowledge is at, binary and basic cpu architecture and assembly language operands that are relevant to your problem. Once you can read each line, then you can decipher the blocks and finally the program.

Related

Using AVX to improve performance of float subtract, divide, truncate to int32

Trying to use AVX to improve performance of the following
__declspec(dllexport) void __cdecl calculate_quantized_vertical_values(long length, float min, float step, float* source, unsigned long* destination)
{
for (long i = 0; i < length; i++)
{
destination[i] = (source[i] - min) / step;
}
}
by replacing it with
__declspec(dllexport) void __cdecl calculate_quantized_vertical_values_avx(long length, float min, float step, float* source, unsigned long* destination)
{
long multiple8end = ((long)(length / 8)) * 8;
__m256 min256 = _mm256_broadcast_ss((const float*)&min);
__m256 step256 = _mm256_broadcast_ss((const float*)&step);
for (long i = 0; i < multiple8end; i+=8)
{
__m256 value256 = _mm256_load_ps((const float*)(source + i));
__m256 offset256 = _mm256_sub_ps(value256, min256);
__m256 floatres256 = _mm256_div_ps(offset256, step256);
__m256i long256 = _mm256_cvttps_epi32(floatres256);
_mm256_store_si256((__m256i*)(destination + i), long256);
}
for (long i = multiple8end; i < length; i ++)
{
destination[i] = (source[i] - min) / step;
}
}
The original loop takes around 330ms with my 55M element source array and the contents of the loop compile to
loc_180001050:
movss xmm0, dword ptr [r10+rcx-4]
subss xmm0, xmm3
divss xmm0, xmm2
cvttss2si rax, xmm0
mov [rcx-4], eax
movss xmm1, dword ptr [r10+rcx]
subss xmm1, xmm3
divss xmm1, xmm2
cvttss2si rax, xmm1
mov [rcx], eax
movss xmm0, dword ptr [r10+rcx+4]
subss xmm0, xmm3
divss xmm0, xmm2
cvttss2si rax, xmm0
mov [rcx+4], eax
movss xmm1, dword ptr [r10+rcx+8]
subss xmm1, xmm3
divss xmm1, xmm2
cvttss2si rax, xmm1
mov [rcx+8], eax
add rcx, 10h
sub r8, 1
jnz short loc_180001050
The AVX loop takes around 170ms over the same 55M element source array and the contents of the (main) loop compile to:
loc_180001160:
vmovups ymm0, ymmword ptr [r8+rdx]
lea rdx, [rdx+20h]
vsubps ymm1, ymm0, ymm6
vdivps ymm2, ymm1, ymm7
vcvttps2dq ymm3, ymm2
vmovdqu ymmword ptr [rdx-20h], ymm3
sub rax, 1
jnz short loc_180001160
So there IS a performance improvement with AVX but I wonder if it's possible to get a more significant performance improvement or this is about the limit for this particular calculation
Edit: I should also mention that I'm calling these DLL functions from a .NET app if it makes any difference.
Edit: I would ideally want unsigned char array for destination but sticking with int32 for now because I've not found a way to do the float -> unsigned char conversion with AVX
Also multiplication by 1.f/step instead of division by step should be fine for me if it improves performance

If you scale by 1/step instead of dividing by step you should be significantly faster, unless you are limited by memory-throughput. If you factor out the subtraction of min, you are also able to use FMA instructions, if they are available:
void calculate_quantized_vertical_values_avx(size_t length, float min, float step, float* source, uint32_t* destination)
{
size_t multiple8end = ((length / 8)) * 8;
const float scale = 1.f/step;
const float offset = -min * scale;
const __m256 scale256 = _mm256_set1_ps(scale);
const __m256 offset256 = _mm256_set1_ps(offset);
for (size_t i = 0; i < multiple8end; i+=8)
{
__m256 value256 = _mm256_load_ps((const float*)(source + i));
#ifdef __FMA__
__m256 floatres256 = _mm256_fmadd_ps(value256, scale256, offset256);
#else
__m256 floatres256 = _mm256_add_ps(_mm256_mul_ps(value256, scale256), offset256);
#endif
__m256i long256 = _mm256_cvttps_epi32(floatres256);
_mm256_store_si256((__m256i*)(destination + i), long256);
}
for (size_t i = multiple8end; i < length; i ++)
{
destination[i] = (source[i] * scale) + offset;
}
}
If you want to convert the result to uint8, have a look at _mm256_packus_epi32 and _mm256_packus_epi16 (or _mm_packus_epi32 and _mm_packus_epi16 if you don't have AVX2).

Convert 32 bit GDI bitmap to Grayscale by manipulating bmBits

I program only on windows (AutoHotkey) and mainly use its win32 api.
I want to convert a GDI Bitmap (32bit DIB) to grayscale.
I use GetObject() to obtain a BITMAP structure and pass bmBITS and it size to following function
int ToGrayscale(unsigned char * s, int n)
{
float b, g, r, y;
for (int i=0; i<n; i+=4)
{
b = (float) s[i+0];
g = (float) s[i+1];
r = (float) s[i+2];
y = (0.299 * r) + (0.587 * g) + (0.114 * b);
s[i+0] = s[i+1] = s[i+2] = (unsigned char)y;
}
return 1;
}
The above is the full code. I compile it to an .obj with Pelles C, extract the machine code from .obj
and call/use that machine code from AutoHotkey language.
The machine code when called gives me access violation error.
s[i+0] = s[i+1] = s[i+2] = 0;
works fine and fills the image to black, but I want to convert the pixels with y, the grayscale value.
What is the correct way of doing this?.
I'm not sure if I've provided enough info. Please ask/suggest and I will update it.
EDIT 1:
I added one more working function ToBlack() to the c file.
int ToBlack(unsigned char * s, int n)
{
for (int i=0; i<n; i+=4)
{
s[i+0] = s[i+1] = s[i+2] = 0;
}
return 1;
}
int ToGrayscale(unsigned char * s, int n)
{
float b, g, r, y;
for (int i=0; i<n; i+=4)
{
b = (float) s[i+0];
g = (float) s[i+1];
r = (float) s[i+2];
y = (0.299 * r) + (0.587 * g) + (0.114 * b);
s[i+0] = s[i+1] = s[i+2] = (unsigned char)y;
}
return 1;
}
The source bitmap is a 2x2 32bit opaque bitmap with single ARGB color FFAABBCC
The bmBITS size is 16 bytes and the hex representation of data is
CCBBAAFFCCBBAAFFCCBBAAFFCCBBAAFF
ToBlack() works fine and when I inspect bmBITS after running the machine code,
I get following results - correctly.
ToBlack(bmBITS, 16) Result: 000000FF000000FF000000FF000000FF
ToBlack(bmBITS, 12) Result: 000000FF000000FF000000FFCCBBAAFF
ToBlack(bmBITS, 8) Result: 000000FF000000FFCCBBAAFFCCBBAAFF
ToBlack(bmBITS, 4) Result: 000000FFCCBBAAFFCCBBAAFFCCBBAAFF
With ToGrayscale(bmBITS, 16) data is unchanged.
I'm guessing that the crash is occurring when y is being assigned.

I found the source for the Access violation error.
The machine code isn't portable.
For the following c code
int ToGrayscale(unsigned char * s, int n)
{
float b, g, r, y;
for (int i=0; i<n; i+=4)
{
b = (float) s[i+0];
g = (float) s[i+1];
r = (float) s[i+2];
y = (0.299 * r) + (0.587 * g) + (0.114 * b);
s[i+0] = s[i+1] = s[i+2] = (unsigned char)y;
}
return 1;
}
the .obj dump is as follows:
Dump of D:\AhkScripts\AHK-003\ToGrayscale.obj
File type: OBJ
_ToGrayscale:
[00000000] 55 push ebp
[00000001] 89E5 mov ebp,esp
[00000003] 83EC10 sub esp,10
[00000006] 56 push esi
[00000007] 8B5508 mov edx,dword ptr [ebp+8]
[0000000A] 8B4D0C mov ecx,dword ptr [ebp+C]
[0000000D] 85C9 test ecx,ecx
[0000000F] 7E6C jle 0000007D
[00000011] 31F6 xor esi,esi
[00000013] 0FB60432 movzx eax,byte ptr [edx+esi]
[00000017] 50 push eax
[00000018] DB0424 fild dword ptr [esp]
[0000001B] 58 pop eax
[0000001C] D95DFC fstp dword ptr [ebp-4]
[0000001F] 0FB6443201 movzx eax,byte ptr [edx+esi+1]
[00000024] 50 push eax
[00000025] DB0424 fild dword ptr [esp]
[00000028] 58 pop eax
[00000029] D95DF8 fstp dword ptr [ebp-8]
[0000002C] 0FB6443202 movzx eax,byte ptr [edx+esi+2]
[00000031] 50 push eax
[00000032] DB0424 fild dword ptr [esp]
[00000035] 58 pop eax
[00000036] D95DF4 fstp dword ptr [ebp-C]
[00000039] D945F4 fld dword ptr [ebp-C]
[0000003C] DC0D00000000 fmul qword ptr [#37]
[00000042] D945F8 fld dword ptr [ebp-8]
[00000045] DC0D00000000 fmul qword ptr [#38]
[0000004B] DEC1 faddp st(1),st
[0000004D] D945FC fld dword ptr [ebp-4]
[00000050] DC0D00000000 fmul qword ptr [#39]
[00000056] DEC1 faddp st(1),st
[00000058] D95DF0 fstp dword ptr [ebp-10]
[0000005B] D945F0 fld dword ptr [ebp-10]
[0000005E] E800000000 call ___ftouc
[00000063] 88443202 mov byte ptr [edx+esi+2],al
[00000067] D945F0 fld dword ptr [ebp-10]
[0000006A] E800000000 call ___ftouc
[0000006F] 88443201 mov byte ptr [edx+esi+1],al
[00000073] 880432 mov byte ptr [edx+esi],al
[00000076] 83C604 add esi,4
[00000079] 39CE cmp esi,ecx
[0000007B] 7C96 jl 00000013
[0000007D] B801000000 mov eax,1
[00000082] 5E pop esi
[00000083] 89EC mov esp,ebp
[00000085] 5D pop ebp
[00000086] C3 ret
SUMMARY
28 .drectve
18 .rdata
87 .text
There seems to be calls to a non-existent function named ___ftouc which I suppose
is Float to Unsigned Char, and it will be available only when linked.
(Please correct me if I'm wrong)
Solution? Do the the math in a way that wouldn't require type casting.
The following code works just fine in x86 / x64.
void ToGrayscale(unsigned char * s, int n)
{
for (int i=0; i<n; i+=4)
s[i+0] = s[i+1] = s[i+2] = ( (s[i+0]*114) + (s[i+1]*587) + (s[i+2]*299) ) / 1000;
}

Reviewing IF conditional code to save CPU cycles

I am reviewing the usage of if condition in my program, in there, I have lines like the following:
if(count > 4) count = 4;
Would it be a good idea to write the above if conditional statement as the following non-branched one?
count = 4*(count> 4) + count*(count<= 4);
I also have the following snippet there:
for (j=0, i=0; j<NCARD_PER_SUIT && i<CARDS_PER_PLAYER+CARDS_ON_BOARD; ++j) {
if (card_cfg.hearts & cfg_mask[j]) {
player_hand[i].card.face = j;
player_hand[i++].card.suit = HEART;
}
if (card_cfg.spades & cfg_mask[j]) {
player_hand[i].card.face = j;
player_hand[i++].card.suit = SPADE;
}
if (card_cfg.clubs & cfg_mask[j]) {
player_hand[i].card.face = j;
player_hand[i++].card.suit = CLUB;
}
if (card_cfg.diamonds & cfg_mask[j]) {
player_hand[i].card.face = j;
player_hand[i++].card.suit = DIAMOND;
}
}
and wondering if there is good (non-branched) way to write the above, any suggestions?
EDIT: Based on some feedback below, i compared the assembly instructions (using MSVS2015 for Windows 10) and got the following:
; 718 : count = 4*(count> 4) + count*(count<= 4);
xor ebx, ebx
cmp edx, 4
setle bl
xor ecx, ecx
imul ebx, edx
cmp edx, 4
mov edx, 4
cmovg ecx, edx
add ebx, ecx
And if revert back to if statement, i get the following, where no jump instruction and total number of instructions 2/3rd compare to the above:
; 718 : if( count >4) count = 4;
mov eax, DWORD PTR _i$6$[ebp]
cmp edx, edi
mov ebx, DWORD PTR _player$GSCopy$1$[ebp]
cmovg edx, edi
mov edi, DWORD PTR _count$1$[ebp]
mov DWORD PTR _count$4$[ebp], edx
EDIT #2: Based on the tip from the comments below, i went ahead and created a
union
typedef union {
struct cfg {
unsigned short hearts;
unsigned short spades;
unsigned short clubs;
unsigned short diamonds;
} suit;
unsigned long long allsuits;
} card_cfg_t;
And with help of this union, i was able to rewrite the second snippet of OP as follows, which seem sot save a lot (20% in my case) if I build it for 64-bit machine and takes more time (extra 40%) if i build it for 32-bit machine:
for (j=0, i=0; j<NCARD_PER_SUIT && i<CARDS_PER_PLAYER+CARDS_ON_BOARD; ++j) {
for (int k=0; k<4; ++k) {
present = (int)((card_cfg.allsuits & (cfg_mask[j] << 16*k)) != 0);
player_hand[i].card.face = j*present;
player_hand[i].card.suit = k;
i = i + present;
}
}

Those micro optimistion do not have too much sense but of you want to compare (you will see the difference between your one and mine - switch on optimisation - compiler is really good in it):
int count;
void foo()
{
count = 4*(count> 4) + count*(count <= 4);
}
void foo1()
{
count = count > 4 ? 4 : count;
}
void foo4()
{
if(count> 4) count = 4;
}
foo:
mov edx, DWORD PTR count[rip]
xor ecx, ecx
cmp edx, 4
setle al
setg cl
movzx eax, al
imul eax, edx
lea eax, [rax+rcx*4]
mov DWORD PTR count[rip], eax
ret
foo1:
cmp DWORD PTR count[rip], 4
mov eax, 4
cmovle eax, DWORD PTR count[rip]
mov DWORD PTR count[rip], eax
ret
foo4:
cmp DWORD PTR count[rip], 4
jle .L6
mov DWORD PTR count[rip], 4
.L6:
rep ret

The answer to the second loop must be something like:
pushcards(player, popcards(dealer));

Why _mm256_load_pd compiled to MOVUPD instead of MOVAPD?

Why the following code results unaligned AVX instructions ( MOVUPD instead of MOVAPD)? I compiled this on Visual Studio 2015. How can I tell the compiler that my data is indeed aligned?
const size_t ALIGN_SIZE = 64;
const size_t ARRAY_SIZE = 1024;
double __declspec(align(ALIGN_SIZE)) a[ARRAY_SIZE];
double __declspec(align(ALIGN_SIZE)) b[ARRAY_SIZE];
//Calculate the dotproduct
__m256d ymm0 = _mm256_set1_pd(0.0);
for (int i = 0; i < ARRAY_SIZE; i += 8)
{
__m256d ymm1 = _mm256_load_pd(a + i);
__m256d ymm2 = _mm256_load_pd(b + i);
__m256d ymm3 = _mm256_mul_pd(ymm1, ymm2);
ymm0 = _mm256_add_pd(ymm3, ymm0);
__m256d ymm4 = _mm256_load_pd(a + i + 4);
__m256d ymm5 = _mm256_load_pd(b + i + 4);
__m256d ymm6 = _mm256_mul_pd(ymm4, ymm5);
ymm0 = _mm256_add_pd(ymm6, ymm0);
}
Assembly of the loop:
00007FF7AC7A1400 vmovupd ymm1,ymmword ptr [rbp+rax*8+2020h]
00007FF7AC7A1409 vmulpd ymm3,ymm1,ymmword ptr [rbp+rax*8+20h]
00007FF7AC7A140F vmovupd ymm2,ymmword ptr [rbp+rax*8]
00007FF7AC7A1415 vmulpd ymm0,ymm2,ymmword ptr b[rax*8]
00007FF7AC7A141E add r8d,8
00007FF7AC7A1422 movsxd rax,r8d
00007FF7AC7A1425 vaddpd ymm1,ymm0,ymm4
00007FF7AC7A1429 vaddpd ymm4,ymm1,ymm3
00007FF7AC7A142D cmp rax,400h
00007FF7AC7A1433 jb main+70h (07FF7AC7A1400h)

There is the way to solve this problem (it allows to use instruction VMOVDQA (analogue of MOVAPD) instead of MOVUPD):
inline __m256d Load(const double * p)
{
#ifdef _MSC_VER
return _mm256_castsi256_pd(_mm256_load_si256((__m256i*)p));
#else
return _mm256_load_pd(p);
#endif
}
Analogous solution for float type:
inline __m256 Load(const float * p)
{
#ifdef _MSC_VER
return _mm256_castsi256_ps(_mm256_load_si256((__m256i*)p));
#else
return _mm256_load_ps(p);
#endif
}
But in order to cheat Visual Studio compiler you have to use dynamically allocated pointers. Otherwise compiler doesn't use VMOVDQA instruction.
#include <immintrin.h>
int main()
{
float * ps = (float*)_mm_malloc(40, 32);
double * pd = (double*)_mm_malloc(40, 32);
__m256 s = Load(ps);
//00007FF79FF81325 vmovdqa ymm1,ymmword ptr [rdi]
__m256d d = Load(pd);
//00007FF79FF8132F vmovdqa ymm0,ymmword ptr [rax]
_mm256_storeu_ps(ps, s);
_mm256_storeu_pd(pd, d);
_mm_free(ps);
_mm_free(pd);
}

Failure of -mavx optimization with gcc?

EDIT partial solution below (EDIT 2), but I have still one question (see at the end)
I am trying to compile the following C program with gcc-4.9.2, on Windows 7, 32 bits, running on a Pentium G3220 (according to Windows System Information). If I understand correctly, this processor does not have AVX extensions, so it's quite natural that something happens, I am just unsure about what exactly. Initially, I was playing with optimizations with gcc, and I tried -mavx rather "accidentally".
The following program computes permutations of numbers 0 ... n-1 (with n given as an argument) in lexicographic order, and also rank of each permutation (its position in this sequential order), and "unrank" (recover permutation from rank), and checks that all of these are correct. It should only print "OK" or "Error" in the end.
With gcc -O3, the program runs correctly with all integer input I checked (1 <= n <= 11).
With gcc -O3 -mavx, it runs correctly for 1 <= n <= 7, and for n >= 8, it prints nothing, and actually it does nothing (almost no delay before exiting). I get no message from the program or from Windows (I would have expected maybe a crash with an unknown instruction, but it didn't happen).
(On another computer with Windows 7 64 bits, on a core-i5, and the same gcc-4.9.2, the program seems to run fine with of without -mavx, when compiled either in 32 or 64 bits)
What I don't understand is why it runs correctly for some input values, and fails for other ones. Does anybody have some hint about this?
Here is the full program, followed by a shorter one with the same problem.
#include <stdlib.h>
#include <stdio.h>
#define SWAP(a,b) {int c; c = a; a = b; b = c;}
int next_perm(int n, int a[n]) {
int i, j, k;
for(i = n - 1; i > 0 && a[i - 1] > a[i]; i--);
for(j = i, k = n - 1; j < k; j++, k--) SWAP(a[j], a[k]);
if(i == 0) return 0;
for(j = i--; a[j] < a[i]; j++);
SWAP(a[i], a[j]);
return 1;
}
#undef SWAP
void copyvec(int n, int dst[n], int src[n]) {
int i;
for(i = 0; i < n; i++) {
dst[i] = src[i];
}
}
int eqvec(int n, int a[n], int b[n]) {
int i;
for(i = 0; i < n; i++) {
if(a[i] != b[i]) return 0;
}
return 1;
}
int rank(int n, int a[n]) {
int v[n], i, j, r;
v[n - 1] = 1;
for(j = n - 2; j >= 0; j--) v[j] = v[j + 1]*(n - 1 - j);
for(r = i = 0; ; i++) {
for(j = i; j < n; j++) {
if(a[j] > j) goto cont;
}
return r;
cont:
i = j;
r += v[i]*(a[i] - i);
for(j = i + 1; j < n; j++) {
if(a[j] < a[i]) a[j]++;
}
}
}
void unrank(int n, int a[n], int p) {
int v[n], i, j, r, s;
v[n - 1] = 1;
for(i = n - 2; i >= 0; i--) v[i] = v[i + 1]*(n - 1 - i);
p %= n*v[0];
for(i = 0; i < n; i++) a[i] = i;
for(i = 0; p > 0; i++) {
for(; v[i] > p; i++);
r = p/v[i];
p %= v[i];
for(s = a[j = i + r]; j >= i; j--) a[j] = a[j - 1];
a[i] = s;
}
}
int main(int argc, char **argv) {
int n, i, r, s = 0, q = 0;
int *a = NULL, *b = NULL, *c = NULL;
if(argc == 2 && (n = strtol(argv[1], NULL, 0)) > 0) {
a = malloc(n*sizeof(int));
b = malloc(n*sizeof(int));
c = malloc(n*sizeof(int));
if(!a || !b || !c) {
puts("Unable to allocate memory");
goto end;
} else {
for(i = 0; i < n; i++) a[i] = i;
do {
copyvec(n, b, a);
r = rank(n, b);
unrank(n, c, r);
q |= s++ != r || !eqvec(n, a, c);
} while(next_perm(n, a));
puts(q?"Error":"OK");
}
} else {
puts("perm n - Check all permutations of {0 ... n - 1}, with n > 0");
}
end:
if(a) free(a);
if(b) free(b);
if(c) free(c);
return 0;
}
EDIT
Following Brian Cain's comment, here is a shorter program with the same problem. I removed all checks on input value, all the rank/unrank stuff, and I replaced the malloc/free with an array of size 20 (only one now, since b and c are not used anymore). Now the program only computes the permutations with the while(next_perm(n, a)); loop, and does nothing with them. It should still print "OK" in the end, though, because the value of q does not change after the initial q=0.
#include <stdlib.h>
#include <stdio.h>
#define SWAP(a,b) {int c; c = a; a = b; b = c;}
int next_perm(int n, int a[n]) {
int i, j, k;
for(i = n - 1; i > 0 && a[i - 1] > a[i]; i--);
for(j = i, k = n - 1; j < k; j++, k--) SWAP(a[j], a[k]);
if(i == 0) return 0;
for(j = i--; a[j] < a[i]; j++);
SWAP(a[i], a[j]);
return 1;
}
#undef SWAP
int main(int argc, char **argv) {
int n, i, r, s = 0, q = 0, a[20];
n = strtol(argv[1], NULL, 0);
for(i = 0; i < n; i++) a[i] = i;
while(next_perm(n, a));
puts(q?"Error":"OK");
return 0;
}
EDIT 2: explanation of the assembly output
I add also the disassembly output of gcc (in Intel syntax), found with gcc -O3 -mavx -S -masm=intel and gcc-4.9.2 (see link above for the actual binary files of the compiler). However, it needs some work, because as is, gcc will inline the call to next_perm, and it's less readable. I also remove the CFI directives and alignment and actually all other directives, to improve readability:
_next_perm:
LFB0:
push ebp
push edi
push esi
push ebx
mov ecx, DWORD PTR [esp+20]
mov edx, DWORD PTR [esp+24]
lea eax, [ecx-1]
test eax, eax
jle L12
mov edi, DWORD PTR [edx-4+ecx*4]
cmp DWORD PTR [edx-8+ecx*4], edi
mov ecx, eax
jg L5
jmp L11
L28:
mov esi, DWORD PTR [edx+ecx*4]
cmp DWORD PTR [edx-4+ecx*4], esi
jle L27
L5:
sub ecx, 1
jne L28
L4:
mov ebx, ecx
L7:
mov esi, DWORD PTR [edx+ebx*4]
mov edi, DWORD PTR [edx+eax*4]
mov DWORD PTR [edx+ebx*4], edi
mov DWORD PTR [edx+eax*4], esi
add ebx, 1
sub eax, 1
cmp ebx, eax
jl L7
L2:
xor eax, eax
test ecx, ecx
je L23
L11:
sal ecx, 2
lea esi, [edx+ecx]
lea ebp, [edx-4+ecx]
mov ebx, DWORD PTR [esi]
mov edi, DWORD PTR [ebp+0]
cmp edi, ebx
jle L9
lea eax, [edx+4+ecx]
L10:
mov esi, eax
add eax, 4
mov ebx, DWORD PTR [eax-4]
cmp ebx, edi
jl L10
L9:
mov DWORD PTR [ebp+0], ebx
mov eax, 1
mov DWORD PTR [esi], edi
L23:
pop ebx
pop esi
pop edi
pop ebp
ret
L27:
cmp eax, ecx
jg L4
jmp L11
L12:
mov ecx, eax
jmp L2
The assembly output is the same with or without -mavx, apart from label numbers: there is no AVX instruction, which means the problem actually lies in main.
This can be checked by adding some puts in main:
int main(int argc, char **argv) {
int n, i, q = 0, a[20];
puts("X");
n = strtol(argv[1], NULL, 0);
puts("Y");
for(i = 0; i < n; i++) a[i] = i;
puts("Z");
while(next_perm(n, a));
puts(q?"Error":"OK");
return 0;
}
Then, the programs prints only X and Y when it fails, hence the problem comes from the AVX instructions used to build 'a' in the for loop between Y and Z.
Here is the assembly output of main, again without directives (LC2 points to "Y", and LC3 to "Z"). The only AVX instructions in the assembly ouptut of main are between those two puts, and they are used for the for loop that builds the initial 'a', that is the array {0, 1, ..., n-1}. What happens actually, is that AVX instructions are used to build several elements of 'a' at a time (4 I guess), and if the length of 'a' is not a multiple of 4, then there is an additional step (between L4 and L9), before calling the puts("Z") at L9, then the while(next_perm(n, a)); at L3. Thus, the problem is very simple: if n is small enough, then the AVX loop is actually not run, and there is no error. Here the maximum valid n is 4, but it varies between differents runs of gcc, it's a bit randomized it seems (I got 8 yesterday).
The LC0 and LC4 labels point to two arrays of 4 elements that are used by the AVX instructions: LC0 is {0,1,2,3}, and LC4 is {4,4,4,4}. No wonder why they are here, even without deep knowledge of AVX, it smells like an unrolled loop :-)
_main:
push ebp
mov ebp, esp
push edi
push esi
push ebx
and esp, -16
sub esp, 96
call ___main
mov DWORD PTR [esp], OFFSET FLAT:LC1
call _puts
mov eax, DWORD PTR [ebp+12]
mov DWORD PTR [esp+8], 0
mov DWORD PTR [esp+4], 0
mov eax, DWORD PTR [eax+4]
mov DWORD PTR [esp], eax
call _strtol
mov DWORD PTR [esp], OFFSET FLAT:LC2
mov ebx, eax
call _puts
test ebx, ebx
jle L17
lea edx, [ebx-4]
lea ecx, [ebx-1]
shr edx, 2
add edx, 1
cmp ecx, 3
lea eax, [0+edx*4]
jbe L10
vmovdqa xmm1, XMMWORD PTR LC4
lea esi, [esp+16]
xor ecx, ecx
vmovdqa xmm0, XMMWORD PTR LC0
L5:
mov edi, ecx
add ecx, 1
sal edi, 4
cmp edx, ecx
vmovaps XMMWORD PTR [esi+edi], xmm0
vpaddd xmm0, xmm0, xmm1
ja L5
cmp ebx, eax
je L9
L4:
lea edx, [eax+1]
mov DWORD PTR [esp+16+eax*4], eax
cmp ebx, edx
jle L9
mov DWORD PTR [esp+16+edx*4], edx
lea edx, [eax+2]
cmp ebx, edx
jle L9
add eax, 3
mov DWORD PTR [esp+16+edx*4], edx
cmp ebx, eax
jle L9
mov DWORD PTR [esp+16+eax*4], eax
L9:
mov DWORD PTR [esp], OFFSET FLAT:LC3
call _puts
L3:
mov DWORD PTR [esp+4], esi
mov DWORD PTR [esp], ebx
call _next_perm
test eax, eax
jne L3
mov DWORD PTR [esp], OFFSET FLAT:LC5
call _puts
lea esp, [ebp-12]
xor eax, eax
pop ebx
pop esi
pop edi
pop ebp
ret
L10:
xor eax, eax
lea esi, [esp+16]
jmp L4
L17:
lea esi, [esp+16]
jmp L9
Now, I understand what actually happens, but one question remains: why is there no error message whatsoever when the program tries to run an AVX instruction? It simply exits, or it's killed, but without any hint that something went wrong.

This code always results in:
where parameter = n
a[] = {0,0,2, 3, ...,n-2,n-1}
b[] = {n-1, n-1, ... , n-1}
c[] = {n-1, n-2, ... , 0}
when it reaches the above conditions,
then it exits with "OK"
the amount of time spent executing the code
climbs at an exponential rate
as the value of the parameter is increased

Develop Reference

c reactjs sql-server angularjs arrays wpf database batch-file google-app-engine silverlight

How to convert this assembly code to intrinsic code? - c

Related

Using AVX to improve performance of float subtract, divide, truncate to int32

Convert 32 bit GDI bitmap to Grayscale by manipulating bmBits

Reviewing IF conditional code to save CPU cycles

Why _mm256_load_pd compiled to MOVUPD instead of MOVAPD?

Failure of -mavx optimization with gcc?

Categories

Resources