enabling paging leads to triple fault - c

In case this helps somebody in the future:
check if the global descriptor table is working
check if the elements of page table entry struct are ordered correctly
This code worked for me:
gdt.s
section .data
gdt:
.null:
dq 0
.code:
dw 0xFFFF
dw 0x0000
db 0x00
db 0x9A
db 0xCF
db 0x00
.data:
dw 0xFFFF
dw 0x0000
db 0x00
db 0x92
db 0xCF
db 0x00
gdtr:
dw $-gdt-1
dd gdt
section .text
global init_global_descriptor_table
init_global_descriptor_table:
lgdt [gdtr]
jmp 0x08:.reload_cs
.reload_cs:
mov ax, 0x10
mov ds, ax
mov es, ax
mov fs, ax
mov gs, ax
mov ss, ax
ret
paging.c
enum flags {
Present = 1 << 0,
ReadWrite = 1 << 1,
AccessAll = 1 << 2,
WriteThroughCashing = 1 << 3,
DisableCashing = 1 << 4,
Accessed = 1 << 5,
Dirty = 1 << 6, // only for page-table-entries
MPages = 1 << 7,
Global = 1 << 8, // only for page-table-entries
};
struct entry {
unsigned int flags : 9;
unsigned int available : 3;
unsigned int addr : 20;
};
extern void load_paging_directory(int *ptr);
void init_paging() {
struct entry *dir = (struct entry *)0x00105000;
struct entry *t1 = (struct entry *)0x00106000;
for (int i = 0; i < 1024; i++) {
dir[i] = (struct entry){0};
if (i <= 262) t1[i] = (struct entry){Present | ReadWrite, 0, i};
}
dir[0] = (struct entry){Present, 0, (int)t1 >> 12};
load_paging_directory((int *)dir);
}
paging_asm.s
global load_paging_directory
load_paging_directory:
push ebp,
mov ebp, esp
mov eax, [ebp + 8]
mov cr3, eax
mov eax, cr0
or eax, 0x80000001
mov cr0, eax
mov esp, ebp
pop ebp
ret

The question has been solved using this struct:
struct entry {
unsigned int flags : 9;
unsigned int available : 3;
unsigned int addr : 20;
};

Related

Reviewing IF conditional code to save CPU cycles

I am reviewing the usage of if condition in my program, in there, I have lines like the following:
if(count > 4) count = 4;
Would it be a good idea to write the above if conditional statement as the following non-branched one?
count = 4*(count> 4) + count*(count<= 4);
I also have the following snippet there:
for (j=0, i=0; j<NCARD_PER_SUIT && i<CARDS_PER_PLAYER+CARDS_ON_BOARD; ++j) {
if (card_cfg.hearts & cfg_mask[j]) {
player_hand[i].card.face = j;
player_hand[i++].card.suit = HEART;
}
if (card_cfg.spades & cfg_mask[j]) {
player_hand[i].card.face = j;
player_hand[i++].card.suit = SPADE;
}
if (card_cfg.clubs & cfg_mask[j]) {
player_hand[i].card.face = j;
player_hand[i++].card.suit = CLUB;
}
if (card_cfg.diamonds & cfg_mask[j]) {
player_hand[i].card.face = j;
player_hand[i++].card.suit = DIAMOND;
}
}
and wondering if there is good (non-branched) way to write the above, any suggestions?
EDIT: Based on some feedback below, i compared the assembly instructions (using MSVS2015 for Windows 10) and got the following:
; 718 : count = 4*(count> 4) + count*(count<= 4);
xor ebx, ebx
cmp edx, 4
setle bl
xor ecx, ecx
imul ebx, edx
cmp edx, 4
mov edx, 4
cmovg ecx, edx
add ebx, ecx
And if revert back to if statement, i get the following, where no jump instruction and total number of instructions 2/3rd compare to the above:
; 718 : if( count >4) count = 4;
mov eax, DWORD PTR _i$6$[ebp]
cmp edx, edi
mov ebx, DWORD PTR _player$GSCopy$1$[ebp]
cmovg edx, edi
mov edi, DWORD PTR _count$1$[ebp]
mov DWORD PTR _count$4$[ebp], edx
EDIT #2: Based on the tip from the comments below, i went ahead and created a
union
typedef union {
struct cfg {
unsigned short hearts;
unsigned short spades;
unsigned short clubs;
unsigned short diamonds;
} suit;
unsigned long long allsuits;
} card_cfg_t;
And with help of this union, i was able to rewrite the second snippet of OP as follows, which seem sot save a lot (20% in my case) if I build it for 64-bit machine and takes more time (extra 40%) if i build it for 32-bit machine:
for (j=0, i=0; j<NCARD_PER_SUIT && i<CARDS_PER_PLAYER+CARDS_ON_BOARD; ++j) {
for (int k=0; k<4; ++k) {
present = (int)((card_cfg.allsuits & (cfg_mask[j] << 16*k)) != 0);
player_hand[i].card.face = j*present;
player_hand[i].card.suit = k;
i = i + present;
}
}
Those micro optimistion do not have too much sense but of you want to compare (you will see the difference between your one and mine - switch on optimisation - compiler is really good in it):
int count;
void foo()
{
count = 4*(count> 4) + count*(count <= 4);
}
void foo1()
{
count = count > 4 ? 4 : count;
}
void foo4()
{
if(count> 4) count = 4;
}
foo:
mov edx, DWORD PTR count[rip]
xor ecx, ecx
cmp edx, 4
setle al
setg cl
movzx eax, al
imul eax, edx
lea eax, [rax+rcx*4]
mov DWORD PTR count[rip], eax
ret
foo1:
cmp DWORD PTR count[rip], 4
mov eax, 4
cmovle eax, DWORD PTR count[rip]
mov DWORD PTR count[rip], eax
ret
foo4:
cmp DWORD PTR count[rip], 4
jle .L6
mov DWORD PTR count[rip], 4
.L6:
rep ret
The answer to the second loop must be something like:
pushcards(player, popcards(dealer));

How do you use the buffer? [intel based assembly]

I am having trouble figuring out how to use the buffer in my code. The code is supposed to loop through and count the number of each char is in the array. I haven't written the function for lower case and upper case chars yet but for the int part it outputs "0 - +0" every time no matter what combination of integers or letters I input. I'm not sure if this issue has to do with the buffer or if its from another part of my code. Any suggestions, tips, or explanations are welcome. I am compiling on Visual Studios 2015 with the kip irvine library.
.data
buffer BYTE 1064 DUP(?)
;sentence input
sentence dword ?
;enter gallons prompt
prompt1 BYTE "Enter the sentence you would like to count: ", 0
;start of int values
intEnd dword 57
;int you are on
intIndex dword 48
;upper char you are on
upcharIndex dword 65
;start of upper case char values
upcharEnd dword 90
;lower char you are on
lowCharIndex dword 97
;start of lower case char values
lowcharEnd dword 122
;prompt dash
dash BYTE " - "
;count vals
count dword 0
.code
main PROC
;shows the prompt
mov edx, OFFSET prompt1
call WriteString
mov edx, OFFSET buffer
mov ecx, SIZEOF buffer
;reads the sentence the user inputs
call ReadString
mov sentence, eax
mov ebx, OFFSET sentence
mov ecx, [LENGTHOF sentence]
checkint:
mov eax, intIndex
cmp eax, intEnd
je done
mov edx, count
L1:
call DumpRegs
cmp al, [ebx]
jne no_inc
cmp al, 00
je none
incre:
inc dl
no_inc:
add ebx, 8
jmp L1
none:
mov intIndex, eax
call WriteChar
call DumpRegs
mov count, edx
mov edx, OFFSET dash
call WriteString
call DumpRegs
mov eax, count
call WriteInt
mov eax,intIndex
inc eax
mov intIndex, eax
call DumpRegs
mov count, eax
jmp checkint
done:
RET
exit
main ENDP
END main
C++ code:
#include<iostream>
#include<string>
using namespace std;
int main()
{
int startint = 48;
int endint = 57;
int startupper = 65;
int endupper = 90;
int startlower = 97;
int endlower = 122;
int count = 0;
cout << "Enter the string you would like to count: ";
string sentence;
getline(cin, sentence);
cout << endl;
for (int i = startint; i < endint; i++)
{
for (char a : sentence)
{
if (a == i)
count++;
}
if(count !=0)
cout << (char)i << " - " << count<<endl;
count = 0;
}
for (int i = startupper; i < endupper; i++)
{
for (char a : sentence)
{
if (a == i)
count++;
}
if(count !=0)
cout << (char)i << " - " << count<<endl;
count = 0;
}
for (int i = startlower; i < endlower; i++)
{
for (char a : sentence)
{
if (a == i)
count++;
}
if(count !=0)
cout << (char)i << " - " << count<<endl;
count = 0;
}
return 0;
}
call ReadString
mov sentence, eax
mov ebx, OFFSET sentence
mov ecx, [LENGTHOF sentence]
The ReadString function returns the size of the input in the EAX register. Yet you use this as an address!
call ReadString
mov sentence, eax ;This is a length !
mov ebx, OFFSET buffer
mov ecx, sentence
cmp al, [ebx]
jne no_inc
cmp al, 00
je none
Check for the end of the string before doing anything else:
cmp byte ptr [ebx], 0
je none
cmp al, [ebx]
jne no_inc
Since count is a dword, increment EDX rather than DL.
Since strings use 1 byte per character, add just 1 to EBX in stead of 8.
checkint:
mov eax, intIndex
cmp eax, intEnd
je done
This way you'll miss the final iteration! Only jump to done when EAX is above intEnd:
checkint:
mov eax, intIndex
cmp eax, intEnd
ja done
I hope this helps to get the main code working...

Assembler string of numbers into int

How do I convert a string of number (decimals) to a integer (binary) using IA-32 assembler in c++?
Here is a shell of what I need.
#include <stdio.h>
int main(int argc, char** argv) {
int iOut = 0;
char* pcInp;
if (argc < 2) {
printf("Mssing parameter: number\n");
return(0);
}
pcInp = argv[1];
_asm {
push aex
push ebx
push ecx
push edx
//code here
pop edx
pop ecx
pop ebx
pop eax
}
printf("Number was processed as %d\n", iOut);
}
Solved, maybe someone else will need this.
#include <iostream>
#include <stdio.h>
#include <string.h>
using namespace std;
int main(int argc, char** argv)
{
int Lenght;
int pirmasSkaicius = 0;
int antrasSkaicius = 0;
int treciasSkaicius = 0;
int ketvirtasSkaicius = 0;
int rez = 0;
char * numbEntered = new char[10];
if (argc < 2) {
printf("No parameter: number\n");
return(0);
}
numbEntered = argv[1];
Lenght = strlen(numbEntered);
cout << "Lenght: " << Lenght << endl;
__asm {
push eax
push ebx
push ecx
push edx
add treciasSkaicius, eax
add ketvirtasSkaicius, edx
xor eax, eax
xor edx, edx
mov ecx, numbEntered
mov al, byte ptr[ecx]
sub eax, 48
mov ebx, 1000
imul eax, ebx
mov pirmasSkaicius, eax
inc ecx
xor eax, eax
xor edx, edx
mov al, byte ptr[ecx]
sub eax, 48
mov ebx, 100
imul eax, ebx
mov antrasSkaicius, eax
xor eax, eax
xor edx, edx
inc ecx
mov al, byte ptr[ecx]
sub eax, 48
mov ebx, 10
imul eax, ebx
mov treciasSkaicius, eax
inc ecx
xor eax, eax
xor edx, edx
mov al, byte ptr[ecx]
sub eax, 48
mov ketvirtasSkaicius, eax
add edx, pirmasSkaicius
add edx, antrasSkaicius
add edx, treciasSkaicius
add edx, ketvirtasSkaicius
mov rez, edx
pop edx
pop ecx
pop ebx
pop eax
}
cout << "Processed: " << rez << endl;
cout << "Pieces as: " << pirmasSkaicius << " " << antrasSkaicius << " " << treciasSkaicius << " " << ketvirtasSkaicius << endl;
system("pause");
return 0;
}

Failure of -mavx optimization with gcc?

EDIT partial solution below (EDIT 2), but I have still one question (see at the end)
I am trying to compile the following C program with gcc-4.9.2, on Windows 7, 32 bits, running on a Pentium G3220 (according to Windows System Information). If I understand correctly, this processor does not have AVX extensions, so it's quite natural that something happens, I am just unsure about what exactly. Initially, I was playing with optimizations with gcc, and I tried -mavx rather "accidentally".
The following program computes permutations of numbers 0 ... n-1 (with n given as an argument) in lexicographic order, and also rank of each permutation (its position in this sequential order), and "unrank" (recover permutation from rank), and checks that all of these are correct. It should only print "OK" or "Error" in the end.
With gcc -O3, the program runs correctly with all integer input I checked (1 <= n <= 11).
With gcc -O3 -mavx, it runs correctly for 1 <= n <= 7, and for n >= 8, it prints nothing, and actually it does nothing (almost no delay before exiting). I get no message from the program or from Windows (I would have expected maybe a crash with an unknown instruction, but it didn't happen).
(On another computer with Windows 7 64 bits, on a core-i5, and the same gcc-4.9.2, the program seems to run fine with of without -mavx, when compiled either in 32 or 64 bits)
What I don't understand is why it runs correctly for some input values, and fails for other ones. Does anybody have some hint about this?
Here is the full program, followed by a shorter one with the same problem.
#include <stdlib.h>
#include <stdio.h>
#define SWAP(a,b) {int c; c = a; a = b; b = c;}
int next_perm(int n, int a[n]) {
int i, j, k;
for(i = n - 1; i > 0 && a[i - 1] > a[i]; i--);
for(j = i, k = n - 1; j < k; j++, k--) SWAP(a[j], a[k]);
if(i == 0) return 0;
for(j = i--; a[j] < a[i]; j++);
SWAP(a[i], a[j]);
return 1;
}
#undef SWAP
void copyvec(int n, int dst[n], int src[n]) {
int i;
for(i = 0; i < n; i++) {
dst[i] = src[i];
}
}
int eqvec(int n, int a[n], int b[n]) {
int i;
for(i = 0; i < n; i++) {
if(a[i] != b[i]) return 0;
}
return 1;
}
int rank(int n, int a[n]) {
int v[n], i, j, r;
v[n - 1] = 1;
for(j = n - 2; j >= 0; j--) v[j] = v[j + 1]*(n - 1 - j);
for(r = i = 0; ; i++) {
for(j = i; j < n; j++) {
if(a[j] > j) goto cont;
}
return r;
cont:
i = j;
r += v[i]*(a[i] - i);
for(j = i + 1; j < n; j++) {
if(a[j] < a[i]) a[j]++;
}
}
}
void unrank(int n, int a[n], int p) {
int v[n], i, j, r, s;
v[n - 1] = 1;
for(i = n - 2; i >= 0; i--) v[i] = v[i + 1]*(n - 1 - i);
p %= n*v[0];
for(i = 0; i < n; i++) a[i] = i;
for(i = 0; p > 0; i++) {
for(; v[i] > p; i++);
r = p/v[i];
p %= v[i];
for(s = a[j = i + r]; j >= i; j--) a[j] = a[j - 1];
a[i] = s;
}
}
int main(int argc, char **argv) {
int n, i, r, s = 0, q = 0;
int *a = NULL, *b = NULL, *c = NULL;
if(argc == 2 && (n = strtol(argv[1], NULL, 0)) > 0) {
a = malloc(n*sizeof(int));
b = malloc(n*sizeof(int));
c = malloc(n*sizeof(int));
if(!a || !b || !c) {
puts("Unable to allocate memory");
goto end;
} else {
for(i = 0; i < n; i++) a[i] = i;
do {
copyvec(n, b, a);
r = rank(n, b);
unrank(n, c, r);
q |= s++ != r || !eqvec(n, a, c);
} while(next_perm(n, a));
puts(q?"Error":"OK");
}
} else {
puts("perm n - Check all permutations of {0 ... n - 1}, with n > 0");
}
end:
if(a) free(a);
if(b) free(b);
if(c) free(c);
return 0;
}
EDIT
Following Brian Cain's comment, here is a shorter program with the same problem. I removed all checks on input value, all the rank/unrank stuff, and I replaced the malloc/free with an array of size 20 (only one now, since b and c are not used anymore). Now the program only computes the permutations with the while(next_perm(n, a)); loop, and does nothing with them. It should still print "OK" in the end, though, because the value of q does not change after the initial q=0.
#include <stdlib.h>
#include <stdio.h>
#define SWAP(a,b) {int c; c = a; a = b; b = c;}
int next_perm(int n, int a[n]) {
int i, j, k;
for(i = n - 1; i > 0 && a[i - 1] > a[i]; i--);
for(j = i, k = n - 1; j < k; j++, k--) SWAP(a[j], a[k]);
if(i == 0) return 0;
for(j = i--; a[j] < a[i]; j++);
SWAP(a[i], a[j]);
return 1;
}
#undef SWAP
int main(int argc, char **argv) {
int n, i, r, s = 0, q = 0, a[20];
n = strtol(argv[1], NULL, 0);
for(i = 0; i < n; i++) a[i] = i;
while(next_perm(n, a));
puts(q?"Error":"OK");
return 0;
}
EDIT 2: explanation of the assembly output
I add also the disassembly output of gcc (in Intel syntax), found with gcc -O3 -mavx -S -masm=intel and gcc-4.9.2 (see link above for the actual binary files of the compiler). However, it needs some work, because as is, gcc will inline the call to next_perm, and it's less readable. I also remove the CFI directives and alignment and actually all other directives, to improve readability:
_next_perm:
LFB0:
push ebp
push edi
push esi
push ebx
mov ecx, DWORD PTR [esp+20]
mov edx, DWORD PTR [esp+24]
lea eax, [ecx-1]
test eax, eax
jle L12
mov edi, DWORD PTR [edx-4+ecx*4]
cmp DWORD PTR [edx-8+ecx*4], edi
mov ecx, eax
jg L5
jmp L11
L28:
mov esi, DWORD PTR [edx+ecx*4]
cmp DWORD PTR [edx-4+ecx*4], esi
jle L27
L5:
sub ecx, 1
jne L28
L4:
mov ebx, ecx
L7:
mov esi, DWORD PTR [edx+ebx*4]
mov edi, DWORD PTR [edx+eax*4]
mov DWORD PTR [edx+ebx*4], edi
mov DWORD PTR [edx+eax*4], esi
add ebx, 1
sub eax, 1
cmp ebx, eax
jl L7
L2:
xor eax, eax
test ecx, ecx
je L23
L11:
sal ecx, 2
lea esi, [edx+ecx]
lea ebp, [edx-4+ecx]
mov ebx, DWORD PTR [esi]
mov edi, DWORD PTR [ebp+0]
cmp edi, ebx
jle L9
lea eax, [edx+4+ecx]
L10:
mov esi, eax
add eax, 4
mov ebx, DWORD PTR [eax-4]
cmp ebx, edi
jl L10
L9:
mov DWORD PTR [ebp+0], ebx
mov eax, 1
mov DWORD PTR [esi], edi
L23:
pop ebx
pop esi
pop edi
pop ebp
ret
L27:
cmp eax, ecx
jg L4
jmp L11
L12:
mov ecx, eax
jmp L2
The assembly output is the same with or without -mavx, apart from label numbers: there is no AVX instruction, which means the problem actually lies in main.
This can be checked by adding some puts in main:
int main(int argc, char **argv) {
int n, i, q = 0, a[20];
puts("X");
n = strtol(argv[1], NULL, 0);
puts("Y");
for(i = 0; i < n; i++) a[i] = i;
puts("Z");
while(next_perm(n, a));
puts(q?"Error":"OK");
return 0;
}
Then, the programs prints only X and Y when it fails, hence the problem comes from the AVX instructions used to build 'a' in the for loop between Y and Z.
Here is the assembly output of main, again without directives (LC2 points to "Y", and LC3 to "Z"). The only AVX instructions in the assembly ouptut of main are between those two puts, and they are used for the for loop that builds the initial 'a', that is the array {0, 1, ..., n-1}. What happens actually, is that AVX instructions are used to build several elements of 'a' at a time (4 I guess), and if the length of 'a' is not a multiple of 4, then there is an additional step (between L4 and L9), before calling the puts("Z") at L9, then the while(next_perm(n, a)); at L3. Thus, the problem is very simple: if n is small enough, then the AVX loop is actually not run, and there is no error. Here the maximum valid n is 4, but it varies between differents runs of gcc, it's a bit randomized it seems (I got 8 yesterday).
The LC0 and LC4 labels point to two arrays of 4 elements that are used by the AVX instructions: LC0 is {0,1,2,3}, and LC4 is {4,4,4,4}. No wonder why they are here, even without deep knowledge of AVX, it smells like an unrolled loop :-)
_main:
push ebp
mov ebp, esp
push edi
push esi
push ebx
and esp, -16
sub esp, 96
call ___main
mov DWORD PTR [esp], OFFSET FLAT:LC1
call _puts
mov eax, DWORD PTR [ebp+12]
mov DWORD PTR [esp+8], 0
mov DWORD PTR [esp+4], 0
mov eax, DWORD PTR [eax+4]
mov DWORD PTR [esp], eax
call _strtol
mov DWORD PTR [esp], OFFSET FLAT:LC2
mov ebx, eax
call _puts
test ebx, ebx
jle L17
lea edx, [ebx-4]
lea ecx, [ebx-1]
shr edx, 2
add edx, 1
cmp ecx, 3
lea eax, [0+edx*4]
jbe L10
vmovdqa xmm1, XMMWORD PTR LC4
lea esi, [esp+16]
xor ecx, ecx
vmovdqa xmm0, XMMWORD PTR LC0
L5:
mov edi, ecx
add ecx, 1
sal edi, 4
cmp edx, ecx
vmovaps XMMWORD PTR [esi+edi], xmm0
vpaddd xmm0, xmm0, xmm1
ja L5
cmp ebx, eax
je L9
L4:
lea edx, [eax+1]
mov DWORD PTR [esp+16+eax*4], eax
cmp ebx, edx
jle L9
mov DWORD PTR [esp+16+edx*4], edx
lea edx, [eax+2]
cmp ebx, edx
jle L9
add eax, 3
mov DWORD PTR [esp+16+edx*4], edx
cmp ebx, eax
jle L9
mov DWORD PTR [esp+16+eax*4], eax
L9:
mov DWORD PTR [esp], OFFSET FLAT:LC3
call _puts
L3:
mov DWORD PTR [esp+4], esi
mov DWORD PTR [esp], ebx
call _next_perm
test eax, eax
jne L3
mov DWORD PTR [esp], OFFSET FLAT:LC5
call _puts
lea esp, [ebp-12]
xor eax, eax
pop ebx
pop esi
pop edi
pop ebp
ret
L10:
xor eax, eax
lea esi, [esp+16]
jmp L4
L17:
lea esi, [esp+16]
jmp L9
Now, I understand what actually happens, but one question remains: why is there no error message whatsoever when the program tries to run an AVX instruction? It simply exits, or it's killed, but without any hint that something went wrong.
This code always results in:
where parameter = n
a[] = {0,0,2, 3, ...,n-2,n-1}
b[] = {n-1, n-1, ... , n-1}
c[] = {n-1, n-2, ... , 0}
when it reaches the above conditions,
then it exits with "OK"
the amount of time spent executing the code
climbs at an exponential rate
as the value of the parameter is increased

Operator * and + produces wrong result in digital mars

I am trying to calculate a number which produce Longest Collatz sequence. But here is a strange problem. 3n+1 become 38654705674 when n is 3. I do not see an error. here is the full code:
/* 6.c -- calculates Longest Collatz sequence */
#include <stdio.h>
long long get_collatz_length(long long);
int main(void)
{
long long i;
long long current, current_count, count;
current_count = 1;
current = 1;
for(i=2;i<1000000;i++)
{
// works fine when i is 2 the next line take eternity when i is 3;
count = get_collatz_length(i);
if(current_count <= count)
{
current = i;
current_count = count;
}
}
printf("%lld %lld\n", current, current_count);
return 0;
}
long long get_collatz_length(long long num)
{
long long count;
count = 1;
while(num != 1)
{
printf("%lld\n", num);
if(num%2)
{
num = num*3+1; // here it is;
}
else
{
num/=2;
}
count++;
}
puts("");
return count;
}
It's seems to be bug in dmc compiler, that fails to handle long long type correctly. Here is narrowed test-case:
#include <stdio.h>
int main(void)
{
long long num = 3LL;
/*printf("%lld\n", num);*/
num = num * 3LL;
char *t = (char *) &num;
for (int i = 0; i < 8; i++)
printf("%x\t", t[i]);
putchar('\n');
/*printf("%lld\n", num);*/
return 0;
}
It produces (little endian, so 0x900000009 == 38 654 705 673):
9 0 0 0 9 0 0 0
From dissasembly it looks that it stores 64-bit integer as two 32-bit registers:
.data:0x000000be 6bd203 imul edx,edx,0x3
.data:0x000000c1 6bc803 imul ecx,eax,0x3
.data:0x000000c4 03ca add ecx,edx
.data:0x000000c6 ba03000000 mov edx,0x3
.data:0x000000cb f7e2 mul edx
.data:0x000000cd 03d1 add edx,ecx
.data:0x000000cf 31c0 xor eax,eax
I additionaly tested it with objconv tool, that just confirms my initial diagnose:
#include <stdio.h>
void mul(void)
{
long long a;
long long c;
a = 5LL;
c = a * 3LL;
printf("%llx\n", c);
}
int main(void)
{
mul();
return 0;
}
disassembly (single section):
>objconv.exe -fmasm ..\dm\bin\check.obj
_mul PROC NEAR
mov eax, 5 ; 0000 _ B8, 00000005
cdq ; 0005 _ 99
imul edx, edx, 3 ; 0006 _ 6B. D2, 03
imul ecx, eax, 3 ; 0009 _ 6B. C8, 03
add ecx, edx ; 000C _ 03. CA
mov edx, 3 ; 000E _ BA, 00000003
mul edx ; 0013 _ F7. E2
add edx, ecx ; 0015 _ 03. D1
push edx ; 0017 _ 52
push eax ; 0018 _ 50
push offset FLAT:?_001 ; 0019 _ 68, 00000000(segrel)
call _printf ; 001E _ E8, 00000000(rel)
add esp, 12 ; 0023 _ 83. C4, 0C
ret ; 0026 _ C3
_mul ENDP
Note that mul edx operates implicitely on eax. The result is stored in both registers, higher part (in this case 0) in stored in edx, while lower in eax.

Resources