Cache size estimation on your system? - c

I got this program from this link (https://gist.github.com/jiewmeng/3787223).I have been searching the web with the idea of gaining a better understanding of processor caches (L1 and L2).I want to be able to write a program that would enable me to guess the size of L1 and L2 cache on my new Laptop.(just for learning purpose.I know I could check the spec.)
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#define KB 1024
#define MB 1024 * 1024
int main() {
unsigned int steps = 256 * 1024 * 1024;
static int arr[4 * 1024 * 1024];
int lengthMod;
unsigned int i;
double timeTaken;
clock_t start;
int sizes[] = {
1 * KB, 4 * KB, 8 * KB, 16 * KB, 32 * KB, 64 * KB, 128 * KB, 256 * KB,
512 * KB, 1 * MB, 1.5 * MB, 2 * MB, 2.5 * MB, 3 * MB, 3.5 * MB, 4 * MB
};
int results[sizeof(sizes)/sizeof(int)];
int s;
/*for each size to test for ... */
for (s = 0; s < sizeof(sizes)/sizeof(int); s++)
{
lengthMod = sizes[s] - 1;
start = clock();
for (i = 0; i < steps; i++)
{
arr[(i * 16) & lengthMod] *= 10;
arr[(i * 16) & lengthMod] /= 10;
}
timeTaken = (double)(clock() - start)/CLOCKS_PER_SEC;
printf("%d, %.8f \n", sizes[s] / 1024, timeTaken);
}
return 0;
}
The output of the program in my machine is as follows.How do I interpret the numbers? What does this program tell me.?
1, 1.07000000
4, 1.04000000
8, 1.06000000
16, 1.13000000
32, 1.14000000
64, 1.17000000
128, 1.20000000
256, 1.21000000
512, 1.19000000
1024, 1.23000000
1536, 1.23000000
2048, 1.46000000
2560, 1.21000000
3072, 1.45000000
3584, 1.47000000
4096, 1.94000000

you need direct access to memory
I am not meaning DMA transfer by this. Memory must be accessed by CPU of course (otherwise you are not measuring CACHEs) but as directly as it can be ... so measurements will probably not be very accurate on Windows/Linux because services and other processes can mess with caches during runtime. Measure many times and average for better results (or use the fastest time or filter it together). For best accuracy use DOS and asm for example
rep + movsb,movsw,movsd
rep + stosb,stosw,stosd
so you measure the memory transfer and not something else like in your code !!!
measure the raw transfer times and plot a graph
x axis is transfer block size
y axis is transfer speed
zones with the same transfer rate are consistent with appropriate CACHE layer
[Edit1] could not find my old source code for this so I busted something right now in C++ for windows:
Time measurement:
//---------------------------------------------------------------------------
double performance_Tms=-1.0, // perioda citaca [ms]
performance_tms= 0.0; // zmerany cas [ms]
//---------------------------------------------------------------------------
void tbeg()
{
LARGE_INTEGER i;
if (performance_Tms<=0.0) { QueryPerformanceFrequency(&i); performance_Tms=1000.0/double(i.QuadPart); }
QueryPerformanceCounter(&i); performance_tms=double(i.QuadPart);
}
//---------------------------------------------------------------------------
double tend()
{
LARGE_INTEGER i;
QueryPerformanceCounter(&i); performance_tms=double(i.QuadPart)-performance_tms; performance_tms*=performance_Tms;
return performance_tms;
}
//---------------------------------------------------------------------------
Benchmark (32bit app):
//---------------------------------------------------------------------------
DWORD sizes[]= // used transfer block sizes
{
1<<10, 2<<10, 3<<10, 4<<10, 5<<10, 6<<10, 7<<10, 8<<10, 9<<10,
10<<10, 11<<10, 12<<10, 13<<10, 14<<10, 15<<10, 16<<10, 17<<10, 18<<10,
19<<10, 20<<10, 21<<10, 22<<10, 23<<10, 24<<10, 25<<10, 26<<10, 27<<10,
28<<10, 29<<10, 30<<10, 31<<10, 32<<10, 48<<10, 64<<10, 80<<10, 96<<10,
112<<10,128<<10,192<<10,256<<10,320<<10,384<<10,448<<10,512<<10, 1<<20,
2<<20, 3<<20, 4<<20, 5<<20, 6<<20, 7<<20, 8<<20, 9<<20, 10<<20,
11<<20, 12<<20, 13<<20, 14<<20, 15<<20, 16<<20, 17<<20, 18<<20, 19<<20,
20<<20, 21<<20, 22<<20, 23<<20, 24<<20, 25<<20, 26<<20, 27<<20, 28<<20,
29<<20, 30<<20, 31<<20, 32<<20,
};
const int N=sizeof(sizes)>>2; // number of used sizes
double pmovsd[N]; // measured transfer rate rep MOVSD [MB/sec]
double pstosd[N]; // measured transfer rate rep STOSD [MB/sec]
//---------------------------------------------------------------------------
void measure()
{
int i;
BYTE *dat; // pointer to used memory
DWORD adr,siz,num; // local variables for asm
double t,t0;
HANDLE hnd; // process handle
// enable priority change (huge difference)
#define measure_priority
// enable critical sections (no difference)
// #define measure_lock
for (i=0;i<N;i++) pmovsd[i]=0.0;
for (i=0;i<N;i++) pstosd[i]=0.0;
dat=new BYTE[sizes[N-1]+4]; // last DWORD +4 Bytes (should be 3 but i like 4 more)
if (dat==NULL) return;
#ifdef measure_priority
hnd=GetCurrentProcess(); if (hnd!=NULL) { SetPriorityClass(hnd,REALTIME_PRIORITY_CLASS); CloseHandle(hnd); }
Sleep(200); // wait to change take effect
#endif
#ifdef measure_lock
CRITICAL_SECTION lock; // lock handle
InitializeCriticalSectionAndSpinCount(&lock,0x00000400);
EnterCriticalSection(&lock);
#endif
adr=(DWORD)(dat);
for (i=0;i<N;i++)
{
siz=sizes[i]; // siz = actual block size
num=(8<<20)/siz; // compute n (times to repeat the measurement)
if (num<4) num=4;
siz>>=2; // size / 4 because of 32bit transfer
// measure overhead
tbeg(); // start time meassurement
asm {
push esi
push edi
push ecx
push ebx
push eax
mov ebx,num
mov al,0
loop0: mov esi,adr
mov edi,adr
mov ecx,siz
// rep movsd // es,ds already set by C++
// rep stosd // es already set by C++
dec ebx
jnz loop0
pop eax
pop ebx
pop ecx
pop edi
pop esi
}
t0=tend(); // stop time meassurement
// measurement 1
tbeg(); // start time meassurement
asm {
push esi
push edi
push ecx
push ebx
push eax
mov ebx,num
mov al,0
loop1: mov esi,adr
mov edi,adr
mov ecx,siz
rep movsd // es,ds already set by C++
// rep stosd // es already set by C++
dec ebx
jnz loop1
pop eax
pop ebx
pop ecx
pop edi
pop esi
}
t=tend(); // stop time meassurement
t-=t0; if (t<1e-6) t=1e-6; // remove overhead and avoid division by zero
t=double(siz<<2)*double(num)/t; // Byte/ms
pmovsd[i]=t/(1.024*1024.0); // MByte/s
// measurement 2
tbeg(); // start time meassurement
asm {
push esi
push edi
push ecx
push ebx
push eax
mov ebx,num
mov al,0
loop2: mov esi,adr
mov edi,adr
mov ecx,siz
// rep movsd // es,ds already set by C++
rep stosd // es already set by C++
dec ebx
jnz loop2
pop eax
pop ebx
pop ecx
pop edi
pop esi
}
t=tend(); // stop time meassurement
t-=t0; if (t<1e-6) t=1e-6; // remove overhead and avoid division by zero
t=double(siz<<2)*double(num)/t; // Byte/ms
pstosd[i]=t/(1.024*1024.0); // MByte/s
}
#ifdef measure_lock
LeaveCriticalSection(&lock);
DeleteCriticalSection(&lock);
#endif
#ifdef measure_priority
hnd=GetCurrentProcess(); if (hnd!=NULL) { SetPriorityClass(hnd,NORMAL_PRIORITY_CLASS); CloseHandle(hnd); }
#endif
delete dat;
}
//---------------------------------------------------------------------------
Where arrays pmovsd[] and pstosd[] holds the measured 32bit transfer rates [MByte/sec]. You can configure the code by use/rem two defines at the start of measure function.
Graphical Output:
To maximize accuracy you can change process priority class to maximum. So create measure thread with max priority (I try it but it mess thing up actually) and add critical section to it so the test will not be uninterrupted by OS as often (no visible difference with and without threads). If you want to use Byte transfers then take account that it uses only 16bit registers so you need to add loop and address iterations.
PS.
If you try this on notebook then you should overheat the CPU to be sure that you measure on top CPU/Mem speed. So no Sleeps. Some stupid loops before measurement will do it but they should run at least few seconds. Also you can synchronize this by CPU frequency measurement and loop while is rising. Stop after it saturates ...
asm instruction RDTSC is best for this (but beware its meaning has slightly changed with new architectures).
If you are not under Windows then change functions tbeg,tend to your OS equivalents
[edit2] further improvements of accuracy
Well after finally solving problem with VCL affecting measurement accuracy which I discover thanks to this question and more about it here, to improve accuracy you can prior to benchmark do this:
set process priority class to realtime
set process affinity to single CPU
so you measure just single CPU on multi-core
flush DATA and Instruction CACHEs
For example:
// before mem benchmark
DWORD process_affinity_mask=0;
DWORD system_affinity_mask =0;
HANDLE hnd=GetCurrentProcess();
if (hnd!=NULL)
{
// priority
SetPriorityClass(hnd,REALTIME_PRIORITY_CLASS);
// affinity
GetProcessAffinityMask(hnd,&process_affinity_mask,&system_affinity_mask);
process_affinity_mask=1;
SetProcessAffinityMask(hnd,process_affinity_mask);
GetProcessAffinityMask(hnd,&process_affinity_mask,&system_affinity_mask);
}
// flush CACHEs
for (DWORD i=0;i<sizes[N-1];i+=7)
{
dat[i]+=i;
dat[i]*=i;
dat[i]&=i;
}
// after mem benchmark
if (hnd!=NULL)
{
SetPriorityClass(hnd,NORMAL_PRIORITY_CLASS);
SetProcessAffinityMask(hnd,system_affinity_mask);
}
So the more accurate measurement looks like this:

Your lengthMod variable doesn't do what you think it does. You want it to limit the size of your data set, but you have 2 problems there -
Doing a bitwise AND with a power of 2 would mask off all bits except the one that's on. If for e.g. lengthMod is 1k (0x400), then all indices lower than 0x400 (meaning i=1 to 63) would simply map to index 0, so you'll always hit the cache. That's probably why the results are so fast. Instead use lengthMod - 1 to create a correct mask (0x400 --> 0x3ff, which would mask just the upper bits and leave the lower ones intact).
Some of the values for lengthMod are not a power of 2, so doing the lengthMod-1 isn't going to work there as some of the mask bits would still be zeros. Either remove them from the list, or use a modulo operation instead of lengthMod-1 altogether. See also my answer here for a similar case.
Another issue is that 16B jumps are probably not enough to skip a cachline as most common CPUs work with 64 byte cachelines, so you get only one miss for every 4 iterations. Use (i*64) instead.

Related

Get EDID info in C (UEFI): read the ES:DI register?

I am Developing an OS, I wants to get EDID from monitor, I am found some asm code (https://wiki.osdev.org/EDID) to get edid in ES:DI registers,
mov ax, 0x4f15
mov bl, 0x01
xor cx, cx
xor dx, dx
int 0x10
;AL = 0x4F if function supported
;AH = status (0 is success, 1 is fail)
;ES:DI contains the EDID
How can I get AL, AH, and ES:DI values in C File?
Actually I am developing an 64 bit UEFI OS
LoadGDT:
lgdt [rdi]
mov ax, 0x10
mov ds, ax
mov es, ax
mov fs, ax
mov gs, ax
mov ss, ax
pop rdi
mov rax, 0x08
push rax
push rdi
retfq
GLOBAL LoadGDT
I am able to run these above asm code and get it in c using Global Functions in C,
That page on osdev.org contains code intended to be run when the CPU is in 16-bit real mode.
You can tell not only from the registers involved but also from the fact that int 10h is used.
This is a well-known BIOS interrupt service that is written in 16-bit real-mode code.
If you target UEFI, then your bootloader is actually an UEFI application, which is a PE32(+) image.
If the CPU is 64-bit capable, the firmware will switch into long mode (64-bit mode) and load your bootloader.
Otherwise, it will switch into protected mode (32-bit mode).
In any case, real mode is never used in UEFI.
You can call 16-bit code from protected/long mode with the use of a 16-bit code segment in the GDT/LDT but you cannot call real-mode code (i.e. code written to work with the real-mode segmentation) because segmentation works completely different between the modes.
Plus, in real mode the interrupts are dispatched through the IVT and not the IDT, you would need to get the original entry-point for interrupt 10h.
UEFI protocol EFI_EDID_DISCOVERED_PROTOCOL
Luckily, UEFI has a replacement for most basic services offered by the legacy BIOS interface.
In this case, you can use the EFI_EDID_DISCOVERED_PROTOCOL and eventually apply any override from the platform firmware with the use of EFI_EDID_OVERRIDE_PROTOCOL.
The EFI_EDID_DISCOVERED_PROTOCOL is straightforward to use, it's just a (Size, Data) pair.
typedef struct _EFI_EDID_DISCOVERED_PROTOCOL {
UINT32 SizeOfEdid;
UINT8 *Edid;
} EFI_EDID_DISCOVERED_PROTOCOL;
(from gnu-efi)
The format of the buffer Edid can be found in the VESA specification or even on Wikipedia.
As an example, I wrote a simple UEFI application with gnu-efi and x64_64-w64-mingw32 (a version of GCC and tools that target PEs).
I avoided using uefilib.h in order to use gnu-efi just for the definition of the structures related to EUFI.
The code sucks, it assumes at most 10 handles support the EDID protocol and I wrote only a partial structure for the EDID data (because I got bored).
But this should be enough the get the idea.
NOTE That my VM didn't return any EDID information, so the code is not completely tested!
#include <efi.h>
//You are better off using this lib
//#include <efilib.h>
EFI_GUID gEfiEdidDiscoveredProtocolGuid = EFI_EDID_DISCOVERED_PROTOCOL_GUID;
EFI_SYSTEM_TABLE* gST = NULL;
typedef struct _EDID14 {
UINT8 Signature[8];
UINT16 ManufacturerID;
UINT16 ManufacturerCode;
UINT32 Serial;
UINT8 Week;
UINT8 Year;
UINT8 Major;
UINT8 Minor;
UINT32 InputParams;
UINT8 HSize;
UINT8 VSize;
UINT8 Gamma;
//...Omitted...
} EDID14_RAW;
VOID Print(CHAR16* string)
{
gST->ConOut->OutputString(gST->ConOut, string);
}
VOID PrintHex(UINT64 number)
{
CHAR16* digits = L"0123456789abcdef";
CHAR16 buffer[2] = {0, 0};
for (INTN i = 64-4; i >= 0; i-=4)
{
buffer[0] = digits[(number >> i) & 0xf];
Print(buffer);
}
}
VOID PrintDec(UINT64 number)
{
CHAR16 buffer[21] = {0};
UINTN i = 19;
do
{
buffer[i--] = L'0' + (number % 10);
number = number / 10;
}
while (number && i >= 0);
Print(buffer + i + 1);
}
#define MANUFACTURER_DECODE_LETTER(x) ( L'A' + ( (x) & 0x1f ) - 1 )
EFI_STATUS efi_main(EFI_HANDLE ImageHandle, EFI_SYSTEM_TABLE* SystemTable)
{
EFI_STATUS Status = EFI_SUCCESS;
EFI_HANDLE EDIDHandles[10];
UINTN Size = sizeof(EFI_HANDLE) * 10;
EFI_EDID_DISCOVERED_PROTOCOL* EDID;
gST = SystemTable;
if ( EFI_ERROR( (Status = SystemTable->BootServices->LocateHandle(ByProtocol, &gEfiEdidDiscoveredProtocolGuid, NULL, &Size, EDIDHandles)) ) )
{
Print(L"Failed to get EDID handles: "); PrintHex(Status); Print(L"\r\n");
return Status;
}
for (INTN i = 0; i < Size/sizeof(EFI_HANDLE); i++)
{
if (EFI_ERROR( (SystemTable->BootServices->OpenProtocol(
EDIDHandles[i], &gEfiEdidDiscoveredProtocolGuid, (VOID**)&EDID, ImageHandle, NULL, EFI_OPEN_PROTOCOL_GET_PROTOCOL)) ) )
{
Print(L"Failed to get EDID info for handle "); PrintDec(i); Print(L": "); PrintHex(Status); Print(L"\r\n");
return Status;
}
if (EDID->SizeOfEdid == 0 || EDID->Edid == NULL)
{
Print(L"No EDID data for handle "); PrintDec(i); Print(L"\r\n");
continue;
}
/*
THIS CODE IS NOT TESTED!
! ! ! D O N O T U S E ! ! !
*/
EDID14_RAW* EdidData = (EDID14_RAW*)EDID->Edid;
CHAR16 Manufacturer[4] = {0};
Manufacturer[0] = MANUFACTURER_DECODE_LETTER(EdidData->ManufacturerID >> 10);
Manufacturer[1] = MANUFACTURER_DECODE_LETTER(EdidData->ManufacturerID >> 5);
Manufacturer[2] = MANUFACTURER_DECODE_LETTER(EdidData->ManufacturerID);
Print(L"Manufacturer ID: "); Print(Manufacturer); Print(L"\r\n");
Print(L"Resolution: "); PrintDec(EdidData->HSize); Print(L"X"); PrintDec(EdidData->VSize); Print(L"\r\n");
}
return Status;
}
ACPI
If you don't want to use these UEFI protocols you can use ACPI. Each display output device has a _DDC method that is documented in the ACPI specification and can be used to return the EDID data (either as a buffer of 128 or 256 bytes).
This method is conceptually simple but in practice it requires writing a full-blown ACPI parser (including the AML VM) which is a lot of work.
However, ACPI is necessary for modern OSes and so you can use it, later on, to get the EDID data without having to worry about UEFI protocols.

Floating point numbers and the effect on 8-bit microcontrollers memory

I am currently working on a project that includes bare-metal programming on an stm-8 micro-controller using the SDCC compiler in linux. The memory in the chip is quite low so I'm trying to keep things really lean. I have gotten by with using 8-bit and 16-bit variables and things have gone well. But recently I ran into a problem were I really needed a float variable. So i wrote a function that takes in a 16-bit value converts to a float does the math I need and returns an 8-bit number. This cause my final compiled code on the MCU to go from 1198 Bytes to 3462 Bytes. Now I understand that using floating points is memory intensive and that many functions may need to be called to handle the use of the floating point number but it seems crazy to increase the size of the program by that much. I would like some help understanding why this is and what happened exactly.
Specs: MCU stm8151f2
Compiler: SDCC with --opt_code_size option
int roundNo(uint16_t bit_input)
{
float num = (((float)bit_input) - ADC_MIN)/124.0;
return num < 0 ? num - 0.5 : num + 0.5;
}
To determine why the code is so large on your particular tool chain, you would need to look at the generated assembly code, and see what FP support calls it makes, then look at the map file to determine the size of each of those functions.
As an example on Godbolt for AVR using GCC 5.4.0 with -Os (Godbolt does not support STM8 or SDCC so this is for comparison as a 8-bit architecture) your code generates 6364 bytes compared 4081 bytes for an empty function. So the additional code required for the code body is 2283 bytes. Now accounting for the fact that you are using both a different compiler and architecture, these are not that different from your results. See in the generated code (below) the rcalls to subroutines such as __divsf3 - these are where the bulk of the code will be, and I suspect FP division is by far the larger contributor.
roundNo(unsigned int):
push r12
push r13
push r14
push r15
mov r22,r24
mov r23,r25
ldi r24,0
ldi r25,0
rcall __floatunsisf
ldi r18,0
ldi r19,0
ldi r20,0
ldi r21,lo8(69)
rcall __subsf3
ldi r18,0
ldi r19,0
ldi r20,lo8(-8)
ldi r21,lo8(66)
rcall __divsf3
mov r12,r22
mov r13,r23
mov r14,r24
mov r15,r25
ldi r18,0
ldi r19,0
ldi r20,0
ldi r21,0
rcall __ltsf2
ldi r18,0
ldi r19,0
ldi r20,0
ldi r21,lo8(63)
sbrs r24,7
rjmp .L6
mov r25,r15
mov r24,r14
mov r23,r13
mov r22,r12
rcall __subsf3
rjmp .L7
.L6:
mov r25,r15
mov r24,r14
mov r23,r13
mov r22,r12
rcall __addsf3
.L7:
rcall __fixsfsi
mov r24,r22
mov r25,r23
pop r15
pop r14
pop r13
pop r12
ret
You need to perform the same analysis on the code generated by your tool chain to answer your question. No doubt SDCC is capable of generating an assembly listing and a map file which will allow you to determine exactly what code and FP support is being generated and linked.
Ultimately though your use of FP in this case is entirely unnecessary:
int roundNo(uint16_t bit_input)
{
int s = (bit_input - ADC_MIN) ;
s += s < 0 ? -62 : 62 ;
return s / 124 ;
}
At Godbolt 2283 bytes compared to an empty function. Still somewhat large, but the issue there most likely is that the AVR lacks a DIV instruction so calls __divmodhi4. STM8 has a DIV for 16 bit dividend and 8 bit divisor, so it will likely be significantly smaller (and faster) on your target.
OK, a version of fixed point that actually works:
// Assume a 28.4 format for math. 12.4 can be used, but roundoff may occur.
// Input should be a literal float (Note that the multiply here will be handled by the
// compiler and not generate FP asm code.
#define TO_FIXED(x) (int)((x * 16))
// Takes a fixed and converts to an int - should turn into a right shift 4.
#define TO_INT(x) (int)((x / 16))
typedef int FIXED;
const uint16_t ADC_MIN = 32768;
int roundNo(uint16_t bit_input)
{
FIXED num = (TO_FIXED(bit_input - ADC_MIN)) / 124;
num += num < 0 ? TO_FIXED(-0.5) : TO_FIXED(0.5);
return TO_INT(num);
}
int main()
{
printf("%d", roundNo(0));
return 0;
}
Note we are using some 32-bit values here so it will be bigger than your current values. With care though, it could possibly convert back to a 12.4 (16-bit int) instead if round off and overflow can be managed carefully.
Or go grab a better full feature Fixed Point library from the web :)
(Update) After writing this, I noticed that #Clifford mentioned that your microcontroller supports this DIV instruction natively, in which case doing this is redundant. Anyway, I will leave it as a concept which can be applied in cases where DIV is implemented as an extern call, or for cases where DIV takes too many cycles and the goal is to make the calculation faster.
Anyway, shifting and adding is likely to be faster than division, if you ever need to squeeze some extra cycles. So if you start from the fact that 124 is almost equal to 4096/33 (the error factor is 0.00098, i.e. 0.098%, so less than 1 in 1000), you can implement the division with a single multiplication with 33 and a shift by 12 bits (division by 4096). Furthermore, 33 is 32+1, meaning multiplying by 33 is equal to shifting left by 5 and adding the input again.
Example: you want to divide 5000 by 124, and 5000/124 is approx. 40.323. What we will be doing is:
5,000 << 5 = 160,000
160,000 + 5,000 = 165,000
165,000 >> 12 = 40
Note that this only works for positive numbers. Also note that, if you're really doing lots of multiplications all over the code, then having a single extern mul or div function might result in smaller overall code in the long run, especially if the compiler is not particularly good at optimizing. And if the compiler can just emit a DIV instruction here, then the only thing you can get is a tiny bit of speed improvement, so don't bother with this.
#include <stdint.h>
#define ADC_MIN 2048
uint16_t roundNo(uint16_t bit_input)
{
// input too low, return zero
if (bit_input < ADC_MIN)
return 0;
bit_input -= (ADC_MIN - 62);
uint32_t x = bit_input;
// this gets us x = x * 33
x <<= 5;
x += bit_input;
// this gets us x = x / 4096
x >>= 12;
return (uint16_t)x;
}
GCC AVR with size optimizations produces this, i.e. all calls to extern mul or div functions are gone, but it seems like AVR doesn't support shifting multiple bits in a single instruction (it emits loops which shift 5 times and 12 times respectively). I don't have a clue what your compiler will do.
If you also need to handle the bit_input < ADC_MIN case, I would handle this part separately, i.e.:
#include <stdint.h>
#include <stdbool.h>
#define ADC_MIN 2048
int16_t roundNo(uint16_t bit_input)
{
// if subtraction would result in a negative value,
// handle it properly
bool negative = (bit_input < ADC_MIN);
bit_input = negative ? (ADC_MIN - bit_input) : (bit_input - ADC_MIN);
// we are always positive from this point on
bit_input -= (ADC_MIN - 62);
uint32_t x = bit_input;
x <<= 5;
x += bit_input;
x >>= 12;
return negative ? -(int16_t)x : (int16_t)x;
}

Delphi Optimize IndexOf Function

Could someone help me speed up my Delphi function
To find a value in a byte array without using binary search.
I call this function thousands of times, is it possible to optimize it with assembly?
Thank you so much.
function IndexOf(const List: TArray< Byte >; const Value: byte): integer;
var
  I: integer;
begin
  for I := Low( List ) to High( List ) do begin
   if List[ I ] = Value then
    Exit ( I );
end;
  Result := -1;
end;
The length of the array is about 15 items.
Well, let's think. At first, please edit this line:
For I := Low( List ) to High( List ) do
(you forgot 'do' at the end). When we compile it without optimization, here is the assembly code for this loop:
Unit1.pas.29: If List [I] = Value then
005C5E7A 8B45FC mov eax,[ebp-$04]
005C5E7D 8B55F0 mov edx,[ebp-$10]
005C5E80 8A0410 mov al,[eax+edx]
005C5E83 3A45FB cmp al,[ebp-$05]
005C5E86 7508 jnz $005c5e90
Unit1.pas.30: Exit (I);
005C5E88 8B45F0 mov eax,[ebp-$10]
005C5E8B 8945F4 mov [ebp-$0c],eax
005C5E8E EB0F jmp $005c5e9f
005C5E90 FF45F0 inc dword ptr [ebp-$10]
Unit1.pas.28: For I := Low (List) to High (List) do
005C5E93 FF4DEC dec dword ptr [ebp-$14]
005C5E96 75E2 jnz $005c5e7a
This code is far from being optimal: local variable i is really local variable, that is: it is stored in RAM, in stack (you can see it by [ebp-$10] adresses, ebp is stack pointer).
So at each new iteration we see how we load address of array into eax register (mov eax, [ebp-$04]),
then we load i from stack into edx register (mov edx, [ebp-$10]),
then we at least load List[i] into al register which is lower byte of eax (mov al, [eax+edx])
after which compare it with argument 'Value' taken again from memory, not from register!
This implementation is extremely slow.
But let's turn optimization on at last! It's done in Project options -> compiling -> code generation. Let's look at new code:
Unit1.pas.29: If List [I] = Value then
005C5E5A 3A1408 cmp dl,[eax+ecx]
005C5E5D 7504 jnz $005c5e63
Unit1.pas.30: Exit (I);
005C5E5F 8BC1 mov eax,ecx
005C5E61 5E pop esi
005C5E62 C3 ret
005C5E63 41 inc ecx
Unit1.pas.28: For I := Low (List) to High (List) do
005C5E64 4E dec esi
005C5E65 75F3 jnz $005c5e5a
now there are just 4 lines of code which gets repeated over and over.
Value is stored inside dl register (lower byte of edx register),
address of 0-th element of array is stored in eax register,
i is stored in ecx register.
So the line 'if List[i] = Value' converts into just 1 assembly line:
005C5E5A 3A1408 cmp dl,[eax+ecx]
the next line is conditional jump, 3 lines after that are executed just once or never (it's if condition is true), and at last there is increment of i,
decrement of loop variable (it's easier to compare it with zero then with anything else)
So, there is little we can do which Delphi compiler with optimizer didn't!
If it's permitted by your program, you can try to reverse direction of search, from last element to first:
For I := High( List ) downto Low( List ) do
this way compiler will be happy to compare i with zero to indicate that we checked everything (this operation is free: when we decrement i and got zero, CPU zero flag turns on!)
But in such implementation behaviour may be different: if you have several entries = Value, you'll get not the first one, but the last one!
Another very easy thing is to declare this IndexOf function as inline: this way you'll probably have no function call here: this code will be inserted at each place where you call it. Function calls are rather slow things.
There are also some crazy methods described in Knuth how to search in simple array as fast as possible, he introduces 'dummy' last element of array which equals your 'Value', that way you don't have to check boundaries (it will alway find something before going out of range), so there is just 1 condition inside loop instead of 2. Another method is 'unrolling' of loop: you write down 2 or 3 or more iterations inside a loop, so there are less jumps per each check, but this has even more downsides: it will be beneficial only for rather large arrays while may make it even slower for arrays with 1 or 2 elements.
As others said: the biggest improvement would be to understand what kind of data you store: does it change frequently or stays the same for long time, do you look for random elements or there are some 'leaders' which gets the most attention. Must these elements be in the same order as you put them or it's allowed to rearrange them as you wish? Then you can choose data structure accordingly. If you look for some 1 or 2 same entries all the time and they can be rearranged, a simple 'Move-to-front' method would be great: you don't just return index but first move element to first place, so it will be found very quickly the next time.
If your arrays are long, you can use the x86 built in string scan REP SCAS.
It is coded in microcode and has a moderate start-up time, but it is
heavily optimized in the CPU and runs fast given long enough data structures (>= 100 bytes).
In fact on a modern CPU it frequently outperforms very clever RISC code.
If your arrays are short, then no amount of optimization of this routine will help, because then your problem is in code not shown in the question, so there is no answer I can give you.
See: http://docwiki.embarcadero.com/RADStudio/Tokyo/en/Internal_Data_Formats_(Delphi)
function IndexOf({$ifndef RunInSeperateThread} const {$endif} List: TArray<byte>; const Value: byte): integer;
//Lock the array if you run this in a separate thread.
{$ifdef CPUX64}
asm
//RCX = List
//DL = byte.
mov r8,[rcx-8] //3 - get the length ASAP.
push rdi //0 - hidden in mov r,m
mov eax,edx //0 - rename
mov rdi,rcx //0 - rename
mov rcx,r8 //0 - rename
mov rdx,r8 //0 - remember the length
//8 cycles setup
repne scasb //2n - repeat until byte found.
pop rdi //1
neg rcx //0
lea rax,[rdx+rcx] //1 result = length - bytes left.
end;
{$ENDIF}
{$ifdef CPUX86}
asm
//EAX = List
//DL = byte.
push edi
mov edi,eax
mov ecx,[eax-4] //get the length
mov eax,edx
mov edx,ecx //remember the length
repne scasb //repeat until byte found.
pop edi
neg ecx
lea eax,[edx+ecx] //result = length - bytes left.
end;
Timings
On my laptop using an array of 1KB with the target byte at the end this gives the following timings (lowest time using a 100.0000 runs)
Code | CPU cycles
| Len=1024 | Len=16
-------------------------------+----------+---------
Your code optimizations off | 5775 | 146
Your code optimizations on | 4540 | 93
X86 my code | 2726 | 60
X64 my code | 2733 | 69
The speed-up is OK (ish), but hardly worth the effort.
If your array's are short, then this code will not help you and you'll have to resort to better other options to optimize your code.
Speed up possible when using binary search
Binary search is a O(log n) operation, vs O(n) for naive search.
Using the same array this will find your data in log2(1024) * CPU cycles per search = 10 * 20 +/- 200 cycles. A 10+ times speed up over my optimized code.

INT 13 Extension Read in C

i can use extended read functions of bios int 13h well from assembly,
with the below code
; *************************************************************************
; Setup DISK ADDRESS PACKET
; *************************************************************************
jmp strtRead
DAPACK :
db 010h ; Packet Size
db 0 ; Always 0
blkcnt:
dw 1 ; Sectors Count
db_add :
dw 07e00h ; Transfer Offset
dw 0 ; Transfer Segment
d_lba :
dd 1 ; Starting LBA(0 - n)
dd 0 ; Bios 48 bit LBA
; *************************************************************************
; Start Reading Sectors using INT13 Func 42
; *************************************************************************
strtRead:
mov si, OFFSET DAPACK; Load DPACK offset to SI
mov ah, 042h ; Function 42h
mov dl, 080h ; Drive ID
int 013h; Call INT13h
i want to convert this to be a c callable function but i have no idea about how to transfer the parameters from c to asm like drive id , sectors count, buffer segment:offset .... etc.
i am using msvc and masm and working with nothing except bios functions.
can anyone help ?!!
update :
i have tried the below function but always nothing loaded into the buffer ??
void read_sector()
{
static unsigned char currentMBR[512] = { 0 };
struct disk_packet //needed for int13 42h
{
byte size_pack; //size of packet must be 16 or 16+
byte reserved1; //reserved
byte no_of_blocks; //nof blocks for transfer
byte reserved2; //reserved
word offset; //offset address
word segment; //segment address
dword lba1;
dword lba2;
} disk_pack;
disk_pack.size_pack = 16; //set size to 16
disk_pack.no_of_blocks = 1; //1 block ie read one sector
disk_pack.reserved1 = 0; //reserved word
disk_pack.reserved2 = 0; //reserved word
disk_pack.segment = 0; //segment of buffer
disk_pack.offset = (word)&currentMBR[0]; //offset of buffer
disk_pack.lba1 = 0; //lba first 32 bits
disk_pack.lba2 = 0; //last 32 bit address
_asm
{
mov dl, 080h;
mov[disk_pack.segment], ds;
mov si, disk_pack;
mov ah, 42h;
int 13h
; jc NoError; //No error, ignore error code
; mov bError, ah; // Error, get the error code
NoError:
}
}
Sorry to post this as "answer"; I want to post this as "comment" but it is too long...
Different compilers have a different syntax of inline assembly. This means that the correct syntax of the following lines:
mov[disk_pack.segment], ds;
mov si, disk_pack;
... depends on the compiler used. Unfortunately I do not use 16-bit C compilers so I cannot help you in this point.
The next thing I see in your program is the following one:
disk_pack.segment = 0; //segment of buffer
disk_pack.offset = (word)&currentMBR[0]; //offset of buffer
With a 99% chance this will lead to a problem. Instead I would to the following:
struct disk_packet //needed for int13 42h
{
byte size_pack;
byte reserved;
word no_of_blocks; // note that this is 16-bit!
void far *data; // <- This line is the main change!
dword lba1;
dword lba2;
} disk_pack;
...
disk_pack.size_pack = 16;
disk_pack.no_of_blocks = 1;
disk_pack.reserved = 0;
disk_pack.data = &currentMBR[0]; // also note the change here
disk_pack.lba1 = 0;
disk_pack.lba2 = 0;
...
Note that some compilers name the keyword "_far" or "__far" instead of "far".
A third problem is that some (buggy) BIOSes require ES to be equal to the segment value from the disk_pack and a fourth one is that many compilers require the inline assembly code not to modify any registers (AX, CX and DX is normally OK).
These two could be solved the following way:
push ds;
push es;
push si;
mov dl, 080h;
// TODO here: Set ds:si to disk_pack in a compiler-specific way
mov es,[si+6];
mov ah, 42h;
int 13h;
...
pop si;
pop es;
pop ds;
In my opinion the "#pragma pack" should not be neccessary because all elements in the structure are propperly aligned.

Decimal concatenation of two integers using bit operations

I want to concatenate two integers using only bit operations since i need efficiency as much as possible.There are various answers available but they are not fast enough what I want is implementation that uses only bit operations like left shift,or and etc.
Please guide me how to do it.
example
int x=32;
int y=12;
int result=3212;
I am having and FPGA implentation of AES.I need this on my system to reduce time consumption of some task
The most efficient way to do it, is likely something similar to this:
uint32_t uintcat (uint32_t ms, uint32_t ls)
{
uint32_t mult=1;
do
{
mult *= 10;
} while(mult <= ls);
return ms * mult + ls;
}
Then let the compiler worry about optimization. There's likely not a lot it can improve since this is base 10, which doesn't get along well with the various instructions of the computer, like shifting.
EDIT : BENCHMARK TEST
Intel i7-3770 2 3,4 GHz
OS: Windows 7/64
Mingw, GCC version 4.6.2
gcc -O3 -std=c99 -pedantic-errors -Wall
10 million random values, from 0 to 3276732767.
Result (approximates):
Algorithm 1: 60287 micro seconds
Algorithm 2: 65185 micro seconds
Benchmark code used:
#include <stdint.h>
#include <stdio.h>
#include <windows.h>
#include <time.h>
uint32_t uintcat (uint32_t ms, uint32_t ls)
{
uint32_t mult=1;
do
{
mult *= 10;
} while(mult <= ls);
return ms * mult + ls;
}
uint32_t myConcat (uint32_t a, uint32_t b) {
switch( (b >= 10000000) ? 7 :
(b >= 1000000) ? 6 :
(b >= 100000) ? 5 :
(b >= 10000) ? 4 :
(b >= 1000) ? 3 :
(b >= 100) ? 2 :
(b >= 10) ? 1 : 0 ) {
case 1: return a*100+b; break;
case 2: return a*1000+b; break;
case 3: return a*10000+b; break;
case 4: return a*100000+b; break;
case 5: return a*1000000+b; break;
case 6: return a*10000000+b; break;
case 7: return a*100000000+b; break;
default: return a*10+b; break;
}
}
static LARGE_INTEGER freq;
static void print_benchmark_results (LARGE_INTEGER* start, LARGE_INTEGER* end)
{
LARGE_INTEGER elapsed;
elapsed.QuadPart = end->QuadPart - start->QuadPart;
elapsed.QuadPart *= 1000000;
elapsed.QuadPart /= freq.QuadPart;
printf("%lu micro seconds", elapsed.QuadPart);
}
int main()
{
const uint32_t TEST_N = 10000000;
uint32_t* data1 = malloc (sizeof(uint32_t) * TEST_N);
uint32_t* data2 = malloc (sizeof(uint32_t) * TEST_N);
volatile uint32_t* result_algo1 = malloc (sizeof(uint32_t) * TEST_N);
volatile uint32_t* result_algo2 = malloc (sizeof(uint32_t) * TEST_N);
srand (time(NULL));
// Mingw rand() apparently gives numbers up to 32767
// worst case should therefore be 3,276,732,767
// fill up random data in arrays
for(uint32_t i=0; i<TEST_N; i++)
{
data1[i] = rand();
data2[i] = rand();
}
QueryPerformanceFrequency(&freq);
LARGE_INTEGER start, end;
// run algorithm 1
QueryPerformanceCounter(&start);
for(uint32_t i=0; i<TEST_N; i++)
{
result_algo1[i] = uintcat(data1[i], data2[i]);
}
QueryPerformanceCounter(&end);
// print results
printf("Algorithm 1: ");
print_benchmark_results(&start, &end);
printf("\n");
// run algorithm 2
QueryPerformanceCounter(&start);
for(uint32_t i=0; i<TEST_N; i++)
{
result_algo2[i] = myConcat(data1[i], data2[i]);
}
QueryPerformanceCounter(&end);
// print results
printf("Algorithm 2: ");
print_benchmark_results(&start, &end);
printf("\n\n");
// sanity check both algorithms against each other
for(uint32_t i=0; i<TEST_N; i++)
{
if(result_algo1[i] != result_algo2[i])
{
printf("Results mismatch for %lu %lu. Expected: %lu%lu, algo1: %lu, algo2: %lu\n",
data1[i],
data2[i],
data1[i],
data2[i],
result_algo1[i],
result_algo2[i]);
}
}
// clean up
free((void*)data1);
free((void*)data2);
free((void*)result_algo1);
free((void*)result_algo2);
}
Bit operations use the binary representation of the numbers. However what you try to achieve is to concatenate the numbers in decimal notation. Please note that concatenating the decimal representations has little to do with concatenating the binary representations. Though it is theoretically possible to solve the problem using binary operations I am sure it will be far from the most efficient way.
We need to calculate a*10^N + b very fast.
Bit operations isn't a best idea to optimize it (even using tricks like a := (a<<1) + (a<<3) <=> a := a*10 as compiler can make it himself).
The first problem is to calculate 10^N, but there is no need to calculate it, there is just 9 possible values.
The second problem is to calculate N from b (length of 10 representation). If your data have uniform distribution you can minimize operations count in average case.
Check b <= 10^9, b <= 10^8, ..., b <= 10 with ()?: (it's faster than if( ) after optimizations, it has much simplier grammar and functionality), call the result N. Next, make switch(N) with lines "return a*10^N + b" (where 10^N is constant). As I know, switch() with 3-4 "case" is faster than same if( ) construction after optimizations.
unsigned int myConcat(unsigned int& a, unsigned int& b) {
switch( (b >= 10000000) ? 7 :
(b >= 1000000) ? 6 :
(b >= 100000) ? 5 :
(b >= 10000) ? 4 :
(b >= 1000) ? 3 :
(b >= 100) ? 2 :
(b >= 10) ? 1 : 0 ) {
case 1: return a*100+b; break;
case 2: return a*1000+b; break;
case 3: return a*10000+b; break;
case 4: return a*100000+b; break;
case 5: return a*1000000+b; break;
case 6: return a*10000000+b; break;
case 7: return a*100000000+b; break;
default: return a*10+b; break;
// I don't really know what to do here
//case 8: return a*1000*1000*1000+b; break;
//case 9: return a*10*1000*1000*1000+b; break;
}
}
As you can see, there is 2-3 operations in average case + optimisations is very effective here. I've benchmarked it in comparison with Lundin's suggestion, here is the result. 0ms vs 100ms
If you care about decimal digit concatenation, you might want to simply do that as you're printing, and convert both numbers to a sequence of digits sequentially. e.g. How do I print an integer in Assembly Level Programming without printf from the c library? shows an efficient C function, as well as asm. Call it twice into the same buffer.
#Lundin's answer loops increasing powers of 10 to find
the right decimal-shift, i.e. a linear search for the right power of 10. If it's called very frequently so a lookup table can stay hot in cache, a speedup maybe be possible.
If you can use GNU C __builtin_clz (Count Leading Zeros) or some other way of quickly finding the MSB position of the right-hand input (ls, the least-significant part of the resulting concatenation), you can start the search for the right mult from a 32-entry lookup table. (And you only have to check at most one more iteration, so it's not a loop.)
Most of the common modern CPU architectures have a HW instruction that the compiler can use directly or with a bit of processing to implement clz. https://en.wikipedia.org/wiki/Find_first_set#Hardware_support. (And on all but x86, the result is well-defined for an input of 0, but unfortunately GNU C doesn't portably give us access to that.)
If the table stays hot in L1d cache, this can be pretty good. The extra latency of a clz and a table lookup are comparable to a couple iterations of the loop (on a modern x86 like Skylake or Ryzen for example, where bsf or tzcnt is 3 cycle latency, L1d latency is 4 or 5 cycles, imul latency is 3 cycles.)
Of course, on many architectures (including x86), multiplying by 10 is cheaper than by a runtime variable, using shift and add. 2 LEA instructions on x86, or an add+lsl on ARM/AArch64 using a shifted input to do tmp = x + x*4 with the add. So on Intel CPUs, we're only looking at a 2-cycle loop-carried dependency chain, not 3. But AMD CPUs have slower LEA when using a scaled index.
This doesn't sound great for small numbers. But it can reduce branch mispredictions by needing at most one iteration. It even makes a branchless implementation possible. And it means less total work for large lower parts (large powers of 10). But large integers will easily overflow unless you use a wider result type.
Unfortunately, 10 is not a power of 2, so the MSB position alone can't give us the exact power of 10 to multiply by. e.g. all numbers from 64 to 127 all have MSB = 1<<7, but some of them have 2 decimal digits and some have 3. Since we want to avoid division (because it requires a multiplication by a magic constant and shifting the high half), we want to always start with the lower power of 10 and see if that's big enough.
But fortunately, a bitscan does get us within one power of 10 so we no longer need a loop.
I probably wouldn't have written the part with _lzcnt_u32 or ARM __clz if I'd learned of the clz(a|1) trick for avoiding problems with input=0 beforehand. But I did, and played around with the source a bit to try to get nicer asm from gcc and clang. Index the table on clz or BSR depending on target platform makes it a bit of a mess.
#include <stdint.h>
#include <limits.h>
#include <assert.h>
// builtin_clz matches Intel's docs for x86 BSR: garbage result for input=0
// actual x86 HW leaves the destination register unmodified; AMD even documents this.
// but GNU C doesn't let us take advantage with intrinsics.
// unless you use BMI1 _lzcnt_u32
// if available, use an intrinsic that gives us a leading-zero count
// *without* an undefined result for input=0
#ifdef __LZCNT__ // x86 CPU feature
#include <immintrin.h> // Intel's intrinsics
#define HAVE_LZCNT32
#define lzcnt32(a) _lzcnt_u32(a)
#endif
#ifdef __ARM__ // TODO: do older ARMs not have this?
#define HAVE_LZCNT32
#define lzcnt32(a) __clz(a) // builtin, no header needed
#endif
// Some POWER compilers define `__cntlzw`?
// index = msb position, or lzcnt, depending on which the HW can do more efficiently
// defined later; one or the other is unused and optimized out, depending on target platform
// alternative: fill this at run-time startup
// with a loop that does mult*=10 when (x<<1)-1 > mult, or something
//#if INDEX_BY_MSB_POS == 1
__attribute__((unused))
static const uint32_t catpower_msb[] = {
10, // 1 and 0
10, // 2..3
10, // 4..7
10, // 8..15
100, // 16..31 // 2 digits even for the low end of the range
100, // 32..63
100, // 64..127
1000, // 128..255 // 3 digits
1000, // 256..511
1000, // 512..1023
10000, // 1024..2047
10000, // 2048..4095
10000, // 4096..8191
10000, // 8192..16383
100000, // 16384..32767
100000, // 32768..65535 // up to 2^16-1, enough for 16-bit inputs
// ... // fill in the rest yourself
};
//#elif INDEX_BY_MSB_POS == 0
// index on leading zeros
__attribute__((unused))
static const uint32_t catpower_lz32[] = {
// top entries overflow: 10^10 doesn't fit in uint32_t
// intentionally wrong to make it easier to spot bad output.
4000000000, // 2^31 .. 2^32-1 2*10^9 .. 4*10^9
2000000000, // 1,073,741,824 .. 2,147,483,647
// first correct entry
1000000000, // 536,870,912 .. 1,073,741,823
// ... fill in the rest
// for testing, skip until 16 leading zeros
[16] = 100000, // 32768..65535 // up to 2^16-1, enough for 16-bit inputs
100000, // 16384..32767
10000, // 8192..16383
10000, // 4096..8191
10000, // 2048..4095
10000, // 1024..2047
1000, // 512..1023
1000, // 256..511
1000, // 128..255
100, // 64..127
100, // 32..63
100, // 16..31 // low end of the range has 2 digits
10, // 8..15
10, // 4..7
10, // 2..3
10, // 1
// lzcnt32(0) == 32
10, // 0 // treat 0 as having one significant digit.
};
//#else
//#error "INDEX_BY_MSB_POS not set correctly"
//#endif
//#undef HAVE_LZCNT32 // codegen for the other path, for fun
static inline uint32_t msb_power10(uint32_t a)
{
#ifdef HAVE_LZCNT32 // 0-safe lzcnt32 macro available
#define INDEX_BY_MSB_POS 0
// a |= 1 would let us shorten the table, in case 32*4 is a lot nicer than 33*4 bytes
unsigned lzcnt = lzcnt32(a); // 32 for a=0
return catpower_lz32[lzcnt];
#else
// only generic __builtin_clz available
static_assert(sizeof(uint32_t) == sizeof(unsigned) && UINT_MAX == (1ULL<<32)-1, "__builtin_clz isn't 32-bit");
// See also https://foonathan.net/blog/2016/02/11/implementation-challenge-2.html
// for C++ templates for fixed-width wrappers for __builtin_clz
#if defined(__i386__) || defined(__x86_64__)
// x86 where MSB_index = 31-clz = BSR is most efficient
#define INDEX_BY_MSB_POS 1
unsigned msb = 31 - __builtin_clz(a|1); // BSR
return catpower_msb[msb];
//return unlikely(a==0) ? 10 : catpower_msb[msb];
#else
// use clz directly while still avoiding input=0
// I think all non-x86 CPUs with hardware CLZ do define clz(0) = 32 or 64 (the operand width),
// but gcc's builtin is still documented as not valid for input=0
// Most ISAs like PowerPC and ARM that have a bitscan instruction have clz, not MSB-index
// set the LSB to avoid the a==0 special case
unsigned clz = __builtin_clz(a|1);
// table[32] unused, could add yet another #ifdef for that
#define INDEX_BY_MSB_POS 0
//return unlikely(a==0) ? 10 : catpower_lz32[clz];
return catpower_lz32[clz]; // a|1 avoids the special-casing
#endif // optimize for BSR or not
#endif // HAVE_LZCNT32
}
uint32_t uintcat (uint32_t ms, uint32_t ls)
{
// if (ls==0) return ms * 10; // Another way to avoid the special case for clz
uint32_t mult = msb_power10(ls); // catpower[clz(ls)];
uint32_t high = mult * ms;
#if 0
if (mult <= ls)
high *= 10;
return high + ls;
#else
// hopefully compute both and then select
// because some CPUs can shift and add at the same time (x86, ARM)
// so this avoids having an ADD *after* the cmov / csel, if the compiler is smart
uint32_t another10 = high*10 + ls;
uint32_t enough = high + ls;
return (mult<=ls) ? another10 : enough;
#endif
}
From the Godbolt compiler explorer, this compiles efficiently for x86-64 with and without BSR:
# clang7.0 -O3 for x86-64 SysV, -march=skylake -mno-lzcnt
uintcat(unsigned int, unsigned int):
mov eax, esi
or eax, 1
bsr eax, eax # 31-clz(ls|1)
mov ecx, dword ptr [4*rax + catpower_msb]
imul edi, ecx # high = mult * ms
lea eax, [rdi + rdi]
lea eax, [rax + 4*rax] # retval = high * 10
cmp ecx, esi
cmova eax, edi # if(mult>ls) retval = high (drop the *10 result)
add eax, esi # retval += ls
ret
Or with lzcnt, (enabled by -march=haswell or later, or some AMD uarches),
uintcat(unsigned int, unsigned int):
# clang doesn't try to break the false dependency on EAX; gcc uses xor eax,eax
lzcnt eax, esi # C source avoids the |1, saving instructions
mov ecx, dword ptr [4*rax + catpower_lz32]
imul edi, ecx # same as above from here on
lea eax, [rdi + rdi]
lea eax, [rax + 4*rax]
cmp ecx, esi
cmova eax, edi
add eax, esi
ret
Factoring the last add out of both sides of the ternary is a missed optimization, adding 1 cycle of latency after the cmov. We can multiply by 10 and add just as cheaply as multiplying by 10 alone, on Intel CPUs:
... same start # hand-optimized version that clang should use
imul edi, ecx # high = mult * ms
lea eax, [rdi + 4*rdi] # high * 5
lea eax, [rsi + rdi*2] # retval = high * 10 + ls
add edi, esi # tmp = high + ls
cmp ecx, esi
cmova eax, edi # if(mult>ls) retval = high+ls
ret
So the high + ls latency would run in parallel with the high*10 + ls latency, both needed as inputs for cmov.
GCC branches instead of using CMOV for the last condition. GCC also makes a mess of 31-clz(a|1), calculating clz with BSR and XOR with 31. But then subtracting that from 31. And it has some extra mov instructions. Strangely, gcc seems to do better with that BSR code when lzcnt is available, even though it chooses not to use it.
clang has no trouble optimizing away the 31-clz double-inversion and just using BSR directly.
For PowerPC64, clang also makes branchless asm. gcc does something similar, but with a branch like on x86-64.
uintcat:
.Lfunc_gep0:
addis 2, 12, .TOC.-.Lfunc_gep0#ha
addi 2, 2, .TOC.-.Lfunc_gep0#l
ori 6, 4, 1 # OR immediate
addis 5, 2, catpower_lz32#toc#ha
cntlzw 6, 6 # CLZ word
addi 5, 5, catpower_lz32#toc#l # static table address
rldic 6, 6, 2, 30 # rotate left and clear immediate (shift and zero-extend the CLZ result)
lwzx 5, 5, 6 # Load Word Zero eXtend, catpower_lz32[clz]
mullw 3, 5, 3 # mul word
cmplw 5, 4 # compare mult, ls
mulli 6, 3, 10 # mul immediate
isel 3, 3, 6, 1 # conditional select high vs. high*10
add 3, 3, 4 # + ls
clrldi 3, 3, 32 # zero extend, clearing upper 32 bits
blr # return
Compressing the table
Using clz(ls|1) >> 1 or that +1 should work, because 4 < 10. The table always takes at least 3 entries to gain another digit. I haven't investigated this. (And have already spent longer than I meant to on this. :P)
Or right-shift a lot more to just get a starting point for the loop. e.g. mult = clz(ls) >= 18 ? 100000 : 10;, or a 3 or 4-way chain of if.
Or loop on mult *= 100, and after exiting that loop sort out whether you want old_mult * 10 or mult. (i.e. check if you went too far). This cuts the iteration count in half for even numbers of digits.
(Watch out for a possible infinite loop on large ls that would overflow the result. If mult *= 100 wraps to 0, it will always stay <= ls for ls = 1000000000, for example.)

Resources