cuda SIMD instruction for per-byte multiplication with unsigned saturation - c

CUDA has a nice set of SIMD instructions for integers that allow efficient SIMD computations. Among those, there are some that compute addition and subtraction per byte or per half-word (like __vadd2 and __vadd4), however, I couldn't find a similar function that computes per-byte multiplication for a 32bit register. I would appreciate it if someone can help me find a proper solution.

however, I couldn't find a similar function that computes per-byte multiplication for a 32bit register.
There isn't one that returns the 4 individual products.
The closest is the __dp4a() intrinsic which returns the sum of the 4 products, in a 32-bit integer.
You could write an 8-bit packed unsigned multiply with saturation like this:
$ cat t2048.cu
#include <cstdio>
#include <cstdint>
__host__ __device__ uchar4 u8mulsat(const uchar4 &a, const uchar4 &b){
const unsigned sv = 255;
uchar4 result;
unsigned t;
t = a.x*b.x;
if (t > sv) t = sv;
result.x = t;
t = a.y*b.y;
if (t > sv) t = sv;
result.y = t;
t = a.z*b.z;
if (t > sv) t = sv;
result.z = t;
t = a.w*b.w;
if (t > sv) t = sv;
result.w = t;
return result;
}
__global__ void k(uchar4 a, uchar4 b, uchar4 *c){
*c = u8mulsat(a, b);
}
int main(){
uchar4 a,b,c, *d_c;
cudaMalloc(&d_c, sizeof(uchar4));
a.x = 1;
a.y = 2;
a.z = 4;
a.w = 8;
b.x = 64;
b.y = 64;
b.z = 64;
b.w = 1;
k<<<1,1>>>(a, b, d_c);
cudaMemcpy(&c, d_c, sizeof(uchar4), cudaMemcpyDeviceToHost);
printf("c.x = %u\n", (unsigned)c.x);
printf("c.y = %u\n", (unsigned)c.y);
printf("c.z = %u\n", (unsigned)c.z);
printf("c.w = %u\n", (unsigned)c.w);
}
$ nvcc -o t2048 t2048.cu
$ compute-sanitizer ./t2048
========= COMPUTE-SANITIZER
c.x = 64
c.y = 128
c.z = 255
c.w = 8
========= ERROR SUMMARY: 0 errors
$ cuobjdump -sass ./t2048
Fatbin elf code:
================
arch = sm_52
code version = [1,7]
producer = <unknown>
host = linux
compile_size = 64bit
code for sm_52
Fatbin elf code:
================
arch = sm_52
code version = [1,7]
producer = <unknown>
host = linux
compile_size = 64bit
code for sm_52
Function : _Z1k6uchar4S_PS_
.headerflags #"EF_CUDA_SM52 EF_CUDA_PTX_SM(EF_CUDA_SM52)"
/* 0x001c4400e22007f6 */
/*0008*/ MOV R1, c[0x0][0x20] ; /* 0x4c98078000870001 */
/*0010*/ LDC.U8 R0, c[0x0][0x140] ; /* 0xef9000001407ff00 */
/*0018*/ LDC.U8 R2, c[0x0][0x144] ; /* 0xef9000001447ff02 */
/* 0x001d4400e6200731 */
/*0028*/ LDC.U8 R3, c[0x0][0x141] ; /* 0xef9000001417ff03 */
/*0030*/ LDC.U8 R4, c[0x0][0x145] ; /* 0xef9000001457ff04 */
/*0038*/ LDC.U8 R5, c[0x0][0x142] ; /* 0xef9000001427ff05 */
/* 0x001dfc00ee200751 */
/*0048*/ LDC.U8 R6, c[0x0][0x146] ; /* 0xef9000001467ff06 */
/*0050*/ LDC.U8 R7, c[0x0][0x143] ; /* 0xef9000001437ff07 */
/*0058*/ LDC.U8 R8, c[0x0][0x147] ; /* 0xef9000001477ff08 */
/* 0x009fd002fe200fe1 */
/*0068*/ XMAD R0, R2, R0, RZ ; /* 0x5b007f8000070200 */
/*0070*/ XMAD R2, R4, R3, RZ ; /* 0x5b007f8000370402 */
/*0078*/ XMAD R3, R6, R5, RZ ; /* 0x5b007f8000570603 */
/* 0x001fc408fe2007f1 */
/*0088*/ IMNMX.U32 R0, R0, 0xff, PT ; /* 0x382003800ff70000 */
/*0090*/ XMAD R4, R8, R7, RZ ; /* 0x5b007f8000770804 */
/*0098*/ IMNMX.U32 R2, R2, 0xff, PT ; /* 0x382003800ff70202 */
/* 0x001fc400fe2007e4 */
/*00a8*/ IMNMX.U32 R3, R3, 0xff, PT ; /* 0x382003800ff70303 */
/*00b0*/ IMNMX.U32 R4, R4, 0xff, PT ; /* 0x382003800ff70404 */
/*00b8*/ BFI R0, R2, 0x808, R0 ; /* 0x36f0000080870200 */
/* 0x001fd400fe2007f5 */
/*00c8*/ MOV R2, c[0x0][0x148] ; /* 0x4c98078005270002 */
/*00d0*/ BFI R5, R3, 0x810, R0 ; /* 0x36f0000081070305 */
/*00d8*/ MOV R3, c[0x0][0x14c] ; /* 0x4c98078005370003 */
/* 0x001ffc00fe2007e2 */
/*00e8*/ BFI R4, R4, 0x818, R5 ; /* 0x36f0028081870404 */
/*00f0*/ STG.E [R2], R4 ; /* 0xeedc200000070204 */
/*00f8*/ EXIT ; /* 0xe30000000007000f */
/* 0x001f8000fc0007ff */
/*0108*/ BRA 0x100 ; /* 0xe2400fffff07000f */
/*0110*/ NOP; /* 0x50b0000000070f00 */
/*0118*/ NOP; /* 0x50b0000000070f00 */
/* 0x001f8000fc0007e0 */
/*0128*/ NOP; /* 0x50b0000000070f00 */
/*0130*/ NOP; /* 0x50b0000000070f00 */
/*0138*/ NOP; /* 0x50b0000000070f00 */
..........
Fatbin ptx code:
================
arch = sm_52
code version = [7,4]
producer = <unknown>
host = linux
compile_size = 64bit
compressed
$
The SASS code appears to be about as I would expect, roughly the same length as the C++ code, ignoring the LDC and STG instructions.
FWIW, on Tesla V100, CUDA 11.4, the implementation by njuffa and mine are pretty close in terms of register usage (njuffa: 16, mine: 17) and performance (njuffa about 1% faster):
$ cat t2048.cu
#include <iostream>
#include <cstdint>
__device__ unsigned int vmulus4 (unsigned int a, unsigned int b)
{
unsigned int plo, phi, res;
// compute products
plo = ((a & 0x000000ff) * (b & 0x000000ff) +
(a & 0x0000ff00) * (b & 0x0000ff00));
phi = (__umulhi (a & 0x00ff0000, b & 0x00ff0000) +
__umulhi (a & 0xff000000, b & 0xff000000));
// clamp products to 255
plo |= __vcmpne2 (plo & 0xff00ff00, 0x00000000);
phi |= __vcmpne2 (phi & 0xff00ff00, 0x00000000);
// extract least significant eight bits of each product
res = __byte_perm (plo, phi, 0x6420);
return res;
}
__host__ __device__ uchar4 u8mulsat(const uchar4 &a, const uchar4 &b){
const unsigned sv = 255;
uchar4 result;
unsigned t;
t = a.x*b.x;
if (t > sv) t = sv;
result.x = t;
t = a.y*b.y;
if (t > sv) t = sv;
result.y = t;
t = a.z*b.z;
if (t > sv) t = sv;
result.z = t;
t = a.w*b.w;
if (t > sv) t = sv;
result.w = t;
return result;
}
__global__ void k(const uchar4 * __restrict__ a, const uchar4 * __restrict__ b, uchar4 * __restrict__ c, unsigned N){
unsigned idx = blockIdx.x*blockDim.x+threadIdx.x;
if (idx < N)
c[idx] = u8mulsat(a[idx], b[idx]);
}
__global__ void k1(const unsigned * __restrict__ a, const unsigned * __restrict__ b, unsigned * __restrict__ c, unsigned N){
unsigned idx = blockIdx.x*blockDim.x+threadIdx.x;
if (idx < N)
c[idx] = vmulus4(a[idx], b[idx]);
}
int main(){
unsigned N = 256U*80U*8U*400U;
uchar4 *d_a,*d_b, *d_c;
cudaMalloc(&d_c, sizeof(uchar4)*N);
cudaMalloc(&d_a, sizeof(uchar4)*N);
cudaMalloc(&d_b, sizeof(uchar4)*N);
for (int i = 0; i < 100; i++) {
k<<<N/256,256>>>(d_a, d_b, d_c, N);
k1<<<N/256,256>>>((unsigned *)d_a, (unsigned *)d_b, (unsigned *)d_c, N);}
cudaDeviceSynchronize();
}
$ nvcc -o t2048 t2048.cu -arch=sm_70 -Xptxas -v
ptxas info : 0 bytes gmem
ptxas info : Compiling entry function '_Z2k1PKjS0_Pjj' for 'sm_70'
ptxas info : Function properties for _Z2k1PKjS0_Pjj
0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info : Used 16 registers, 380 bytes cmem[0]
ptxas info : Compiling entry function '_Z1kPK6uchar4S1_PS_j' for 'sm_70'
ptxas info : Function properties for _Z1kPK6uchar4S1_PS_j
0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info : Used 17 registers, 380 bytes cmem[0]
$ nvprof ./t2048
==2696== NVPROF is profiling process 2696, command: ./t2048
==2696== Profiling application: ./t2048
==2696== Profiling result:
Type Time(%) Time Calls Avg Min Max Name
GPU activities: 50.21% 100.24ms 100 1.0024ms 998.26us 1.0084ms k(uchar4 const *, uchar4 const *, uchar4*, unsigned int)
49.79% 99.412ms 100 994.12us 990.33us 1.0015ms k1(unsigned int const *, unsigned int const *, unsigned int*, unsigned int)
API calls: 57.39% 279.76ms 3 93.254ms 557.75us 278.64ms cudaMalloc
40.69% 198.31ms 1 198.31ms 198.31ms 198.31ms cudaDeviceSynchronize
1.03% 5.0147ms 4 1.2537ms 589.80us 3.2328ms cuDeviceTotalMem
0.51% 2.4799ms 404 6.1380us 333ns 272.34us cuDeviceGetAttribute
0.30% 1.4715ms 200 7.3570us 6.5220us 68.684us cudaLaunchKernel
0.07% 354.69us 4 88.672us 61.927us 166.60us cuDeviceGetName
0.00% 20.956us 4 5.2390us 3.1200us 7.8000us cuDeviceGetPCIBusId
0.00% 10.445us 8 1.3050us 522ns 4.9100us cuDeviceGet
0.00% 3.7970us 4 949ns 780ns 1.2230us cuDeviceGetUuid
0.00% 3.2030us 3 1.0670us 751ns 1.5050us cuDeviceGetCount
$
Later:
Here is a slightly faster routine (a few percent, on sm_70) compared to my previous:
__device__ uchar4 u8mulsat(const uchar4 &a, const uchar4 &b){
uchar4 result;
const half sv = 255;
const short svi = 255;
__half2 ah2, bh2, rh2;
ah2 = __floats2half2_rn(a.x, a.y);
bh2 = __floats2half2_rn(b.x, b.y);
rh2 = __hmul2(ah2, bh2);
result.x = (rh2.x > sv) ? (svi):((short)rh2.x);
result.y = (rh2.y > sv) ? (svi):((short)rh2.y);
ah2 = __floats2half2_rn(a.z, a.w);
bh2 = __floats2half2_rn(b.z, b.w);
rh2 = __hmul2(ah2, bh2);
result.z = (rh2.x > sv) ? (svi):((short)rh2.x);
result.w = (rh2.y > sv) ? (svi):((short)rh2.y);
return result;
}
It has the disadvantage that it uses CUDA half-precision intrinsics, so it is "less portable" than the previous, and likewise cannot be decorated with __host__.

There is no existing intrinsic __vmulus8() in CUDA. However, it can be emulated using existing intrinsics. Basically, we can pack the four 16-bit products of four 8-bit quantities using two 32-bit variable to hold them. Then clamp each product to 255 and extract the least-significant byte of each product into the final result with the help of the permute operation. The code generated by CUDA 11 for compute capabilities >= 7.0 looks reasonable. Whether the performance is sufficient will depend on the use case. If this operation occurs in the middle of a processing pipeline computing with packed bytes, that should be the case.
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
/* byte-wise multiply with unsigned saturation */
__device__ unsigned int vmulus4 (unsigned int a, unsigned int b)
{
unsigned int plo, phi, res;
// compute products
plo = ((a & 0x000000ff) * (b & 0x000000ff) +
(a & 0x0000ff00) * (b & 0x0000ff00));
phi = (__umulhi (a & 0x00ff0000, b & 0x00ff0000) +
__umulhi (a & 0xff000000, b & 0xff000000));
// clamp products to 255
plo |= __vcmpne2 (plo & 0xff00ff00, 0x00000000);
phi |= __vcmpne2 (phi & 0xff00ff00, 0x00000000);
// extract least significant eight bits of each product
res = __byte_perm (plo, phi, 0x6420);
return res;
}
__global__ void kernel (unsigned int a, unsigned int b, unsigned int *res)
{
*res = vmulus4 (a, b);
}
unsigned int vmulus4_ref (unsigned int a, unsigned int b)
{
unsigned char a0, a1, a2, a3, b0, b1, b2, b3;
unsigned int p0, p1, p2, p3;
a0 = (a >> 0) & 0xff;
a1 = (a >> 8) & 0xff;
a2 = (a >> 16) & 0xff;
a3 = (a >> 24) & 0xff;
b0 = (b >> 0) & 0xff;
b1 = (b >> 8) & 0xff;
b2 = (b >> 16) & 0xff;
b3 = (b >> 24) & 0xff;
p0 = (unsigned int)a0 * (unsigned int)b0;
p1 = (unsigned int)a1 * (unsigned int)b1;
p2 = (unsigned int)a2 * (unsigned int)b2;
p3 = (unsigned int)a3 * (unsigned int)b3;
if (p0 > 255) p0 = 255;
if (p1 > 255) p1 = 255;
if (p2 > 255) p2 = 255;
if (p3 > 255) p3 = 255;
return (p0 << 0) + (p1 << 8) + (p2 << 16) + (p3 << 24);
}
// George Marsaglia's KISS PRNG, period 2**123. Newsgroup sci.math, 21 Jan 1999
// Bug fix: Greg Rose, "KISS: A Bit Too Simple" http://eprint.iacr.org/2011/007
static uint32_t kiss_z=362436069, kiss_w=521288629;
static uint32_t kiss_jsr=123456789, kiss_jcong=380116160;
#define znew (kiss_z=36969*(kiss_z&65535)+(kiss_z>>16))
#define wnew (kiss_w=18000*(kiss_w&65535)+(kiss_w>>16))
#define MWC ((znew<<16)+wnew )
#define SHR3 (kiss_jsr^=(kiss_jsr<<13),kiss_jsr^=(kiss_jsr>>17), \
kiss_jsr^=(kiss_jsr<<5))
#define CONG (kiss_jcong=69069*kiss_jcong+1234567)
#define KISS ((MWC^CONG)+SHR3)
int main (void)
{
unsigned int *resD = 0;
unsigned int a, b, res, ref;
cudaMalloc ((void**)&resD, sizeof resD[0]);
for (int i = 0; i < 1000000; i++) {
a = KISS;
b = KISS;
kernel<<<1,1>>>(a, b, resD);
cudaMemcpy (&res, resD, sizeof res, cudaMemcpyDeviceToHost);
ref = vmulus4_ref (a, b);
if (res != ref) {
printf ("error: a=%08x b=%08x res=%08x ref=%08x\n", a, b, res, ref);
return EXIT_FAILURE;
}
}
cudaFree (resD);
return EXIT_SUCCESS;
}

Related

Unsigned 64x64->128 bit integer multiply on 32-bit platforms

In the context of exploratory activity I have started to take a look at integer & fixed-point arithmetic building blocks for 32-bit platforms. My primary target would be ARM32 (specifically armv7), with a side glance to RISC-V32 which I expect to grow in importance in the embedded space. The first sample building block I chose to examine is unsigned 64x64->128 bit integer multiplication. Other questions on this site about this building block do not provide detailed coverage of 32-bit platforms.
Over the past thirty years, I have implemented this and other arithmetic building blocks multiple times, but always in assembly language, for various architectures. However, at this point in time my hope and desire is that these could be programmed in straight ISO-C, without the use of intrinsics. Ideally a single version of the C code would generate good machine code across architectures. I know that the approach of manipulating HLL code to control machine code is generally brittle, but hope that processor architectures and toolchains have matured enough to make this feasible.
Some approaches used in assembly language implementations are not well suited for porting to C. In the exemplary code below I have selected six variants that seemed amenable to an HLL implementation. Besides the generation of partial products, which is common to all variants, the two basic approaches are: (1) Sum the partial products using 64-bit arithmetic, letting the compiler take care of the carry propagation between 32-bit halves. In this case there are multiple choices in which order to sum the partial products. (2) Use 32-bit arithmetic for the summing, simulating the carry flag directly. In this case we have a choice of generating the carry after an addition (a = a + b; carry = a < b;) or before the addition (carry = ~a < b; a = a + b;). Variants 1 through 3 below fall into the former category, variants 5 and 6 fall into the latter.
At Compiler Explorer, I focused on the toolchains gcc 12.2 and clang 15.0 for the platforms of interest. I compiled with -O3. The general finding is that on average clang generates more efficient code than gcc, and that the differences between the variants (number of instructions and registers used) are more pronounced with clang. While this may be understandable in the case of RISC-V as the newer architecture, it surprised me in the case of armv7 which has been around for well over a dozen years.
Three cases in particular struck me as noteworthy. While I have worked with compiler engineers before and have a reasonable understanding of basic code transformation, phase ordering issues, etc, the only technique I aware of that might apply to this code is idiom recognition, and I do not see how this could explain the observations by itself. The first case is variant 3, where clang 15.0 produces extremely tight code comprising just 10 instructions that I don't think can be improved upon:
umul64wide:
push {r4, lr}
umull r12, r4, r2, r0
umull lr, r0, r3, r0
umaal lr, r4, r2, r1
umaal r0, r4, r3, r1
ldr r1, [sp, #8]
strd r0, r4, [r1]
mov r0, r12
mov r1, lr
pop {r4, pc}
By contrast, gcc generates twice the number of instructions and requires twice the number of registers. I hypothesize that it does not recognize how to use the multiply-accumulate instruction umaal here, but is that the full story? The reverse situation, but not quite as dramatic, occurs in variant 6, where gcc 12.2 produces this sequence of 18 instructions, with low register usage:
umul64wide:
mov ip, r0
push {r4, r5, lr}
mov lr, r1
umull r0, r1, r0, r2
ldr r4, [sp, #12]
umull r5, ip, r3, ip
adds r1, r1, r5
umull r2, r5, lr, r2
adc ip, ip, #0
umull lr, r3, lr, r3
adds r1, r1, r2
adc r2, ip, #0
adds r2, r2, r5
adc r3, r3, #0
adds r2, r2, lr
adc r3, r3, #0
strd r2, r3, [r4]
pop {r4, r5, pc}
The generated code nicely turns the simulated carry propagation into real carry propagation. clang 15.0 uses nine instructions and five registers more, and I cannot really make out what it is trying to do without spending much more time on analysis. The third observation is with regard to the differences seen in the machine code produced for variant 5 vs. variant 6, in particular with clang. These use the same basic algorithm, with one variant computing the simulated carry before the additions, the other after it. I did find in the end that one variant, namely variant 4, seems to be efficient across both tool chains and both architectures. However, before I proceed to other building blocks and face a similar struggle, I would like to inquire:
(1) Are there coding idioms or algorithms I have not considered in the code below that might lead to superior results? (2) Are there specific optimization switches, e.g. a hypothetical -ffrobnicate (see here), that are not included in -O3 that would help the compilers generate efficient code more consistently for these kind of bit-manipulation scenarios? Explanations as to what compiler mechanisms are likely responsible for the cases of significant differences in code generation observed, and how one might influence or work round them, could also be helpful.
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#define VARIANT (3)
#define USE_X64_ASM_REF (0)
/* Multiply two unsigned 64-bit operands a and b. Returns least significant 64
bits of product as return value, most significant 64 bits of product via h.
*/
uint64_t umul64wide (uint64_t a, uint64_t b, uint64_t *h)
{
uint32_t a_lo = (uint32_t)a;
uint32_t a_hi = a >> 32;
uint32_t b_lo = (uint32_t)b;
uint32_t b_hi = b >> 32;
uint64_t p0 = (uint64_t)a_lo * b_lo;
uint64_t p1 = (uint64_t)a_lo * b_hi;
uint64_t p2 = (uint64_t)a_hi * b_lo;
uint64_t p3 = (uint64_t)a_hi * b_hi;
#if VARIANT == 1
uint32_t c = (uint32_t)(((p0 >> 32) + (uint32_t)p1 + (uint32_t)p2) >> 32);
*h = p3 + (p1 >> 32) + (p2 >> 32) + c;
return p0 + ((p1 + p2) << 32);
#elif VARIANT == 2
uint64_t s = (p0 >> 32) + p1;
uint64_t t = (uint32_t)s + p2;
*h = (s >> 32) + (t >> 32) + p3;
return (uint32_t)p0 + (t << 32);
#elif VARIANT == 3
*h = (p1 >> 32) + (((p0 >> 32) + (uint32_t)p1 + p2) >> 32) + p3;
return p0 + ((p1 + p2) << 32);
#elif VARIANT == 4
uint64_t t = (p0 >> 32) + p1 + (uint32_t)p2;
*h = (p2 >> 32) + (t >> 32) + p3;
return (uint32_t)p0 + (t << 32);
#elif VARIANT == 5
uint32_t r0, r1, r2, r3, r4, r5, r6;
r0 = (uint32_t)p0;
r1 = p0 >> 32;
r5 = (uint32_t)p1;
r2 = p1 >> 32;
r1 = r1 + r5;
r6 = r1 < r5;
r2 = r2 + r6;
r6 = (uint32_t)p2;
r5 = p2 >> 32;
r1 = r1 + r6;
r6 = r1 < r6;
r2 = r2 + r6;
r4 = (uint32_t)p3;
r3 = p3 >> 32;
r2 = r2 + r5;
r6 = r2 < r5;
r3 = r3 + r6;
r2 = r2 + r4;
r6 = r2 < r4;
r3 = r3 + r6;
*h = ((uint64_t)r3 << 32) | r2;
return ((uint64_t)r1 << 32) | r0;
#elif VARIANT == 6
uint32_t r0, r1, r2, r3, r4, r5, r6;
r0 = (uint32_t)p0;
r1 = p0 >> 32;
r5 = (uint32_t)p1;
r2 = p1 >> 32;
r4 = ~r1;
r4 = r4 < r5;
r1 = r1 + r5;
r2 = r2 + r4;
r6 = (uint32_t)p2;
r5 = p2 >> 32;
r4 = ~r1;
r4 = r4 < r6;
r1 = r1 + r6;
r2 = r2 + r4;
r4 = (uint32_t)p3;
r3 = p3 >> 32;
r6 = ~r2;
r6 = r6 < r5;
r2 = r2 + r5;
r3 = r3 + r6;
r6 = ~r2;
r6 = r6 < r4;
r2 = r2 + r4;
r3 = r3 + r6;
*h = ((uint64_t)r3 << 32) | r2;
return ((uint64_t)r1 << 32) | r0;
#else
#error unsupported VARIANT
#endif
}
#if defined(__SIZEOF_INT128__)
uint64_t umul64wide_ref (uint64_t a, uint64_t b, uint64_t *h)
{
unsigned __int128 prod = ((unsigned __int128)a) * b;
*h = (uint64_t)(prod >> 32);
return (uint64_t)prod;
}
#elif defined(_MSC_VER) && defined(_WIN64)
#include <intrin.h>
uint64_t umul64wide_ref (uint64_t a, uint64_t b, uint64_t *h)
{
*h = __umulh (a, b);
return a * b;
}
#elif USE_X64_ASM_REF
uint64_t umul64wide_ref (uint64_t a, uint64_t b, uint64_t *h)
{
uint64_t res_l, res_h;
__asm__ (
"movq %2, %%rax;\n\t" // rax = a
"mulq %3;\n\t" // rdx:rax = a * b
"movq %%rdx, %0;\n\t" // res_h = rdx
"movq %%rax, %1;\n\t" // res_l = rax
: "=rm" (res_h), "=rm"(res_l)
: "rm"(a), "rm"(b)
: "%rax", "%rdx");
*h = res_h;
return res_l;
}
#else // generic (and slow) reference implementation
#define ADDCcc(a,b,cy,t0,t1) \
(t0=(b)+cy, t1=(a), cy=t0<cy, t0=t0+t1, t1=t0<t1, cy=cy+t1, t0=t0)
#define ADDcc(a,b,cy,t0,t1) \
(t0=(b), t1=(a), t0=t0+t1, cy=t0<t1, t0=t0)
#define ADDC(a,b,cy,t0,t1) \
(t0=(b)+cy, t1=(a), t0+t1)
uint64_t umul64wide_ref (uint64_t a, uint64_t b, uint64_t *h)
{
uint32_t cy, t0, t1;
uint32_t a_lo = (uint32_t)a;
uint32_t a_hi = a >> 32;
uint32_t b_lo = (uint32_t)b;
uint32_t b_hi = b >> 32;
uint64_t p0 = (uint64_t)a_lo * b_lo;
uint64_t p1 = (uint64_t)a_lo * b_hi;
uint64_t p2 = (uint64_t)a_hi * b_lo;
uint64_t p3 = (uint64_t)a_hi * b_hi;
uint32_t p0_lo = (uint32_t)p0;
uint32_t p0_hi = p0 >> 32;
uint32_t p1_lo = (uint32_t)p1;
uint32_t p1_hi = p1 >> 32;
uint32_t p2_lo = (uint32_t)p2;
uint32_t p2_hi = p2 >> 32;
uint32_t p3_lo = (uint32_t)p3;
uint32_t p3_hi = p3 >> 32;
uint32_t r0 = p0_lo;
uint32_t r1 = ADDcc (p0_hi, p1_lo, cy, t0, t1);
uint32_t r2 = ADDCcc (p1_hi, p2_hi, cy, t0, t1);
uint32_t r3 = ADDC (p3_hi, 0, cy, t0, t1);
r1 = ADDcc (r1, p2_lo, cy, t0, t1);
r2 = ADDCcc (r2, p3_lo, cy, t0, t1);
r3 = ADDC (r3, 0, cy, t0, t1);
*h = ((uint64_t)r3 << 32) + r2;
return ((uint64_t)r1 << 32) + r0;
}
#endif
/*
https://groups.google.com/forum/#!original/comp.lang.c/qFv18ql_WlU/IK8KGZZFJx4J
From: geo <gmars...#gmail.com>
Newsgroups: sci.math,comp.lang.c,comp.lang.fortran
Subject: 64-bit KISS RNGs
Date: Sat, 28 Feb 2009 04:30:48 -0800 (PST)
This 64-bit KISS RNG has three components, each nearly
good enough to serve alone. The components are:
Multiply-With-Carry (MWC), period (2^121+2^63-1)
Xorshift (XSH), period 2^64-1
Congruential (CNG), period 2^64
*/
static uint64_t kiss64_x = 1234567890987654321ULL;
static uint64_t kiss64_c = 123456123456123456ULL;
static uint64_t kiss64_y = 362436362436362436ULL;
static uint64_t kiss64_z = 1066149217761810ULL;
static uint64_t kiss64_t;
#define MWC64 (kiss64_t = (kiss64_x << 58) + kiss64_c, \
kiss64_c = (kiss64_x >> 6), kiss64_x += kiss64_t, \
kiss64_c += (kiss64_x < kiss64_t), kiss64_x)
#define XSH64 (kiss64_y ^= (kiss64_y << 13), kiss64_y ^= (kiss64_y >> 17), \
kiss64_y ^= (kiss64_y << 43))
#define CNG64 (kiss64_z = 6906969069ULL * kiss64_z + 1234567ULL)
#define KISS64 (MWC64 + XSH64 + CNG64)
int main (void)
{
uint64_t a, b, res_hi, res_lo, ref_hi, ref_lo, count = 0;
printf ("Smoke test of umul64wide variant %d\n", VARIANT);
do {
a = KISS64;
b = KISS64;
ref_lo = umul64wide_ref (a, b, &ref_hi);
res_lo = umul64wide (a, b, &res_hi);
if ((res_lo ^ ref_lo) | (res_hi ^ ref_hi)) {
printf ("!!!! error: a=%016llx b=%016llx res=%016llx_%016llx ref=%016llx_%016llx\n",
a, b, res_hi, res_lo, ref_hi, ref_lo);
return EXIT_FAILURE;
}
if (!(count & 0xfffffff)) printf ("\r%llu", count);
count++;
} while (count);
return EXIT_SUCCESS;
}
I avoided the use of the ((x += y) < y) overflow test, since not every ISA handles conditional flags efficiently, and may inhibit re-ordering when using the results of flag register(s); x86[-64] is the obvious example, though later BMI(2) instructions may help mitigate this. I also added a 32 x 32 -> 64 bit C implementation for comparison - but I would expect any modern ISA to at least supply a 'high word' multiply like ARM's umulh.
/******************************************************************************/
/* stackoverflow.com/questions/74713642 */
#include <inttypes.h>
#include <stdio.h>
/* umul_32_32 : 32 x 32 => 64 */
/* force inline (non-portable), or implement it as macro, e.g.,
* #define umul_32_32(rh, rl, x, y) do { ... } while (0) */
#if (1)
static inline __attribute__((always_inline))
uint64_t umul_32_32 (uint32_t x, uint32_t y)
{
return (((uint64_t) x) * y);
}
#else
/* if no widening multiply is available, the compiler probably
* generates something at least as efficient as the following -
* or (worst case) it calls a builtin function. */
static inline __attribute__((always_inline))
uint64_t umul_32_32 (uint32_t x, uint32_t y)
{
uint32_t m0, m1, m2, m3; /* (partial products) */
uint32_t x0, x1, y0, y1;
x0 = x & UINT16_MAX, x1 = x >> (16);
y0 = y & UINT16_MAX, y1 = y >> (16);
m0 = x0 * y0, m1 = x1 * y0;
m2 = x0 * y1, m3 = x1 * y1;
m1 += m0 >> (16);
m3 += m2 >> (16);
m1 += m2 & UINT16_MAX;
uint32_t rh = m3 + (m1 >> (16));
uint32_t rl = m1 << (16) | (m0 & UINT16_MAX);
return (((uint64_t) rh) << 32 | rl);
/* 32 x 32 => 64 : no branching or carry overflow tests. */
}
#endif
/* ensure the function is called to inspect code gen / assembly,
* otherwise gcc and clang evaluate this at compile time. */
__attribute__((noinline)) void umul_64_64 (
uint64_t *rh, uint64_t *rl, uint64_t x, uint64_t y)
{
uint64_t m0, m1, m2, m3; /* (partial products) */
uint32_t x0, x1, y0, y1;
x0 = (uint32_t) (x), x1 = (uint32_t) (x >> (32));
y0 = (uint32_t) (y), y1 = (uint32_t) (y >> (32));
m0 = umul_32_32(x0, y0), m1 = umul_32_32(x1, y0);
m2 = umul_32_32(x0, y1), m3 = umul_32_32(x1, y1);
m1 += m0 >> (32);
m3 += m2 >> (32);
m1 += m2 & UINT32_MAX;
*rh = m3 + (m1 >> (32));
*rl = m1 << (32) | (m0 & UINT32_MAX);
/* 64 x 64 => 128 : no branching or carry overflow tests. */
}
#if (0)
int main (void)
{
uint64_t x = UINT64_MAX, y = UINT64_MAX, rh, rl;
umul_64_64(& rh, & rl, x, y);
fprintf(stdout, "0x%016" PRIX64 ":0x%016" PRIX64 "\n", rh, rl);
return (0);
}
#endif
/******************************************************************************/
For ARM-7, I'm getting more or less the same results as your 'variant 3' code, which isn't surprising, since it's the same essential idea. I tried different flags on gcc-12 and gcc-trunk, but couldn't improve it.
I'd hazard a guess that with Apple's investment in AArch64 silicon, there's simply been more aggressive optimization and funding directed toward clang that benefits 32-bit ARM-7 as well. But that's pure speculation. It's a pretty glaring disparity for such a major platform though.

Interpolate 3 x (4 bits) values at once stored in a 2-byte variable

I have 3 numbers stored in a 16-bit variable. These numbers have a maximum of
4 bits each and they are each stored in a 5-bit field in the following format in the 16-bit variable: 0x3DEF (so there is one free (zero) bit after each value and the most significant bit is always zero).
I want to interpolate each separate number from the 16-bit variable with the respective numbers from another 16-bit variable like so: (numberA * interpolation + numberB) >> 8), where the interpolation range is from 0 to 255.
Is there a faster way than extracting each value from the 16-bit variables, doing the interpolation, and then grouping them back together?
I want the results to be stored back in the 16-bit variable using the same format.
I'm pretty sure that there won't be a faster way to do the manipulation than split, calculate (interpolate) and combine. You can't simply multiply the 16-bit (or 32-bit) values, add and shift; you'd need more space around the values. I came up with the code:
#include <stdio.h>
#include <stdint.h>
typedef struct B3x4_Data
{
char data[3];
} B3x4_Data;
typedef uint16_t B3x4_Value;
static inline void b3x4_map_value_to_data(B3x4_Value value, B3x4_Data *data)
{
data->data[0] = (value >> 0) & 0x0F;
data->data[1] = (value >> 5) & 0x0F;
data->data[2] = (value >> 10) & 0x0F;
}
static inline B3x4_Value b3x4_map_data_to_value(const B3x4_Data *data)
{
return ((data->data[0] & 0x0F) << 0) |
((data->data[1] & 0x0F) << 5) |
((data->data[2] & 0x0F) << 10);
}
static inline B3x4_Value b3x4_interpolate(B3x4_Value valueA, uint8_t interpolate, B3x4_Value valueB)
{
B3x4_Data dataA;
B3x4_Data dataB;
B3x4_Data result;
b3x4_map_value_to_data(valueA, &dataA);
b3x4_map_value_to_data(valueB, &dataB);
for (int i = 0; i < 3; i++)
result.data[i] = ((dataA.data[i] * interpolate + dataB.data[i]) >> 8) & 0x0F;
return b3x4_map_data_to_value(&result);
}
static inline B3x4_Value B3x4_Init(uint8_t v1, uint8_t v2, uint8_t v3)
{
return ((((v1) & 0x0F) << 10) |
(((v2) & 0x0F) << 5) |
(((v3) & 0x0F) << 0));
}
static void b3x4_print_value(B3x4_Value v)
{
B3x4_Data d;
b3x4_map_value_to_data(v, &d);
printf("[0x%.4X = (0x%X,0x%X,0x%X)]", v, d.data[2], d.data[1], d.data[0]);
}
static void print_calculation(B3x4_Value v1, uint8_t i1, B3x4_Value v2, B3x4_Value r1)
{
b3x4_print_value(v1);
printf(" * %d + ", i1);
b3x4_print_value(v2);
printf(" = ");
b3x4_print_value(r1);
putchar('\n');
}
int main(void)
{
B3x4_Value v1 = B3x4_Init(0xF, 0xF, 0xF);
B3x4_Value v2 = B3x4_Init(0xC, 0xB, 0xA);
B3x4_Value v3 = B3x4_Init(0x7, 0x9, 0xC);
B3x4_Value v4 = B3x4_Init(0xA, 0x6, 0x2);
uint8_t i1 = 37;
uint8_t i2 = 96;
B3x4_Value r1 = b3x4_interpolate(v1, i1, v2);
B3x4_Value r2 = b3x4_interpolate(v3, i2, v4);
print_calculation(v1, i1, v2, r1);
print_calculation(v3, i2, v4, r2);
return 0;
}
Any self-respecting optimizer will ignore the 'shift by zero' operations, but showing them emphasizes the symmetry in the work.
That produces the output:
[0x3DEF = (0xF,0xF,0xF)] * 37 + [0x316A = (0xC,0xB,0xA)] = [0x0842 = (0x2,0x2,0x2)]
[0x1D2C = (0x7,0x9,0xC)] * 96 + [0x28C2 = (0xA,0x6,0x2)] = [0x0864 = (0x2,0x3,0x4)]

How to multiply 2 uint8 modulo a big number without using integer type in C language [closed]

Closed. This question needs details or clarity. It is not currently accepting answers.
Want to improve this question? Add details and clarify the problem by editing this post.
Closed 7 years ago.
Improve this question
If A and B are of type uint8_t and I want the result C=AxB % N where N is 2^16, how do i do this if I can't use integers (so I can't declare N as an integer, only uint8_t) in C language?
N.B: A, B and C are stored in uint8 arrays, so they are "expressed" as uint8 but their values can be bigger.
In general there is no easy way to do this.
Firstly you need to implement the multiply with carry between A and B for each uint8_t block. See the answer here.
Division with 2^16 really mean "disregard" the last 16 bits, "don't use" the last two uint8_t (as you use the array of int.). As you have the modulus operator, this means just the opposite, so you only need to get the last two uint8_ts.
Take the lowest two uint8 of A (say a0 and a1) and B (say b0 and b1):
split each uint8 in high and low part
a0h = a0 >> 4; ## the same as a0h = a0/16;
a0l = a0 % 16; ## the same as a0l = a0 & 0x0f;
a1h = a1 >> 4;
a1l = a1 % 16;
b0h = b0 >> 4;
b0l = b0 % 16;
b1h = b1 >> 4;
b1l = b1 % 16;
Multiply the lower parts first (x is a buffer var)
x = a0l * b0l;
The first part of the result is the last four bits of x, let's call it s0l
s0l = x % 16;
The top for bits of x are carry.
c = x>>4;
multiply the higher parts of first uint8 and add carry.
x = (a0h * b0h) + c;
The first part of the result is the last four bits of x, let's call it s0h. And we need to get carry again.
s0h = x % 16;
c = x>>4;
We can now combine the s0:
s0 = (s0h << 4) + s0l;
Do exactly the same for the s1 (but don't forget to add the carry!):
x = (a1l * b1l) + c;
s1l = x % 16;
c = x>>4;
x = (a1h * b1h) + c;
s1h = x % 16;
c = x>>4;
s1 = (s1h << 4) + s1l;
Your result at this point is c, s1 and s0 (you need carry for next multiplications eg. s2, s3, s4,). As your formula says %(2^16) you already have your result - s1 and s2. If you have to divide with something else, you should do something similar to the code above, but for division. In this case be careful to catch the dividing with zero, it will give you NAN or something!
You can put A, B, C and S in array and loop it through the indexes to make code cleaner.
Here's my effort. I took the liberty of using larger integers and pointers for looping through the arrays. The numbers are represented by arrays of uint8_t in big-endian order. All the intermediate results are kept in uint8_t variables. The code could be made more efficient if intermediate results could be stored in wider integer variables!
#include <stddef.h>
#include <stdint.h>
#include <stdio.h>
static void add_c(uint8_t *r, size_t len_r, uint8_t x)
{
uint8_t o;
while (len_r--) {
o = r[len_r];
r[len_r] += x;
if (o <= r[len_r])
break;
x = 1;
}
}
void multiply(uint8_t *res, size_t len_res,
const uint8_t *a, size_t len_a, const uint8_t *b, size_t len_b)
{
size_t ia, ib, ir;
for (ir = 0; ir < len_res; ir++)
res[ir] = 0;
for (ia = 0; ia < len_a && ia < len_res; ia++) {
uint8_t ah, al, t;
t = a[len_a - ia - 1];
ah = t >> 4;
al = t & 0xf;
for (ib = 0; ib < len_b && ia + ib < len_res; ib++) {
uint8_t bh, bl, x, o, c0, c1;
t = b[len_b - ib - 1];
bh = t >> 4;
bl = t & 0xf;
c0 = al * bl;
c1 = ah * bh;
o = c0;
t = al * bh;
x = (t & 0xf) << 4;
c0 += x;
x = (t >> 4);
c1 += x;
if (o > c0)
c1++;
o = c0;
t = ah * bl;
x = (t & 0xf) << 4;
c0 += x;
x = (t >> 4);
c1 += x;
if (o > c0)
c1++;
add_c(res, len_res - ia - ib, c0);
add_c(res, len_res - ia - ib - 1, c1);
}
}
}
int main(void)
{
uint8_t a[2] = { 0xee, 0xdd };
uint8_t b[2] = { 0xcc, 0xbb };
uint8_t r[4];
multiply(r, sizeof(r), a, sizeof(a), b, sizeof(b));
printf("0x%02X%02X * 0x%02X%02X = 0x%02X%02X%02X%02X\n",
a[0], a[1], b[0], b[1], r[0], r[1], r[2], r[3]);
return 0;
}
Output:
0xEEDD * 0xCCBB = 0xBF06976F

Fastest way to multiply two 64-bit ints to 128-bit then >> to 64-bit? [duplicate]

This question already has answers here:
Computing high 64 bits of a 64x64 int product in C
(5 answers)
Closed 6 years ago.
I need to multiply two signed 64-bit integers a and b together, then shift the (128-bit) result to a signed 64-bit integer. What's the fastest way to do that?
My 64-bit integers actually represent fixed-point numbers with fmt fractional bits. fmt is chosen so that a * b >> fmt should not overflow, for instance abs(a) < 64<<fmt and abs(b) < 2<<fmt with fmt==56 would never overflow in 64-bits as the final result would be < 128<<fmt and therefore fit in an int64.
The reason I want to do that is to quickly and precisely evaluate quintic polynomials of the form ((((c5*x + c4)*x + c3)*x + c2)*x + c1)*x + c0 in fixed point format, with every number a signed 64-bit fixed-point number with fmt fractional bits. I'm looking for the most efficient way to achieve that.
As a commenter on the question pointed out, this is most easily accomplished efficiently by machine-dependent code, rather than by portable code. The asker states that the main platform is x86_64, and that has a built-in instruction for performing 64 ✕ 64 → 128 bit multiplication. This is easily accessed using a small piece of inline assembly. Note that details of inline assembly may differ somewhat with compiler, the code below was built with the Intel C/C++ compiler.
#include <stdint.h>
/* compute mul_wide (a, b) >> s, for s in [0,63] */
int64_t mulshift (int64_t a, int64_t b, int s)
{
int64_t res;
__asm__ (
"movq %1, %%rax;\n\t" // rax = a
"movl %3, %%ecx;\n\t" // ecx = s
"imulq %2;\n\t" // rdx:rax = a * b
"shrdq %%cl, %%rdx, %%rax;\n\t" // rax = int64_t (rdx:rax >> s)
"movq %%rax, %0;\n\t" // res = rax
: "=rm" (res)
: "rm"(a), "rm"(b), "rm"(s)
: "%rax", "%rdx", "%ecx");
return res;
}
A portable C99 equivalent to the above code is shown below. I have tested this extensively against the inline assembly version and no mismatches were found.
void umul64wide (uint64_t a, uint64_t b, uint64_t *hi, uint64_t *lo)
{
uint64_t a_lo = (uint64_t)(uint32_t)a;
uint64_t a_hi = a >> 32;
uint64_t b_lo = (uint64_t)(uint32_t)b;
uint64_t b_hi = b >> 32;
uint64_t p0 = a_lo * b_lo;
uint64_t p1 = a_lo * b_hi;
uint64_t p2 = a_hi * b_lo;
uint64_t p3 = a_hi * b_hi;
uint32_t cy = (uint32_t)(((p0 >> 32) + (uint32_t)p1 + (uint32_t)p2) >> 32);
*lo = p0 + (p1 << 32) + (p2 << 32);
*hi = p3 + (p1 >> 32) + (p2 >> 32) + cy;
}
void mul64wide (int64_t a, int64_t b, int64_t *hi, int64_t *lo)
{
umul64wide ((uint64_t)a, (uint64_t)b, (uint64_t *)hi, (uint64_t *)lo);
if (a < 0LL) *hi -= b;
if (b < 0LL) *hi -= a;
}
/* compute mul_wide (a, b) >> s, for s in [0,63] */
int64_t mulshift (int64_t a, int64_t b, int s)
{
int64_t res;
int64_t hi, lo;
mul64wide (a, b, &hi, &lo);
if (s) {
res = ((uint64_t)hi << (64 - s)) | ((uint64_t)lo >> s);
} else {
res = lo;
}
return res;
}

Is there a more efficient way of expanding a char to an uint64_t?

I want to inflate an unsigned char to an uint64_t by repeating each bit 8 times. E.g.
char -> uint64_t
0x00 -> 0x00
0x01 -> 0xFF
0x02 -> 0xFF00
0x03 -> 0xFFFF
0xAA -> 0xFF00FF00FF00FF00
I currently have the following implementation, using bit shifts to test if a bit is set, to accomplish this:
#include <stdint.h>
#include <inttypes.h>
#define BIT_SET(var, pos) ((var) & (1 << (pos)))
static uint64_t inflate(unsigned char a)
{
uint64_t MASK = 0xFF;
uint64_t result = 0;
for (int i = 0; i < 8; i++) {
if (BIT_SET(a, i))
result |= (MASK << (8 * i));
}
return result;
}
However, I'm fairly new to C, so this fiddling with individual bits makes me a little vary that there might be a better (i.e. more efficient) way of doing this.
EDIT TO ADD
Ok, so after trying out the table lookup solution, here are the results. However, keep in mind that I didn't test the routine directly, but rather as part of bigger function (a multiplication of binary matrices to be precise), so this might have affected how the results turned out. So, on my computer, when multiplying a million 8x8 matrices, and compiled with:
gcc -O2 -Wall -std=c99 foo.c
I got
./a.out original
real 0m0.127s
user 0m0.124s
sys 0m0.000s
./a.out table_lookup
real 0m0.012s
user 0m0.012s
sys 0m0.000s
So at least on my machine (a virtual machine 64 bit Linux Mint I should mention), the table lookup approach seems to provide a roughly 10-times speed-up, so I will accept that as the answer.
If you're looking for efficiency use a lookup table: a static array of 256 entries, each already holding the required result. You can use your code above to generate it.
In selected architectures (SSE,Neon) there are fast vector operations that can speed up this task or are designed to do this. Without special instructions the suggested look up table approach is both the fastest and most portable.
If the 2k size is an issue, parallel vector arithmetic operations can be simulated:
static uint64_t inflate_parallel(unsigned char a) {
uint64_t vector = a * 0x0101010101010101ULL;
// replicate the word all over qword
// A5 becomes A5 A5 A5 A5 A5 A5 A5 A5
vector &= 0x8040201008040201; // becomes 80 00 20 00 00 04 00 01 <--
vector += 0x00406070787c7e7f; // becomes 80 40 80 70 78 80 7e 80
// MSB is correct
vector = (vector >> 7) & 0x0101010101010101ULL; // LSB is correct
return vector * 255; // all bits correct
}
EDIT: 2^31 iterations, (four time unroll to mitigate loop evaluation)
time ./parallel time ./original time ./lookup
real 0m2.038s real 0m14.161s real 0m1.436s
user 0m2.030s user 0m14.120s user 0m1.430s
sys 0m0.000s sys 0m0.000s sys 0m0.000s
That's about 7x speedup, while the lookup table gives ~10x speedup
You should profile what your code does, before worrying about optimising it.
On my compiler locally, your code gets entirely inlined, unrolled and turned into 8 constant test + or instructions when the value is unknown, and turned into a constant when the value is known at compile time. I could probably marginally improve it by removing a few branches, but the compiler is doing a reasonable job on its own.
Optimising the loop is then a bit pointless. A table lookup might be more efficient, but would probably prevent the compiler from making optimisations itself.
The desired functionality can be achieved by moving each bit of the source into the lsb of the appropriate target byte (0 → 0, 1 → 8, 2 → 16, ...., 7 → 56), then expanding each lsb to cover the whole byte, which is easily done by multiplying with 0xff (255). Instead of moving bits into place individually using shifts, then combining the results, we can use an integer multiply to shift multiple bits in parallel. To prevent self-overlap, we can move only the least-significant seven source bits in this fashion, but need to move the source msb separately with a shift.
This leads to the following ISO-C99 implementation:
#include <stdint.h>
/* expand each bit in input into one byte in output */
uint64_t fast_inflate (uint8_t a)
{
const uint64_t spread7 = (1ULL << 42) | (1ULL << 35) | (1ULL << 28) | (1ULL << 21) |
(1ULL << 14) | (1ULL << 7) | (1UL << 0);
const uint64_t byte_lsb = (1ULL << 56) | (1ULL << 48) | (1ULL << 40) | (1ULL << 32) |
(1ULL << 24) | (1ULL << 16) | (1ULL << 8) | (1ULL << 0);
uint64_t r;
/* spread bits to lsbs of each byte */
r = (((uint64_t)(a & 0x7f) * spread7) + ((uint64_t)a << 49));
/* extract the lsbs of all bytes */
r = r & byte_lsb;
/* fill each byte with its lsb */
r = r * 0xff;
return r;
}
#define BIT_SET(var, pos) ((var) & (1 << (pos)))
static uint64_t inflate(unsigned char a)
{
uint64_t MASK = 0xFF;
uint64_t result = 0;
for (int i = 0; i < 8; i++) {
if (BIT_SET(a, i))
result |= (MASK << (8 * i));
}
return result;
}
#include <stdio.h>
#include <stdlib.h>
int main (void)
{
uint8_t a = 0;
do {
uint64_t res = fast_inflate (a);
uint64_t ref = inflate (a);
if (res != ref) {
printf ("error # %02x: fast_inflate = %016llx inflate = %016llx\n",
a, res, ref);
return EXIT_FAILURE;
}
a++;
} while (a);
printf ("test passed\n");
return EXIT_SUCCESS;
}
Most x64 compilers will compile fast_inflate() in straightforward manner. For example, my Intel compiler Version 13.1.3.198, when building with /Ox, generates the 11-instruction sequence below. Note that the final multiply with 0xff is actually implemented as a shift and subtract sequence.
fast_inflate PROC
mov rdx, 040810204081H
movzx r9d, cl
and ecx, 127
mov r8, 0101010101010101H
imul rdx, rcx
shl r9, 49
add r9, rdx
and r9, r8
mov rax, r9
shl rax, 8
sub rax, r9
ret
If you're willing to spend 256 * 8 = 2kB of memory on this (i.e. become less efficient in terms of memory, but more efficient in terms of CPU cycles needed), the most efficient way would be to pre-compute a lookup table:
static uint64_t inflate(unsigned char a) {
static const uint64_t charToUInt64[256] = {
0x0000000000000000, 0x00000000000000FF, 0x000000000000FF00, 0x000000000000FFFF,
// ...
};
return charToUInt64[a];
}
Here is one more method using only simple arithmetics:
uint64_t inflate_chqrlie(uint8_t value) {
uint64_t x = value;
x = (x | (x << 28));
x = (x | (x << 14));
x = (x | (x << 7)) & 0x0101010101010101ULL;
x = (x << 8) - x;
return x;
}
Another very efficient and concise one by phuclv using multiplication and mask:
static uint64_t inflate_phuclv(uint8_t b) {
uint64_t MAGIC = 0x8040201008040201ULL;
uint64_t MASK = 0x8080808080808080ULL;
return ((MAGIC * b) & MASK) >> 7;
}
And another with a small lookup table:
static uint32_t const lut_4_32[16] = {
0x00000000, 0x000000FF, 0x0000FF00, 0x0000FFFF,
0x00FF0000, 0x00FF00FF, 0x00FFFF00, 0x00FFFFFF,
0xFF000000, 0xFF0000FF, 0xFF00FF00, 0xFF00FFFF,
0xFFFF0000, 0xFFFF00FF, 0xFFFFFF00, 0xFFFFFFFF,
};
static uint64_t inflate_lut32(uint8_t b) {
return lut_4_32[b & 15] | ((uint64_t)lut_4_32[b >> 4] << 32);
}
I wrote a benchmarking program to determine relative performance of the different approaches on my system (x86_64-apple-darwin16.7.0, Apple LLVM version 9.0.0 (clang-900.0.39.2, clang -O3).
The results show that my function inflate_chqrlie is faster than naive approaches but slower than other elaborate versions, all of which are beaten hands down by inflate_lut64 using a 2KB the lookup table in cache optimal situations.
The function inflate_lut32, using a much smaller lookup table (64 bytes instead of 2KB) is not as fast as inflate_lut64, but seems a good compromise for 32-bit architectures as it is still much faster than all other alternatives.
64-bit benchmark:
inflate: 0, 848.316ms
inflate_Curd: 0, 845.424ms
inflate_chqrlie: 0, 371.502ms
fast_inflate_njuffa: 0, 288.669ms
inflate_parallel1: 0, 242.827ms
inflate_parallel2: 0, 315.105ms
inflate_parallel3: 0, 363.379ms
inflate_parallel4: 0, 304.051ms
inflate_parallel5: 0, 301.205ms
inflate_phuclv: 0, 109.130ms
inflate_lut32: 0, 197.178ms
inflate_lut64: 0, 25.160ms
32-bit benchmark:
inflate: 0, 1451.464ms
inflate_Curd: 0, 955.509ms
inflate_chqrlie: 0, 385.036ms
fast_inflate_njuffa: 0, 463.212ms
inflate_parallel1: 0, 468.070ms
inflate_parallel2: 0, 570.107ms
inflate_parallel3: 0, 511.741ms
inflate_parallel4: 0, 601.892ms
inflate_parallel5: 0, 506.695ms
inflate_phuclv: 0, 192.431ms
inflate_lut32: 0, 140.968ms
inflate_lut64: 0, 28.776ms
Here is the code:
#include <stdio.h>
#include <stdint.h>
#include <time.h>
static uint64_t inflate(unsigned char a) {
#define BIT_SET(var, pos) ((var) & (1 << (pos)))
uint64_t MASK = 0xFF;
uint64_t result = 0;
for (int i = 0; i < 8; i++) {
if (BIT_SET(a, i))
result |= (MASK << (8 * i));
}
return result;
}
static uint64_t inflate_Curd(unsigned char a) {
uint64_t mask = 0xFF;
uint64_t result = 0;
for (int i = 0; i < 8; i++) {
if (a & 1)
result |= mask;
mask <<= 8;
a >>= 1;
}
return result;
}
uint64_t inflate_chqrlie(uint8_t value) {
uint64_t x = value;
x = (x | (x << 28));
x = (x | (x << 14));
x = (x | (x << 7)) & 0x0101010101010101ULL;
x = (x << 8) - x;
return x;
}
uint64_t fast_inflate_njuffa(uint8_t a) {
const uint64_t spread7 = (1ULL << 42) | (1ULL << 35) | (1ULL << 28) | (1ULL << 21) |
(1ULL << 14) | (1ULL << 7) | (1UL << 0);
const uint64_t byte_lsb = (1ULL << 56) | (1ULL << 48) | (1ULL << 40) | (1ULL << 32) |
(1ULL << 24) | (1ULL << 16) | (1ULL << 8) | (1ULL << 0);
uint64_t r;
/* spread bits to lsbs of each byte */
r = (((uint64_t)(a & 0x7f) * spread7) + ((uint64_t)a << 49));
/* extract the lsbs of all bytes */
r = r & byte_lsb;
/* fill each byte with its lsb */
r = r * 0xff;
return r;
}
// Aki Suuihkonen: 1.265
static uint64_t inflate_parallel1(unsigned char a) {
uint64_t vector = a * 0x0101010101010101ULL;
// replicate the word all over qword
// A5 becomes A5 A5 A5 A5 A5 A5 A5 A5
vector &= 0x8040201008040201; // becomes 80 00 20 00 00 04 00 01 <--
vector += 0x00406070787c7e7f; // becomes 80 40 80 70 78 80 7e 80
// MSB is correct
vector = (vector >> 7) & 0x0101010101010101ULL; // LSB is correct
return vector * 255; // all bits correct
}
// By seizet and then combine: 1.583
static uint64_t inflate_parallel2(unsigned char a) {
uint64_t vector1 = a * 0x0002000800200080ULL;
uint64_t vector2 = a * 0x0000040010004001ULL;
uint64_t vector = (vector1 & 0x0100010001000100ULL) | (vector2 & 0x0001000100010001ULL);
return vector * 255;
}
// Stay in 32 bits as much as possible: 1.006
static uint64_t inflate_parallel3(unsigned char a) {
uint32_t vector1 = (( (a & 0x0F) * 0x00204081) & 0x01010101) * 255;
uint32_t vector2 = ((((a & 0xF0) >> 4) * 0x00204081) & 0x01010101) * 255;
return (((uint64_t)vector2) << 32) | vector1;
}
// Do the common computation in 64 bits: 0.915
static uint64_t inflate_parallel4(unsigned char a) {
uint32_t vector1 = (a & 0x0F) * 0x00204081;
uint32_t vector2 = ((a & 0xF0) >> 4) * 0x00204081;
uint64_t vector = (vector1 | (((uint64_t)vector2) << 32)) & 0x0101010101010101ULL;
return vector * 255;
}
// Some computation is done in 64 bits a little sooner: 0.806
static uint64_t inflate_parallel5(unsigned char a) {
uint32_t vector1 = (a & 0x0F) * 0x00204081;
uint64_t vector2 = (a & 0xF0) * 0x002040810000000ULL;
uint64_t vector = (vector1 | vector2) & 0x0101010101010101ULL;
return vector * 255;
}
static uint64_t inflate_phuclv(uint8_t b) {
uint64_t MAGIC = 0x8040201008040201ULL;
uint64_t MASK = 0x8080808080808080ULL;
return ((MAGIC * b) & MASK) >> 7;
}
static uint32_t const lut_4_32[16] = {
0x00000000, 0x000000FF, 0x0000FF00, 0x0000FFFF,
0x00FF0000, 0x00FF00FF, 0x00FFFF00, 0x00FFFFFF,
0xFF000000, 0xFF0000FF, 0xFF00FF00, 0xFF00FFFF,
0xFFFF0000, 0xFFFF00FF, 0xFFFFFF00, 0xFFFFFFFF,
};
static uint64_t inflate_lut32(uint8_t b) {
return lut_4_32[b & 15] | ((uint64_t)lut_4_32[b >> 4] << 32);
}
static uint64_t lut_8_64[256];
static uint64_t inflate_lut64(uint8_t b) {
return lut_8_64[b];
}
#define ITER 1000000
int main() {
clock_t t;
uint64_t x;
for (int b = 0; b < 256; b++)
lut_8_64[b] = inflate((uint8_t)b);
#define TEST(func) do { \
t = clock(); \
x = 0; \
for (int i = 0; i < ITER; i++) { \
for (int b = 0; b < 256; b++) \
x ^= func((uint8_t)b); \
} \
t = clock() - t; \
printf("%20s: %llu, %.3fms\n", \
#func, x, t * 1000.0 / CLOCKS_PER_SEC); \
} while (0)
TEST(inflate);
TEST(inflate_Curd);
TEST(inflate_chqrlie);
TEST(fast_inflate_njuffa);
TEST(inflate_parallel1);
TEST(inflate_parallel2);
TEST(inflate_parallel3);
TEST(inflate_parallel4);
TEST(inflate_parallel5);
TEST(inflate_phuclv);
TEST(inflate_lut32);
TEST(inflate_lut64);
return 0;
}
Variations on the same theme as #Aki answer. Some of them are better here, but it may depend on your compiler and target machines (they should be more suitable for superscalar processor that Aki's function even if they do more work as there is less data dependencies)
// Aki Suuihkonen: 1.265
static uint64_t inflate_parallel1(unsigned char a) {
uint64_t vector = a * 0x0101010101010101ULL;
vector &= 0x8040201008040201;
vector += 0x00406070787c7e7f;
vector = (vector >> 7) & 0x0101010101010101ULL;
return vector * 255;
}
// By seizet and then combine: 1.583
static uint64_t inflate_parallel2(unsigned char a) {
uint64_t vector1 = a * 0x0002000800200080ULL;
uint64_t vector2 = a * 0x0000040010004001ULL;
uint64_t vector = (vector1 & 0x0100010001000100ULL) | (vector2 & 0x0001000100010001ULL);
return vector * 255;
}
// Stay in 32 bits as much as possible: 1.006
static uint64_t inflate_parallel3(unsigned char a) {
uint32_t vector1 = (( (a & 0x0F) * 0x00204081) & 0x01010101) * 255;
uint32_t vector2 = ((((a & 0xF0) >> 4) * 0x00204081) & 0x01010101) * 255;
return (((uint64_t)vector2) << 32) | vector1;
}
// Do the common computation in 64 bits: 0.915
static uint64_t inflate_parallel4(unsigned char a) {
uint32_t vector1 = (a & 0x0F) * 0x00204081;
uint32_t vector2 = ((a & 0xF0) >> 4) * 0x00204081;
uint64_t vector = (vector1 | (((uint64_t)vector2) << 32)) & 0x0101010101010101ULL;
return vector * 255;
}
// Some computation is done in 64 bits a little sooner: 0.806
static uint64_t inflate_parallel5(unsigned char a) {
uint32_t vector1 = (a & 0x0F) * 0x00204081;
uint64_t vector2 = (a & 0xF0) * 0x002040810000000ULL;
uint64_t vector = (vector1 | vector2) & 0x0101010101010101ULL;
return vector * 255;
}
Two minor optimizations:
One for testing the bits in the input (a will be destroyed but this doesn't matter)
The other for shifting the mask.
static uint64_t inflate(unsigned char a)
{
uint64_t mask = 0xFF;
uint64_t result = 0;
for (int i = 0; i < 8; i++) {
if (a & 1)
result |= mask;
mask <<= 8;
a >>= 1;
}
return result;
}
Maybe you can also replace the 'for (int i = 0; i < 8; i++)'-loop by a
'while (a)'-loop.
This works, however, only if the right shift a >>=1 works unsigned
(As much as I know C standard allows the compiler to do it signed or unsigned).
Otherwise you will have an infinite loop in some cases.
EDIT:
To see the result I compiled both variants with gcc -std=c99 -S source.c.
A quick glance at the resulting assembler outputs shows that the optimization shown above yields ca. 1/3 viewer instructions, most of them inside the loop.

Resources