diff -uNr a/ffa/libffa/ffa.gpr b/ffa/libffa/ffa.gpr --- a/ffa/libffa/ffa.gpr 95f8719172dc8f10159e2f6993eb5810d2a3d8db993e3902dbde3c32692a283a32a214e2780b37b3fdee3995fea2054c6df1998cb260755e614842c8f755b8b5 +++ b/ffa/libffa/ffa.gpr bd9b02259a4889e9929f239484fba9bfec4c1736d6f2670818f38d73e9e7c0e6aaca4ee4d4e04637ac60dd6946b012e32b8e87303768f65babd333b1d09e6f38 @@ -24,7 +24,7 @@ type Mode_Type is ("debug", "release"); Mode : Mode_Type := external ("mode", "release"); - for Languages use ("Ada"); + for Languages use ("Ada", "Asm"); for Source_Dirs use ("."); for Library_Dir use "lib"; for Library_Name use "FFA"; diff -uNr a/ffa/libffa/fz_mul.adb b/ffa/libffa/fz_mul.adb --- a/ffa/libffa/fz_mul.adb a819415bc60308fe0b550eee13da2d6d4819064e4d336e9766e65a63768aef24ac7db9f5ac238725587f4660c2992ae17abc85ef5047d62beb04ba2c12d1172a +++ b/ffa/libffa/fz_mul.adb 81a6c2c4ce7f029e8973e0a4ff0632ad921d7f4bdbcd7ba4d48089ddbe69e90ea427434e5efcd5a5588c66bbf9dcb257d56d463049028cf5e093795fa7eaede9 @@ -25,6 +25,24 @@ package body FZ_Mul is + -- Comba's multiplier fastpath. (CAUTION: UNBUFFERED) + procedure FZ_Mul_Comba_Fast(X : in FZ; + Y : in FZ; + XY : out FZ) + is + procedure Asm_Comba(X : in FZ; + Y : in FZ; + XY : out FZ; + L : in Word_Index); + pragma Import (C, Asm_Comba, "x86_64_comba_unrolled"); + begin + pragma Assert(X'Length = Karatsuba_Thresh and + Y'Length = Karatsuba_Thresh and + XY'Length = 2*Karatsuba_Thresh); + Asm_Comba(X, Y, XY, X'Length); + end FZ_Mul_Comba_Fast; + + -- Comba's multiplier. (CAUTION: UNBUFFERED) procedure FZ_Mul_Comba(X : in FZ; Y : in FZ; @@ -235,11 +253,14 @@ begin - if L <= Karatsuba_Thresh then - - -- Base case: - FZ_Mul_Comba(X, Y, XY); + if L = Karatsuba_Thresh then + -- Optimized case: + FZ_Mul_Comba_Fast(X, Y, XY); + elsif L < Karatsuba_Thresh then + + -- Base case + FZ_Mul_Comba(X, Y, XY); else -- Recursive case: diff -uNr a/ffa/libffa/fz_mul.ads b/ffa/libffa/fz_mul.ads --- a/ffa/libffa/fz_mul.ads e85e9fc6e391e1332ec7aa9bbf4331bba8e462d5c1996b497696c12bb26097f1a3e4f97a342c868c30534ff648d12c2989875adde9233507a656c2b28742418f +++ b/ffa/libffa/fz_mul.ads 743b846e4ea1054d8afbcf9c28209da3f1e56d273d4b5c6197ba30ea374d788e15a3f9b2abdfcb6d02b6287553b071d3d316c41d765f7343140dabc654c700dc @@ -25,7 +25,9 @@ pragma Pure; -- Karatsuba Threshhold - at or below this many Words, we use Comba mult. - Karatsuba_Thresh : constant Indices := 8; + -- Edit the Karatsuba_Thresh in x86_64_comba.s as well after changing this + -- value. + Karatsuba_Thresh : constant Indices := 32; -- Multiply. (CAUTION: UNBUFFERED) procedure FZ_Multiply_Unbuffered(X : in FZ; @@ -33,6 +35,12 @@ XY : out FZ); pragma Inline_Always(FZ_Multiply_Unbuffered); + -- Comba's multiplier in assembly (fastpath). (CAUTION: UNBUFFERED) + procedure FZ_Mul_Comba_Fast(X : in FZ; + Y : in FZ; + XY : out FZ); + pragma Inline_Always(FZ_Mul_Comba_Fast); + -- Comba's multiplier. (CAUTION: UNBUFFERED) procedure FZ_Mul_Comba(X : in FZ; Y : in FZ; diff -uNr a/ffa/libffa/x86_64_comba.s b/ffa/libffa/x86_64_comba.s --- a/ffa/libffa/x86_64_comba.s false +++ b/ffa/libffa/x86_64_comba.s f612c126dc5dc542f01762390c568414a82b05e1e6258daa12abacc3ddb787c6153d4c82bfcfc8a29a13ebda7cecb61204018ed9417a372b1a79d4a19b18c07f @@ -0,0 +1,99 @@ +.intel_mnemonic +.intel_syntax noprefix + +# unrolled comba variant + +# Register allocation +# R8 = A0 +# R9 = A1 +# R10 = A2 +# R11 = J,U +# R12 = I in second (higher part) loop of comba +# RBX = N in col = I in comba +# RDI = X +# RSI = Y +# RCX = L at x86_64_comba_unrolled entry, XY later +# RAX, RDX = (temporary) + +.macro gen_col_inner I NIter +.if \NIter - \I +gen_col_inner "(\I + 1)" \NIter +.endif +lea rdx, [rsi + 8*rbx] # rdx := Y'Address + N*8 +lea rax, [8*r11] # rax := 8*j +sub rdx, rax # rdx := rdx - j*8 +mov rdx, [rdx] # rdx := *(rdx) +mov rax, [rdi + 8*r11] # rax := X(j) := *(X'Address + j*8) +mul rdx # rdx:rax := rax*rdx +add r8, rax # A0, C := A0 + rax +adc r9, rdx # A1, C := A1 + rdx + C +adc r10, 0 # A2, [C=0] := A2 + 0 + C +inc r11 # J := J + 1 +.endm + +.macro col_finish +mov [rcx + 8*rbx], r8 # XY(N) := A0 +mov r8, r9 # A0 := A1 +mov r9, r10 # A1 := A2 +xor r10, r10 # A2 := 0 +inc rbx # N := N + 1 +.endm + +.macro gen_col NIter +gen_col_inner 0 \NIter +col_finish +.endm + +.macro gen_loop_low L +.if \L +gen_loop_low "(\L-1)" +xor r11, r11 # U := 0 +gen_col \L-1 +.endif +.endm + +.macro gen_loop_high_inner I L +.if \L-\I +inc r12 # I := I + 1 +mov r11, r12 # U := I (U in col) +gen_col "(\L-1-\I)" +gen_loop_high_inner "(\I+1)" \L +.endif +.endm + +.macro gen_loop_high L +gen_loop_high_inner 1 \L +.endm + +.equiv Karatsuba_Thresh, 32 + +# Arguments +# RDI: X +# RSI: Y +# RDX: XY +# RCX: L +.global x86_64_comba_unrolled +x86_64_comba_unrolled: +push rbx +push r12 + +cmp rcx, Karatsuba_Thresh +jne size_fail + +mov rcx, rdx # RCX := XY +xor r12, r12 # TMP := 0 +xor r8, r8 # A0 := 0 +xor r9, r9 # A1 := 0 +xor r10, r10 # A2 := 0 +xor rbx, rbx # N := 0 + +gen_loop_low Karatsuba_Thresh +gen_loop_high Karatsuba_Thresh +col_finish + +pop r12 +pop rbx +ret + +size_fail: +ud2