diff -uNr a/ffa/libffa/ffa.gpr b/ffa/libffa/ffa.gpr --- a/ffa/libffa/ffa.gpr 92d0220a48f6753fb699220db8eba867b4f4354291c3e4ebd6b3048982524e93f82356d40b8bf2371f7c6768545fe2ee0e531ebd70144cbd8213a6d8c84e727e +++ b/ffa/libffa/ffa.gpr 9ddfc20107335ae2ca3f5b44ed95de89b6127a56c29429f652bd3f3632f8b70433a8110023d66cf000ba8214108428c2ebb3bf5c9eba346cbcc99889cf60124a @@ -24,7 +24,7 @@ type Mode_Type is ("debug", "release"); Mode : Mode_Type := external ("mode", "release"); - for Languages use ("Ada"); + for Languages use ("Ada", "Asm"); for Source_Dirs use ("."); for Library_Dir use "lib"; for Library_Name use "FFA"; diff -uNr a/ffa/libffa/fz_mul.adb b/ffa/libffa/fz_mul.adb --- a/ffa/libffa/fz_mul.adb 5cb9ecb938f842b7c34ca7edf99fb92502986d7f660b223659c9b19c6008a24c3be6de8135db37e29e7ff278a451b68c61f6b57d8e404372dcd3fd6d4320ea0a +++ b/ffa/libffa/fz_mul.adb bfad12cbd645042ea8d0121994c38af1a8945423cfd535af80a2762d506bef3d30ed8ecc7a6988eca6788789a6d36c25e3fe505c49f478afe227ab8d9d51821b @@ -29,6 +29,32 @@ Y : in FZ; XY_Lo : out FZ; XY_Hi : out FZ) is + + -- Words in each multiplicand + L : constant Word_Index := X'Length; + + -- Length of Product, i.e. double the length of either multiplicand + LP : constant Word_Index := 2 * L; + + -- Register holding Product; indexed from zero + XY : FZ(0 .. LP - 1); + + procedure Asm_Comba(X : in FZ; + Y : in FZ; + XY : out FZ; + X_Size : in Word_Count); + pragma Import (C, Asm_Comba, "x86_64_comba"); + begin + Asm_Comba(X, Y, XY, L); + XY_Lo := XY(0 .. L - 1); + XY_Hi := XY(L .. XY'Last); + end FZ_Mul_Comba; + + -- Comba's multiplier. + procedure FZ_Mul_Comba_C(X : in FZ; + Y : in FZ; + XY_Lo : out FZ; + XY_Hi : out FZ) is -- Words in each multiplicand L : constant Word_Index := X'Length; @@ -119,7 +145,7 @@ XY_Lo := XY(0 .. L - 1); XY_Hi := XY(L .. XY'Last); - end FZ_Mul_Comba; + end FZ_Mul_Comba_C; pragma Inline_Always(FZ_Mul_Comba); end FZ_Mul; diff -uNr a/ffa/libffa/x86_64_comba.s b/ffa/libffa/x86_64_comba.s --- a/ffa/libffa/x86_64_comba.s false +++ b/ffa/libffa/x86_64_comba.s 20e3b23c757f4b95f87c9ea28197c703d3b29fb84862c8629f9063b9e00704b15838a4b8fa711655b8f09fbfa157a50915c54219de49ab75aee72602f7b12063 @@ -0,0 +1,85 @@ +.intel_mnemonic +.intel_syntax noprefix + +# Register allocation +# R8 = A0 +# R9 = A1 +# R10 = A2 +# R11 = J,U +# R12 = V +# RBX = N in col = I in comba +# RDI = X +# RSI = Y +# R13 = XY +# RCX = upper bound of the foreach-column loops +# RAX, RDX = (temporary) + +col: +cmp r11, r12 # exit when J > V +jg col_output # ... +lea rdx, [rsi + 8*rbx] # rdx := Y'Address + N*8 +lea rax, [8*r11] # rax := j +sub rdx, rax # rdx := rdx - j*8 +mov rdx, [rdx] # rdx := *(rdx) +mov rax, [rdi + 8*r11] # rax := X(j) := *(X'Address + j*8) +mul rdx # rdx:rax := rax*rdx +add r8, rax # A0, C := A0 + rax +adc r9, rdx # A1, C := A1 + rdx + C +adc r10, 0 # A2, [C=0] := A2 + 0 + C +inc r11 # J := J + 1 +jmp col +col_output: +mov [r13 + 8*rbx], r8 # XY(N) := A0 +mov r8, r9 # A0 := A1 +mov r9, r10 # A1 := A2 +xor r10, r10 # A2 := 0 +ret + +# Arguments according to SysV ABI +# RDI: X, array of words size X'Size elements +# RSI: Y, array of words size X'Size elements +# RDX: XY, array of words size 2*X'Size elements +# RCX: X'Size, base FZ length (for X and Y). +.global x86_64_comba +x86_64_comba: +push rbx +push r12 +push r13 +mov r13, rdx # RDX is used by MUL, move XY to a free register + +xor r8, r8 # A0 := 0 +xor r9, r9 # A1 := 0 +xor r10, r10 # A2 := 0 +xor rbx, rbx # I := 0 + +loop_1: +cmp rbx, rcx # exit when I >= L +jge end_loop_1 # ... +xor r11, r11 # U := 0 +mov r12, rbx # V := I +call col # +inc rbx # I := I + 1 +jmp loop_1 +end_loop_1: + +# rbx = L after the previous loop +lea r12, [rcx - 1] # V = L - 1 +mov rcx, r12 # RCX := L - 1 +shl rcx, 1 # RCX := (L - 1)*2 +loop_2: +cmp rbx, rcx # exit when I > 2*L-2 +jg end_loop_2 # ... +mov r11, rbx # U := I +sub r11, r12 # U := I - V := I - L + 1 +call col # V already set to L - 1 +inc rbx # I := I + 1 +jmp loop_2 +end_loop_2: + +mov [r13 + 8*rbx], r8 # XY(I) := A0 + +end_comba: +pop r13 +pop r12 +pop rbx +ret