-
+ F612C126DC5DC542F01762390C568414A82B05E1E6258DAA12ABACC3DDB787C6153D4C82BFCFC8A29A13EBDA7CECB61204018ED9417A372B1A79D4A19B18C07F
ffa/libffa/x86_64_comba.s
(0 . 0)(1 . 99)
96 .intel_mnemonic
97 .intel_syntax noprefix
98
99 # unrolled comba variant
100
101 # Register allocation
102 # R8 = A0
103 # R9 = A1
104 # R10 = A2
105 # R11 = J,U
106 # R12 = I in second (higher part) loop of comba
107 # RBX = N in col = I in comba
108 # RDI = X
109 # RSI = Y
110 # RCX = L at x86_64_comba_unrolled entry, XY later
111 # RAX, RDX = (temporary)
112
113 .macro gen_col_inner I NIter
114 .if \NIter - \I
115 gen_col_inner "(\I + 1)" \NIter
116 .endif
117 lea rdx, [rsi + 8*rbx] # rdx := Y'Address + N*8
118 lea rax, [8*r11] # rax := 8*j
119 sub rdx, rax # rdx := rdx - j*8
120 mov rdx, [rdx] # rdx := *(rdx)
121 mov rax, [rdi + 8*r11] # rax := X(j) := *(X'Address + j*8)
122 mul rdx # rdx:rax := rax*rdx
123 add r8, rax # A0, C := A0 + rax
124 adc r9, rdx # A1, C := A1 + rdx + C
125 adc r10, 0 # A2, [C=0] := A2 + 0 + C
126 inc r11 # J := J + 1
127 .endm
128
129 .macro col_finish
130 mov [rcx + 8*rbx], r8 # XY(N) := A0
131 mov r8, r9 # A0 := A1
132 mov r9, r10 # A1 := A2
133 xor r10, r10 # A2 := 0
134 inc rbx # N := N + 1
135 .endm
136
137 .macro gen_col NIter
138 gen_col_inner 0 \NIter
139 col_finish
140 .endm
141
142 .macro gen_loop_low L
143 .if \L
144 gen_loop_low "(\L-1)"
145 xor r11, r11 # U := 0
146 gen_col \L-1
147 .endif
148 .endm
149
150 .macro gen_loop_high_inner I L
151 .if \L-\I
152 inc r12 # I := I + 1
153 mov r11, r12 # U := I (U in col)
154 gen_col "(\L-1-\I)"
155 gen_loop_high_inner "(\I+1)" \L
156 .endif
157 .endm
158
159 .macro gen_loop_high L
160 gen_loop_high_inner 1 \L
161 .endm
162
163 .equiv Karatsuba_Thresh, 32
164
165 # Arguments
166 # RDI: X
167 # RSI: Y
168 # RDX: XY
169 # RCX: L
170 .global x86_64_comba_unrolled
171 x86_64_comba_unrolled:
172 push rbx
173 push r12
174
175 cmp rcx, Karatsuba_Thresh
176 jne size_fail
177
178 mov rcx, rdx # RCX := XY
179 xor r12, r12 # TMP := 0
180 xor r8, r8 # A0 := 0
181 xor r9, r9 # A1 := 0
182 xor r10, r10 # A2 := 0
183 xor rbx, rbx # N := 0
184
185 gen_loop_low Karatsuba_Thresh
186 gen_loop_high Karatsuba_Thresh
187 col_finish
188
189 pop r12
190 pop rbx
191 ret
192
193 size_fail:
194 ud2