-
+ 20E3B23C757F4B95F87C9EA28197C703D3B29FB84862C8629F9063B9E00704B15838A4B8FA711655B8F09FBFA157A50915C54219DE49AB75AEE72602F7B12063
ffa/libffa/x86_64_comba.s
(0 . 0)(1 . 85)
67 .intel_mnemonic
68 .intel_syntax noprefix
69
70 # Register allocation
71 # R8 = A0
72 # R9 = A1
73 # R10 = A2
74 # R11 = J,U
75 # R12 = V
76 # RBX = N in col = I in comba
77 # RDI = X
78 # RSI = Y
79 # R13 = XY
80 # RCX = upper bound of the foreach-column loops
81 # RAX, RDX = (temporary)
82
83 col:
84 cmp r11, r12 # exit when J > V
85 jg col_output # ...
86 lea rdx, [rsi + 8*rbx] # rdx := Y'Address + N*8
87 lea rax, [8*r11] # rax := j
88 sub rdx, rax # rdx := rdx - j*8
89 mov rdx, [rdx] # rdx := *(rdx)
90 mov rax, [rdi + 8*r11] # rax := X(j) := *(X'Address + j*8)
91 mul rdx # rdx:rax := rax*rdx
92 add r8, rax # A0, C := A0 + rax
93 adc r9, rdx # A1, C := A1 + rdx + C
94 adc r10, 0 # A2, [C=0] := A2 + 0 + C
95 inc r11 # J := J + 1
96 jmp col
97 col_output:
98 mov [r13 + 8*rbx], r8 # XY(N) := A0
99 mov r8, r9 # A0 := A1
100 mov r9, r10 # A1 := A2
101 xor r10, r10 # A2 := 0
102 ret
103
104 # Arguments according to SysV ABI
105 # RDI: X, array of words size X'Size elements
106 # RSI: Y, array of words size X'Size elements
107 # RDX: XY, array of words size 2*X'Size elements
108 # RCX: X'Size, base FZ length (for X and Y).
109 .global x86_64_comba
110 x86_64_comba:
111 push rbx
112 push r12
113 push r13
114 mov r13, rdx # RDX is used by MUL, move XY to a free register
115
116 xor r8, r8 # A0 := 0
117 xor r9, r9 # A1 := 0
118 xor r10, r10 # A2 := 0
119 xor rbx, rbx # I := 0
120
121 loop_1:
122 cmp rbx, rcx # exit when I >= L
123 jge end_loop_1 # ...
124 xor r11, r11 # U := 0
125 mov r12, rbx # V := I
126 call col #
127 inc rbx # I := I + 1
128 jmp loop_1
129 end_loop_1:
130
131 # rbx = L after the previous loop
132 lea r12, [rcx - 1] # V = L - 1
133 mov rcx, r12 # RCX := L - 1
134 shl rcx, 1 # RCX := (L - 1)*2
135 loop_2:
136 cmp rbx, rcx # exit when I > 2*L-2
137 jg end_loop_2 # ...
138 mov r11, rbx # U := I
139 sub r11, r12 # U := I - V := I - L + 1
140 call col # V already set to L - 1
141 inc rbx # I := I + 1
142 jmp loop_2
143 end_loop_2:
144
145 mov [r13 + 8*rbx], r8 # XY(I) := A0
146
147 end_comba:
148 pop r13
149 pop r12
150 pop rbx
151 ret