raw
ch9_asm_comba           1 .intel_mnemonic
ch9_asm_comba 2 .intel_syntax noprefix
ch9_asm_comba 3
ch9_asm_comba 4 # Register allocation
ch9_asm_comba 5 # R8 = A0
ch9_asm_comba 6 # R9 = A1
ch9_asm_comba 7 # R10 = A2
ch9_asm_comba 8 # R11 = J,U
ch9_asm_comba 9 # R12 = V
ch9_asm_comba 10 # RBX = N in col = I in comba
ch9_asm_comba 11 # RDI = X
ch9_asm_comba 12 # RSI = Y
ch9_asm_comba 13 # R13 = XY
ch9_asm_comba 14 # RCX = upper bound of the foreach-column loops
ch9_asm_comba 15 # RAX, RDX = (temporary)
ch9_asm_comba 16
ch9_asm_comba 17 col:
ch9_asm_comba 18 cmp r11, r12 # exit when J > V
ch9_asm_comba 19 jg col_output # ...
ch9_asm_comba 20 lea rdx, [rsi + 8*rbx] # rdx := Y'Address + N*8
ch9_asm_comba 21 lea rax, [8*r11] # rax := j
ch9_asm_comba 22 sub rdx, rax # rdx := rdx - j*8
ch9_asm_comba 23 mov rdx, [rdx] # rdx := *(rdx)
ch9_asm_comba 24 mov rax, [rdi + 8*r11] # rax := X(j) := *(X'Address + j*8)
ch9_asm_comba 25 mul rdx # rdx:rax := rax*rdx
ch9_asm_comba 26 add r8, rax # A0, C := A0 + rax
ch9_asm_comba 27 adc r9, rdx # A1, C := A1 + rdx + C
ch9_asm_comba 28 adc r10, 0 # A2, [C=0] := A2 + 0 + C
ch9_asm_comba 29 inc r11 # J := J + 1
ch9_asm_comba 30 jmp col
ch9_asm_comba 31 col_output:
ch9_asm_comba 32 mov [r13 + 8*rbx], r8 # XY(N) := A0
ch9_asm_comba 33 mov r8, r9 # A0 := A1
ch9_asm_comba 34 mov r9, r10 # A1 := A2
ch9_asm_comba 35 xor r10, r10 # A2 := 0
ch9_asm_comba 36 ret
ch9_asm_comba 37
ch9_asm_comba 38 # Arguments according to SysV ABI
ch9_asm_comba 39 # RDI: X, array of words size X'Size elements
ch9_asm_comba 40 # RSI: Y, array of words size X'Size elements
ch9_asm_comba 41 # RDX: XY, array of words size 2*X'Size elements
ch9_asm_comba 42 # RCX: X'Size, base FZ length (for X and Y).
ch9_asm_comba 43 .global x86_64_comba
ch9_asm_comba 44 x86_64_comba:
ch9_asm_comba 45 push rbx
ch9_asm_comba 46 push r12
ch9_asm_comba 47 push r13
ch9_asm_comba 48 mov r13, rdx # RDX is used by MUL, move XY to a free register
ch9_asm_comba 49
ch9_asm_comba 50 xor r8, r8 # A0 := 0
ch9_asm_comba 51 xor r9, r9 # A1 := 0
ch9_asm_comba 52 xor r10, r10 # A2 := 0
ch9_asm_comba 53 xor rbx, rbx # I := 0
ch9_asm_comba 54
ch9_asm_comba 55 loop_1:
ch9_asm_comba 56 cmp rbx, rcx # exit when I >= L
ch9_asm_comba 57 jge end_loop_1 # ...
ch9_asm_comba 58 xor r11, r11 # U := 0
ch9_asm_comba 59 mov r12, rbx # V := I
ch9_asm_comba 60 call col #
ch9_asm_comba 61 inc rbx # I := I + 1
ch9_asm_comba 62 jmp loop_1
ch9_asm_comba 63 end_loop_1:
ch9_asm_comba 64
ch9_asm_comba 65 # rbx = L after the previous loop
ch9_asm_comba 66 lea r12, [rcx - 1] # V = L - 1
ch9_asm_comba 67 mov rcx, r12 # RCX := L - 1
ch9_asm_comba 68 shl rcx, 1 # RCX := (L - 1)*2
ch9_asm_comba 69 loop_2:
ch9_asm_comba 70 cmp rbx, rcx # exit when I > 2*L-2
ch9_asm_comba 71 jg end_loop_2 # ...
ch9_asm_comba 72 mov r11, rbx # U := I
ch9_asm_comba 73 sub r11, r12 # U := I - V := I - L + 1
ch9_asm_comba 74 call col # V already set to L - 1
ch9_asm_comba 75 inc rbx # I := I + 1
ch9_asm_comba 76 jmp loop_2
ch9_asm_comba 77 end_loop_2:
ch9_asm_comba 78
ch9_asm_comba 79 mov [r13 + 8*rbx], r8 # XY(I) := A0
ch9_asm_comba 80
ch9_asm_comba 81 end_comba:
ch9_asm_comba 82 pop r13
ch9_asm_comba 83 pop r12
ch9_asm_comba 84 pop rbx
ch9_asm_comba 85 ret