blob: 3ed4be126937025d9aa7775512b5cabad372c77f [file] [log] [blame]
dnl AMD64 mpn_sqr_basecase.
dnl Contributed to the GNU project by Torbjorn Granlund.
dnl Copyright 2008, 2009 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
dnl The GNU MP Library is free software; you can redistribute it and/or modify
dnl it under the terms of the GNU Lesser General Public License as published
dnl by the Free Software Foundation; either version 3 of the License, or (at
dnl your option) any later version.
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
dnl License for more details.
dnl You should have received a copy of the GNU Lesser General Public License
dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
C The inner loops of this code are the result of running a code generation and
C optimization tool suite written by David Harvey and Torbjorn Granlund.
C NOTES
C * This code only handles operands up to SQR_KARATSUBA_THRESHOLD_MAX. That
C means we can safely use 32-bit operations for all sizes, unlike in e.g.,
C mpn_addmul_1.
C * The jump table could probably be optimized, at least for non-pic.
C * The special code for n=1,2,3 was quickly written. It is probably too
C large and unnecessarily slow.
C * Consider combining small cases code so that the n=k-1 code jumps into
C the middle of the n=k code.
C * Avoid saving registers for small cases code.
C * Needed variables:
C n r11 input size
C i r8 work left, initially n
C j r9 inner loop count
C r15 unused
C v0 r13
C v1 r14
C rp rdi
C up rsi
C w0 rbx
C w1 rcx
C w2 rbp
C w3 r10
C tp r12
C lo rax
C hi rdx
C rsp
C INPUT PARAMETERS
define(`rp', `%rdi')
define(`up', `%rsi')
define(`n_param', `%rdx')
C We should really trim this, for better spatial locality. Alternatively,
C we could grab the upper part of the stack area, leaving the lower part
C instead of the upper part unused.
define(`SQR_KARATSUBA_THRESHOLD_MAX', 120)
define(`STACK_ALLOC', eval(8*2*SQR_KARATSUBA_THRESHOLD_MAX))
define(`n', `%r11')
define(`tp', `%r12')
define(`i', `%r8')
define(`j', `%r9')
define(`v0', `%r13')
define(`v1', `%r14')
define(`w0', `%rbx')
define(`w1', `%rcx')
define(`w2', `%rbp')
define(`w3', `%r10')
define(`SPECIAL_CODE_FOR_4',1)
ASM_START()
TEXT
ALIGN(16)
PROLOGUE(mpn_sqr_basecase)
add $-48, %rsp
mov %rbx, 40(%rsp)
mov %rbp, 32(%rsp)
mov %r12, 24(%rsp)
mov %r13, 16(%rsp)
mov %r14, 8(%rsp)
mov R32(n_param), R32(n) C free original n register (rdx)
mov R32(n_param), R32(%rcx)
and $3, R32(%rcx)
lea 4(%rcx), %rbx
cmp $4, R32(n_param)
cmovg %rbx, %rcx
lea L(jmptab)(%rip), %rax
jmp *(%rax,%rcx,8)
JUMPTABSECT
ALIGN(8)
L(jmptab):
.quad L(4)
.quad L(1)
.quad L(2)
.quad L(3)
.quad L(0m4)
.quad L(1m4)
.quad L(2m4)
.quad L(3m4)
TEXT
L(1): mov (up), %rax
mul %rax
mov %rax, (rp)
mov %rdx, 8(rp)
add $40, %rsp
pop %rbx
ret
L(2): mov (up), %rax
mul %rax
mov %rax, (rp)
mov %rdx, %r9
mov 8(up), %rax
mul %rax
mov %rax, %r10
mov %rdx, %r11
mov 8(up), %rax
mov (up), %rbx
mul %rbx
add %rax, %r9
adc %rdx, %r10
adc $0, %r11
add %rax, %r9
mov %r9, 8(rp)
adc %rdx, %r10
mov %r10, 16(rp)
adc $0, %r11
mov %r11, 24(rp)
add $40, %rsp
pop %rbx
ret
L(3): mov (up), %rax
mul %rax
mov %rax, (rp)
mov %rdx, 8(rp)
mov 8(up), %rax
mul %rax
mov %rax, 16(rp)
mov %rdx, 24(rp)
mov 16(up), %rax
mul %rax
mov %rax, 32(rp)
mov %rdx, 40(rp)
mov (up), %rbx
mov 8(up), %rax
mul %rbx
mov %rax, %r8
mov %rdx, %r9
mov 16(up), %rax
mul %rbx
xor R32(%r10), R32(%r10)
add %rax, %r9
adc %rdx, %r10
mov 8(up), %rbx
mov 16(up), %rax
mul %rbx
xor R32(%r11), R32(%r11)
add %rax, %r10
adc %rdx, %r11
add %r8, %r8
adc %r9, %r9
adc %r10, %r10
adc %r11, %r11
mov $0, R32(%rbx)
adc %rbx, %rbx
add %r8, 8(rp)
adc %r9, 16(rp)
adc %r10, 24(rp)
adc %r11, 32(rp)
adc %rbx, 40(rp)
add $40, %rsp
pop %rbx
ret
ifdef(`SPECIAL_CODE_FOR_4',`
L(4): mov (up), %rax
mul %rax
mov %rax, (rp)
mov %rdx, 8(rp)
mov 8(up), %rax
mul %rax
mov %rax, 16(rp)
mov %rdx, 24(rp)
mov 16(up), %rax
mul %rax
mov %rax, 32(rp)
mov %rdx, 40(rp)
mov 24(up), %rax
mul %rax
mov %rax, 48(rp)
mov %rdx, 56(rp)
mov (up), %rbx
mov 8(up), %rax
mul %rbx
mov %rax, %r8
mov %rdx, %r9
mov 16(up), %rax
mul %rbx
xor R32(%r10), R32(%r10)
add %rax, %r9
adc %rdx, %r10
mov 24(up), %rax
mul %rbx
xor R32(%r11), R32(%r11)
add %rax, %r10
adc %rdx, %r11
mov 8(up), %rbx
mov 16(up), %rax
mul %rbx
xor R32(%r12), R32(%r12)
add %rax, %r10
adc %rdx, %r11
adc $0, %r12
mov 24(up), %rax
mul %rbx
add %rax, %r11
adc %rdx, %r12
mov 16(up), %rbx
mov 24(up), %rax
mul %rbx
xor R32(%rbp), R32(%rbp)
add %rax, %r12
adc %rdx, %rbp
add %r8, %r8
adc %r9, %r9
adc %r10, %r10
adc %r11, %r11
adc %r12, %r12
mov $0, R32(%rbx)
adc %rbp, %rbp
adc %rbx, %rbx
add %r8, 8(rp)
adc %r9, 16(rp)
adc %r10, 24(rp)
adc %r11, 32(rp)
adc %r12, 40(rp)
adc %rbp, 48(rp)
adc %rbx, 56(rp)
add $24, %rsp
pop %r12
pop %rbp
pop %rbx
ret
')
L(0m4): add $-STACK_ALLOC, %rsp
lea (%rsp,n,8), tp C point tp in middle of result operand
lea (up,n,8), up C point up at end of input operand
lea -1(n), i
C Function mpn_mul_1_m3(tp, up - i, i, up[-i - 1])
mov $-1, j
sub i, j
lea -24(tp), tp C offset FIXME
mov (up,j,8), v0
mov 8(up,j,8), %rax
mul v0
xor R32(w2), R32(w2)
mov %rax, w0
mov 16(up,j,8), %rax
mov %rdx, w3
jmp L(L3)
ALIGN(16)
L(mul_1_m3_top):
add %rax, w2
mov w3, (tp,j,8)
mov (up,j,8), %rax
adc %rdx, w1
xor R32(w0), R32(w0)
mul v0
xor R32(w3), R32(w3)
mov w2, 8(tp,j,8)
add %rax, w1
adc %rdx, w0
mov 8(up,j,8), %rax
mov w1, 16(tp,j,8)
xor R32(w2), R32(w2)
mul v0
add %rax, w0
mov 16(up,j,8), %rax
adc %rdx, w3
L(L3): xor R32(w1), R32(w1)
mul v0
add %rax, w3
mov 24(up,j,8), %rax
adc %rdx, w2
mov w0, 24(tp,j,8)
mul v0
add $4, j
js L(mul_1_m3_top)
add %rax, w2
mov w3, (tp)
adc %rdx, w1
mov w2, 8(tp)
mov w1, 16(tp)
lea eval(24+2*8)(tp), tp C tp += 2, undo offset FIXME
ifdef(`SPECIAL_CODE_FOR_4',`',`
cmp $3, R32(i)
je L(last)
')
jmp L(dowhile)
L(1m4): add $-STACK_ALLOC, %rsp
lea (%rsp,n,8), tp C point tp in middle of result operand
lea (up,n,8), up C point up at end of input operand
lea (n), i
C Function mpn_mul_2s_m0(tp, up - i, i, up - i - 1)
mov $3, R32(j)
sub i, j
lea 8(up), up C offset FIXME
mov -32(up,j,8), v0 C u0
mov -24(up,j,8), v1 C u1
mov -24(up,j,8), %rax C u1
mul v0 C u0 * u1
mov %rdx, w1
xor R32(w2), R32(w2)
mov %rax, -24(tp,j,8)
jmp L(m0)
ALIGN(16)
L(mul_2_m0_top):
mul v1
add %rax, w0
adc %rdx, w1
mov -24(up,j,8), %rax
mov $0, R32(w2)
mul v0
add %rax, w0
mov -24(up,j,8), %rax
adc %rdx, w1
adc $0, R32(w2)
mul v1 C v1 * u0
add %rax, w1
mov w0, -24(tp,j,8)
adc %rdx, w2
L(m0): mov -16(up,j,8), %rax C u2, u6 ...
mul v0 C u0 * u2
mov $0, R32(w3)
add %rax, w1
adc %rdx, w2
mov -16(up,j,8), %rax
adc $0, R32(w3)
mov $0, R32(w0)
mov w1, -16(tp,j,8)
mul v1
add %rax, w2
mov -8(up,j,8), %rax
adc %rdx, w3
mov $0, R32(w1)
mul v0
add %rax, w2
mov -8(up,j,8), %rax
adc %rdx, w3
adc $0, R32(w0)
mul v1
add %rax, w3
mov w2, -8(tp,j,8)
adc %rdx, w0
mov (up,j,8), %rax
mul v0
add %rax, w3
adc %rdx, w0
adc $0, R32(w1)
add $4, j
mov -32(up,j,8), %rax
mov w3, -32(tp,j,8)
js L(mul_2_m0_top)
mul v1
add %rax, w0
adc %rdx, w1
mov w0, -8(tp)
mov w1, (tp)
lea -8(up), up C undo offset FIXME
lea eval(3*8)(tp), tp C tp += 3
add $-2, R32(i) C i -= 2
cmp $3, R32(i)
je L(last)
jmp L(dowhile)
L(2m4): add $-STACK_ALLOC, %rsp
lea (%rsp,n,8), tp C point tp in middle of result operand
lea (up,n,8), up C point up at end of input operand
lea -1(n), i
C Function mpn_mul_1_m1(tp, up - (i - 1), i - 1, up[-i])
mov $1, R32(j)
sub i, j
lea -24(tp), tp C offset FIXME
mov -16(up,j,8), v0
mov -8(up,j,8), %rax
mul v0
mov %rax, w2
mov (up,j,8), %rax
mov %rdx, w1
jmp L(L1)
ALIGN(16)
L(mul_1_m1_top):
add %rax, w2
mov w3, (tp,j,8)
mov (up,j,8), %rax
adc %rdx, w1
L(L1): xor R32(w0), R32(w0)
mul v0
xor R32(w3), R32(w3)
mov w2, 8(tp,j,8)
add %rax, w1
adc %rdx, w0
mov 8(up,j,8), %rax
mov w1, 16(tp,j,8)
xor R32(w2), R32(w2)
mul v0
add %rax, w0
mov 16(up,j,8), %rax
adc %rdx, w3
xor R32(w1), R32(w1)
mul v0
add %rax, w3
mov 24(up,j,8), %rax
adc %rdx, w2
mov w0, 24(tp,j,8)
mul v0
add $4, j
js L(mul_1_m1_top)
add %rax, w2
mov w3, (tp)
adc %rdx, w1
mov w2, 8(tp)
mov w1, 16(tp)
lea eval(24+2*8)(tp), tp C tp += 2, undo offset FIXME
jmp L(dowhile_mid)
L(3m4): add $-STACK_ALLOC, %rsp
lea (%rsp,n,8), tp C point tp in middle of result operand
lea (up,n,8), up C point up at end of input operand
lea (n), i
C Function mpn_mul_2s_m2(tp, up - i + 1, i - 1, up - i)
mov $1, R32(j)
sub i, j
lea 8(up), up C offset FIXME
mov -16(up,j,8), v0
mov -8(up,j,8), v1
mov -8(up,j,8), %rax
mul v0 C v0 * u0
mov %rdx, w3
xor R32(w0), R32(w0)
xor R32(w1), R32(w1)
mov %rax, -8(tp,j,8)
jmp L(m2)
ALIGN(16)
L(mul_2_m2_top):
mul v1
add %rax, w0
adc %rdx, w1
mov -24(up,j,8), %rax
mov $0, R32(w2)
mul v0
add %rax, w0
mov -24(up,j,8), %rax
adc %rdx, w1
adc $0, R32(w2)
mul v1 C v1 * u0
add %rax, w1
mov w0, -24(tp,j,8)
adc %rdx, w2
mov -16(up,j,8), %rax
mul v0
mov $0, R32(w3)
add %rax, w1
adc %rdx, w2
mov -16(up,j,8), %rax
adc $0, R32(w3)
mov $0, R32(w0)
mov w1, -16(tp,j,8)
mul v1
add %rax, w2
mov -8(up,j,8), %rax
adc %rdx, w3
mov $0, R32(w1)
mul v0
add %rax, w2
mov -8(up,j,8), %rax
adc %rdx, w3
adc $0, R32(w0)
mul v1
add %rax, w3
mov w2, -8(tp,j,8)
adc %rdx, w0
L(m2): mov (up,j,8), %rax
mul v0
add %rax, w3
adc %rdx, w0
adc $0, R32(w1)
add $4, j
mov -32(up,j,8), %rax
mov w3, -32(tp,j,8)
js L(mul_2_m2_top)
mul v1
add %rax, w0
adc %rdx, w1
mov w0, -8(tp)
mov w1, (tp)
lea -8(up), up C undo offset FIXME
lea eval(3*8)(tp), tp C tp += 3
add $-2, R32(i) C i -= 2
jmp L(dowhile_mid)
L(dowhile):
C Function mpn_addmul_2s_m2(tp, up - (i - 1), i - 1, up - i)
mov $-1, j
sub i, j
lea -24(tp), tp C offset FIXME
lea -8(up), up C offset FIXME
mov 16(up,j,8), v0
mov 24(up,j,8), v1
mov 24(up,j,8), %rax
mul v0
xor R32(w3), R32(w3)
add %rax, 24(tp,j,8)
adc %rdx, w3
xor R32(w0), R32(w0)
xor R32(w1), R32(w1)
jmp L(am2)
ALIGN(16)
L(addmul_2_m2_top):
add w3, (tp,j,8)
adc %rax, w0
mov 8(up,j,8), %rax
adc %rdx, w1
mov $0, R32(w2)
mul v0
add %rax, w0
mov 8(up,j,8), %rax
adc %rdx, w1
adc $0, R32(w2)
mul v1 C v1 * u0
add w0, 8(tp,j,8)
adc %rax, w1
adc %rdx, w2
mov 16(up,j,8), %rax
mov $0, R32(w3)
mul v0 C v0 * u1
add %rax, w1
mov 16(up,j,8), %rax
adc %rdx, w2
adc $0, R32(w3)
mul v1 C v1 * u1
add w1, 16(tp,j,8)
adc %rax, w2
mov 24(up,j,8), %rax
adc %rdx, w3
mul v0
mov $0, R32(w0)
add %rax, w2
adc %rdx, w3
mov $0, R32(w1)
mov 24(up,j,8), %rax
adc $0, R32(w0)
mul v1
add w2, 24(tp,j,8)
adc %rax, w3
adc %rdx, w0
L(am2): mov 32(up,j,8), %rax
mul v0
add %rax, w3
mov 32(up,j,8), %rax
adc %rdx, w0
adc $0, R32(w1)
mul v1
add $4, j
js L(addmul_2_m2_top)
add w3, (tp)
adc %rax, w0
adc %rdx, w1
mov w0, 8(tp)
mov w1, 16(tp)
lea eval(2*8)(tp), tp C tp += 2
add $-2, R32(i) C i -= 2
lea 24(tp), tp C undo offset FIXME
lea 8(up), up C undo offset FIXME
L(dowhile_mid):
C Function mpn_addmul_2s_m0(tp, up - (i - 1), i - 1, up - i)
mov $1, R32(j)
sub i, j
lea -24(tp), tp C offset FIXME
lea -8(up), up C offset FIXME
mov (up,j,8), v0
mov 8(up,j,8), v1
mov 8(up,j,8), %rax
mul v0
xor R32(w1), R32(w1)
add %rax, 8(tp,j,8)
adc %rdx, w1
xor R32(w2), R32(w2)
jmp L(20)
ALIGN(16)
L(addmul_2_m0_top):
add w3, (tp,j,8)
adc %rax, w0
mov 8(up,j,8), %rax
adc %rdx, w1
mov $0, R32(w2)
mul v0
add %rax, w0
mov 8(up,j,8), %rax
adc %rdx, w1
adc $0, R32(w2)
mul v1 C v1 * u0
add w0, 8(tp,j,8)
adc %rax, w1
adc %rdx, w2
L(20): mov 16(up,j,8), %rax
mov $0, R32(w3)
mul v0 C v0 * u1
add %rax, w1
mov 16(up,j,8), %rax
adc %rdx, w2
adc $0, R32(w3)
mul v1 C v1 * u1
add w1, 16(tp,j,8)
adc %rax, w2
mov 24(up,j,8), %rax
adc %rdx, w3
mul v0
mov $0, R32(w0)
add %rax, w2
adc %rdx, w3
mov $0, R32(w1)
mov 24(up,j,8), %rax
adc $0, R32(w0)
mul v1
add w2, 24(tp,j,8)
adc %rax, w3
adc %rdx, w0
mov 32(up,j,8), %rax
mul v0
add %rax, w3
mov 32(up,j,8), %rax
adc %rdx, w0
adc $0, R32(w1)
mul v1
add $4, j
js L(addmul_2_m0_top)
add w3, (tp)
adc %rax, w0
adc %rdx, w1
mov w0, 8(tp)
mov w1, 16(tp)
lea 24(tp), tp C undo offset FIXME
lea 8(up), up C undo offset FIXME
lea eval(2*8)(tp), tp C tp += 2
add $-2, R32(i) C i -= 2
cmp $3, R32(i)
jne L(dowhile)
L(last):
C Function mpn_addmul_2s_2
mov -24(up), v0
mov -16(up), v1
mov -16(up), %rax
mul v0
xor R32(w3), R32(w3)
add %rax, -32(tp)
adc %rdx, w3
xor R32(w0), R32(w0)
xor R32(w1), R32(w1)
mov -8(up), %rax
mul v0
add %rax, w3
mov -8(up), %rax
adc %rdx, w0
mul v1
add w3, -24(tp)
adc %rax, w0
adc %rdx, w1
mov w0, -16(tp)
mov w1, -8(tp)
C Function mpn_sqr_diag_addlsh1
mov R32(n), R32(j)
shl $3, n
sub n, up
mov (%rsp), %r11
bt $0, j
lea -4(j,j),j
jc L(odd)
L(evn): lea (rp,j,8), rp
lea (up,j,4), up
lea 8(%rsp,j,8), tp
neg j
add %r11, %r11
sbb R32(%rbx), R32(%rbx) C save CF
mov (up,j,4), %rax
mul %rax
add %rdx, %r11
mov %rax, (rp,j,8)
jmp L(d0)
L(odd): lea -16(rp,j,8), rp
lea -8(up,j,4), up
lea -8(%rsp,j,8), tp
neg j
add %r11, %r11
sbb R32(%rbp), R32(%rbp) C save CF
mov 8(up,j,4), %rax
mul %rax
add %rdx, %r11
mov %rax, 16(rp,j,8)
jmp L(d1)
ALIGN(16)
L(top): mov (up,j,4), %rax
mul %rax
add R32(%rbp), R32(%rbp) C restore carry
adc %rax, %r10
adc %rdx, %r11
mov %r10, (rp,j,8)
L(d0): mov %r11, 8(rp,j,8)
mov (tp,j,8), %r10
adc %r10, %r10
mov 8(tp,j,8), %r11
adc %r11, %r11
nop
sbb R32(%rbp), R32(%rbp) C save CF
mov 8(up,j,4), %rax
mul %rax
add R32(%rbx), R32(%rbx) C restore carry
adc %rax, %r10
adc %rdx, %r11
mov %r10, 16(rp,j,8)
L(d1): mov %r11, 24(rp,j,8)
mov 16(tp,j,8), %r10
adc %r10, %r10
mov 24(tp,j,8), %r11
adc %r11, %r11
sbb R32(%rbx), R32(%rbx) C save CF
add $4, j
js L(top)
L(end): mov (up,j,4), %rax
mul %rax
add R32(%rbp), R32(%rbp) C restore carry
adc %rax, %r10
adc %rdx, %r11
mov %r10, (rp,j,8)
mov %r11, 8(rp,j,8)
mov (tp,j,8), %r10
adc %r10, %r10
sbb R32(%rbp), R32(%rbp) C save CF
neg R32(%rbp)
mov 8(up,j,4), %rax
mul %rax
add R32(%rbx), R32(%rbx) C restore carry
adc %rax, %r10
adc %rbp, %rdx
mov %r10, 16(rp,j,8)
mov %rdx, 24(rp,j,8)
add $eval(8+STACK_ALLOC), %rsp
pop %r14
pop %r13
pop %r12
pop %rbp
pop %rbx
ret
EPILOGUE()