blob: a25c74ebdcfd072255bc61d4d26b411f82a61b43 [file] [log] [blame]
dnl AMD64 mpn_addmul_1 and mpn_submul_1.
dnl Copyright 2003, 2004, 2005, 2007, 2008 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
dnl The GNU MP Library is free software; you can redistribute it and/or modify
dnl it under the terms of the GNU Lesser General Public License as published
dnl by the Free Software Foundation; either version 3 of the License, or (at
dnl your option) any later version.
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
dnl License for more details.
dnl You should have received a copy of the GNU Lesser General Public License
dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
C cycles/limb
C K8,K9: 2.5
C K10: 2.5
C P4: 14.9
C P6-15 (Core2): 5.09
C P6-28 (Atom): 21.3
C The inner loop of this code is the result of running a code generation and
C optimization tool suite written by David Harvey and Torbjorn Granlund.
C TODO:
C * The inner loop is great, but the prologue and epilogue code was
C quickly written. Tune it!
C INPUT PARAMETERS
define(`rp', `%rdi')
define(`up', `%rsi')
define(`n_param',`%rdx')
define(`vl', `%rcx')
define(`n', `%r11')
ifdef(`OPERATION_addmul_1',`
define(`ADDSUB', `add')
define(`func', `mpn_addmul_1')
')
ifdef(`OPERATION_submul_1',`
define(`ADDSUB', `sub')
define(`func', `mpn_submul_1')
')
MULFUNC_PROLOGUE(mpn_addmul_1 mpn_submul_1)
ASM_START()
TEXT
ALIGN(16)
PROLOGUE(func)
mov (up), %rax C read first u limb early
push %rbx
mov n_param, %rbx C move away n from rdx, mul uses it
mul vl
mov %rbx, %r11
and $3, R32(%rbx)
jz L(b0)
cmp $2, R32(%rbx)
jz L(b2)
jg L(b3)
L(b1): dec n
jne L(gt1)
ADDSUB %rax, (rp)
jmp L(ret)
L(gt1): lea 8(up,n,8), up
lea -8(rp,n,8), rp
neg n
xor %r10, %r10
xor R32(%rbx), R32(%rbx)
mov %rax, %r9
mov (up,n,8), %rax
mov %rdx, %r8
jmp L(L1)
L(b0): lea (up,n,8), up
lea -16(rp,n,8), rp
neg n
xor %r10, %r10
mov %rax, %r8
mov %rdx, %rbx
jmp L(L0)
L(b3): lea -8(up,n,8), up
lea -24(rp,n,8), rp
neg n
mov %rax, %rbx
mov %rdx, %r10
jmp L(L3)
L(b2): lea -16(up,n,8), up
lea -32(rp,n,8), rp
neg n
xor %r8, %r8
xor R32(%rbx), R32(%rbx)
mov %rax, %r10
mov 24(up,n,8), %rax
mov %rdx, %r9
jmp L(L2)
ALIGN(16)
L(top): ADDSUB %r10, (rp,n,8)
adc %rax, %r9
mov (up,n,8), %rax
adc %rdx, %r8
mov $0, %r10d
L(L1): mul vl
ADDSUB %r9, 8(rp,n,8)
adc %rax, %r8
adc %rdx, %rbx
L(L0): mov 8(up,n,8), %rax
mul vl
ADDSUB %r8, 16(rp,n,8)
adc %rax, %rbx
adc %rdx, %r10
L(L3): mov 16(up,n,8), %rax
mul vl
ADDSUB %rbx, 24(rp,n,8)
mov $0, %r8d # zero
mov %r8, %rbx # zero
adc %rax, %r10
mov 24(up,n,8), %rax
mov %r8, %r9 # zero
adc %rdx, %r9
L(L2): mul vl
add $4, n
js L(top)
ADDSUB %r10, (rp,n,8)
adc %rax, %r9
adc %r8, %rdx
ADDSUB %r9, 8(rp,n,8)
L(ret): adc $0, %rdx
mov %rdx, %rax
pop %rbx
ret
EPILOGUE()