blob: 89e7bed9800431eabc31891ae9ffe2ae4546735c [file] [log] [blame]
dnl AMD64 mpn_addaddmul_1msb0, R = Au + Bv, u,v < 2^63.
dnl Copyright 2008 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
dnl The GNU MP Library is free software; you can redistribute it and/or modify
dnl it under the terms of the GNU Lesser General Public License as published
dnl by the Free Software Foundation; either version 3 of the License, or (at
dnl your option) any later version.
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
dnl License for more details.
dnl You should have received a copy of the GNU Lesser General Public License
dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
C cycles/limb
C K8: 2.167
C P4: 12.0
C P6-15: 4.0
C TODO
C * Perhaps handle various n mod 3 sizes better. The code now is too large.
C INPUT PARAMETERS
define(`rp', `%rdi')
define(`ap', `%rsi')
define(`bp_param', `%rdx')
define(`n', `%rcx')
define(`u0', `%r8')
define(`v0', `%r9')
define(`bp', `%rbp')
ASM_START()
TEXT
ALIGN(16)
PROLOGUE(mpn_addaddmul_1msb0)
push %r12
push %rbp
lea (ap,n,8), ap
lea (bp_param,n,8), bp
lea (rp,n,8), rp
neg n
mov (ap,n,8), %rax
mul %r8
mov %rax, %r12
mov (bp,n,8), %rax
mov %rdx, %r10
add $3, n
jns L(end)
ALIGN(16)
L(top): mul %r9
add %rax, %r12
mov -16(ap,n,8), %rax
adc %rdx, %r10
mov %r12, -24(rp,n,8)
mul %r8
add %rax, %r10
mov -16(bp,n,8), %rax
mov $0, %r11d
adc %rdx, %r11
mul %r9
add %rax, %r10
mov -8(ap,n,8), %rax
adc %rdx, %r11
mov %r10, -16(rp,n,8)
mul %r8
add %rax, %r11
mov -8(bp,n,8), %rax
mov $0, %r12d
adc %rdx, %r12
mul %r9
add %rax, %r11
adc %rdx, %r12
mov (ap,n,8), %rax
mul %r8
add %rax, %r12
mov %r11, -8(rp,n,8)
mov (bp,n,8), %rax
mov $0, %r10d
adc %rdx, %r10
add $3, n
js L(top)
L(end): cmp $1, R32(n)
ja 2f
jz 1f
mul %r9
add %rax, %r12
mov -16(ap), %rax
adc %rdx, %r10
mov %r12, -24(rp)
mul %r8
add %rax, %r10
mov -16(bp), %rax
mov $0, %r11d
adc %rdx, %r11
mul %r9
add %rax, %r10
mov -8(ap), %rax
adc %rdx, %r11
mov %r10, -16(rp)
mul %r8
add %rax, %r11
mov -8(bp), %rax
mov $0, %r12d
adc %rdx, %r12
mul %r9
add %rax, %r11
adc %rdx, %r12
mov %r11, -8(rp)
mov %r12, %rax
pop %rbp
pop %r12
ret
1: mul %r9
add %rax, %r12
mov -8(ap), %rax
adc %rdx, %r10
mov %r12, -16(rp)
mul %r8
add %rax, %r10
mov -8(bp), %rax
mov $0, %r11d
adc %rdx, %r11
mul %r9
add %rax, %r10
adc %rdx, %r11
mov %r10, -8(rp)
mov %r11, %rax
pop %rbp
pop %r12
ret
2: mul %r9
add %rax, %r12
mov %r12, -8(rp)
adc %rdx, %r10
mov %r10, %rax
pop %rbp
pop %r12
ret
EPILOGUE()