blob: dc8576b2209287f798a27923c6e11fc176183ae5 [file] [log] [blame]
dnl AMD64 mpn_lshsub_n. R = 2^k(U - V).
dnl Copyright 2006 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
dnl The GNU MP Library is free software; you can redistribute it and/or modify
dnl it under the terms of the GNU Lesser General Public License as published
dnl by the Free Software Foundation; either version 3 of the License, or (at
dnl your option) any later version.
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
dnl License for more details.
dnl You should have received a copy of the GNU Lesser General Public License
dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
C cycles/limb
C K8,K9: 3.15 (mpn_sub_n + mpn_lshift costs about 4 c/l)
C K10: 3.15 (mpn_sub_n + mpn_lshift costs about 4 c/l)
C P4: 16.5
C P6-15: 4.35
C This was written quickly and not optimized at all, but it runs very well on
C K8. But perhaps one could get under 3 c/l. Ideas:
C 1) Use indexing to save the 3 LEA
C 2) Write reasonable feed-in code
C 3) Be more clever about register usage
C 4) Unroll more, handling CL negation, carry save/restore cost much now
C 5) Reschedule
C INPUT PARAMETERS
define(`rp', `%rdi')
define(`up', `%rsi')
define(`vp', `%rdx')
define(`n', `%rcx')
define(`cnt' `%r8')
ASM_START()
TEXT
ALIGN(16)
PROLOGUE(mpn_lshsub_n)
push %r12
push %r13
push %r14
push %r15
push %rbx
mov n, %rax
xor %ebx, %ebx C clear carry save register
mov %r8d, %ecx C shift count
xor %r15d, %r15d C limb carry
mov %eax, %r11d
and $3, %r11d
je L(4)
sub $1, %r11d
L(oopette):
add %ebx, %ebx C restore carry flag
mov 0(up), %r8
lea 8(up), up
sbb 0(vp), %r8
mov %r8, %r12
sbb %ebx, %ebx C save carry flag
shl %cl, %r8
or %r15, %r8
mov %r12, %r15
lea 8(vp), vp
neg %cl
shr %cl, %r15
neg %cl
mov %r8, 0(rp)
lea 8(rp), rp
sub $1, %r11d
jnc L(oopette)
L(4):
sub $4, %rax
jc L(end)
ALIGN(16)
L(oop):
add %ebx, %ebx C restore carry flag
mov 0(up), %r8
mov 8(up), %r9
mov 16(up), %r10
mov 24(up), %r11
lea 32(up), up
sbb 0(vp), %r8
mov %r8, %r12
sbb 8(vp), %r9
mov %r9, %r13
sbb 16(vp), %r10
mov %r10, %r14
sbb 24(vp), %r11
sbb %ebx, %ebx C save carry flag
shl %cl, %r8
shl %cl, %r9
shl %cl, %r10
or %r15, %r8
mov %r11, %r15
shl %cl, %r11
lea 32(vp), vp
neg %cl
shr %cl, %r12
shr %cl, %r13
shr %cl, %r14
shr %cl, %r15 C used next loop
or %r12, %r9
or %r13, %r10
or %r14, %r11
neg %cl
mov %r8, 0(rp)
mov %r9, 8(rp)
mov %r10, 16(rp)
mov %r11, 24(rp)
lea 32(rp), rp
sub $4, %rax
jnc L(oop)
L(end):
neg %ebx
shl %cl, %rbx
adc %r15, %rbx
mov %rbx, %rax
pop %rbx
pop %r15
pop %r14
pop %r13
pop %r12
ret
EPILOGUE()