| dnl AMD64 mpn_addlsh1_n, mpn_sublsh1_n -- rp[] = up[] +- (vp[] << 1), |
| dnl optimized for Pentium 4. |
| |
| dnl Copyright 2008 Free Software Foundation, Inc. |
| |
| dnl This file is part of the GNU MP Library. |
| |
| dnl The GNU MP Library is free software; you can redistribute it and/or modify |
| dnl it under the terms of the GNU Lesser General Public License as published |
| dnl by the Free Software Foundation; either version 3 of the License, or (at |
| dnl your option) any later version. |
| |
| dnl The GNU MP Library is distributed in the hope that it will be useful, but |
| dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY |
| dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public |
| dnl License for more details. |
| |
| dnl You should have received a copy of the GNU Lesser General Public License |
| dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. |
| |
| include(`../config.m4') |
| |
| C cycles/limb |
| C K8,K9: 3.8 |
| C K10: 4.8 |
| C P4: 5.8 |
| C P6-15: ? |
| |
| |
| C INPUT PARAMETERS |
| define(`rp',`%rdi') |
| define(`up',`%rsi') |
| define(`vp',`%rdx') |
| define(`n', `%rcx') |
| |
| ifdef(`OPERATION_addlsh1_n', ` |
| define(ADDSUB, add) |
| define(func, mpn_addlsh1_n)') |
| ifdef(`OPERATION_sublsh1_n', ` |
| define(ADDSUB, sub) |
| define(func, mpn_sublsh1_n)') |
| |
| MULFUNC_PROLOGUE(mpn_addlsh1_n mpn_sublsh1_n) |
| |
| ASM_START() |
| TEXT |
| ALIGN(16) |
| PROLOGUE(func) |
| push %rbx |
| push %r12 |
| push %rbp |
| |
| mov (vp), %r9 |
| shl %r9 |
| mov 4(vp), R32(%rbp) |
| |
| xor R32(%rbx), R32(%rbx) |
| |
| mov R32(n), R32(%rax) |
| and $3, R32(%rax) |
| jne L(n00) C n = 0, 4, 8, ... |
| |
| mov (up), %r8 |
| mov 8(up), %r10 |
| shr $31, R32(%rbp) |
| ADDSUB %r9, %r8 |
| mov 8(vp), %r9 |
| lea (%rbp,%r9,2), %r9 |
| setc R8(%rax) |
| mov 12(vp), R32(%rbp) |
| lea -16(rp), rp |
| jmp L(L00) |
| |
| L(n00): cmp $2, R32(%rax) |
| jnc L(n01) C n = 1, 5, 9, ... |
| mov (up), %r11 |
| lea -8(rp), rp |
| shr $31, R32(%rbp) |
| ADDSUB %r9, %r11 |
| setc R8(%rbx) |
| dec n |
| jz L(1) C jump for n = 1 |
| mov 8(up), %r8 |
| mov 8(vp), %r9 |
| lea (%rbp,%r9,2), %r9 |
| mov 12(vp), R32(%rbp) |
| lea 8(up), up |
| lea 8(vp), vp |
| jmp L(L01) |
| |
| L(n01): jne L(n10) C n = 2, 6, 10, ... |
| mov (up), %r12 |
| mov 8(up), %r11 |
| shr $31, R32(%rbp) |
| ADDSUB %r9, %r12 |
| mov 8(vp), %r9 |
| lea (%rbp,%r9,2), %r9 |
| setc R8(%rax) |
| mov 12(vp), R32(%rbp) |
| lea 16(up), up |
| lea 16(vp), vp |
| jmp L(L10) |
| |
| L(n10): mov (up), %r10 |
| mov 8(up), %r12 |
| shr $31, R32(%rbp) |
| ADDSUB %r9, %r10 |
| mov 8(vp), %r9 |
| lea (%rbp,%r9,2), %r9 |
| setc R8(%rbx) |
| mov 12(vp), R32(%rbp) |
| lea -24(rp), rp |
| lea -8(up), up |
| lea -8(vp), vp |
| jmp L(L11) |
| |
| L(c0): mov $1, R8(%rbx) |
| jmp L(rc0) |
| L(c1): mov $1, R8(%rax) |
| jmp L(rc1) |
| L(c2): mov $1, R8(%rbx) |
| jmp L(rc2) |
| |
| ALIGN(16) |
| L(top): mov (up), %r8 C not on critical path |
| shr $31, R32(%rbp) |
| ADDSUB %r9, %r11 C not on critical path |
| mov (vp), %r9 |
| lea (%rbp,%r9,2), %r9 |
| setc R8(%rbx) C save carry out |
| mov 4(vp), R32(%rbp) |
| mov %r12, (rp) |
| ADDSUB %rax, %r11 C apply previous carry out |
| jc L(c0) C jump if ripple |
| L(rc0): |
| L(L01): mov 8(up), %r10 |
| shr $31, R32(%rbp) |
| ADDSUB %r9, %r8 |
| mov 8(vp), %r9 |
| lea (%rbp,%r9,2), %r9 |
| setc R8(%rax) |
| mov 12(vp), R32(%rbp) |
| mov %r11, 8(rp) |
| ADDSUB %rbx, %r8 |
| jc L(c1) |
| L(rc1): |
| L(L00): mov 16(up), %r12 |
| shr $31, R32(%rbp) |
| ADDSUB %r9, %r10 |
| mov 16(vp), %r9 |
| lea (%rbp,%r9,2), %r9 |
| setc R8(%rbx) |
| mov 20(vp), R32(%rbp) |
| mov %r8, 16(rp) |
| ADDSUB %rax, %r10 |
| jc L(c2) |
| L(rc2): |
| L(L11): mov 24(up), %r11 |
| shr $31, R32(%rbp) |
| ADDSUB %r9, %r12 |
| mov 24(vp), %r9 |
| lea (%rbp,%r9,2), %r9 |
| lea 32(up), up |
| lea 32(vp), vp |
| setc R8(%rax) |
| mov -4(vp), R32(%rbp) |
| mov %r10, 24(rp) |
| ADDSUB %rbx, %r12 |
| jc L(c3) |
| L(rc3): lea 32(rp), rp |
| L(L10): sub $4, n |
| ja L(top) |
| |
| L(end): |
| shr $31, R32(%rbp) |
| ADDSUB %r9, %r11 |
| setc R8(%rbx) |
| mov %r12, (rp) |
| ADDSUB %rax, %r11 |
| jnc L(1) |
| mov $1, R8(%rbx) |
| L(1): mov %r11, 8(rp) |
| lea (%rbx,%rbp), R32(%rax) |
| pop %rbp |
| pop %r12 |
| pop %rbx |
| emms |
| ret |
| L(c3): mov $1, R8(%rax) |
| jmp L(rc3) |
| EPILOGUE() |
| ASM_END() |