| dnl AMD K6 mpn_divexact_1 -- mpn by limb exact division. |
| |
| dnl Copyright 2000, 2001, 2002, 2007 Free Software Foundation, Inc. |
| dnl |
| dnl This file is part of the GNU MP Library. |
| dnl |
| dnl The GNU MP Library is free software; you can redistribute it and/or |
| dnl modify it under the terms of the GNU Lesser General Public License as |
| dnl published by the Free Software Foundation; either version 3 of the |
| dnl License, or (at your option) any later version. |
| dnl |
| dnl The GNU MP Library is distributed in the hope that it will be useful, |
| dnl but WITHOUT ANY WARRANTY; without even the implied warranty of |
| dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
| dnl Lesser General Public License for more details. |
| dnl |
| dnl You should have received a copy of the GNU Lesser General Public License |
| dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. |
| |
| include(`../config.m4') |
| |
| |
| C divisor |
| C odd even |
| C K6: 10.0 12.0 cycles/limb |
| C K6-2: 10.0 11.5 |
| |
| |
| C void mpn_divexact_1 (mp_ptr dst, mp_srcptr src, mp_size_t size, |
| C mp_limb_t divisor); |
| C |
| C A simple divl is used for size==1. This is about 10 cycles faster for an |
| C odd divisor or 20 cycles for an even divisor. |
| C |
| C The loops are quite sensitive to code alignment, speeds should be |
| C rechecked (odd and even divisor, pic and non-pic) if contemplating |
| C changing anything. |
| |
| defframe(PARAM_DIVISOR,16) |
| defframe(PARAM_SIZE, 12) |
| defframe(PARAM_SRC, 8) |
| defframe(PARAM_DST, 4) |
| |
| dnl re-use parameter space |
| define(VAR_INVERSE,`PARAM_DST') |
| |
| TEXT |
| |
| ALIGN(32) |
| PROLOGUE(mpn_divexact_1) |
| deflit(`FRAME',0) |
| |
| movl PARAM_SIZE, %ecx |
| |
| movl PARAM_SRC, %eax |
| xorl %edx, %edx |
| |
| cmpl $1, %ecx |
| jnz L(two_or_more) |
| |
| movl (%eax), %eax |
| |
| divl PARAM_DIVISOR |
| |
| movl PARAM_DST, %ecx |
| movl %eax, (%ecx) |
| |
| ret |
| |
| |
| L(two_or_more): |
| movl PARAM_DIVISOR, %eax |
| pushl %ebx FRAME_pushl() |
| |
| movl PARAM_SRC, %ebx |
| pushl %ebp FRAME_pushl() |
| |
| L(strip_twos): |
| shrl %eax |
| incl %edx C will get shift+1 |
| |
| jnc L(strip_twos) |
| pushl %esi FRAME_pushl() |
| |
| leal 1(%eax,%eax), %esi C d without twos |
| andl $127, %eax C d/2, 7 bits |
| |
| ifdef(`PIC',` |
| LEA( binvert_limb_table, %ebp) |
| Zdisp( movzbl, 0,(%eax,%ebp), %eax) |
| ',` |
| movzbl binvert_limb_table(%eax), %eax C inv 8 bits |
| ') |
| pushl %edi FRAME_pushl() |
| |
| leal (%eax,%eax), %ebp C 2*inv |
| |
| imull %eax, %eax C inv*inv |
| |
| movl PARAM_DST, %edi |
| |
| imull %esi, %eax C inv*inv*d |
| |
| subl %eax, %ebp C inv = 2*inv - inv*inv*d |
| leal (%ebp,%ebp), %eax C 2*inv |
| |
| imull %ebp, %ebp C inv*inv |
| |
| movl %esi, PARAM_DIVISOR C d without twos |
| leal (%ebx,%ecx,4), %ebx C src end |
| |
| imull %esi, %ebp C inv*inv*d |
| |
| leal (%edi,%ecx,4), %edi C dst end |
| negl %ecx C -size |
| |
| subl %ebp, %eax C inv = 2*inv - inv*inv*d |
| subl $1, %edx C shift amount, and clear carry |
| |
| ASSERT(e,` C expect d*inv == 1 mod 2^BITS_PER_MP_LIMB |
| pushl %eax FRAME_pushl() |
| imull PARAM_DIVISOR, %eax |
| cmpl $1, %eax |
| popl %eax FRAME_popl()') |
| |
| movl %eax, VAR_INVERSE |
| jnz L(even) |
| |
| movl (%ebx,%ecx,4), %esi C src low limb |
| jmp L(odd_entry) |
| |
| |
| ALIGN(16) |
| nop C code alignment |
| L(odd_top): |
| C eax scratch |
| C ebx src end |
| C ecx counter, limbs, negative |
| C edx inverse |
| C esi next limb, adjusted for carry |
| C edi dst end |
| C ebp carry bit, 0 or -1 |
| |
| imull %edx, %esi |
| |
| movl PARAM_DIVISOR, %eax |
| movl %esi, -4(%edi,%ecx,4) |
| |
| mull %esi C carry limb in edx |
| |
| subl %ebp, %edx C apply carry bit |
| movl (%ebx,%ecx,4), %esi |
| |
| L(odd_entry): |
| subl %edx, %esi C apply carry limb |
| movl VAR_INVERSE, %edx |
| |
| sbbl %ebp, %ebp C 0 or -1 |
| |
| incl %ecx |
| jnz L(odd_top) |
| |
| |
| imull %edx, %esi |
| |
| movl %esi, -4(%edi,%ecx,4) |
| |
| popl %edi |
| popl %esi |
| |
| popl %ebp |
| popl %ebx |
| |
| ret |
| |
| |
| L(even): |
| C eax |
| C ebx src end |
| C ecx -size |
| C edx twos |
| C esi |
| C edi dst end |
| C ebp |
| |
| xorl %ebp, %ebp |
| Zdisp( movq, 0,(%ebx,%ecx,4), %mm0) C src[0,1] |
| |
| movd %edx, %mm7 |
| movl VAR_INVERSE, %edx |
| |
| addl $2, %ecx |
| psrlq %mm7, %mm0 |
| |
| movd %mm0, %esi |
| jz L(even_two) C if only two limbs |
| |
| |
| C Out-of-order execution is good enough to hide the load/rshift/movd |
| C latency. Having imul at the top of the loop gives 11.5 c/l instead of 12, |
| C on K6-2. In fact there's only 11 of decode, but nothing running at 11 has |
| C been found. Maybe the fact every second movq is unaligned costs the extra |
| C 0.5. |
| |
| L(even_top): |
| C eax scratch |
| C ebx src end |
| C ecx counter, limbs, negative |
| C edx inverse |
| C esi next limb, adjusted for carry |
| C edi dst end |
| C ebp carry bit, 0 or -1 |
| C |
| C mm0 scratch, source limbs |
| C mm7 twos |
| |
| imull %edx, %esi |
| |
| movl %esi, -8(%edi,%ecx,4) |
| movl PARAM_DIVISOR, %eax |
| |
| mull %esi C carry limb in edx |
| |
| movq -4(%ebx,%ecx,4), %mm0 |
| psrlq %mm7, %mm0 |
| |
| movd %mm0, %esi |
| subl %ebp, %edx C apply carry bit |
| |
| subl %edx, %esi C apply carry limb |
| movl VAR_INVERSE, %edx |
| |
| sbbl %ebp, %ebp C 0 or -1 |
| |
| incl %ecx |
| jnz L(even_top) |
| |
| |
| L(even_two): |
| movd -4(%ebx), %mm0 C src high limb |
| psrlq %mm7, %mm0 |
| |
| imull %edx, %esi |
| |
| movl %esi, -8(%edi) |
| movl PARAM_DIVISOR, %eax |
| |
| mull %esi C carry limb in edx |
| |
| movd %mm0, %esi |
| subl %ebp, %edx C apply carry bit |
| |
| movl VAR_INVERSE, %eax |
| subl %edx, %esi C apply carry limb |
| |
| imull %eax, %esi |
| |
| movl %esi, -4(%edi) |
| |
| popl %edi |
| popl %esi |
| |
| popl %ebp |
| popl %ebx |
| |
| emms_or_femms |
| |
| ret |
| |
| EPILOGUE() |