| dnl AMD K6 mpn_gcd_1 -- mpn by 1 gcd. |
| |
| dnl Copyright 2000, 2001, 2002, 2004 Free Software Foundation, Inc. |
| dnl |
| dnl This file is part of the GNU MP Library. |
| dnl |
| dnl The GNU MP Library is free software; you can redistribute it and/or |
| dnl modify it under the terms of the GNU Lesser General Public License as |
| dnl published by the Free Software Foundation; either version 3 of the |
| dnl License, or (at your option) any later version. |
| dnl |
| dnl The GNU MP Library is distributed in the hope that it will be useful, |
| dnl but WITHOUT ANY WARRANTY; without even the implied warranty of |
| dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
| dnl Lesser General Public License for more details. |
| dnl |
| dnl You should have received a copy of the GNU Lesser General Public License |
| dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. |
| |
| include(`../config.m4') |
| |
| |
| C K6: 9.5 cycles/bit (approx) 1x1 gcd |
| C 11.0 cycles/limb Nx1 reduction (modexact_1_odd) |
| |
| |
| C mp_limb_t mpn_gcd_1 (mp_srcptr src, mp_size_t size, mp_limb_t y); |
| C |
| C This code is nothing very special, but offers a speedup over what gcc 2.95 |
| C can do with mpn/generic/gcd_1.c. |
| C |
| C Future: |
| C |
| C Using a lookup table to count trailing zeros seems a touch quicker, but |
| C after a slightly longer startup. Might be worthwhile if an mpn_gcd_2 used |
| C it too. |
| |
| |
| dnl If size==1 and x (the larger operand) is more than DIV_THRESHOLD bits |
| dnl bigger than y, then a division x%y is done to reduce it. |
| dnl |
| dnl A divl is 20 cycles and the loop runs at about 9.5 cycles/bitpair so |
| dnl there should be an advantage in the divl at about 4 or 5 bits, which is |
| dnl what's found. |
| |
| deflit(DIV_THRESHOLD, 5) |
| |
| |
| defframe(PARAM_LIMB, 12) |
| defframe(PARAM_SIZE, 8) |
| defframe(PARAM_SRC, 4) |
| |
| TEXT |
| ALIGN(16) |
| |
| PROLOGUE(mpn_gcd_1) |
| deflit(`FRAME',0) |
| |
| ASSERT(ne, `cmpl $0, PARAM_LIMB') |
| ASSERT(ae, `cmpl $1, PARAM_SIZE') |
| |
| |
| movl PARAM_SRC, %eax |
| pushl %ebx FRAME_pushl() |
| |
| movl PARAM_LIMB, %edx |
| movl $-1, %ecx |
| |
| movl (%eax), %ebx C src low limb |
| |
| movl %ebx, %eax C src low limb |
| orl %edx, %ebx |
| |
| L(common_twos): |
| shrl %ebx |
| incl %ecx |
| |
| jnc L(common_twos) C 1/4 chance on random data |
| shrl %cl, %edx C y |
| |
| cmpl $1, PARAM_SIZE |
| ja L(size_two_or_more) |
| |
| |
| ASSERT(nz, `orl %eax, %eax') C should have src limb != 0 |
| |
| shrl %cl, %eax C x |
| |
| |
| C Swap if necessary to make x>=y. Measures a touch quicker as a |
| C jump than a branch free calculation. |
| C |
| C eax x |
| C ebx |
| C ecx common twos |
| C edx y |
| |
| movl %eax, %ebx |
| cmpl %eax, %edx |
| |
| jb L(noswap) |
| movl %edx, %eax |
| |
| movl %ebx, %edx |
| movl %eax, %ebx |
| L(noswap): |
| |
| |
| C See if it's worth reducing x with a divl. |
| C |
| C eax x |
| C ebx x |
| C ecx common twos |
| C edx y |
| |
| shrl $DIV_THRESHOLD, %ebx |
| |
| cmpl %ebx, %edx |
| ja L(nodiv) |
| |
| |
| C Reduce x to x%y. |
| C |
| C eax x |
| C ebx |
| C ecx common twos |
| C edx y |
| |
| movl %edx, %ebx |
| xorl %edx, %edx |
| |
| divl %ebx |
| |
| orl %edx, %edx C y |
| nop C code alignment |
| |
| movl %ebx, %eax C x |
| jz L(done_shll) |
| L(nodiv): |
| |
| |
| C eax x |
| C ebx |
| C ecx common twos |
| C edx y |
| C esi |
| C edi |
| C ebp |
| |
| L(strip_y): |
| shrl %edx |
| jnc L(strip_y) |
| |
| leal 1(%edx,%edx), %edx |
| movl %ecx, %ebx C common twos |
| |
| leal 1(%eax), %ecx |
| jmp L(strip_x_and) |
| |
| |
| C Calculating a %cl shift based on the low bit 0 or 1 avoids doing a branch |
| C on a 50/50 chance of 0 or 1. The chance of the next bit also being 0 is |
| C only 1/4. |
| C |
| C A second computed %cl shift was tried, but that measured a touch slower |
| C than branching back. |
| C |
| C A branch-free abs(x-y) and min(x,y) calculation was tried, but that |
| C measured about 1 cycle/bit slower. |
| |
| C eax x |
| C ebx common twos |
| C ecx scratch |
| C edx y |
| |
| ALIGN(4) |
| L(swap): |
| addl %eax, %edx C x-y+y = x |
| negl %eax C -(x-y) = y-x |
| |
| L(strip_x): |
| shrl %eax C odd-odd = even, so always one to strip |
| ASSERT(nz) |
| |
| L(strip_x_leal): |
| leal 1(%eax), %ecx |
| |
| L(strip_x_and): |
| andl $1, %ecx C (x^1)&1 |
| |
| shrl %cl, %eax C shift if x even |
| |
| testb $1, %al |
| jz L(strip_x) |
| |
| ASSERT(nz,`testl $1, %eax') C x, y odd |
| ASSERT(nz,`testl $1, %edx') |
| |
| subl %edx, %eax |
| jb L(swap) |
| ja L(strip_x) |
| |
| |
| movl %edx, %eax |
| movl %ebx, %ecx |
| |
| L(done_shll): |
| shll %cl, %eax |
| popl %ebx |
| |
| ret |
| |
| |
| C ----------------------------------------------------------------------------- |
| C Two or more limbs. |
| C |
| C x={src,size} is reduced modulo y using either a plain mod_1 style |
| C remainder, or a modexact_1 style exact division. |
| |
| deflit(MODEXACT_THRESHOLD, ifdef(`PIC', 4, 4)) |
| |
| ALIGN(8) |
| L(size_two_or_more): |
| C eax |
| C ebx |
| C ecx common twos |
| C edx y, without common twos |
| C esi |
| C edi |
| C ebp |
| |
| deflit(FRAME_TWO_OR_MORE, FRAME) |
| |
| pushl %edi defframe_pushl(SAVE_EDI) |
| movl PARAM_SRC, %ebx |
| |
| L(y_twos): |
| shrl %edx |
| jnc L(y_twos) |
| |
| movl %ecx, %edi C common twos |
| movl PARAM_SIZE, %ecx |
| |
| pushl %esi defframe_pushl(SAVE_ESI) |
| leal 1(%edx,%edx), %esi C y (odd) |
| |
| movl -4(%ebx,%ecx,4), %eax C src high limb |
| |
| cmpl %edx, %eax C carry if high<divisor |
| |
| sbbl %edx, %edx C -1 if high<divisor |
| |
| addl %edx, %ecx C skip one limb if high<divisor |
| andl %eax, %edx |
| |
| cmpl $MODEXACT_THRESHOLD, %ecx |
| jae L(modexact) |
| |
| |
| L(divide_top): |
| C eax scratch (quotient) |
| C ebx src |
| C ecx counter, size-1 to 1 |
| C edx carry (remainder) |
| C esi divisor (odd) |
| C edi |
| C ebp |
| |
| movl -4(%ebx,%ecx,4), %eax |
| divl %esi |
| loop L(divide_top) |
| |
| |
| movl %edx, %eax C x |
| movl %esi, %edx C y (odd) |
| |
| movl %edi, %ebx C common twos |
| popl %esi |
| |
| popl %edi |
| leal 1(%eax), %ecx |
| |
| orl %eax, %eax |
| jnz L(strip_x_and) |
| |
| |
| movl %ebx, %ecx |
| movl %edx, %eax |
| |
| shll %cl, %eax |
| popl %ebx |
| |
| ret |
| |
| |
| ALIGN(8) |
| L(modexact): |
| C eax |
| C ebx src ptr |
| C ecx size or size-1 |
| C edx |
| C esi y odd |
| C edi common twos |
| C ebp |
| |
| movl PARAM_SIZE, %eax |
| pushl %esi FRAME_pushl() |
| |
| pushl %eax FRAME_pushl() |
| |
| pushl %ebx FRAME_pushl() |
| |
| ifdef(`PIC',` |
| nop C code alignment |
| call L(movl_eip_ebx) |
| L(here): |
| addl $_GLOBAL_OFFSET_TABLE_, %ebx |
| call GSYM_PREFIX`'mpn_modexact_1_odd@PLT |
| ',` |
| call GSYM_PREFIX`'mpn_modexact_1_odd |
| ') |
| |
| movl %esi, %edx C y odd |
| movl SAVE_ESI, %esi |
| |
| movl %edi, %ebx C common twos |
| movl SAVE_EDI, %edi |
| |
| addl $eval(FRAME - FRAME_TWO_OR_MORE), %esp |
| orl %eax, %eax |
| |
| leal 1(%eax), %ecx |
| jnz L(strip_x_and) |
| |
| |
| movl %ebx, %ecx |
| movl %edx, %eax |
| |
| shll %cl, %eax |
| popl %ebx |
| |
| ret |
| |
| |
| ifdef(`PIC',` |
| L(movl_eip_ebx): |
| movl (%esp), %ebx |
| ret_internal |
| ') |
| |
| EPILOGUE() |