| dnl x86 mpn_mul_basecase -- Multiply two limb vectors and store the result |
| dnl in a third limb vector. |
| |
| dnl Copyright 1996, 1997, 1998, 1999, 2000, 2001, 2002 Free Software |
| dnl Foundation, Inc. |
| dnl |
| dnl This file is part of the GNU MP Library. |
| dnl |
| dnl The GNU MP Library is free software; you can redistribute it and/or |
| dnl modify it under the terms of the GNU Lesser General Public License as |
| dnl published by the Free Software Foundation; either version 3 of the |
| dnl License, or (at your option) any later version. |
| dnl |
| dnl The GNU MP Library is distributed in the hope that it will be useful, |
| dnl but WITHOUT ANY WARRANTY; without even the implied warranty of |
| dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
| dnl Lesser General Public License for more details. |
| dnl |
| dnl You should have received a copy of the GNU Lesser General Public License |
| dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. |
| |
| include(`../config.m4') |
| |
| |
| C cycles/crossproduct |
| C P5: 15 |
| C P6: 7.5 |
| C K6: 12.5 |
| C K7: 5.5 |
| C P4: 24 |
| |
| |
| C void mpn_mul_basecase (mp_ptr wp, |
| C mp_srcptr xp, mp_size_t xsize, |
| C mp_srcptr yp, mp_size_t ysize); |
| C |
| C This was written in a haste since the Pentium optimized code that was used |
| C for all x86 machines was slow for the Pentium II. This code would benefit |
| C from some cleanup. |
| C |
| C To shave off some percentage of the run-time, one should make 4 variants |
| C of the Louter loop, for the four different outcomes of un mod 4. That |
| C would avoid Loop0 altogether. Code expansion would be > 4-fold for that |
| C part of the function, but since it is not very large, that would be |
| C acceptable. |
| C |
| C The mul loop (at L(oopM)) might need some tweaking. It's current speed is |
| C unknown. |
| |
| defframe(PARAM_YSIZE,20) |
| defframe(PARAM_YP, 16) |
| defframe(PARAM_XSIZE,12) |
| defframe(PARAM_XP, 8) |
| defframe(PARAM_WP, 4) |
| |
| defframe(VAR_MULTIPLIER, -4) |
| defframe(VAR_COUNTER, -8) |
| deflit(VAR_STACK_SPACE, 8) |
| |
| TEXT |
| ALIGN(8) |
| |
| PROLOGUE(mpn_mul_basecase) |
| deflit(`FRAME',0) |
| |
| subl $VAR_STACK_SPACE,%esp |
| pushl %esi |
| pushl %ebp |
| pushl %edi |
| deflit(`FRAME',eval(VAR_STACK_SPACE+12)) |
| |
| movl PARAM_XP,%esi |
| movl PARAM_WP,%edi |
| movl PARAM_YP,%ebp |
| |
| movl (%esi),%eax C load xp[0] |
| mull (%ebp) C multiply by yp[0] |
| movl %eax,(%edi) C store to wp[0] |
| movl PARAM_XSIZE,%ecx C xsize |
| decl %ecx C If xsize = 1, ysize = 1 too |
| jz L(done) |
| |
| pushl %ebx |
| FRAME_pushl() |
| movl %edx,%ebx |
| |
| leal 4(%esi),%esi |
| leal 4(%edi),%edi |
| |
| L(oopM): |
| movl (%esi),%eax C load next limb at xp[j] |
| leal 4(%esi),%esi |
| mull (%ebp) |
| addl %ebx,%eax |
| movl %edx,%ebx |
| adcl $0,%ebx |
| movl %eax,(%edi) |
| leal 4(%edi),%edi |
| decl %ecx |
| jnz L(oopM) |
| |
| movl %ebx,(%edi) C most significant limb of product |
| addl $4,%edi C increment wp |
| movl PARAM_XSIZE,%eax |
| shll $2,%eax |
| subl %eax,%edi |
| subl %eax,%esi |
| |
| movl PARAM_YSIZE,%eax C ysize |
| decl %eax |
| jz L(skip) |
| movl %eax,VAR_COUNTER C set index i to ysize |
| |
| L(outer): |
| movl PARAM_YP,%ebp C yp |
| addl $4,%ebp C make ebp point to next v limb |
| movl %ebp,PARAM_YP |
| movl (%ebp),%eax C copy y limb ... |
| movl %eax,VAR_MULTIPLIER C ... to stack slot |
| movl PARAM_XSIZE,%ecx |
| |
| xorl %ebx,%ebx |
| andl $3,%ecx |
| jz L(end0) |
| |
| L(oop0): |
| movl (%esi),%eax |
| mull VAR_MULTIPLIER |
| leal 4(%esi),%esi |
| addl %ebx,%eax |
| movl $0,%ebx |
| adcl %ebx,%edx |
| addl %eax,(%edi) |
| adcl %edx,%ebx C propagate carry into cylimb |
| |
| leal 4(%edi),%edi |
| decl %ecx |
| jnz L(oop0) |
| |
| L(end0): |
| movl PARAM_XSIZE,%ecx |
| shrl $2,%ecx |
| jz L(endX) |
| |
| ALIGN(8) |
| L(oopX): |
| movl (%esi),%eax |
| mull VAR_MULTIPLIER |
| addl %eax,%ebx |
| movl $0,%ebp |
| adcl %edx,%ebp |
| |
| movl 4(%esi),%eax |
| mull VAR_MULTIPLIER |
| addl %ebx,(%edi) |
| adcl %eax,%ebp C new lo + cylimb |
| movl $0,%ebx |
| adcl %edx,%ebx |
| |
| movl 8(%esi),%eax |
| mull VAR_MULTIPLIER |
| addl %ebp,4(%edi) |
| adcl %eax,%ebx C new lo + cylimb |
| movl $0,%ebp |
| adcl %edx,%ebp |
| |
| movl 12(%esi),%eax |
| mull VAR_MULTIPLIER |
| addl %ebx,8(%edi) |
| adcl %eax,%ebp C new lo + cylimb |
| movl $0,%ebx |
| adcl %edx,%ebx |
| |
| addl %ebp,12(%edi) |
| adcl $0,%ebx C propagate carry into cylimb |
| |
| leal 16(%esi),%esi |
| leal 16(%edi),%edi |
| decl %ecx |
| jnz L(oopX) |
| |
| L(endX): |
| movl %ebx,(%edi) |
| addl $4,%edi |
| |
| C we incremented wp and xp in the loop above; compensate |
| movl PARAM_XSIZE,%eax |
| shll $2,%eax |
| subl %eax,%edi |
| subl %eax,%esi |
| |
| movl VAR_COUNTER,%eax |
| decl %eax |
| movl %eax,VAR_COUNTER |
| jnz L(outer) |
| |
| L(skip): |
| popl %ebx |
| popl %edi |
| popl %ebp |
| popl %esi |
| addl $8,%esp |
| ret |
| |
| L(done): |
| movl %edx,4(%edi) C store to wp[1] |
| popl %edi |
| popl %ebp |
| popl %esi |
| addl $8,%esp |
| ret |
| |
| EPILOGUE() |