| dnl x86 generic mpn_sqr_basecase -- square an mpn number. |
| |
| dnl Copyright 1999, 2000, 2002, 2003 Free Software Foundation, Inc. |
| dnl |
| dnl This file is part of the GNU MP Library. |
| dnl |
| dnl The GNU MP Library is free software; you can redistribute it and/or modify |
| dnl it under the terms of the GNU Lesser General Public License as published |
| dnl by the Free Software Foundation; either version 3 of the License, or (at |
| dnl your option) any later version. |
| dnl |
| dnl The GNU MP Library is distributed in the hope that it will be useful, but |
| dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY |
| dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public |
| dnl License for more details. |
| dnl |
| dnl You should have received a copy of the GNU Lesser General Public License |
| dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. |
| |
| |
| include(`../config.m4') |
| |
| |
| C cycles/crossproduct cycles/triangleproduct |
| C P5: |
| C P6: |
| C K6: |
| C K7: |
| C P4: |
| |
| |
| C void mpn_sqr_basecase (mp_ptr dst, mp_srcptr src, mp_size_t size); |
| C |
| C The algorithm is basically the same as mpn/generic/sqr_basecase.c, but a |
| C lot of function call overheads are avoided, especially when the size is |
| C small. |
| C |
| C The mul1 loop is not unrolled like mul_1.asm, it doesn't seem worth the |
| C code size to do so here. |
| C |
| C Enhancements: |
| C |
| C The addmul loop here is also not unrolled like aorsmul_1.asm and |
| C mul_basecase.asm are. Perhaps it should be done. It'd add to the |
| C complexity, but if it's worth doing in the other places then it should be |
| C worthwhile here. |
| C |
| C A fully-unrolled style like other sqr_basecase.asm versions (k6, k7, p6) |
| C might be worth considering. That'd add quite a bit to the code size, but |
| C only as much as is used would be dragged into L1 cache. |
| |
| defframe(PARAM_SIZE,12) |
| defframe(PARAM_SRC, 8) |
| defframe(PARAM_DST, 4) |
| |
| TEXT |
| ALIGN(8) |
| PROLOGUE(mpn_sqr_basecase) |
| deflit(`FRAME',0) |
| |
| movl PARAM_SIZE, %edx |
| |
| movl PARAM_SRC, %eax |
| |
| cmpl $2, %edx |
| movl PARAM_DST, %ecx |
| |
| je L(two_limbs) |
| ja L(three_or_more) |
| |
| |
| C ----------------------------------------------------------------------------- |
| C one limb only |
| C eax src |
| C ebx |
| C ecx dst |
| C edx |
| |
| movl (%eax), %eax |
| mull %eax |
| movl %eax, (%ecx) |
| movl %edx, 4(%ecx) |
| ret |
| |
| |
| C ----------------------------------------------------------------------------- |
| ALIGN(8) |
| L(two_limbs): |
| C eax src |
| C ebx |
| C ecx dst |
| C edx |
| |
| pushl %ebx |
| pushl %ebp |
| |
| movl %eax, %ebx |
| movl (%eax), %eax |
| |
| mull %eax C src[0]^2 |
| |
| pushl %esi |
| pushl %edi |
| |
| movl %edx, %esi C dst[1] |
| movl %eax, (%ecx) C dst[0] |
| |
| movl 4(%ebx), %eax |
| mull %eax C src[1]^2 |
| |
| movl %eax, %edi C dst[2] |
| movl %edx, %ebp C dst[3] |
| |
| movl (%ebx), %eax |
| mull 4(%ebx) C src[0]*src[1] |
| |
| addl %eax, %esi |
| |
| adcl %edx, %edi |
| |
| adcl $0, %ebp |
| addl %esi, %eax |
| |
| adcl %edi, %edx |
| movl %eax, 4(%ecx) |
| |
| adcl $0, %ebp |
| |
| movl %edx, 8(%ecx) |
| movl %ebp, 12(%ecx) |
| |
| popl %edi |
| popl %esi |
| |
| popl %ebp |
| popl %ebx |
| |
| ret |
| |
| |
| C ----------------------------------------------------------------------------- |
| ALIGN(8) |
| L(three_or_more): |
| deflit(`FRAME',0) |
| C eax src |
| C ebx |
| C ecx dst |
| C edx size |
| |
| pushl %ebx FRAME_pushl() |
| pushl %edi FRAME_pushl() |
| |
| pushl %esi FRAME_pushl() |
| pushl %ebp FRAME_pushl() |
| |
| leal (%ecx,%edx,4), %edi C &dst[size], end of this mul1 |
| leal (%eax,%edx,4), %esi C &src[size] |
| |
| C First multiply src[0]*src[1..size-1] and store at dst[1..size]. |
| |
| movl (%eax), %ebp C src[0], multiplier |
| movl %edx, %ecx |
| |
| negl %ecx C -size |
| xorl %ebx, %ebx C clear carry limb |
| |
| incl %ecx C -(size-1) |
| |
| L(mul1): |
| C eax scratch |
| C ebx carry |
| C ecx counter, limbs, negative |
| C edx scratch |
| C esi &src[size] |
| C edi &dst[size] |
| C ebp multiplier |
| |
| movl (%esi,%ecx,4), %eax |
| mull %ebp |
| addl %eax, %ebx |
| adcl $0, %edx |
| movl %ebx, (%edi,%ecx,4) |
| movl %edx, %ebx |
| incl %ecx |
| jnz L(mul1) |
| |
| movl %ebx, (%edi) |
| |
| |
| C Add products src[n]*src[n+1..size-1] at dst[2*n-1...], for |
| C n=1..size-2. |
| C |
| C The last products src[size-2]*src[size-1], which is the end corner |
| C of the product triangle, is handled separately at the end to save |
| C looping overhead. If size is 3 then it's only this that needs to |
| C be done. |
| C |
| C In the outer loop %esi is a constant, and %edi just advances by 1 |
| C limb each time. The size of the operation decreases by 1 limb |
| C each time. |
| |
| C eax |
| C ebx carry (needing carry flag added) |
| C ecx |
| C edx |
| C esi &src[size] |
| C edi &dst[size] |
| C ebp |
| |
| movl PARAM_SIZE, %ecx |
| subl $3, %ecx |
| jz L(corner) |
| |
| negl %ecx |
| |
| dnl re-use parameter space |
| define(VAR_OUTER,`PARAM_DST') |
| |
| L(outer): |
| C eax |
| C ebx |
| C ecx |
| C edx outer loop counter, -(size-3) to -1 |
| C esi &src[size] |
| C edi dst, pointing at stored carry limb of previous loop |
| C ebp |
| |
| movl %ecx, VAR_OUTER |
| addl $4, %edi C advance dst end |
| |
| movl -8(%esi,%ecx,4), %ebp C next multiplier |
| subl $1, %ecx |
| |
| xorl %ebx, %ebx C initial carry limb |
| |
| L(inner): |
| C eax scratch |
| C ebx carry (needing carry flag added) |
| C ecx counter, -n-1 to -1 |
| C edx scratch |
| C esi &src[size] |
| C edi dst end of this addmul |
| C ebp multiplier |
| |
| movl (%esi,%ecx,4), %eax |
| mull %ebp |
| addl %ebx, %eax |
| adcl $0, %edx |
| addl %eax, (%edi,%ecx,4) |
| adcl $0, %edx |
| movl %edx, %ebx |
| addl $1, %ecx |
| jl L(inner) |
| |
| |
| movl %ebx, (%edi) |
| movl VAR_OUTER, %ecx |
| incl %ecx |
| jnz L(outer) |
| |
| |
| L(corner): |
| C esi &src[size] |
| C edi &dst[2*size-3] |
| |
| movl -4(%esi), %eax |
| mull -8(%esi) C src[size-1]*src[size-2] |
| addl %eax, 0(%edi) |
| adcl $0, %edx |
| movl %edx, 4(%edi) C dst high limb |
| |
| |
| C ----------------------------------------------------------------------------- |
| C Left shift of dst[1..2*size-2], high bit shifted out becomes dst[2*size-1]. |
| |
| movl PARAM_SIZE, %eax |
| negl %eax |
| addl $1, %eax C -(size-1) and clear carry |
| |
| L(lshift): |
| C eax counter, negative |
| C ebx next limb |
| C ecx |
| C edx |
| C esi |
| C edi &dst[2*size-4] |
| C ebp |
| |
| rcll 8(%edi,%eax,8) |
| rcll 12(%edi,%eax,8) |
| incl %eax |
| jnz L(lshift) |
| |
| |
| adcl %eax, %eax C high bit out |
| movl %eax, 8(%edi) C dst most significant limb |
| |
| |
| C Now add in the squares on the diagonal, namely src[0]^2, src[1]^2, ..., |
| C src[size-1]^2. dst[0] hasn't yet been set at all yet, and just gets the |
| C low limb of src[0]^2. |
| |
| movl PARAM_SRC, %esi |
| movl (%esi), %eax C src[0] |
| mull %eax C src[0]^2 |
| |
| movl PARAM_SIZE, %ecx |
| leal (%esi,%ecx,4), %esi C src end |
| |
| negl %ecx C -size |
| movl %edx, %ebx C initial carry |
| |
| movl %eax, 12(%edi,%ecx,8) C dst[0] |
| incl %ecx C -(size-1) |
| |
| L(diag): |
| C eax scratch (low product) |
| C ebx carry limb |
| C ecx counter, -(size-1) to -1 |
| C edx scratch (high product) |
| C esi &src[size] |
| C edi &dst[2*size-3] |
| C ebp scratch (fetched dst limbs) |
| |
| movl (%esi,%ecx,4), %eax |
| mull %eax |
| |
| addl %ebx, 8(%edi,%ecx,8) |
| movl %edx, %ebx |
| |
| adcl %eax, 12(%edi,%ecx,8) |
| adcl $0, %ebx |
| |
| incl %ecx |
| jnz L(diag) |
| |
| |
| addl %ebx, 8(%edi) C dst most significant limb |
| |
| popl %ebp |
| popl %esi |
| |
| popl %edi |
| popl %ebx |
| |
| ret |
| |
| EPILOGUE() |