| dnl AMD K7 mpn_add_n/mpn_sub_n -- mpn add or subtract. |
| |
| dnl Copyright 1999, 2000, 2001, 2002, 2003 Free Software Foundation, Inc. |
| dnl |
| dnl This file is part of the GNU MP Library. |
| dnl |
| dnl The GNU MP Library is free software; you can redistribute it and/or |
| dnl modify it under the terms of the GNU Lesser General Public License as |
| dnl published by the Free Software Foundation; either version 3 of the |
| dnl License, or (at your option) any later version. |
| dnl |
| dnl The GNU MP Library is distributed in the hope that it will be useful, |
| dnl but WITHOUT ANY WARRANTY; without even the implied warranty of |
| dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
| dnl Lesser General Public License for more details. |
| dnl |
| dnl You should have received a copy of the GNU Lesser General Public License |
| dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. |
| |
| include(`../config.m4') |
| |
| |
| C K7: 1.64 cycles/limb (at 16 limbs/loop). |
| |
| |
| |
| dnl K7: UNROLL_COUNT cycles/limb |
| dnl 8 1.9 |
| dnl 16 1.64 |
| dnl 32 1.7 |
| dnl 64 2.0 |
| dnl Maximum possible with the current code is 64. |
| |
| deflit(UNROLL_COUNT, 16) |
| |
| |
| ifdef(`OPERATION_add_n', ` |
| define(M4_inst, adcl) |
| define(M4_function_n, mpn_add_n) |
| define(M4_function_nc, mpn_add_nc) |
| define(M4_description, add) |
| ',`ifdef(`OPERATION_sub_n', ` |
| define(M4_inst, sbbl) |
| define(M4_function_n, mpn_sub_n) |
| define(M4_function_nc, mpn_sub_nc) |
| define(M4_description, subtract) |
| ',`m4_error(`Need OPERATION_add_n or OPERATION_sub_n |
| ')')') |
| |
| MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc) |
| |
| |
| C mp_limb_t M4_function_n (mp_ptr dst, mp_srcptr src1, mp_srcptr src2, |
| C mp_size_t size); |
| C mp_limb_t M4_function_nc (mp_ptr dst, mp_srcptr src1, mp_srcptr src2, |
| C mp_size_t size, mp_limb_t carry); |
| C |
| C Calculate src1,size M4_description src2,size, and store the result in |
| C dst,size. The return value is the carry bit from the top of the result (1 |
| C or 0). |
| C |
| C The _nc version accepts 1 or 0 for an initial carry into the low limb of |
| C the calculation. Note values other than 1 or 0 here will lead to garbage |
| C results. |
| C |
| C This code runs at 1.64 cycles/limb, which might be the best possible with |
| C plain integer operations. Each limb is 2 loads and 1 store, any 2 of |
| C which can be done each cycle, leading to 1.5 c/l. |
| |
| dnl Must have UNROLL_THRESHOLD >= 2, since the unrolled loop can't handle 1. |
| ifdef(`PIC',` |
| deflit(UNROLL_THRESHOLD, 8) |
| ',` |
| deflit(UNROLL_THRESHOLD, 8) |
| ') |
| |
| defframe(PARAM_CARRY,20) |
| defframe(PARAM_SIZE, 16) |
| defframe(PARAM_SRC2, 12) |
| defframe(PARAM_SRC1, 8) |
| defframe(PARAM_DST, 4) |
| |
| defframe(SAVE_EBP, -4) |
| defframe(SAVE_ESI, -8) |
| defframe(SAVE_EBX, -12) |
| defframe(SAVE_EDI, -16) |
| deflit(STACK_SPACE, 16) |
| |
| TEXT |
| ALIGN(32) |
| deflit(`FRAME',0) |
| |
| PROLOGUE(M4_function_nc) |
| movl PARAM_CARRY, %eax |
| jmp L(start) |
| EPILOGUE() |
| |
| PROLOGUE(M4_function_n) |
| |
| xorl %eax, %eax C carry |
| L(start): |
| movl PARAM_SIZE, %ecx |
| subl $STACK_SPACE, %esp |
| deflit(`FRAME',STACK_SPACE) |
| |
| movl %edi, SAVE_EDI |
| movl %ebx, SAVE_EBX |
| cmpl $UNROLL_THRESHOLD, %ecx |
| |
| movl PARAM_SRC2, %edx |
| movl PARAM_SRC1, %ebx |
| jae L(unroll) |
| |
| movl PARAM_DST, %edi |
| leal (%ebx,%ecx,4), %ebx |
| leal (%edx,%ecx,4), %edx |
| |
| leal (%edi,%ecx,4), %edi |
| negl %ecx |
| shrl %eax |
| |
| C This loop in in a single 16 byte code block already, so no |
| C alignment necessary. |
| L(simple): |
| C eax scratch |
| C ebx src1 |
| C ecx counter |
| C edx src2 |
| C esi |
| C edi dst |
| C ebp |
| |
| movl (%ebx,%ecx,4), %eax |
| M4_inst (%edx,%ecx,4), %eax |
| movl %eax, (%edi,%ecx,4) |
| incl %ecx |
| jnz L(simple) |
| |
| movl $0, %eax |
| movl SAVE_EDI, %edi |
| |
| movl SAVE_EBX, %ebx |
| setc %al |
| addl $STACK_SPACE, %esp |
| |
| ret |
| |
| |
| C ----------------------------------------------------------------------------- |
| C This is at 0x55, close enough to aligned. |
| L(unroll): |
| deflit(`FRAME',STACK_SPACE) |
| movl %ebp, SAVE_EBP |
| andl $-2, %ecx C size low bit masked out |
| andl $1, PARAM_SIZE C size low bit kept |
| |
| movl %ecx, %edi |
| decl %ecx |
| movl PARAM_DST, %ebp |
| |
| shrl $UNROLL_LOG2, %ecx |
| negl %edi |
| movl %esi, SAVE_ESI |
| |
| andl $UNROLL_MASK, %edi |
| |
| ifdef(`PIC',` |
| call L(pic_calc) |
| L(here): |
| ',` |
| leal L(entry) (%edi,%edi,8), %esi C 9 bytes per |
| ') |
| negl %edi |
| shrl %eax |
| |
| leal ifelse(UNROLL_BYTES,256,128) (%ebx,%edi,4), %ebx |
| leal ifelse(UNROLL_BYTES,256,128) (%edx,%edi,4), %edx |
| leal ifelse(UNROLL_BYTES,256,128) (%ebp,%edi,4), %edi |
| |
| jmp *%esi |
| |
| |
| ifdef(`PIC',` |
| L(pic_calc): |
| C See mpn/x86/README about old gas bugs |
| leal (%edi,%edi,8), %esi |
| addl $L(entry)-L(here), %esi |
| addl (%esp), %esi |
| ret_internal |
| ') |
| |
| |
| C ----------------------------------------------------------------------------- |
| ALIGN(32) |
| L(top): |
| C eax zero |
| C ebx src1 |
| C ecx counter |
| C edx src2 |
| C esi scratch (was computed jump) |
| C edi dst |
| C ebp scratch |
| |
| leal UNROLL_BYTES(%edx), %edx |
| |
| L(entry): |
| deflit(CHUNK_COUNT, 2) |
| forloop(i, 0, UNROLL_COUNT/CHUNK_COUNT-1, ` |
| deflit(`disp0', eval(i*CHUNK_COUNT*4 ifelse(UNROLL_BYTES,256,-128))) |
| deflit(`disp1', eval(disp0 + 4)) |
| |
| Zdisp( movl, disp0,(%ebx), %esi) |
| movl disp1(%ebx), %ebp |
| Zdisp( M4_inst,disp0,(%edx), %esi) |
| Zdisp( movl, %esi, disp0,(%edi)) |
| M4_inst disp1(%edx), %ebp |
| movl %ebp, disp1(%edi) |
| ') |
| |
| decl %ecx |
| leal UNROLL_BYTES(%ebx), %ebx |
| leal UNROLL_BYTES(%edi), %edi |
| jns L(top) |
| |
| |
| mov PARAM_SIZE, %esi |
| movl SAVE_EBP, %ebp |
| movl $0, %eax |
| |
| decl %esi |
| js L(even) |
| |
| movl (%ebx), %ecx |
| M4_inst UNROLL_BYTES(%edx), %ecx |
| movl %ecx, (%edi) |
| L(even): |
| |
| movl SAVE_EDI, %edi |
| movl SAVE_EBX, %ebx |
| setc %al |
| |
| movl SAVE_ESI, %esi |
| addl $STACK_SPACE, %esp |
| |
| ret |
| |
| EPILOGUE() |