| dnl Alpha ev6 mpn_addmul_1 and mpn_submul_1. |
| |
| dnl Copyright 2000, 2003, 2004, 2005, 2008 Free Software Foundation, Inc. |
| |
| dnl This file is part of the GNU MP Library. |
| |
| dnl The GNU MP Library is free software; you can redistribute it and/or modify |
| dnl it under the terms of the GNU Lesser General Public License as published |
| dnl by the Free Software Foundation; either version 3 of the License, or (at |
| dnl your option) any later version. |
| |
| dnl The GNU MP Library is distributed in the hope that it will be useful, but |
| dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY |
| dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public |
| dnl License for more details. |
| |
| dnl You should have received a copy of the GNU Lesser General Public License |
| dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. |
| |
| include(`../config.m4') |
| |
| C cycles/limb |
| C EV4: 42 |
| C EV5: 18 |
| C EV6: 3.5 |
| |
| C INPUT PARAMETERS |
| define(`rp', `r16') |
| define(`up', `r17') |
| define(`n', `r18') |
| define(`v0', `r19') |
| |
| dnl This code was written in cooperation with ev6 pipeline expert Steve Root. |
| |
| dnl The stores can issue a cycle late so we have paired no-op's to 'catch' |
| dnl them, so that further disturbance to the schedule is damped. |
| |
| dnl We couldn't pair the loads, because the entangled schedule of the carry's |
| dnl has to happen on one side {0} of the machine. |
| |
| dnl This is a great schedule for the d_cache, a poor schedule for the b_cache. |
| dnl The lockup on U0 means that any stall can't be recovered from. Consider a |
| dnl ldq in L1, say that load gets stalled because it collides with a fill from |
| dnl the b_cache. On the next cycle, this load gets priority. If first looks |
| dnl at L0, and goes there. The instruction we intended for L0 gets to look at |
| dnl L1, which is NOT where we want it. It either stalls 1, because it can't |
| dnl go in L0, or goes there, and causes a further instruction to stall. |
| |
| dnl So for b_cache, we're likely going to want to put one or more cycles back |
| dnl into the code! And, of course, put in lds prefetch for the rp[] operand. |
| dnl At a place where we have an mt followed by a bookkeeping, put the |
| dnl bookkeeping in upper, and the prefetch into lower. |
| |
| dnl Note, the ldq's and stq's are at the end of the quadpacks. Note, we'd |
| dnl like not to have an ldq or an stq to preceded a conditional branch in a |
| dnl quadpack. The conditional branch moves the retire pointer one cycle |
| dnl later. |
| |
| ifdef(`OPERATION_addmul_1',` |
| define(`ADDSUB', `addq') |
| define(`CMPCY', `cmpult $2,$1') |
| define(`func', `mpn_addmul_1') |
| ') |
| ifdef(`OPERATION_submul_1',` |
| define(`ADDSUB', `subq') |
| define(`CMPCY', `cmpult $1,$2') |
| define(`func', `mpn_submul_1') |
| ') |
| |
| MULFUNC_PROLOGUE(mpn_addmul_1 mpn_submul_1) |
| |
| ASM_START() |
| PROLOGUE(func) |
| ldq r3, 0(up) C |
| and r18, 7, r20 C |
| lda r18, -9(r18) C |
| cmpeq r20, 1, r21 C |
| beq r21, $L1 C |
| |
| $1mod8: ldq r5, 0(rp) C |
| mulq v0, r3, r7 C |
| umulh v0, r3, r8 C |
| ADDSUB r5, r7, r23 C |
| CMPCY( r5, r23), r20 C |
| addq r8, r20, r0 C |
| stq r23, 0(rp) C |
| bge r18, $ent1 C |
| ret r31, (r26), 1 C |
| |
| $L1: lda r8, 0(r31) C zero carry reg |
| lda r24, 0(r31) C zero carry reg |
| cmpeq r20, 2, r21 C |
| bne r21, $2mod8 C |
| cmpeq r20, 3, r21 C |
| bne r21, $3mod8 C |
| cmpeq r20, 4, r21 C |
| bne r21, $4mod8 C |
| cmpeq r20, 5, r21 C |
| bne r21, $5mod8 C |
| cmpeq r20, 6, r21 C |
| bne r21, $6mod8 C |
| cmpeq r20, 7, r21 C |
| beq r21, $0mod8 C |
| |
| $7mod8: ldq r5, 0(rp) C |
| lda up, 8(up) C |
| mulq v0, r3, r7 C |
| umulh v0, r3, r24 C |
| ADDSUB r5, r7, r23 C |
| CMPCY( r5, r23), r20 C |
| addq r24, r20, r24 C |
| stq r23, 0(rp) C |
| lda rp, 8(rp) C |
| ldq r3, 0(up) C |
| $6mod8: ldq r1, 8(up) C |
| mulq v0, r3, r25 C |
| umulh v0, r3, r3 C |
| mulq v0, r1, r28 C |
| ldq r0, 16(up) C |
| ldq r4, 0(rp) C |
| umulh v0, r1, r8 C |
| ldq r1, 24(up) C |
| lda up, 48(up) C L1 bookkeeping |
| mulq v0, r0, r2 C |
| ldq r5, 8(rp) C |
| lda rp, -32(rp) C L1 bookkeeping |
| umulh v0, r0, r6 C |
| ADDSUB r4, r25, r25 C lo + acc |
| mulq v0, r1, r7 C |
| br r31, $ent6 C |
| |
| $ent1: lda up, 8(up) C |
| lda rp, 8(rp) C |
| lda r8, 0(r0) C |
| ldq r3, 0(up) C |
| $0mod8: ldq r1, 8(up) C |
| mulq v0, r3, r2 C |
| umulh v0, r3, r6 C |
| mulq v0, r1, r7 C |
| ldq r0, 16(up) C |
| ldq r4, 0(rp) C |
| umulh v0, r1, r24 C |
| ldq r1, 24(up) C |
| mulq v0, r0, r25 C |
| ldq r5, 8(rp) C |
| umulh v0, r0, r3 C |
| ADDSUB r4, r2, r2 C lo + acc |
| mulq v0, r1, r28 C |
| lda rp, -16(rp) C |
| br r31, $ent0 C |
| |
| $3mod8: ldq r5, 0(rp) C |
| lda up, 8(up) C |
| mulq v0, r3, r7 C |
| umulh v0, r3, r8 C |
| ADDSUB r5, r7, r23 C |
| CMPCY( r5, r23), r20 C |
| addq r8, r20, r24 C |
| stq r23, 0(rp) C |
| lda rp, 8(rp) C |
| ldq r3, 0(up) C |
| $2mod8: ldq r1, 8(up) C |
| mulq v0, r3, r25 C |
| umulh v0, r3, r3 C |
| mulq v0, r1, r28 C |
| ble r18, $n23 C |
| ldq r0, 16(up) C |
| ldq r4, 0(rp) C |
| umulh v0, r1, r8 C |
| ldq r1, 24(up) C |
| lda up, 16(up) C L1 bookkeeping |
| mulq v0, r0, r2 C |
| ldq r5, 8(rp) C |
| lda rp, 0(rp) C L1 bookkeeping |
| umulh v0, r0, r6 C |
| ADDSUB r4, r25, r25 C lo + acc |
| mulq v0, r1, r7 C |
| br r31, $ent2 C |
| |
| $5mod8: ldq r5, 0(rp) C |
| lda up, 8(up) C |
| mulq v0, r3, r7 C |
| umulh v0, r3, r24 C |
| ADDSUB r5, r7, r23 C |
| CMPCY( r5, r23), r20 C |
| addq r24, r20, r8 C |
| stq r23, 0(rp) C |
| lda rp, 8(rp) C |
| ldq r3, 0(up) C |
| $4mod8: ldq r1, 8(up) C |
| mulq v0, r3, r2 C |
| umulh v0, r3, r6 C |
| mulq v0, r1, r7 C |
| ldq r0, 16(up) C |
| ldq r4, 0(rp) C |
| umulh v0, r1, r24 C |
| ldq r1, 24(up) C |
| lda up, 32(up) C L1 bookkeeping |
| mulq v0, r0, r25 C |
| ldq r5, 8(rp) C |
| lda rp, 16(rp) C L1 bookkeeping |
| umulh v0, r0, r3 C |
| ADDSUB r4, r2, r2 C lo + acc |
| mulq v0, r1, r28 C |
| CMPCY( r4, r2), r20 C L0 lo add => carry |
| ADDSUB r2, r8, r22 C U0 hi add => answer |
| ble r18, $Lend C |
| ALIGN(16) |
| $Loop: |
| bis r31, r31, r31 C U1 mt |
| CMPCY( r2, r22), r21 C L0 hi add => carry |
| addq r6, r20, r6 C U0 hi mul + carry |
| ldq r0, 0(up) C |
| |
| bis r31, r31, r31 C U1 mt |
| ADDSUB r5, r7, r7 C L0 lo + acc |
| addq r6, r21, r6 C U0 hi mul + carry |
| ldq r4, 0(rp) C L1 |
| |
| umulh v0, r1, r8 C U1 |
| CMPCY( r5, r7), r20 C L0 lo add => carry |
| ADDSUB r7, r6, r23 C U0 hi add => answer |
| ldq r1, 8(up) C L1 |
| |
| mulq v0, r0, r2 C U1 |
| CMPCY( r7, r23), r21 C L0 hi add => carry |
| addq r24, r20, r24 C U0 hi mul + carry |
| ldq r5, 8(rp) C L1 |
| |
| umulh v0, r0, r6 C U1 |
| ADDSUB r4, r25, r25 C U0 lo + acc |
| stq r22, -16(rp) C L0 |
| stq r23, -8(rp) C L1 |
| |
| bis r31, r31, r31 C L0 st slosh |
| mulq v0, r1, r7 C U1 |
| bis r31, r31, r31 C L1 st slosh |
| addq r24, r21, r24 C U0 hi mul + carry |
| $ent2: |
| CMPCY( r4, r25), r20 C L0 lo add => carry |
| bis r31, r31, r31 C U1 mt |
| lda r18, -8(r18) C L1 bookkeeping |
| ADDSUB r25, r24, r22 C U0 hi add => answer |
| |
| bis r31, r31, r31 C U1 mt |
| CMPCY( r25, r22), r21 C L0 hi add => carry |
| addq r3, r20, r3 C U0 hi mul + carry |
| ldq r0, 16(up) C L1 |
| |
| bis r31, r31, r31 C U1 mt |
| ADDSUB r5, r28, r28 C L0 lo + acc |
| addq r3, r21, r3 C U0 hi mul + carry |
| ldq r4, 16(rp) C L1 |
| |
| umulh v0, r1, r24 C U1 |
| CMPCY( r5, r28), r20 C L0 lo add => carry |
| ADDSUB r28, r3, r23 C U0 hi add => answer |
| ldq r1, 24(up) C L1 |
| |
| mulq v0, r0, r25 C U1 |
| CMPCY( r28, r23), r21 C L0 hi add => carry |
| addq r8, r20, r8 C U0 hi mul + carry |
| ldq r5, 24(rp) C L1 |
| |
| umulh v0, r0, r3 C U1 |
| ADDSUB r4, r2, r2 C U0 lo + acc |
| stq r22, 0(rp) C L0 |
| stq r23, 8(rp) C L1 |
| |
| bis r31, r31, r31 C L0 st slosh |
| mulq v0, r1, r28 C U1 |
| bis r31, r31, r31 C L1 st slosh |
| addq r8, r21, r8 C U0 hi mul + carry |
| $ent0: |
| CMPCY( r4, r2), r20 C L0 lo add => carry |
| bis r31, r31, r31 C U1 mt |
| lda up, 64(up) C L1 bookkeeping |
| ADDSUB r2, r8, r22 C U0 hi add => answer |
| |
| bis r31, r31, r31 C U1 mt |
| CMPCY( r2, r22), r21 C L0 hi add => carry |
| addq r6, r20, r6 C U0 hi mul + carry |
| ldq r0, -32(up) C L1 |
| |
| bis r31, r31, r31 C U1 mt |
| ADDSUB r5, r7, r7 C L0 lo + acc |
| addq r6, r21, r6 C U0 hi mul + carry |
| ldq r4, 32(rp) C L1 |
| |
| umulh v0, r1, r8 C U1 |
| CMPCY( r5, r7), r20 C L0 lo add => carry |
| ADDSUB r7, r6, r23 C U0 hi add => answer |
| ldq r1, -24(up) C L1 |
| |
| mulq v0, r0, r2 C U1 |
| CMPCY( r7, r23), r21 C L0 hi add => carry |
| addq r24, r20, r24 C U0 hi mul + carry |
| ldq r5, 40(rp) C L1 |
| |
| umulh v0, r0, r6 C U1 |
| ADDSUB r4, r25, r25 C U0 lo + acc |
| stq r22, 16(rp) C L0 |
| stq r23, 24(rp) C L1 |
| |
| bis r31, r31, r31 C L0 st slosh |
| mulq v0, r1, r7 C U1 |
| bis r31, r31, r31 C L1 st slosh |
| addq r24, r21, r24 C U0 hi mul + carry |
| $ent6: |
| CMPCY( r4, r25), r20 C L0 lo add => carry |
| bis r31, r31, r31 C U1 mt |
| lda rp, 64(rp) C L1 bookkeeping |
| ADDSUB r25, r24, r22 C U0 hi add => answer |
| |
| bis r31, r31, r31 C U1 mt |
| CMPCY( r25, r22), r21 C L0 hi add => carry |
| addq r3, r20, r3 C U0 hi mul + carry |
| ldq r0, -16(up) C L1 |
| |
| bis r31, r31, r31 C U1 mt |
| ADDSUB r5, r28, r28 C L0 lo + acc |
| addq r3, r21, r3 C U0 hi mul + carry |
| ldq r4, -16(rp) C L1 |
| |
| umulh v0, r1, r24 C U1 |
| CMPCY( r5, r28), r20 C L0 lo add => carry |
| ADDSUB r28, r3, r23 C U0 hi add => answer |
| ldq r1, -8(up) C L1 |
| |
| mulq v0, r0, r25 C U1 |
| CMPCY( r28, r23), r21 C L0 hi add => carry |
| addq r8, r20, r8 C U0 hi mul + carry |
| ldq r5, -8(rp) C L1 |
| |
| umulh v0, r0, r3 C U1 |
| ADDSUB r4, r2, r2 C U0 lo + acc |
| stq r22, -32(rp) C L0 |
| stq r23, -24(rp) C L1 |
| |
| bis r31, r31, r31 C L0 st slosh |
| mulq v0, r1, r28 C U1 |
| bis r31, r31, r31 C L1 st slosh |
| addq r8, r21, r8 C U0 hi mul + carry |
| |
| CMPCY( r4, r2), r20 C L0 lo add => carry |
| ADDSUB r2, r8, r22 C U0 hi add => answer |
| ldl r31, 256(up) C prefetch up[] |
| bgt r18, $Loop C U1 bookkeeping |
| |
| $Lend: CMPCY( r2, r22), r21 C |
| addq r6, r20, r6 C |
| ADDSUB r5, r7, r7 C |
| addq r6, r21, r6 C |
| ldq r4, 0(rp) C |
| umulh v0, r1, r8 C |
| CMPCY( r5, r7), r20 C |
| ADDSUB r7, r6, r23 C |
| CMPCY(r7, r23), r21 C |
| addq r24, r20, r24 C |
| ldq r5, 8(rp) C |
| ADDSUB r4, r25, r25 C |
| stq r22, -16(rp) C |
| stq r23, -8(rp) C |
| addq r24, r21, r24 C |
| br L(x) |
| |
| ALIGN(16) |
| $n23: ldq r4, 0(rp) C |
| ldq r5, 8(rp) C |
| umulh v0, r1, r8 C |
| ADDSUB r4, r25, r25 C |
| L(x): CMPCY( r4, r25), r20 C |
| ADDSUB r25, r24, r22 C |
| CMPCY( r25, r22), r21 C |
| addq r3, r20, r3 C |
| ADDSUB r5, r28, r28 C |
| addq r3, r21, r3 C |
| CMPCY( r5, r28), r20 C |
| ADDSUB r28, r3, r23 C |
| CMPCY( r28, r23), r21 C |
| addq r8, r20, r8 C |
| stq r22, 0(rp) C |
| stq r23, 8(rp) C |
| addq r8, r21, r0 C |
| ret r31, (r26), 1 C |
| EPILOGUE() |
| ASM_END() |