| dnl SPARC v9 32-bit mpn_sqr_diagonal. |
| |
| dnl Copyright 2001, 2003 Free Software Foundation, Inc. |
| |
| dnl This file is part of the GNU MP Library. |
| |
| dnl The GNU MP Library is free software; you can redistribute it and/or modify |
| dnl it under the terms of the GNU Lesser General Public License as published |
| dnl by the Free Software Foundation; either version 3 of the License, or (at |
| dnl your option) any later version. |
| |
| dnl The GNU MP Library is distributed in the hope that it will be useful, but |
| dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY |
| dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public |
| dnl License for more details. |
| |
| dnl You should have received a copy of the GNU Lesser General Public License |
| dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. |
| |
| |
| include(`../config.m4') |
| |
| C INPUT PARAMETERS |
| C rp i0 |
| C up i1 |
| C n i2 |
| |
| C This code uses a very deep software pipeline, due to the need for moving data |
| C forth and back between the integer registers and floating-point registers. |
| C |
| C A VIS variant of this code would make the pipeline less deep, since the |
| C masking now done in the integer unit could take place in the floating-point |
| C unit using the FAND instruction. It would be possible to save several cycles |
| C too. |
| C |
| C On UltraSPARC 1 and 2, this code runs at 11 cycles/limb from the Dcache and |
| C not much slower from the Ecache. It would perhaps be possible to shave off |
| C one cycle, but not easily. We cannot do better than 10 cycles/limb with the |
| C used instructions, since we have 10 memory operations per limb. But a VIS |
| C variant could run three cycles faster than the corresponding non-VIS code. |
| |
| C This is non-pipelined code showing the algorithm: |
| C |
| C .Loop: |
| C lduw [up+0],%g4 C 00000000hhhhllll |
| C sllx %g4,16,%g3 C 0000hhhhllll0000 |
| C or %g3,%g4,%g2 C 0000hhhhXXXXllll |
| C andn %g2,%g5,%g2 C 0000hhhh0000llll |
| C stx %g2,[%fp+80] |
| C ldd [%fp+80],%f0 |
| C fitod %f0,%f4 C hi16 |
| C fitod %f1,%f6 C lo16 |
| C ld [up+0],%f9 |
| C fxtod %f8,%f2 |
| C fmuld %f2,%f4,%f4 |
| C fmuld %f2,%f6,%f6 |
| C fdtox %f4,%f4 |
| C fdtox %f6,%f6 |
| C std %f4,[%fp-24] |
| C std %f6,[%fp-16] |
| C ldx [%fp-24],%g2 |
| C ldx [%fp-16],%g1 |
| C sllx %g2,16,%g2 |
| C add %g2,%g1,%g1 |
| C stw %g1,[rp+0] |
| C srlx %g1,32,%l0 |
| C stw %l0,[rp+4] |
| C add up,4,up |
| C subcc n,1,n |
| C bne,pt %icc,.Loop |
| C add rp,8,rp |
| |
| define(`fanop',`fitod %f12,%f10') dnl A quasi nop running in the FA pipe |
| |
| ASM_START() |
| |
| TEXT |
| ALIGN(4) |
| .Lnoll: |
| .word 0 |
| |
| PROLOGUE(mpn_sqr_diagonal) |
| save %sp,-256,%sp |
| |
| ifdef(`PIC', |
| `.Lpc: rd %pc,%o7 |
| ld [%o7+.Lnoll-.Lpc],%f8', |
| ` sethi %hi(.Lnoll),%g1 |
| ld [%g1+%lo(.Lnoll)],%f8') |
| |
| sethi %hi(0xffff0000),%g5 |
| add %i1,-8,%i1 |
| |
| lduw [%i1+8],%g4 |
| add %i1,4,%i1 C s1_ptr++ |
| sllx %g4,16,%g3 C 0000hhhhllll0000 |
| or %g3,%g4,%g2 C 0000hhhhXXXXllll |
| subcc %i2,1,%i2 |
| bne,pt %icc,.L_grt_1 |
| andn %g2,%g5,%g2 C 0000hhhh0000llll |
| |
| add %i1,4,%i1 C s1_ptr++ |
| stx %g2,[%fp+80] |
| ld [%i1],%f9 |
| ldd [%fp+80],%f0 |
| fxtod %f8,%f2 |
| fitod %f0,%f4 |
| fitod %f1,%f6 |
| fmuld %f2,%f4,%f4 |
| fmuld %f2,%f6,%f6 |
| fdtox %f4,%f4 |
| fdtox %f6,%f6 |
| std %f4,[%fp-24] |
| std %f6,[%fp-16] |
| |
| add %fp, 80, %l3 |
| add %fp, -24, %l4 |
| add %fp, 72, %l5 |
| b .L1 |
| add %fp, -40, %l6 |
| |
| .L_grt_1: |
| stx %g2,[%fp+80] |
| lduw [%i1+8],%g4 |
| add %i1,4,%i1 C s1_ptr++ |
| sllx %g4,16,%g3 C 0000hhhhllll0000 |
| or %g3,%g4,%g2 C 0000hhhhXXXXllll |
| subcc %i2,1,%i2 |
| bne,pt %icc,.L_grt_2 |
| andn %g2,%g5,%g2 C 0000hhhh0000llll |
| |
| stx %g2,[%fp+72] |
| ld [%i1],%f9 |
| add %i1,4,%i1 C s1_ptr++ |
| ldd [%fp+80],%f0 |
| fxtod %f8,%f2 |
| fitod %f0,%f4 |
| fitod %f1,%f6 |
| fmuld %f2,%f4,%f4 |
| ld [%i1],%f9 |
| fmuld %f2,%f6,%f6 |
| ldd [%fp+72],%f0 |
| fdtox %f4,%f4 |
| fdtox %f6,%f6 |
| std %f4,[%fp-24] |
| fxtod %f8,%f2 |
| std %f6,[%fp-16] |
| fitod %f0,%f4 |
| fitod %f1,%f6 |
| fmuld %f2,%f4,%f4 |
| fmuld %f2,%f6,%f6 |
| fdtox %f4,%f4 |
| |
| add %fp, 72, %l3 |
| add %fp, -40, %l4 |
| add %fp, 80, %l5 |
| b .L2 |
| add %fp, -24, %l6 |
| |
| .L_grt_2: |
| stx %g2,[%fp+72] |
| lduw [%i1+8],%g4 |
| ld [%i1],%f9 |
| add %i1,4,%i1 C s1_ptr++ |
| ldd [%fp+80],%f0 |
| sllx %g4,16,%g3 C 0000hhhhllll0000 |
| or %g3,%g4,%g2 C 0000hhhhXXXXllll |
| subcc %i2,1,%i2 |
| fxtod %f8,%f2 |
| bne,pt %icc,.L_grt_3 |
| andn %g2,%g5,%g2 C 0000hhhh0000llll |
| |
| stx %g2,[%fp+80] |
| fitod %f0,%f4 |
| fitod %f1,%f6 |
| fmuld %f2,%f4,%f4 |
| ld [%i1],%f9 |
| fmuld %f2,%f6,%f6 |
| add %i1,4,%i1 C s1_ptr++ |
| ldd [%fp+72],%f0 |
| fdtox %f4,%f4 |
| fdtox %f6,%f6 |
| std %f4,[%fp-24] |
| fxtod %f8,%f2 |
| std %f6,[%fp-16] |
| fitod %f0,%f4 |
| fitod %f1,%f6 |
| fmuld %f2,%f4,%f4 |
| ld [%i1],%f9 |
| add %fp, 80, %l3 |
| fmuld %f2,%f6,%f6 |
| add %fp, -24, %l4 |
| ldd [%fp+80],%f0 |
| add %fp, 72, %l5 |
| fdtox %f4,%f4 |
| b .L3 |
| add %fp, -40, %l6 |
| |
| .L_grt_3: |
| stx %g2,[%fp+80] |
| fitod %f0,%f4 |
| lduw [%i1+8],%g4 |
| fitod %f1,%f6 |
| fmuld %f2,%f4,%f4 |
| ld [%i1],%f9 |
| fmuld %f2,%f6,%f6 |
| add %i1,4,%i1 C s1_ptr++ |
| ldd [%fp+72],%f0 |
| fdtox %f4,%f4 |
| sllx %g4,16,%g3 C 0000hhhhllll0000 |
| fdtox %f6,%f6 |
| or %g3,%g4,%g2 C 0000hhhhXXXXllll |
| subcc %i2,1,%i2 |
| std %f4,[%fp-24] |
| fxtod %f8,%f2 |
| std %f6,[%fp-16] |
| bne,pt %icc,.L_grt_4 |
| andn %g2,%g5,%g2 C 0000hhhh0000llll |
| |
| stx %g2,[%fp+72] |
| fitod %f0,%f4 |
| fitod %f1,%f6 |
| add %fp, 72, %l3 |
| fmuld %f2,%f4,%f4 |
| add %fp, -40, %l4 |
| ld [%i1],%f9 |
| fmuld %f2,%f6,%f6 |
| add %i1,4,%i1 C s1_ptr++ |
| ldd [%fp+80],%f0 |
| add %fp, 80, %l5 |
| fdtox %f4,%f4 |
| b .L4 |
| add %fp, -24, %l6 |
| |
| .L_grt_4: |
| stx %g2,[%fp+72] |
| fitod %f0,%f4 |
| lduw [%i1+8],%g4 |
| fitod %f1,%f6 |
| fmuld %f2,%f4,%f4 |
| ld [%i1],%f9 |
| fmuld %f2,%f6,%f6 |
| add %i1,4,%i1 C s1_ptr++ |
| ldd [%fp+80],%f0 |
| fdtox %f4,%f4 |
| sllx %g4,16,%g3 C 0000hhhhllll0000 |
| fdtox %f6,%f6 |
| or %g3,%g4,%g2 C 0000hhhhXXXXllll |
| subcc %i2,1,%i2 |
| std %f4,[%fp-40] |
| fxtod %f8,%f2 |
| std %f6,[%fp-32] |
| be,pn %icc,.L5 |
| andn %g2,%g5,%g2 C 0000hhhh0000llll |
| |
| b,a .Loop |
| |
| .align 16 |
| C --- LOOP BEGIN |
| .Loop: nop |
| nop |
| stx %g2,[%fp+80] |
| fitod %f0,%f4 |
| C --- |
| nop |
| nop |
| lduw [%i1+8],%g4 |
| fitod %f1,%f6 |
| C --- |
| nop |
| nop |
| ldx [%fp-24],%g2 C p16 |
| fanop |
| C --- |
| nop |
| nop |
| ldx [%fp-16],%g1 C p0 |
| fmuld %f2,%f4,%f4 |
| C --- |
| sllx %g2,16,%g2 C align p16 |
| add %i0,8,%i0 C res_ptr++ |
| ld [%i1],%f9 |
| fmuld %f2,%f6,%f6 |
| C --- |
| add %g2,%g1,%g1 C add p16 to p0 (ADD1) |
| add %i1,4,%i1 C s1_ptr++ |
| ldd [%fp+72],%f0 |
| fanop |
| C --- |
| srlx %g1,32,%l0 |
| nop |
| stw %g1,[%i0-8] |
| fdtox %f4,%f4 |
| C --- |
| sllx %g4,16,%g3 C 0000hhhhllll0000 |
| nop |
| stw %l0,[%i0-4] |
| fdtox %f6,%f6 |
| C --- |
| or %g3,%g4,%g2 C 0000hhhhXXXXllll |
| subcc %i2,1,%i2 |
| std %f4,[%fp-24] |
| fxtod %f8,%f2 |
| C --- |
| std %f6,[%fp-16] |
| andn %g2,%g5,%g2 C 0000hhhh0000llll |
| be,pn %icc,.Lend |
| fanop |
| C --- LOOP MIDDLE |
| nop |
| nop |
| stx %g2,[%fp+72] |
| fitod %f0,%f4 |
| C --- |
| nop |
| nop |
| lduw [%i1+8],%g4 |
| fitod %f1,%f6 |
| C --- |
| nop |
| nop |
| ldx [%fp-40],%g2 C p16 |
| fanop |
| C --- |
| nop |
| nop |
| ldx [%fp-32],%g1 C p0 |
| fmuld %f2,%f4,%f4 |
| C --- |
| sllx %g2,16,%g2 C align p16 |
| add %i0,8,%i0 C res_ptr++ |
| ld [%i1],%f9 |
| fmuld %f2,%f6,%f6 |
| C --- |
| add %g2,%g1,%g1 C add p16 to p0 (ADD1) |
| add %i1,4,%i1 C s1_ptr++ |
| ldd [%fp+80],%f0 |
| fanop |
| C --- |
| srlx %g1,32,%l0 |
| nop |
| stw %g1,[%i0-8] |
| fdtox %f4,%f4 |
| C --- |
| sllx %g4,16,%g3 C 0000hhhhllll0000 |
| nop |
| stw %l0,[%i0-4] |
| fdtox %f6,%f6 |
| C --- |
| or %g3,%g4,%g2 C 0000hhhhXXXXllll |
| subcc %i2,1,%i2 |
| std %f4,[%fp-40] |
| fxtod %f8,%f2 |
| C --- |
| std %f6,[%fp-32] |
| andn %g2,%g5,%g2 C 0000hhhh0000llll |
| bne,pt %icc,.Loop |
| fanop |
| C --- LOOP END |
| |
| .L5: add %fp, 80, %l3 |
| add %fp, -24, %l4 |
| add %fp, 72, %l5 |
| b .Ltail |
| add %fp, -40, %l6 |
| |
| .Lend: add %fp, 72, %l3 |
| add %fp, -40, %l4 |
| add %fp, 80, %l5 |
| add %fp, -24, %l6 |
| .Ltail: stx %g2,[%l3] |
| fitod %f0,%f4 |
| fitod %f1,%f6 |
| ldx [%l4],%g2 C p16 |
| ldx [%l4+8],%g1 C p0 |
| fmuld %f2,%f4,%f4 |
| sllx %g2,16,%g2 C align p16 |
| add %i0,8,%i0 C res_ptr++ |
| ld [%i1],%f9 |
| fmuld %f2,%f6,%f6 |
| add %g2,%g1,%g1 C add p16 to p0 (ADD1) |
| add %i1,4,%i1 C s1_ptr++ |
| ldd [%l5],%f0 |
| srlx %g1,32,%l0 |
| stw %g1,[%i0-8] |
| fdtox %f4,%f4 |
| stw %l0,[%i0-4] |
| .L4: fdtox %f6,%f6 |
| std %f4,[%l4] |
| fxtod %f8,%f2 |
| std %f6,[%l4+8] |
| |
| fitod %f0,%f4 |
| fitod %f1,%f6 |
| ldx [%l6],%g2 C p16 |
| ldx [%l6+8],%g1 C p0 |
| fmuld %f2,%f4,%f4 |
| sllx %g2,16,%g2 C align p16 |
| add %i0,8,%i0 C res_ptr++ |
| ld [%i1],%f9 |
| fmuld %f2,%f6,%f6 |
| add %g2,%g1,%g1 C add p16 to p0 (ADD1) |
| ldd [%l3],%f0 |
| srlx %g1,32,%l0 |
| stw %g1,[%i0-8] |
| fdtox %f4,%f4 |
| stw %l0,[%i0-4] |
| .L3: fdtox %f6,%f6 |
| std %f4,[%l6] |
| fxtod %f8,%f2 |
| std %f6,[%l6+8] |
| |
| fitod %f0,%f4 |
| fitod %f1,%f6 |
| ldx [%l4],%g2 C p16 |
| ldx [%l4+8],%g1 C p0 |
| fmuld %f2,%f4,%f4 |
| sllx %g2,16,%g2 C align p16 |
| add %i0,8,%i0 C res_ptr++ |
| fmuld %f2,%f6,%f6 |
| add %g2,%g1,%g1 C add p16 to p0 (ADD1) |
| srlx %g1,32,%l0 |
| stw %g1,[%i0-8] |
| fdtox %f4,%f4 |
| stw %l0,[%i0-4] |
| .L2: fdtox %f6,%f6 |
| std %f4,[%l4] |
| std %f6,[%l4+8] |
| |
| ldx [%l6],%g2 C p16 |
| ldx [%l6+8],%g1 C p0 |
| sllx %g2,16,%g2 C align p16 |
| add %i0,8,%i0 C res_ptr++ |
| add %g2,%g1,%g1 C add p16 to p0 (ADD1) |
| srlx %g1,32,%l0 |
| stw %g1,[%i0-8] |
| stw %l0,[%i0-4] |
| |
| .L1: ldx [%l4],%g2 C p16 |
| ldx [%l4+8],%g1 C p0 |
| sllx %g2,16,%g2 C align p16 |
| add %i0,8,%i0 C res_ptr++ |
| add %g2,%g1,%g1 C add p16 to p0 (ADD1) |
| srlx %g1,32,%l0 |
| stw %g1,[%i0-8] |
| stw %l0,[%i0-4] |
| |
| ret |
| restore %g0,%g0,%o0 |
| |
| EPILOGUE(mpn_sqr_diagonal) |