blob: e6fe9ee62c533d7b9fe42d4db75fa3ba000611cc [file] [log] [blame]
dnl SPARC v9 mpn_sub_n -- Subtract two limb vectors of the same length > 0 and
dnl store difference in a third limb vector.
dnl Copyright 2001, 2002, 2003 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
dnl The GNU MP Library is free software; you can redistribute it and/or modify
dnl it under the terms of the GNU Lesser General Public License as published
dnl by the Free Software Foundation; either version 3 of the License, or (at
dnl your option) any later version.
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
dnl License for more details.
dnl You should have received a copy of the GNU Lesser General Public License
dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
C cycles/limb
C UltraSPARC 1&2: 4
C UltraSPARC 3: 4.5
C Compute carry-out from the most significant bits of u,v, and r, where
C r=u-v-carry_in, using logic operations.
C This code runs at 4 cycles/limb on UltraSPARC 1 and 2. It has a 4 insn
C recurrency, and the UltraSPARC 1 and 2 the IE units are 100% saturated.
C Therefore, it seems futile to try to optimize this any further...
C INPUT PARAMETERS
define(`rp',`%i0')
define(`up',`%i1')
define(`vp',`%i2')
define(`n',`%i3')
define(`u0',`%l0')
define(`u1',`%l2')
define(`u2',`%l4')
define(`u3',`%l6')
define(`v0',`%l1')
define(`v1',`%l3')
define(`v2',`%l5')
define(`v3',`%l7')
define(`cy',`%i4')
define(`fanop',`fitod %f0,%f2') dnl A quasi nop running in the FA pipe
define(`fmnop',`fmuld %f0,%f0,%f4') dnl A quasi nop running in the FM pipe
ASM_START()
REGISTER(%g2,#scratch)
REGISTER(%g3,#scratch)
PROLOGUE(mpn_sub_n)
save %sp,-160,%sp
fitod %f0,%f0 C make sure f0 contains small, quiet number
subcc n,4,%g0
bl,pn %icc,.Loop0
mov 0,cy
ldx [up+0],u0
ldx [vp+0],v0
add up,32,up
ldx [up-24],u1
ldx [vp+8],v1
add vp,32,vp
ldx [up-16],u2
ldx [vp-16],v2
ldx [up-8],u3
ldx [vp-8],v3
subcc n,8,n
sub u0,v0,%g1 C main sub
sub %g1,cy,%g4 C carry sub
orn u0,v0,%g2
bl,pn %icc,.Lend4567
fanop
b,a .Loop
.align 16
C START MAIN LOOP
.Loop: orn %g4,%g2,%g2
andn u0,v0,%g3
ldx [up+0],u0
fanop
C --
andn %g2,%g3,%g2
ldx [vp+0],v0
add up,32,up
fanop
C --
srlx %g2,63,cy
sub u1,v1,%g1
stx %g4,[rp+0]
fanop
C --
sub %g1,cy,%g4
orn u1,v1,%g2
fmnop
fanop
C --
orn %g4,%g2,%g2
andn u1,v1,%g3
ldx [up-24],u1
fanop
C --
andn %g2,%g3,%g2
ldx [vp+8],v1
add vp,32,vp
fanop
C --
srlx %g2,63,cy
sub u2,v2,%g1
stx %g4,[rp+8]
fanop
C --
sub %g1,cy,%g4
orn u2,v2,%g2
fmnop
fanop
C --
orn %g4,%g2,%g2
andn u2,v2,%g3
ldx [up-16],u2
fanop
C --
andn %g2,%g3,%g2
ldx [vp-16],v2
add rp,32,rp
fanop
C --
srlx %g2,63,cy
sub u3,v3,%g1
stx %g4,[rp-16]
fanop
C --
sub %g1,cy,%g4
orn u3,v3,%g2
fmnop
fanop
C --
orn %g4,%g2,%g2
andn u3,v3,%g3
ldx [up-8],u3
fanop
C --
andn %g2,%g3,%g2
subcc n,4,n
ldx [vp-8],v3
fanop
C --
srlx %g2,63,cy
sub u0,v0,%g1
stx %g4,[rp-8]
fanop
C --
sub %g1,cy,%g4
orn u0,v0,%g2
bge,pt %icc,.Loop
fanop
C END MAIN LOOP
.Lend4567:
orn %g4,%g2,%g2
andn u0,v0,%g3
andn %g2,%g3,%g2
srlx %g2,63,cy
sub u1,v1,%g1
stx %g4,[rp+0]
sub %g1,cy,%g4
orn u1,v1,%g2
orn %g4,%g2,%g2
andn u1,v1,%g3
andn %g2,%g3,%g2
srlx %g2,63,cy
sub u2,v2,%g1
stx %g4,[rp+8]
sub %g1,cy,%g4
orn u2,v2,%g2
orn %g4,%g2,%g2
andn u2,v2,%g3
andn %g2,%g3,%g2
add rp,32,rp
srlx %g2,63,cy
sub u3,v3,%g1
stx %g4,[rp-16]
sub %g1,cy,%g4
orn u3,v3,%g2
orn %g4,%g2,%g2
andn u3,v3,%g3
andn %g2,%g3,%g2
srlx %g2,63,cy
stx %g4,[rp-8]
addcc n,4,n
bz,pn %icc,.Lret
fanop
.Loop0: ldx [up],u0
add up,8,up
ldx [vp],v0
add vp,8,vp
add rp,8,rp
subcc n,1,n
sub u0,v0,%g1
orn u0,v0,%g2
sub %g1,cy,%g4
andn u0,v0,%g3
orn %g4,%g2,%g2
stx %g4,[rp-8]
andn %g2,%g3,%g2
bnz,pt %icc,.Loop0
srlx %g2,63,cy
.Lret: mov cy,%i0
ret
restore
EPILOGUE(mpn_sub_n)