blob: e345eef01feba3fbd62e802ad173dafcb51aa04a [file] [log] [blame]
dnl PowerPC-32/VMX and PowerPC-64/VMX mpn_copyd.
dnl Copyright 2006 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
dnl The GNU MP Library is free software; you can redistribute it and/or modify
dnl it under the terms of the GNU Lesser General Public License as published
dnl by the Free Software Foundation; either version 3 of the License, or (at
dnl your option) any later version.
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
dnl License for more details.
dnl You should have received a copy of the GNU Lesser General Public License
dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
C 16-byte coaligned unaligned
C cycles/limb cycles/limb
C 7400,7410 (G4): 0.5 0.64
C 744x,745x (G4+): 0.75 0.82
C 970 (G5): 0.78 1.02 (64-bit limbs)
C STATUS
C * Works for all sizes and alignments.
C TODO
C * Optimize unaligned case. Some basic tests with 2-way and 4-way unrolling
C indicate that we can reach 0.56 c/l for 7400, 0.75 c/l for 745x, and 0.80
C c/l for 970.
C * Consider using VMX instructions also for head and tail, by using some
C read-modify-write tricks.
C * The VMX code is used from the smallest sizes it handles, but measurements
C show a large speed bump at the cutoff points. Small copying (perhaps
C using some read-modify-write technique) should be optimized.
C * Make a mpn_com_n based on this code.
define(`GMP_LIMB_BYTES', eval(GMP_LIMB_BITS/8))
define(`LIMBS_PER_VR', eval(16/GMP_LIMB_BYTES))
define(`LIMBS_PER_2VR', eval(32/GMP_LIMB_BYTES))
ifelse(GMP_LIMB_BITS,32,`
define(`LIMB32',` $1')
define(`LIMB64',`')
',`
define(`LIMB32',`')
define(`LIMB64',` $1')
')
C INPUT PARAMETERS
define(`rp', `r3')
define(`up', `r4')
define(`n', `r5')
define(`us', `v4')
ASM_START()
PROLOGUE(mpn_copyd)
LIMB32(`slwi. r0, n, 2 ')
LIMB64(`sldi. r0, n, 3 ')
add rp, rp, r0
add up, up, r0
LIMB32(`cmpi cr7, n, 11 ')
LIMB64(`cmpdi cr7, n, 5 ')
bge cr7, L(big)
beqlr cr0
C Handle small cases with plain operations
mtctr n
L(topS):
LIMB32(`lwz r0, -4(up) ')
LIMB64(`ld r0, -8(up) ')
addi up, up, -GMP_LIMB_BYTES
LIMB32(`stw r0, -4(rp) ')
LIMB64(`std r0, -8(rp) ')
addi rp, rp, -GMP_LIMB_BYTES
bdnz L(topS)
blr
C Handle large cases with VMX operations
L(big):
addi rp, rp, -16
addi up, up, -16
mfspr r12, 256
oris r0, r12, 0xf800 C Set VRSAVE bit 0-4
mtspr 256, r0
LIMB32(`rlwinm. r7, rp, 30,30,31') C (rp >> 2) mod 4
LIMB64(`rlwinm. r7, rp, 29,31,31') C (rp >> 3) mod 2
beq L(rp_aligned)
subf n, r7, n
L(top0):
LIMB32(`lwz r0, 12(up) ')
LIMB64(`ld r0, 8(up) ')
addi up, up, -GMP_LIMB_BYTES
LIMB32(`addic. r7, r7, -1 ')
LIMB32(`stw r0, 12(rp) ')
LIMB64(`std r0, 8(rp) ')
addi rp, rp, -GMP_LIMB_BYTES
LIMB32(`bne L(top0) ')
L(rp_aligned):
LIMB32(`rlwinm. r0, up, 30,30,31') C (up >> 2) mod 4
LIMB64(`rlwinm. r0, up, 29,31,31') C (up >> 3) mod 2
LIMB64(`srdi r7, n, 2 ') C loop count corresponding to n
LIMB32(`srwi r7, n, 3 ') C loop count corresponding to n
mtctr r7 C copy n to count register
li r10, -16
beq L(up_aligned)
lvsl us, 0, up
addi up, up, 16
LIMB32(`andi. r0, n, 0x4 ')
LIMB64(`andi. r0, n, 0x2 ')
beq L(1)
lvx v0, 0, up
lvx v2, r10, up
vperm v3, v2, v0, us
stvx v3, 0, rp
addi up, up, -32
addi rp, rp, -16
b L(lpu)
L(1): lvx v2, 0, up
addi up, up, -16
b L(lpu)
ALIGN(32)
L(lpu): lvx v0, 0, up
vperm v3, v0, v2, us
stvx v3, 0, rp
lvx v2, r10, up
addi up, up, -32
vperm v3, v2, v0, us
stvx v3, r10, rp
addi rp, rp, -32
bdnz L(lpu)
b L(tail)
L(up_aligned):
LIMB32(`andi. r0, n, 0x4 ')
LIMB64(`andi. r0, n, 0x2 ')
beq L(lpa)
lvx v0, 0, up
stvx v0, 0, rp
addi up, up, -16
addi rp, rp, -16
b L(lpa)
ALIGN(32)
L(lpa): lvx v0, 0, up
lvx v1, r10, up
addi up, up, -32
nop
stvx v0, 0, rp
stvx v1, r10, rp
addi rp, rp, -32
bdnz L(lpa)
L(tail):
LIMB32(`rlwinm. r7, n, 0,30,31 ') C r7 = n mod 4
LIMB64(`rlwinm. r7, n, 0,31,31 ') C r7 = n mod 2
beq L(ret)
LIMB32(`li r10, 12 ')
L(top2):
LIMB32(`lwzx r0, r10, up ')
LIMB64(`ld r0, 8(up) ')
LIMB32(`addic. r7, r7, -1 ')
LIMB32(`stwx r0, r10, rp ')
LIMB64(`std r0, 8(rp) ')
LIMB32(`addi r10, r10, -GMP_LIMB_BYTES')
LIMB32(`bne L(top2) ')
L(ret): mtspr 256, r12
blr
EPILOGUE()