gcc/gmp/mpn/powerpc32/vmx/copyd.asm - native_client/nacl-toolchain - Git at Google

 dnl  PowerPC-32/VMX and PowerPC-64/VMX mpn_copyd.

 dnl  Copyright 2006 Free Software Foundation, Inc.

 dnl  This file is part of the GNU MP Library.

 dnl  The GNU MP Library is free software; you can redistribute it and/or modify
 dnl  it under the terms of the GNU Lesser General Public License as published
 dnl  by the Free Software Foundation; either version 3 of the License, or (at
 dnl  your option) any later version.

 dnl  The GNU MP Library is distributed in the hope that it will be useful, but
 dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
 dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
 dnl  License for more details.

 dnl  You should have received a copy of the GNU Lesser General Public License
 dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.

 include(`../config.m4')

 C                16-byte coaligned      unaligned
 C                   cycles/limb        cycles/limb
 C 7400,7410 (G4):       0.5                0.64
 C 744x,745x (G4+):      0.75               0.82
 C 970 (G5):             0.78               1.02		(64-bit limbs)

 C STATUS
 C  * Works for all sizes and alignments.

 C TODO
 C  * Optimize unaligned case.  Some basic tests with 2-way and 4-way unrolling
 C    indicate that we can reach 0.56 c/l for 7400, 0.75 c/l for 745x, and 0.80
 C    c/l for 970.
 C  * Consider using VMX instructions also for head and tail, by using some
 C    read-modify-write tricks.
 C  * The VMX code is used from the smallest sizes it handles, but measurements
 C    show a large speed bump at the cutoff points.  Small copying (perhaps
 C    using some read-modify-write technique) should be optimized.
 C  * Make a mpn_com_n based on this code.

 define(`GMP_LIMB_BYTES', eval(GMP_LIMB_BITS/8))
 define(`LIMBS_PER_VR',  eval(16/GMP_LIMB_BYTES))
 define(`LIMBS_PER_2VR', eval(32/GMP_LIMB_BYTES))


 ifelse(GMP_LIMB_BITS,32,`
 	define(`LIMB32',`	$1')
 	define(`LIMB64',`')
 ',`
 	define(`LIMB32',`')
 	define(`LIMB64',`	$1')
 ')

 C INPUT PARAMETERS
 define(`rp',	`r3')
 define(`up',	`r4')
 define(`n',	`r5')

 define(`us',	`v4')


 ASM_START()
 PROLOGUE(mpn_copyd)

 LIMB32(`slwi.	r0, n, 2	')
 LIMB64(`sldi.	r0, n, 3	')
 	add	rp, rp, r0
 	add	up, up, r0

 LIMB32(`cmpi	cr7, n, 11	')
 LIMB64(`cmpdi	cr7, n, 5	')
 	bge	cr7, L(big)

 	beqlr	cr0

 C Handle small cases with plain operations
 	mtctr	n
 L(topS):
 LIMB32(`lwz	r0, -4(up)	')
 LIMB64(`ld	r0, -8(up)	')
 	addi	up, up, -GMP_LIMB_BYTES
 LIMB32(`stw	r0, -4(rp)	')
 LIMB64(`std	r0, -8(rp)	')
 	addi	rp, rp, -GMP_LIMB_BYTES
 	bdnz	L(topS)
 	blr

 C Handle large cases with VMX operations
 L(big):
 	addi	rp, rp, -16
 	addi	up, up, -16
 	mfspr	r12, 256
 	oris	r0, r12, 0xf800		C Set VRSAVE bit 0-4
 	mtspr	256, r0

 LIMB32(`rlwinm.	r7, rp, 30,30,31')	C (rp >> 2) mod 4
 LIMB64(`rlwinm.	r7, rp, 29,31,31')	C (rp >> 3) mod 2
 	beq	L(rp_aligned)

 	subf	n, r7, n
 L(top0):
 LIMB32(`lwz	r0, 12(up)	')
 LIMB64(`ld	r0, 8(up)	')
 	addi	up, up, -GMP_LIMB_BYTES
 LIMB32(`addic.	r7, r7, -1	')
 LIMB32(`stw	r0, 12(rp)	')
 LIMB64(`std	r0, 8(rp)	')
 	addi	rp, rp, -GMP_LIMB_BYTES
 LIMB32(`bne	L(top0)		')

 L(rp_aligned):

 LIMB32(`rlwinm.	r0, up, 30,30,31')	C (up >> 2) mod 4
 LIMB64(`rlwinm.	r0, up, 29,31,31')	C (up >> 3) mod 2

 LIMB64(`srdi	r7, n, 2	')	C loop count corresponding to n
 LIMB32(`srwi	r7, n, 3	')	C loop count corresponding to n
 	mtctr	r7			C copy n to count register

 	li	r10, -16

 	beq	L(up_aligned)

 	lvsl	us, 0, up

 	addi	up, up, 16
 LIMB32(`andi.	r0, n, 0x4	')
 LIMB64(`andi.	r0, n, 0x2	')
 	beq	L(1)
 	lvx	v0, 0, up
 	lvx	v2, r10, up
 	vperm	v3, v2, v0, us
 	stvx	v3, 0, rp
 	addi	up, up, -32
 	addi	rp, rp, -16
 	b	L(lpu)
 L(1):	lvx	v2, 0, up
 	addi	up, up, -16
 	b	L(lpu)

 	ALIGN(32)
 L(lpu):	lvx	v0, 0, up
 	vperm	v3, v0, v2, us
 	stvx	v3, 0, rp
 	lvx	v2, r10, up
 	addi	up, up, -32
 	vperm	v3, v2, v0, us
 	stvx	v3, r10, rp
 	addi	rp, rp, -32
 	bdnz	L(lpu)

 	b	L(tail)

 L(up_aligned):

 LIMB32(`andi.	r0, n, 0x4	')
 LIMB64(`andi.	r0, n, 0x2	')
 	beq	L(lpa)
 	lvx	v0, 0,   up
 	stvx	v0, 0,   rp
 	addi	up, up, -16
 	addi	rp, rp, -16
 	b	L(lpa)

 	ALIGN(32)
 L(lpa):	lvx	v0, 0,   up
 	lvx	v1, r10, up
 	addi	up, up, -32
 	nop
 	stvx	v0, 0,   rp
 	stvx	v1, r10, rp
 	addi	rp, rp, -32
 	bdnz	L(lpa)

 L(tail):
 LIMB32(`rlwinm.	r7, n, 0,30,31	')	C r7 = n mod 4
 LIMB64(`rlwinm.	r7, n, 0,31,31	')	C r7 = n mod 2
 	beq	L(ret)
 LIMB32(`li	r10, 12		')
 L(top2):
 LIMB32(`lwzx	r0, r10, up	')
 LIMB64(`ld	r0, 8(up)	')
 LIMB32(`addic.	r7, r7, -1	')
 LIMB32(`stwx	r0, r10, rp	')
 LIMB64(`std	r0, 8(rp)	')
 LIMB32(`addi	r10, r10, -GMP_LIMB_BYTES')
 LIMB32(`bne	L(top2)		')

 L(ret):	mtspr	256, r12
 	blr
 EPILOGUE()
	dnl PowerPC-32/VMX and PowerPC-64/VMX mpn_copyd.

	dnl Copyright 2006 Free Software Foundation, Inc.

	dnl This file is part of the GNU MP Library.

	dnl The GNU MP Library is free software; you can redistribute it and/or modify
	dnl it under the terms of the GNU Lesser General Public License as published
	dnl by the Free Software Foundation; either version 3 of the License, or (at
	dnl your option) any later version.

	dnl The GNU MP Library is distributed in the hope that it will be useful, but
	dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
	dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
	dnl License for more details.

	dnl You should have received a copy of the GNU Lesser General Public License
	dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.

	include(`../config.m4')

	C 16-byte coaligned unaligned
	C cycles/limb cycles/limb
	C 7400,7410 (G4): 0.5 0.64
	C 744x,745x (G4+): 0.75 0.82
	C 970 (G5): 0.78 1.02 (64-bit limbs)

	C STATUS
	C * Works for all sizes and alignments.

	C TODO
	C * Optimize unaligned case. Some basic tests with 2-way and 4-way unrolling
	C indicate that we can reach 0.56 c/l for 7400, 0.75 c/l for 745x, and 0.80
	C c/l for 970.
	C * Consider using VMX instructions also for head and tail, by using some
	C read-modify-write tricks.
	C * The VMX code is used from the smallest sizes it handles, but measurements
	C show a large speed bump at the cutoff points. Small copying (perhaps
	C using some read-modify-write technique) should be optimized.
	C * Make a mpn_com_n based on this code.

	define(`GMP_LIMB_BYTES', eval(GMP_LIMB_BITS/8))
	define(`LIMBS_PER_VR', eval(16/GMP_LIMB_BYTES))
	define(`LIMBS_PER_2VR', eval(32/GMP_LIMB_BYTES))


	ifelse(GMP_LIMB_BITS,32,`
	define(`LIMB32',` $1')
	define(`LIMB64',`')
	',`
	define(`LIMB32',`')
	define(`LIMB64',` $1')
	')

	C INPUT PARAMETERS
	define(`rp', `r3')
	define(`up', `r4')
	define(`n', `r5')

	define(`us', `v4')


	ASM_START()
	PROLOGUE(mpn_copyd)

	LIMB32(`slwi. r0, n, 2 ')
	LIMB64(`sldi. r0, n, 3 ')
	add rp, rp, r0
	add up, up, r0

	LIMB32(`cmpi cr7, n, 11 ')
	LIMB64(`cmpdi cr7, n, 5 ')
	bge cr7, L(big)

	beqlr cr0

	C Handle small cases with plain operations
	mtctr n
	L(topS):
	LIMB32(`lwz r0, -4(up) ')
	LIMB64(`ld r0, -8(up) ')
	addi up, up, -GMP_LIMB_BYTES
	LIMB32(`stw r0, -4(rp) ')
	LIMB64(`std r0, -8(rp) ')
	addi rp, rp, -GMP_LIMB_BYTES
	bdnz L(topS)
	blr

	C Handle large cases with VMX operations
	L(big):
	addi rp, rp, -16
	addi up, up, -16
	mfspr r12, 256
	oris r0, r12, 0xf800 C Set VRSAVE bit 0-4
	mtspr 256, r0

	LIMB32(`rlwinm. r7, rp, 30,30,31') C (rp >> 2) mod 4
	LIMB64(`rlwinm. r7, rp, 29,31,31') C (rp >> 3) mod 2
	beq L(rp_aligned)

	subf n, r7, n
	L(top0):
	LIMB32(`lwz r0, 12(up) ')
	LIMB64(`ld r0, 8(up) ')
	addi up, up, -GMP_LIMB_BYTES
	LIMB32(`addic. r7, r7, -1 ')
	LIMB32(`stw r0, 12(rp) ')
	LIMB64(`std r0, 8(rp) ')
	addi rp, rp, -GMP_LIMB_BYTES
	LIMB32(`bne L(top0) ')

	L(rp_aligned):

	LIMB32(`rlwinm. r0, up, 30,30,31') C (up >> 2) mod 4
	LIMB64(`rlwinm. r0, up, 29,31,31') C (up >> 3) mod 2

	LIMB64(`srdi r7, n, 2 ') C loop count corresponding to n
	LIMB32(`srwi r7, n, 3 ') C loop count corresponding to n
	mtctr r7 C copy n to count register

	li r10, -16

	beq L(up_aligned)

	lvsl us, 0, up

	addi up, up, 16
	LIMB32(`andi. r0, n, 0x4 ')
	LIMB64(`andi. r0, n, 0x2 ')
	beq L(1)
	lvx v0, 0, up
	lvx v2, r10, up
	vperm v3, v2, v0, us
	stvx v3, 0, rp
	addi up, up, -32
	addi rp, rp, -16
	b L(lpu)
	L(1): lvx v2, 0, up
	addi up, up, -16
	b L(lpu)

	ALIGN(32)
	L(lpu): lvx v0, 0, up
	vperm v3, v0, v2, us
	stvx v3, 0, rp
	lvx v2, r10, up
	addi up, up, -32
	vperm v3, v2, v0, us
	stvx v3, r10, rp
	addi rp, rp, -32
	bdnz L(lpu)

	b L(tail)

	L(up_aligned):

	LIMB32(`andi. r0, n, 0x4 ')
	LIMB64(`andi. r0, n, 0x2 ')
	beq L(lpa)
	lvx v0, 0, up
	stvx v0, 0, rp
	addi up, up, -16
	addi rp, rp, -16
	b L(lpa)

	ALIGN(32)
	L(lpa): lvx v0, 0, up
	lvx v1, r10, up
	addi up, up, -32
	nop
	stvx v0, 0, rp
	stvx v1, r10, rp
	addi rp, rp, -32
	bdnz L(lpa)

	L(tail):
	LIMB32(`rlwinm. r7, n, 0,30,31 ') C r7 = n mod 4
	LIMB64(`rlwinm. r7, n, 0,31,31 ') C r7 = n mod 2
	beq L(ret)
	LIMB32(`li r10, 12 ')
	L(top2):
	LIMB32(`lwzx r0, r10, up ')
	LIMB64(`ld r0, 8(up) ')
	LIMB32(`addic. r7, r7, -1 ')
	LIMB32(`stwx r0, r10, rp ')
	LIMB64(`std r0, 8(rp) ')
	LIMB32(`addi r10, r10, -GMP_LIMB_BYTES')
	LIMB32(`bne L(top2) ')

	L(ret): mtspr 256, r12
	blr
	EPILOGUE()