gcc/gmp/mpn/x86_64/pentium4/rshift.asm - native_client/nacl-toolchain - Git at Google

 dnl  x86-64 mpn_rshift optimized for Pentium 4.

 dnl  Copyright 2003, 2005, 2007, 2008 Free Software Foundation, Inc.
 dnl
 dnl  This file is part of the GNU MP Library.
 dnl
 dnl  The GNU MP Library is free software; you can redistribute it and/or
 dnl  modify it under the terms of the GNU Lesser General Public License as
 dnl  published by the Free Software Foundation; either version 3 of the
 dnl  License, or (at your option) any later version.
 dnl
 dnl  The GNU MP Library is distributed in the hope that it will be useful,
 dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
 dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 dnl  Lesser General Public License for more details.
 dnl
 dnl  You should have received a copy of the GNU Lesser General Public License
 dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.

 include(`../config.m4')


 C	     cycles/limb
 C K8,K9:	 2.5
 C K10:		 ?
 C P4:		 3.29
 C P6-15 (Core2): 2.1 (fluctuates, presumably cache related)
 C P6-28 (Atom):	14.3

 C INPUT PARAMETERS
 define(`rp',`%rdi')
 define(`up',`%rsi')
 define(`n',`%rdx')
 define(`cnt',`%cl')

 ASM_START()
 	TEXT
 	ALIGN(32)
 PROLOGUE(mpn_rshift)
 	mov	(up), %rax
 	movd	%ecx, %mm4
 	neg	%ecx			C put lsh count in cl
 	and	$63, %ecx
 	movd	%ecx, %mm5

 	lea	-8(up,n,8), up
 	lea	-8(rp,n,8), rp
 	lea	1(n), %r8d
 	neg	n

 	shl	%cl, %rax		C function return value

 	and	$3, %r8d
 	je	L(rol)			C jump for n = 3, 7, 11, ...

 	dec	%r8d
 	jne	L(1)
 C	n = 4, 8, 12, ...
 	movq	8(up,n,8), %mm2
 	psrlq	%mm4, %mm2
 	movq	16(up,n,8), %mm0
 	psllq	%mm5, %mm0
 	por	%mm0, %mm2
 	movq	%mm2, 8(rp,n,8)
 	inc	n
 	jmp	L(rol)

 L(1):	dec	%r8d
 	je	L(1x)			C jump for n = 1, 5, 9, 13, ...
 C	n = 2, 6, 10, 16, ...
 	movq	8(up,n,8), %mm2
 	psrlq	%mm4, %mm2
 	movq	16(up,n,8), %mm0
 	psllq	%mm5, %mm0
 	por	%mm0, %mm2
 	movq	%mm2, 8(rp,n,8)
 	inc	n
 L(1x):
 	cmp	$-1, n
 	je	L(ast)
 	movq	8(up,n,8), %mm2
 	psrlq	%mm4, %mm2
 	movq	16(up,n,8), %mm3
 	psrlq	%mm4, %mm3
 	movq	16(up,n,8), %mm0
 	movq	24(up,n,8), %mm1
 	psllq	%mm5, %mm0
 	por	%mm0, %mm2
 	psllq	%mm5, %mm1
 	por	%mm1, %mm3
 	movq	%mm2, 8(rp,n,8)
 	movq	%mm3, 16(rp,n,8)
 	add	$2, n

 L(rol):	movq	8(up,n,8), %mm2
 	psrlq	%mm4, %mm2
 	movq	16(up,n,8), %mm3
 	psrlq	%mm4, %mm3

 	add	$4, n			C				      4
 	jb	L(end)			C				      2
 	ALIGN(32)
 L(top):
 	C finish stuff from lsh block
 	movq	-16(up,n,8), %mm0
 	movq	-8(up,n,8), %mm1
 	psllq	%mm5, %mm0
 	por	%mm0, %mm2
 	psllq	%mm5, %mm1
 	movq	(up,n,8), %mm0
 	por	%mm1, %mm3
 	movq	8(up,n,8), %mm1
 	movq	%mm2, -24(rp,n,8)
 	movq	%mm3, -16(rp,n,8)
 	C start two new rsh
 	psllq	%mm5, %mm0
 	psllq	%mm5, %mm1

 	C finish stuff from rsh block
 	movq	-8(up,n,8), %mm2
 	movq	(up,n,8), %mm3
 	psrlq	%mm4, %mm2
 	por	%mm2, %mm0
 	psrlq	%mm4, %mm3
 	movq	8(up,n,8), %mm2
 	por	%mm3, %mm1
 	movq	16(up,n,8), %mm3
 	movq	%mm0, -8(rp,n,8)
 	movq	%mm1, (rp,n,8)
 	C start two new lsh
 	add	$4, n
 	psrlq	%mm4, %mm2
 	psrlq	%mm4, %mm3

 	jae	L(top)			C				      2
 L(end):
 	movq	-16(up,n,8), %mm0
 	psllq	%mm5, %mm0
 	por	%mm0, %mm2
 	movq	-8(up,n,8), %mm1
 	psllq	%mm5, %mm1
 	por	%mm1, %mm3
 	movq	%mm2, -24(rp,n,8)
 	movq	%mm3, -16(rp,n,8)

 L(ast):	movq	(up), %mm2
 	psrlq	%mm4, %mm2
 	movq	%mm2, (rp)
 	emms
 	ret
 EPILOGUE()
	dnl x86-64 mpn_rshift optimized for Pentium 4.

	dnl Copyright 2003, 2005, 2007, 2008 Free Software Foundation, Inc.
	dnl
	dnl This file is part of the GNU MP Library.
	dnl
	dnl The GNU MP Library is free software; you can redistribute it and/or
	dnl modify it under the terms of the GNU Lesser General Public License as
	dnl published by the Free Software Foundation; either version 3 of the
	dnl License, or (at your option) any later version.
	dnl
	dnl The GNU MP Library is distributed in the hope that it will be useful,
	dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
	dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	dnl Lesser General Public License for more details.
	dnl
	dnl You should have received a copy of the GNU Lesser General Public License
	dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.

	include(`../config.m4')


	C cycles/limb
	C K8,K9: 2.5
	C K10: ?
	C P4: 3.29
	C P6-15 (Core2): 2.1 (fluctuates, presumably cache related)
	C P6-28 (Atom): 14.3

	C INPUT PARAMETERS
	define(`rp',`%rdi')
	define(`up',`%rsi')
	define(`n',`%rdx')
	define(`cnt',`%cl')

	ASM_START()
	TEXT
	ALIGN(32)
	PROLOGUE(mpn_rshift)
	mov (up), %rax
	movd %ecx, %mm4
	neg %ecx C put lsh count in cl
	and $63, %ecx
	movd %ecx, %mm5

	lea -8(up,n,8), up
	lea -8(rp,n,8), rp
	lea 1(n), %r8d
	neg n

	shl %cl, %rax C function return value

	and $3, %r8d
	je L(rol) C jump for n = 3, 7, 11, ...

	dec %r8d
	jne L(1)
	C n = 4, 8, 12, ...
	movq 8(up,n,8), %mm2
	psrlq %mm4, %mm2
	movq 16(up,n,8), %mm0
	psllq %mm5, %mm0
	por %mm0, %mm2
	movq %mm2, 8(rp,n,8)
	inc n
	jmp L(rol)

	L(1): dec %r8d
	je L(1x) C jump for n = 1, 5, 9, 13, ...
	C n = 2, 6, 10, 16, ...
	movq 8(up,n,8), %mm2
	psrlq %mm4, %mm2
	movq 16(up,n,8), %mm0
	psllq %mm5, %mm0
	por %mm0, %mm2
	movq %mm2, 8(rp,n,8)
	inc n
	L(1x):
	cmp $-1, n
	je L(ast)
	movq 8(up,n,8), %mm2
	psrlq %mm4, %mm2
	movq 16(up,n,8), %mm3
	psrlq %mm4, %mm3
	movq 16(up,n,8), %mm0
	movq 24(up,n,8), %mm1
	psllq %mm5, %mm0
	por %mm0, %mm2
	psllq %mm5, %mm1
	por %mm1, %mm3
	movq %mm2, 8(rp,n,8)
	movq %mm3, 16(rp,n,8)
	add $2, n

	L(rol): movq 8(up,n,8), %mm2
	psrlq %mm4, %mm2
	movq 16(up,n,8), %mm3
	psrlq %mm4, %mm3

	add $4, n C 4
	jb L(end) C 2
	ALIGN(32)
	L(top):
	C finish stuff from lsh block
	movq -16(up,n,8), %mm0
	movq -8(up,n,8), %mm1
	psllq %mm5, %mm0
	por %mm0, %mm2
	psllq %mm5, %mm1
	movq (up,n,8), %mm0
	por %mm1, %mm3
	movq 8(up,n,8), %mm1
	movq %mm2, -24(rp,n,8)
	movq %mm3, -16(rp,n,8)
	C start two new rsh
	psllq %mm5, %mm0
	psllq %mm5, %mm1

	C finish stuff from rsh block
	movq -8(up,n,8), %mm2
	movq (up,n,8), %mm3
	psrlq %mm4, %mm2
	por %mm2, %mm0
	psrlq %mm4, %mm3
	movq 8(up,n,8), %mm2
	por %mm3, %mm1
	movq 16(up,n,8), %mm3
	movq %mm0, -8(rp,n,8)
	movq %mm1, (rp,n,8)
	C start two new lsh
	add $4, n
	psrlq %mm4, %mm2
	psrlq %mm4, %mm3

	jae L(top) C 2
	L(end):
	movq -16(up,n,8), %mm0
	psllq %mm5, %mm0
	por %mm0, %mm2
	movq -8(up,n,8), %mm1
	psllq %mm5, %mm1
	por %mm1, %mm3
	movq %mm2, -24(rp,n,8)
	movq %mm3, -16(rp,n,8)

	L(ast): movq (up), %mm2
	psrlq %mm4, %mm2
	movq %mm2, (rp)
	emms
	ret
	EPILOGUE()