gcc/gmp/mpn/x86_64/lshift.asm - native_client/nacl-toolchain - Git at Google

 dnl  AMD64 mpn_lshift -- mpn left shift.

 dnl  Copyright 2003, 2005, 2007 Free Software Foundation, Inc.
 dnl
 dnl  This file is part of the GNU MP Library.
 dnl
 dnl  The GNU MP Library is free software; you can redistribute it and/or
 dnl  modify it under the terms of the GNU Lesser General Public License as
 dnl  published by the Free Software Foundation; either version 3 of the
 dnl  License, or (at your option) any later version.
 dnl
 dnl  The GNU MP Library is distributed in the hope that it will be useful,
 dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
 dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 dnl  Lesser General Public License for more details.
 dnl
 dnl  You should have received a copy of the GNU Lesser General Public License
 dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.

 include(`../config.m4')


 C	     cycles/limb   cycles/limb cnt=1
 C K8,K9:	 2.375		 1.375
 C K10:		 2.375		 1.375
 C P4:		 8		10.5
 C P6-15 (Core2): 2.11		 4.28
 C P6-28 (Atom):	 5.75		 3.5


 C INPUT PARAMETERS
 define(`rp',	`%rdi')
 define(`up',	`%rsi')
 define(`n',	`%rdx')
 define(`cnt',	`%rcx')

 ASM_START()
 	TEXT
 	ALIGN(32)
 PROLOGUE(mpn_lshift)
 	cmp	$1, %cl
 	jne	L(gen)

 C For cnt=1 we want to work from lowest limb towards higher limbs.
 C Check for bad overlap (up=rp is OK!) up=1..rp+n-1 is bad.
 C FIXME: this could surely be done more cleverly.

 	mov    rp, %rax
 	sub    up, %rax
 	je     L(fwd)			C rp = up
 	shr    $3, %rax
 	cmp    n, %rax
 	jb     L(gen)

 L(fwd):	mov	R32(n), R32(%rax)
 	shr	$2, n
 	je	L(e1)
 	and	$3, R32(%rax)

 	ALIGN(8)
 	nop
 	nop
 L(t1):	mov	(up), %r8
 	mov	8(up), %r9
 	mov	16(up), %r10
 	mov	24(up), %r11
 	lea	32(up), up
 	adc	%r8, %r8
 	mov	%r8, (rp)
 	adc	%r9, %r9
 	mov	%r9, 8(rp)
 	adc	%r10, %r10
 	mov	%r10, 16(rp)
 	adc	%r11, %r11
 	mov	%r11, 24(rp)
 	lea	32(rp), rp
 	dec	n
 	jne	L(t1)

 	inc	%eax
 	dec	%eax
 	jne	L(n00)
 	adc	%eax, %eax
 	ret
 L(e1):	test	%eax, %eax			C clear cy
 L(n00):	mov	(up), %r8
 	dec	%eax
 	jne	L(n01)
 	adc	%r8, %r8
 	mov	%r8, (rp)
 L(ret):	adc	%eax, %eax
 	ret
 L(n01):	dec	%eax
 	mov	8(up), %r9
 	jne	L(n10)
 	adc	%r8, %r8
 	adc	%r9, %r9
 	mov	%r8, (rp)
 	mov	%r9, 8(rp)
 	adc	%eax, %eax
 	ret
 L(n10):	mov	16(up), %r10
 	adc	%r8, %r8
 	adc	%r9, %r9
 	adc	%r10, %r10
 	mov	%r8, (rp)
 	mov	%r9, 8(rp)
 	mov	%r10, 16(rp)
 	adc	$-1, %eax
 	ret

 L(gen):	neg	%ecx			C put rsh count in cl
 	mov	-8(up,n,8), %rax
 	shr	%cl, %rax		C function return value

 	neg	%ecx			C put lsh count in cl
 	lea	1(n), R32(%r8)
 	and	$3, R32(%r8)
 	je	L(rlx)			C jump for n = 3, 7, 11, ...

 	dec	R32(%r8)
 	jne	L(1)
 C	n = 4, 8, 12, ...
 	mov	-8(up,n,8), %r10
 	shl	%cl, %r10
 	neg	%ecx			C put rsh count in cl
 	mov	-16(up,n,8), %r8
 	shr	%cl, %r8
 	or	%r8, %r10
 	mov	%r10, -8(rp,n,8)
 	dec	n
 	jmp	L(rll)

 L(1):	dec	R32(%r8)
 	je	L(1x)			C jump for n = 1, 5, 9, 13, ...
 C	n = 2, 6, 10, 16, ...
 	mov	-8(up,n,8), %r10
 	shl	%cl, %r10
 	neg	%ecx			C put rsh count in cl
 	mov	-16(up,n,8), %r8
 	shr	%cl, %r8
 	or	%r8, %r10
 	mov	%r10, -8(rp,n,8)
 	dec	n
 	neg	%ecx			C put lsh count in cl
 L(1x):
 	cmp	$1, n
 	je	L(ast)
 	mov	-8(up,n,8), %r10
 	shl	%cl, %r10
 	mov	-16(up,n,8), %r11
 	shl	%cl, %r11
 	neg	%ecx			C put rsh count in cl
 	mov	-16(up,n,8), %r8
 	mov	-24(up,n,8), %r9
 	shr	%cl, %r8
 	or	%r8, %r10
 	shr	%cl, %r9
 	or	%r9, %r11
 	mov	%r10, -8(rp,n,8)
 	mov	%r11, -16(rp,n,8)
 	sub	$2, n

 L(rll):	neg	%ecx			C put lsh count in cl
 L(rlx):	mov	-8(up,n,8), %r10
 	shl	%cl, %r10
 	mov	-16(up,n,8), %r11
 	shl	%cl, %r11

 	sub	$4, n			C				      4
 	jb	L(end)			C				      2
 	ALIGN(16)
 L(top):
 	C finish stuff from lsh block
 	neg	%ecx			C put rsh count in cl
 	mov	16(up,n,8), %r8
 	mov	8(up,n,8), %r9
 	shr	%cl, %r8
 	or	%r8, %r10
 	shr	%cl, %r9
 	or	%r9, %r11
 	mov	%r10, 24(rp,n,8)
 	mov	%r11, 16(rp,n,8)
 	C start two new rsh
 	mov	0(up,n,8), %r8
 	mov	-8(up,n,8), %r9
 	shr	%cl, %r8
 	shr	%cl, %r9

 	C finish stuff from rsh block
 	neg	%ecx			C put lsh count in cl
 	mov	8(up,n,8), %r10
 	mov	0(up,n,8), %r11
 	shl	%cl, %r10
 	or	%r10, %r8
 	shl	%cl, %r11
 	or	%r11, %r9
 	mov	%r8, 8(rp,n,8)
 	mov	%r9, 0(rp,n,8)
 	C start two new lsh
 	mov	-8(up,n,8), %r10
 	mov	-16(up,n,8), %r11
 	shl	%cl, %r10
 	shl	%cl, %r11

 	sub	$4, n
 	jae	L(top)			C				      2
 L(end):
 	neg	%ecx			C put rsh count in cl
 	mov	16(up,n,8), %r8
 	shr	%cl, %r8
 	or	%r8, %r10
 	mov	8(up,n,8), %r9
 	shr	%cl, %r9
 	or	%r9, %r11
 	mov	%r10, 24(rp,n,8)
 	mov	%r11, 16(rp,n,8)

 	neg	%ecx			C put lsh count in cl
 L(ast):	mov	(up), %r10
 	shl	%cl, %r10
 	mov	%r10, (rp)
 	ret
 EPILOGUE()
	dnl AMD64 mpn_lshift -- mpn left shift.

	dnl Copyright 2003, 2005, 2007 Free Software Foundation, Inc.
	dnl
	dnl This file is part of the GNU MP Library.
	dnl
	dnl The GNU MP Library is free software; you can redistribute it and/or
	dnl modify it under the terms of the GNU Lesser General Public License as
	dnl published by the Free Software Foundation; either version 3 of the
	dnl License, or (at your option) any later version.
	dnl
	dnl The GNU MP Library is distributed in the hope that it will be useful,
	dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
	dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	dnl Lesser General Public License for more details.
	dnl
	dnl You should have received a copy of the GNU Lesser General Public License
	dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.

	include(`../config.m4')


	C cycles/limb cycles/limb cnt=1
	C K8,K9: 2.375 1.375
	C K10: 2.375 1.375
	C P4: 8 10.5
	C P6-15 (Core2): 2.11 4.28
	C P6-28 (Atom): 5.75 3.5


	C INPUT PARAMETERS
	define(`rp', `%rdi')
	define(`up', `%rsi')
	define(`n', `%rdx')
	define(`cnt', `%rcx')

	ASM_START()
	TEXT
	ALIGN(32)
	PROLOGUE(mpn_lshift)
	cmp $1, %cl
	jne L(gen)

	C For cnt=1 we want to work from lowest limb towards higher limbs.
	C Check for bad overlap (up=rp is OK!) up=1..rp+n-1 is bad.
	C FIXME: this could surely be done more cleverly.

	mov rp, %rax
	sub up, %rax
	je L(fwd) C rp = up
	shr $3, %rax
	cmp n, %rax
	jb L(gen)

	L(fwd): mov R32(n), R32(%rax)
	shr $2, n
	je L(e1)
	and $3, R32(%rax)

	ALIGN(8)
	nop
	nop
	L(t1): mov (up), %r8
	mov 8(up), %r9
	mov 16(up), %r10
	mov 24(up), %r11
	lea 32(up), up
	adc %r8, %r8
	mov %r8, (rp)
	adc %r9, %r9
	mov %r9, 8(rp)
	adc %r10, %r10
	mov %r10, 16(rp)
	adc %r11, %r11
	mov %r11, 24(rp)
	lea 32(rp), rp
	dec n
	jne L(t1)

	inc %eax
	dec %eax
	jne L(n00)
	adc %eax, %eax
	ret
	L(e1): test %eax, %eax C clear cy
	L(n00): mov (up), %r8
	dec %eax
	jne L(n01)
	adc %r8, %r8
	mov %r8, (rp)
	L(ret): adc %eax, %eax
	ret
	L(n01): dec %eax
	mov 8(up), %r9
	jne L(n10)
	adc %r8, %r8
	adc %r9, %r9
	mov %r8, (rp)
	mov %r9, 8(rp)
	adc %eax, %eax
	ret
	L(n10): mov 16(up), %r10
	adc %r8, %r8
	adc %r9, %r9
	adc %r10, %r10
	mov %r8, (rp)
	mov %r9, 8(rp)
	mov %r10, 16(rp)
	adc $-1, %eax
	ret

	L(gen): neg %ecx C put rsh count in cl
	mov -8(up,n,8), %rax
	shr %cl, %rax C function return value

	neg %ecx C put lsh count in cl
	lea 1(n), R32(%r8)
	and $3, R32(%r8)
	je L(rlx) C jump for n = 3, 7, 11, ...

	dec R32(%r8)
	jne L(1)
	C n = 4, 8, 12, ...
	mov -8(up,n,8), %r10
	shl %cl, %r10
	neg %ecx C put rsh count in cl
	mov -16(up,n,8), %r8
	shr %cl, %r8
	or %r8, %r10
	mov %r10, -8(rp,n,8)
	dec n
	jmp L(rll)

	L(1): dec R32(%r8)
	je L(1x) C jump for n = 1, 5, 9, 13, ...
	C n = 2, 6, 10, 16, ...
	mov -8(up,n,8), %r10
	shl %cl, %r10
	neg %ecx C put rsh count in cl
	mov -16(up,n,8), %r8
	shr %cl, %r8
	or %r8, %r10
	mov %r10, -8(rp,n,8)
	dec n
	neg %ecx C put lsh count in cl
	L(1x):
	cmp $1, n
	je L(ast)
	mov -8(up,n,8), %r10
	shl %cl, %r10
	mov -16(up,n,8), %r11
	shl %cl, %r11
	neg %ecx C put rsh count in cl
	mov -16(up,n,8), %r8
	mov -24(up,n,8), %r9
	shr %cl, %r8
	or %r8, %r10
	shr %cl, %r9
	or %r9, %r11
	mov %r10, -8(rp,n,8)
	mov %r11, -16(rp,n,8)
	sub $2, n

	L(rll): neg %ecx C put lsh count in cl
	L(rlx): mov -8(up,n,8), %r10
	shl %cl, %r10
	mov -16(up,n,8), %r11
	shl %cl, %r11

	sub $4, n C 4
	jb L(end) C 2
	ALIGN(16)
	L(top):
	C finish stuff from lsh block
	neg %ecx C put rsh count in cl
	mov 16(up,n,8), %r8
	mov 8(up,n,8), %r9
	shr %cl, %r8
	or %r8, %r10
	shr %cl, %r9
	or %r9, %r11
	mov %r10, 24(rp,n,8)
	mov %r11, 16(rp,n,8)
	C start two new rsh
	mov 0(up,n,8), %r8
	mov -8(up,n,8), %r9
	shr %cl, %r8
	shr %cl, %r9

	C finish stuff from rsh block
	neg %ecx C put lsh count in cl
	mov 8(up,n,8), %r10
	mov 0(up,n,8), %r11
	shl %cl, %r10
	or %r10, %r8
	shl %cl, %r11
	or %r11, %r9
	mov %r8, 8(rp,n,8)
	mov %r9, 0(rp,n,8)
	C start two new lsh
	mov -8(up,n,8), %r10
	mov -16(up,n,8), %r11
	shl %cl, %r10
	shl %cl, %r11

	sub $4, n
	jae L(top) C 2
	L(end):
	neg %ecx C put rsh count in cl
	mov 16(up,n,8), %r8
	shr %cl, %r8
	or %r8, %r10
	mov 8(up,n,8), %r9
	shr %cl, %r9
	or %r9, %r11
	mov %r10, 24(rp,n,8)
	mov %r11, 16(rp,n,8)

	neg %ecx C put lsh count in cl
	L(ast): mov (up), %r10
	shl %cl, %r10
	mov %r10, (rp)
	ret
	EPILOGUE()