gcc/gmp/mpn/x86_64/aorsmul_1.asm - native_client/nacl-toolchain - Git at Google

 dnl  AMD64 mpn_addmul_1 and mpn_submul_1.

 dnl  Copyright 2003, 2004, 2005, 2007, 2008 Free Software Foundation, Inc.

 dnl  This file is part of the GNU MP Library.

 dnl  The GNU MP Library is free software; you can redistribute it and/or modify
 dnl  it under the terms of the GNU Lesser General Public License as published
 dnl  by the Free Software Foundation; either version 3 of the License, or (at
 dnl  your option) any later version.

 dnl  The GNU MP Library is distributed in the hope that it will be useful, but
 dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
 dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
 dnl  License for more details.

 dnl  You should have received a copy of the GNU Lesser General Public License
 dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.

 include(`../config.m4')

 C	     cycles/limb
 C K8,K9:	 2.5
 C K10:		 2.5
 C P4:		14.9
 C P6-15 (Core2): 5.09
 C P6-28 (Atom):	21.3

 C The inner loop of this code is the result of running a code generation and
 C optimization tool suite written by David Harvey and Torbjorn Granlund.

 C TODO:
 C  * The inner loop is great, but the prologue and epilogue code was
 C    quickly written.  Tune it!

 C INPUT PARAMETERS
 define(`rp',	 `%rdi')
 define(`up',	 `%rsi')
 define(`n_param',`%rdx')
 define(`vl',	 `%rcx')

 define(`n',	`%r11')

 ifdef(`OPERATION_addmul_1',`
       define(`ADDSUB',        `add')
       define(`func',  `mpn_addmul_1')
 ')
 ifdef(`OPERATION_submul_1',`
       define(`ADDSUB',        `sub')
       define(`func',  `mpn_submul_1')
 ')

 MULFUNC_PROLOGUE(mpn_addmul_1 mpn_submul_1)

 ASM_START()
 	TEXT
 	ALIGN(16)
 PROLOGUE(func)
 	mov	(up), %rax		C read first u limb early
 	push	%rbx
 	mov	n_param, %rbx		C move away n from rdx, mul uses it
 	mul	vl
 	mov	%rbx, %r11

 	and	$3, R32(%rbx)
 	jz	L(b0)
 	cmp	$2, R32(%rbx)
 	jz	L(b2)
 	jg	L(b3)

 L(b1):	dec	n
 	jne	L(gt1)
 	ADDSUB	%rax, (rp)
 	jmp	L(ret)
 L(gt1):	lea	8(up,n,8), up
 	lea	-8(rp,n,8), rp
 	neg	n
 	xor	%r10, %r10
 	xor	R32(%rbx), R32(%rbx)
 	mov	%rax, %r9
 	mov	(up,n,8), %rax
 	mov	%rdx, %r8
 	jmp	L(L1)

 L(b0):	lea	(up,n,8), up
 	lea	-16(rp,n,8), rp
 	neg	n
 	xor	%r10, %r10
 	mov	%rax, %r8
 	mov	%rdx, %rbx
 	jmp	 L(L0)

 L(b3):	lea	-8(up,n,8), up
 	lea	-24(rp,n,8), rp
 	neg	n
 	mov	%rax, %rbx
 	mov	%rdx, %r10
 	jmp	L(L3)

 L(b2):	lea	-16(up,n,8), up
 	lea	-32(rp,n,8), rp
 	neg	n
 	xor	%r8, %r8
 	xor	R32(%rbx), R32(%rbx)
 	mov	%rax, %r10
 	mov	24(up,n,8), %rax
 	mov	%rdx, %r9
 	jmp	L(L2)

 	ALIGN(16)
 L(top):	ADDSUB	%r10, (rp,n,8)
 	adc	%rax, %r9
 	mov	(up,n,8), %rax
 	adc	%rdx, %r8
 	mov	$0, %r10d
 L(L1):	mul	vl
 	ADDSUB	%r9, 8(rp,n,8)
 	adc	%rax, %r8
 	adc	%rdx, %rbx
 L(L0):	mov	8(up,n,8), %rax
 	mul	vl
 	ADDSUB	%r8, 16(rp,n,8)
 	adc	%rax, %rbx
 	adc	%rdx, %r10
 L(L3):	mov	16(up,n,8), %rax
 	mul	vl
 	ADDSUB	%rbx, 24(rp,n,8)
 	mov	$0, %r8d		# zero
 	mov	%r8, %rbx		# zero
 	adc	%rax, %r10
 	mov	24(up,n,8), %rax
 	mov	%r8, %r9		# zero
 	adc	%rdx, %r9
 L(L2):	mul	vl
 	add	$4, n
 	js	 L(top)

 	ADDSUB	%r10, (rp,n,8)
 	adc	%rax, %r9
 	adc	%r8, %rdx
 	ADDSUB	%r9, 8(rp,n,8)
 L(ret):	adc	$0, %rdx
 	mov	%rdx, %rax

 	pop	%rbx
 	ret
 EPILOGUE()
	dnl AMD64 mpn_addmul_1 and mpn_submul_1.

	dnl Copyright 2003, 2004, 2005, 2007, 2008 Free Software Foundation, Inc.

	dnl This file is part of the GNU MP Library.

	dnl The GNU MP Library is free software; you can redistribute it and/or modify
	dnl it under the terms of the GNU Lesser General Public License as published
	dnl by the Free Software Foundation; either version 3 of the License, or (at
	dnl your option) any later version.

	dnl The GNU MP Library is distributed in the hope that it will be useful, but
	dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
	dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
	dnl License for more details.

	dnl You should have received a copy of the GNU Lesser General Public License
	dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.

	include(`../config.m4')

	C cycles/limb
	C K8,K9: 2.5
	C K10: 2.5
	C P4: 14.9
	C P6-15 (Core2): 5.09
	C P6-28 (Atom): 21.3

	C The inner loop of this code is the result of running a code generation and
	C optimization tool suite written by David Harvey and Torbjorn Granlund.

	C TODO:
	C * The inner loop is great, but the prologue and epilogue code was
	C quickly written. Tune it!

	C INPUT PARAMETERS
	define(`rp', `%rdi')
	define(`up', `%rsi')
	define(`n_param',`%rdx')
	define(`vl', `%rcx')

	define(`n', `%r11')

	ifdef(`OPERATION_addmul_1',`
	define(`ADDSUB', `add')
	define(`func', `mpn_addmul_1')
	')
	ifdef(`OPERATION_submul_1',`
	define(`ADDSUB', `sub')
	define(`func', `mpn_submul_1')
	')

	MULFUNC_PROLOGUE(mpn_addmul_1 mpn_submul_1)

	ASM_START()
	TEXT
	ALIGN(16)
	PROLOGUE(func)
	mov (up), %rax C read first u limb early
	push %rbx
	mov n_param, %rbx C move away n from rdx, mul uses it
	mul vl
	mov %rbx, %r11

	and $3, R32(%rbx)
	jz L(b0)
	cmp $2, R32(%rbx)
	jz L(b2)
	jg L(b3)

	L(b1): dec n
	jne L(gt1)
	ADDSUB %rax, (rp)
	jmp L(ret)
	L(gt1): lea 8(up,n,8), up
	lea -8(rp,n,8), rp
	neg n
	xor %r10, %r10
	xor R32(%rbx), R32(%rbx)
	mov %rax, %r9
	mov (up,n,8), %rax
	mov %rdx, %r8
	jmp L(L1)

	L(b0): lea (up,n,8), up
	lea -16(rp,n,8), rp
	neg n
	xor %r10, %r10
	mov %rax, %r8
	mov %rdx, %rbx
	jmp L(L0)

	L(b3): lea -8(up,n,8), up
	lea -24(rp,n,8), rp
	neg n
	mov %rax, %rbx
	mov %rdx, %r10
	jmp L(L3)

	L(b2): lea -16(up,n,8), up
	lea -32(rp,n,8), rp
	neg n
	xor %r8, %r8
	xor R32(%rbx), R32(%rbx)
	mov %rax, %r10
	mov 24(up,n,8), %rax
	mov %rdx, %r9
	jmp L(L2)

	ALIGN(16)
	L(top): ADDSUB %r10, (rp,n,8)
	adc %rax, %r9
	mov (up,n,8), %rax
	adc %rdx, %r8
	mov $0, %r10d
	L(L1): mul vl
	ADDSUB %r9, 8(rp,n,8)
	adc %rax, %r8
	adc %rdx, %rbx
	L(L0): mov 8(up,n,8), %rax
	mul vl
	ADDSUB %r8, 16(rp,n,8)
	adc %rax, %rbx
	adc %rdx, %r10
	L(L3): mov 16(up,n,8), %rax
	mul vl
	ADDSUB %rbx, 24(rp,n,8)
	mov $0, %r8d # zero
	mov %r8, %rbx # zero
	adc %rax, %r10
	mov 24(up,n,8), %rax
	mov %r8, %r9 # zero
	adc %rdx, %r9
	L(L2): mul vl
	add $4, n
	js L(top)

	ADDSUB %r10, (rp,n,8)
	adc %rax, %r9
	adc %r8, %rdx
	ADDSUB %r9, 8(rp,n,8)
	L(ret): adc $0, %rdx
	mov %rdx, %rax

	pop %rbx
	ret
	EPILOGUE()