gcc/gmp/mpn/x86_64/addmul_2.asm - native_client/nacl-toolchain - Git at Google

 dnl  AMD64 mpn_addmul_2 -- Multiply an n-limb vector with a 2-limb vector and
 dnl  add the result to a third limb vector.

 dnl  Copyright 2008 Free Software Foundation, Inc.

 dnl  This file is part of the GNU MP Library.

 dnl  The GNU MP Library is free software; you can redistribute it and/or modify
 dnl  it under the terms of the GNU Lesser General Public License as published
 dnl  by the Free Software Foundation; either version 3 of the License, or (at
 dnl  your option) any later version.

 dnl  The GNU MP Library is distributed in the hope that it will be useful, but
 dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
 dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
 dnl  License for more details.

 dnl  You should have received a copy of the GNU Lesser General Public License
 dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.

 include(`../config.m4')

 C	     cycles/limb
 C K8,K9:	 2.375
 C K10:		 2.375
 C P4:		 ?
 C P6-15:	 4.45

 C This code is the result of running a code generation and optimization tool
 C suite written by David Harvey and Torbjorn Granlund.

 C TODO
 C  * Work on feed-in and wind-down code.
 C  * Convert "mov $0" to "xor".
 C  * Adjust initial lea to save some bytes.
 C  * Perhaps adjust n from n_param&3 value?

 C INPUT PARAMETERS
 define(`rp',     `%rdi')
 define(`up',     `%rsi')
 define(`n_param',`%rdx')
 define(`vp',     `%rcx')

 define(`v0', `%r8')
 define(`v1', `%r9')
 define(`w0', `%rbx')
 define(`w1', `%rcx')
 define(`w2', `%rbp')
 define(`w3', `%r10')
 define(`n',  `%r11')

 ASM_START()
 	TEXT
 	ALIGN(16)
 PROLOGUE(mpn_addmul_2)
 	push	%rbx
 	push	%rbp

 	mov	(vp), v0
 	mov	8(vp), v1

 	mov	n_param, n
 	neg	n
 	lea	-32(up,n_param,8), up
 	lea	-32(rp,n_param,8), rp

 	and	$3, R32(n_param)
 	jz	L(am2p0)
 	cmp	$2, R32(n_param)
 	jc	L(am2p1)
 	jz	L(am2p2)
 L(am2p3):
 	mov	32(up,n,8), %rax
 	mul	v0
 	mov	%rax, w1
 	mov	32(up,n,8), %rax
 	mov	%rdx, w2
 	xor	R32(w3), R32(w3)
 	add	$2, n
 	jmp	L(am3)
 L(am2p0):
 	mov	32(up,n,8), %rax
 	mul	v0
 	mov	%rax, w0
 	mov	32(up,n,8), %rax
 	mov	%rdx, w1
 	xor	R32(w2), R32(w2)
 	add	$3, n
 	jmp	L(am0)
 L(am2p1):
 	mov	32(up,n,8), %rax
 	mul	v0
 	mov	%rax, w3
 	mov	32(up,n,8), %rax
 	mov	%rdx, w0
 	xor	R32(w1), R32(w1)
 	jmp	L(am1)
 L(am2p2):
 	mov	32(up,n,8), %rax
 	mul	v0
 	mov	%rax, w2
 	mov	32(up,n,8), %rax
 	mov	%rdx, w3
 	xor	R32(w0), R32(w0)
 	xor	R32(w1), R32(w1)
 	add	$1, n
 	jmp	L(am2)

 	ALIGN(32)
 L(top):
 	add	w3, (rp,n,8)
 	adc	%rax, w0
 	mov	8(up,n,8), %rax
 	adc	%rdx, w1
 	mov	$0, R32(w2)
 	mul	v0
 	add	%rax, w0
 	mov	8(up,n,8), %rax
 	adc	%rdx, w1
 	adc	$0, R32(w2)
 L(am0):	mul	v1
 	add	w0, 8(rp,n,8)
 	adc	%rax, w1
 	adc	%rdx, w2
 	mov	16(up,n,8), %rax
 	mov	$0, R32(w3)
 	mul	v0
 	add	%rax, w1
 	mov	16(up,n,8), %rax
 	adc	%rdx, w2
 	adc	$0, R32(w3)
 L(am3):	mul	v1
 	add	w1, 16(rp,n,8)
 	adc	%rax, w2
 	mov	24(up,n,8), %rax
 	adc	%rdx, w3
 	mul	v0
 	mov	$0, R32(w0)
 	add	%rax, w2
 	adc	%rdx, w3
 	mov	$0, R32(w1)
 	mov	24(up,n,8), %rax
 	adc	$0, R32(w0)
 L(am2):	mul	v1
 	add	w2, 24(rp,n,8)
 	adc	%rax, w3
 	adc	%rdx, w0
 	mov	32(up,n,8), %rax
 	mul	v0
 	add	%rax, w3
 	mov	32(up,n,8), %rax
 	adc	%rdx, w0
 	adc	$0, R32(w1)
 L(am1):	mul	v1
 	add	$4, n
 	js	L(top)

 	add	w3, (rp,n,8)
 	adc	%rax, w0
 	adc	%rdx, w1
 	mov	w0, 8(rp,n,8)
 	mov	w1, %rax

 	pop	%rbp
 	pop	%rbx
 	ret
 EPILOGUE()
	dnl AMD64 mpn_addmul_2 -- Multiply an n-limb vector with a 2-limb vector and
	dnl add the result to a third limb vector.

	dnl Copyright 2008 Free Software Foundation, Inc.

	dnl This file is part of the GNU MP Library.

	dnl The GNU MP Library is free software; you can redistribute it and/or modify
	dnl it under the terms of the GNU Lesser General Public License as published
	dnl by the Free Software Foundation; either version 3 of the License, or (at
	dnl your option) any later version.

	dnl The GNU MP Library is distributed in the hope that it will be useful, but
	dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
	dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
	dnl License for more details.

	dnl You should have received a copy of the GNU Lesser General Public License
	dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.

	include(`../config.m4')

	C cycles/limb
	C K8,K9: 2.375
	C K10: 2.375
	C P4: ?
	C P6-15: 4.45

	C This code is the result of running a code generation and optimization tool
	C suite written by David Harvey and Torbjorn Granlund.

	C TODO
	C * Work on feed-in and wind-down code.
	C * Convert "mov $0" to "xor".
	C * Adjust initial lea to save some bytes.
	C * Perhaps adjust n from n_param&3 value?

	C INPUT PARAMETERS
	define(`rp', `%rdi')
	define(`up', `%rsi')
	define(`n_param',`%rdx')
	define(`vp', `%rcx')

	define(`v0', `%r8')
	define(`v1', `%r9')
	define(`w0', `%rbx')
	define(`w1', `%rcx')
	define(`w2', `%rbp')
	define(`w3', `%r10')
	define(`n', `%r11')

	ASM_START()
	TEXT
	ALIGN(16)
	PROLOGUE(mpn_addmul_2)
	push %rbx
	push %rbp

	mov (vp), v0
	mov 8(vp), v1

	mov n_param, n
	neg n
	lea -32(up,n_param,8), up
	lea -32(rp,n_param,8), rp

	and $3, R32(n_param)
	jz L(am2p0)
	cmp $2, R32(n_param)
	jc L(am2p1)
	jz L(am2p2)
	L(am2p3):
	mov 32(up,n,8), %rax
	mul v0
	mov %rax, w1
	mov 32(up,n,8), %rax
	mov %rdx, w2
	xor R32(w3), R32(w3)
	add $2, n
	jmp L(am3)
	L(am2p0):
	mov 32(up,n,8), %rax
	mul v0
	mov %rax, w0
	mov 32(up,n,8), %rax
	mov %rdx, w1
	xor R32(w2), R32(w2)
	add $3, n
	jmp L(am0)
	L(am2p1):
	mov 32(up,n,8), %rax
	mul v0
	mov %rax, w3
	mov 32(up,n,8), %rax
	mov %rdx, w0
	xor R32(w1), R32(w1)
	jmp L(am1)
	L(am2p2):
	mov 32(up,n,8), %rax
	mul v0
	mov %rax, w2
	mov 32(up,n,8), %rax
	mov %rdx, w3
	xor R32(w0), R32(w0)
	xor R32(w1), R32(w1)
	add $1, n
	jmp L(am2)

	ALIGN(32)
	L(top):
	add w3, (rp,n,8)
	adc %rax, w0
	mov 8(up,n,8), %rax
	adc %rdx, w1
	mov $0, R32(w2)
	mul v0
	add %rax, w0
	mov 8(up,n,8), %rax
	adc %rdx, w1
	adc $0, R32(w2)
	L(am0): mul v1
	add w0, 8(rp,n,8)
	adc %rax, w1
	adc %rdx, w2
	mov 16(up,n,8), %rax
	mov $0, R32(w3)
	mul v0
	add %rax, w1
	mov 16(up,n,8), %rax
	adc %rdx, w2
	adc $0, R32(w3)
	L(am3): mul v1
	add w1, 16(rp,n,8)
	adc %rax, w2
	mov 24(up,n,8), %rax
	adc %rdx, w3
	mul v0
	mov $0, R32(w0)
	add %rax, w2
	adc %rdx, w3
	mov $0, R32(w1)
	mov 24(up,n,8), %rax
	adc $0, R32(w0)
	L(am2): mul v1
	add w2, 24(rp,n,8)
	adc %rax, w3
	adc %rdx, w0
	mov 32(up,n,8), %rax
	mul v0
	add %rax, w3
	mov 32(up,n,8), %rax
	adc %rdx, w0
	adc $0, R32(w1)
	L(am1): mul v1
	add $4, n
	js L(top)

	add w3, (rp,n,8)
	adc %rax, w0
	adc %rdx, w1
	mov w0, 8(rp,n,8)
	mov w1, %rax

	pop %rbp
	pop %rbx
	ret
	EPILOGUE()