gcc/gmp/mpn/x86_64/divrem_2.asm - native_client/nacl-toolchain - Git at Google

 dnl  x86-64 mpn_divrem_2 -- Divide an mpn number by a normalized 2-limb number.

 dnl  Copyright 2007, 2008 Free Software Foundation, Inc.

 dnl  This file is part of the GNU MP Library.

 dnl  The GNU MP Library is free software; you can redistribute it and/or modify
 dnl  it under the terms of the GNU Lesser General Public License as published
 dnl  by the Free Software Foundation; either version 3 of the License, or (at
 dnl  your option) any later version.

 dnl  The GNU MP Library is distributed in the hope that it will be useful, but
 dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
 dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
 dnl  License for more details.

 dnl  You should have received a copy of the GNU Lesser General Public License
 dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.

 include(`../config.m4')


 C		norm	frac
 C K8		20	20
 C P4		73	73
 C P6-15		37	37

 C TODO
 C  * Perhaps compute the inverse without relying on divq?  Could either use
 C    Newton's method and mulq, or perhaps the faster fdiv.
 C  * The loop has not been carefully tuned, nor analysed for critical path
 C    length.  It seems that 20 c/l is a bit long, compared to the 13 c/l for
 C    mpn_divrem_1.
 C  * Clean up.  This code is really crude.


 C INPUT PARAMETERS
 define(`qp',		`%rdi')
 define(`fn',		`%rsi')
 define(`up_param',	`%rdx')
 define(`un_param',	`%rcx')
 define(`dp',		`%r8')

 define(`dinv',		`%r9')


 C rax rbx rcx rdx rsi rdi rbp r8  r9  r10 r11 r12 r13 r14 r15
 C         cnt         qp      d  dinv

 ASM_START()
 	TEXT
 	ALIGN(16)
 PROLOGUE(mpn_divrem_2)

 	push	%r15
 	lea	(%rdx,%rcx,8), %rax
 	push	%r14
 	push	%r13
 	mov	%rsi, %r13
 	push	%r12
 	lea	-24(%rax), %r12
 	push	%rbp
 	mov	%rdi, %rbp
 	push	%rbx
 	mov	8(%r8), %r11
 	mov	-8(%rax), %r9
 	mov	(%r8), %r8
 	mov	-16(%rax), %r10
 	xor	R32(%r15), R32(%r15)
 	cmp	%r9, %r11
 	ja	L(2)
 	setb	%dl
 	cmp	%r10, %r8
 	setbe	%al
 	or	%al, %dl
 	jne	L(23)
 L(2):
 	lea	-3(%rcx,%r13), %rbx	C un + fn - 3
 	test	%rbx, %rbx
 	js	L(6)
 	mov	%r11, %rdx
 	mov	$-1, %rax
 	not	%rdx
 	div	%r11
 	mov	%r11, %rdx
 	mov	%rax, %rdi
 	imul	%rax, %rdx
 	mov	%rdx, %r14
 	mul	%r8
 	mov	%rdx, %rcx
 	mov	$-1, %rdx
 	add	%r8, %r14
 	adc	$0, %rdx
 	add	%rcx, %r14
 	adc	$0, %rdx
 	js	L(8)
 L(18):
 	dec	%rdi
 	sub	%r11, %r14
 	sbb	$0, %rdx
 	jns	L(18)
 L(8):

 C rax rbx rcx rdx rsi rdi rbp r8 r9 r10 r11 r12 r13 r14 r15
 C n2      un      n1 dinv qp  d0        d1  up  fn      msl
 C     n2  un     -d1      n1    dinv XX              XX

 ifdef(`NEW',`
 	lea	(%rbp,%rbx,8), %rbp
 	mov	%rbx, %rcx		C un
 	mov	%r9, %rbx
 	mov	%rdi, %r9		C di
 	mov	%r10, %r14
 	mov	%r11, %rsi
 	neg	%rsi			C -d1
 	ALIGN(16)
 L(loop):
 	mov	%r9, %rax		C di		ncp
 	mul	%rbx			C		0, 18
 	add	%r14, %rax		C		4
 	mov	%rax, %r10		C q0		5
 	adc	%rbx, %rdx		C		5
 	mov	%rdx, %rdi		C q		6
 	imul	%rsi, %rdx		C		6
 	mov	%r8, %rax		C		ncp
 	lea	(%rdx, %r14), %rbx	C n1 -= ...	7
 	mul	%rdi			C		7
 	xor	R32(%r14), R32(%r14)	C
 	cmp	%rcx, %r13		C
 	jg	L(19)			C
 	mov	(%r12), %r14		C
 	sub	$8, %r12		C
 L(19):	sub	%r8, %r14		C		ncp
 	sbb	%r11, %rbx		C		9
 	sub	%rax, %r14		C		11
 	sbb	%rdx, %rbx		C		12
 	inc	%rdi			C		7
 	xor	R32(%rdx), R32(%rdx)	C
 	cmp	%r10, %rbx		C		13
 	mov	%r8, %rax		C d1		ncp
 	adc	$-1, %rdx		C mask		14
 	add	%rdx, %rdi		C q--		15
 	and	%rdx, %rax		C d0 or 0	15
 	and	%r11, %rdx		C d1 or 0	15
 	add	%rax, %r14		C		16
 	adc	%rdx, %rbx		C		16
 	cmp	%r11, %rbx		C		17
 	jae	L(fix)			C
 L(bck):	mov	%rdi, (%rbp)		C
 	sub	$8, %rbp		C
 	dec	%rcx
 	jns	L(loop)

 	mov	%r14, %r10
 	mov	%rbx, %r9
 ',`
 	lea	(%rbp,%rbx,8), %rbp
 	mov	%rbx, %rcx
 	mov	%r9, %rax
 	mov	%r10, %rsi
 	ALIGN(16)
 L(loop):
 	mov	%rax, %r14		C		0, 19
 	mul	%rdi			C		0
 	mov	%r11, %r9		C		1
 	add	%rsi, %rax		C		4
 	mov	%rax, %rbx		C q0		5
 	adc	%r14, %rdx		C q		5
 	lea	1(%rdx), %r10		C		6
 	mov	%rdx, %rax		C		6
 	imul	%rdx, %r9		C		6
 	sub	%r9, %rsi		C		10
 	xor	R32(%r9), R32(%r9)	C
 	mul	%r8			C		7
 	cmp	%rcx, %r13		C
 	jg	L(13)			C
 	mov	(%r12), %r9		C
 	sub	$8, %r12		C
 L(13):	sub	%r8, %r9		C		ncp
 	sbb	%r11, %rsi		C		11
 	sub	%rax, %r9		C		11
 	sbb	%rdx, %rsi		C		12
 	cmp	%rbx, %rsi		C		13
 	sbb	%rax, %rax		C		14
 	not	%rax			C		15
 	add	%rax, %r10		C		16
 	mov	%r8, %rbx		C		ncp
 	and	%rax, %rbx		C		16
 	and	%r11, %rax		C		16
 	add	%rbx, %r9		C		17
 	adc	%rsi, %rax		C		18
 	cmp	%rax, %r11		C		19
 	jbe	L(fix)			C
 L(bck):	mov	%r10, (%rbp)		C
 	sub	$8, %rbp		C
 	mov	%r9, %rsi		C		18
 	dec	%rcx
 	jns	L(loop)

 	mov	%rsi, %r10
 	mov	%rax, %r9
 ')
 L(6):
 	mov	%r10, 8(%r12)
 	mov	%r9, 16(%r12)
 	pop	%rbx
 	pop	%rbp
 	pop	%r12
 	pop	%r13
 	pop	%r14
 	mov	%r15, %rax
 	pop	%r15
 	ret

 L(23):	inc	R32(%r15)
 	sub	%r8, %r10
 	sbb	%r11, %r9
 	jmp	L(2)

 ifdef(`NEW',`
 L(fix):	seta	%dl
 	cmp	%r8, %r14
 	setae	%al
 	orb	%dl, %al
 	je	L(bck)
 	inc	%rdi
 	sub	%r8, %r14
 	sbb	%r11, %rbx
 	jmp	L(bck)
 ',`
 L(fix):	jb	L(88)
 	cmp	%r8, %r9
 	jb	L(bck)
 L(88):	inc	%r10
 	sub	%r8, %r9
 	sbb	%r11, %rax
 	jmp	L(bck)
 ')
 EPILOGUE()
	dnl x86-64 mpn_divrem_2 -- Divide an mpn number by a normalized 2-limb number.

	dnl Copyright 2007, 2008 Free Software Foundation, Inc.

	dnl This file is part of the GNU MP Library.

	dnl The GNU MP Library is free software; you can redistribute it and/or modify
	dnl it under the terms of the GNU Lesser General Public License as published
	dnl by the Free Software Foundation; either version 3 of the License, or (at
	dnl your option) any later version.

	dnl The GNU MP Library is distributed in the hope that it will be useful, but
	dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
	dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
	dnl License for more details.

	dnl You should have received a copy of the GNU Lesser General Public License
	dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.

	include(`../config.m4')


	C norm frac
	C K8 20 20
	C P4 73 73
	C P6-15 37 37

	C TODO
	C * Perhaps compute the inverse without relying on divq? Could either use
	C Newton's method and mulq, or perhaps the faster fdiv.
	C * The loop has not been carefully tuned, nor analysed for critical path
	C length. It seems that 20 c/l is a bit long, compared to the 13 c/l for
	C mpn_divrem_1.
	C * Clean up. This code is really crude.


	C INPUT PARAMETERS
	define(`qp', `%rdi')
	define(`fn', `%rsi')
	define(`up_param', `%rdx')
	define(`un_param', `%rcx')
	define(`dp', `%r8')

	define(`dinv', `%r9')


	C rax rbx rcx rdx rsi rdi rbp r8 r9 r10 r11 r12 r13 r14 r15
	C cnt qp d dinv

	ASM_START()
	TEXT
	ALIGN(16)
	PROLOGUE(mpn_divrem_2)

	push %r15
	lea (%rdx,%rcx,8), %rax
	push %r14
	push %r13
	mov %rsi, %r13
	push %r12
	lea -24(%rax), %r12
	push %rbp
	mov %rdi, %rbp
	push %rbx
	mov 8(%r8), %r11
	mov -8(%rax), %r9
	mov (%r8), %r8
	mov -16(%rax), %r10
	xor R32(%r15), R32(%r15)
	cmp %r9, %r11
	ja L(2)
	setb %dl
	cmp %r10, %r8
	setbe %al
	or %al, %dl
	jne L(23)
	L(2):
	lea -3(%rcx,%r13), %rbx C un + fn - 3
	test %rbx, %rbx
	js L(6)
	mov %r11, %rdx
	mov $-1, %rax
	not %rdx
	div %r11
	mov %r11, %rdx
	mov %rax, %rdi
	imul %rax, %rdx
	mov %rdx, %r14
	mul %r8
	mov %rdx, %rcx
	mov $-1, %rdx
	add %r8, %r14
	adc $0, %rdx
	add %rcx, %r14
	adc $0, %rdx
	js L(8)
	L(18):
	dec %rdi
	sub %r11, %r14
	sbb $0, %rdx
	jns L(18)
	L(8):

	C rax rbx rcx rdx rsi rdi rbp r8 r9 r10 r11 r12 r13 r14 r15
	C n2 un n1 dinv qp d0 d1 up fn msl
	C n2 un -d1 n1 dinv XX XX

	ifdef(`NEW',`
	lea (%rbp,%rbx,8), %rbp
	mov %rbx, %rcx C un
	mov %r9, %rbx
	mov %rdi, %r9 C di
	mov %r10, %r14
	mov %r11, %rsi
	neg %rsi C -d1
	ALIGN(16)
	L(loop):
	mov %r9, %rax C di ncp
	mul %rbx C 0, 18
	add %r14, %rax C 4
	mov %rax, %r10 C q0 5
	adc %rbx, %rdx C 5
	mov %rdx, %rdi C q 6
	imul %rsi, %rdx C 6
	mov %r8, %rax C ncp
	lea (%rdx, %r14), %rbx C n1 -= ... 7
	mul %rdi C 7
	xor R32(%r14), R32(%r14) C
	cmp %rcx, %r13 C
	jg L(19) C
	mov (%r12), %r14 C
	sub $8, %r12 C
	L(19): sub %r8, %r14 C ncp
	sbb %r11, %rbx C 9
	sub %rax, %r14 C 11
	sbb %rdx, %rbx C 12
	inc %rdi C 7
	xor R32(%rdx), R32(%rdx) C
	cmp %r10, %rbx C 13
	mov %r8, %rax C d1 ncp
	adc $-1, %rdx C mask 14
	add %rdx, %rdi C q-- 15
	and %rdx, %rax C d0 or 0 15
	and %r11, %rdx C d1 or 0 15
	add %rax, %r14 C 16
	adc %rdx, %rbx C 16
	cmp %r11, %rbx C 17
	jae L(fix) C
	L(bck): mov %rdi, (%rbp) C
	sub $8, %rbp C
	dec %rcx
	jns L(loop)

	mov %r14, %r10
	mov %rbx, %r9
	',`
	lea (%rbp,%rbx,8), %rbp
	mov %rbx, %rcx
	mov %r9, %rax
	mov %r10, %rsi
	ALIGN(16)
	L(loop):
	mov %rax, %r14 C 0, 19
	mul %rdi C 0
	mov %r11, %r9 C 1
	add %rsi, %rax C 4
	mov %rax, %rbx C q0 5
	adc %r14, %rdx C q 5
	lea 1(%rdx), %r10 C 6
	mov %rdx, %rax C 6
	imul %rdx, %r9 C 6
	sub %r9, %rsi C 10
	xor R32(%r9), R32(%r9) C
	mul %r8 C 7
	cmp %rcx, %r13 C
	jg L(13) C
	mov (%r12), %r9 C
	sub $8, %r12 C
	L(13): sub %r8, %r9 C ncp
	sbb %r11, %rsi C 11
	sub %rax, %r9 C 11
	sbb %rdx, %rsi C 12
	cmp %rbx, %rsi C 13
	sbb %rax, %rax C 14
	not %rax C 15
	add %rax, %r10 C 16
	mov %r8, %rbx C ncp
	and %rax, %rbx C 16
	and %r11, %rax C 16
	add %rbx, %r9 C 17
	adc %rsi, %rax C 18
	cmp %rax, %r11 C 19
	jbe L(fix) C
	L(bck): mov %r10, (%rbp) C
	sub $8, %rbp C
	mov %r9, %rsi C 18
	dec %rcx
	jns L(loop)

	mov %rsi, %r10
	mov %rax, %r9
	')
	L(6):
	mov %r10, 8(%r12)
	mov %r9, 16(%r12)
	pop %rbx
	pop %rbp
	pop %r12
	pop %r13
	pop %r14
	mov %r15, %rax
	pop %r15
	ret

	L(23): inc R32(%r15)
	sub %r8, %r10
	sbb %r11, %r9
	jmp L(2)

	ifdef(`NEW',`
	L(fix): seta %dl
	cmp %r8, %r14
	setae %al
	orb %dl, %al
	je L(bck)
	inc %rdi
	sub %r8, %r14
	sbb %r11, %rbx
	jmp L(bck)
	',`
	L(fix): jb L(88)
	cmp %r8, %r9
	jb L(bck)
	L(88): inc %r10
	sub %r8, %r9
	sbb %r11, %rax
	jmp L(bck)
	')
	EPILOGUE()