gcc/gmp/mpn/x86/pentium4/sse2/divrem_1.asm - native_client/nacl-toolchain - Git at Google

 dnl  Intel Pentium-4 mpn_divrem_1 -- mpn by limb division.

 dnl  Copyright 1999, 2000, 2001, 2002, 2003, 2004 Free Software Foundation,
 dnl  Inc.
 dnl
 dnl  This file is part of the GNU MP Library.
 dnl
 dnl  The GNU MP Library is free software; you can redistribute it and/or
 dnl  modify it under the terms of the GNU Lesser General Public License as
 dnl  published by the Free Software Foundation; either version 3 of the
 dnl  License, or (at your option) any later version.
 dnl
 dnl  The GNU MP Library is distributed in the hope that it will be useful,
 dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
 dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 dnl  Lesser General Public License for more details.
 dnl
 dnl  You should have received a copy of the GNU Lesser General Public License
 dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.

 include(`../config.m4')


 C P4: 32 cycles/limb integer part, 30 cycles/limb fraction part.


 C mp_limb_t mpn_divrem_1 (mp_ptr dst, mp_size_t xsize,
 C                         mp_srcptr src, mp_size_t size,
 C                         mp_limb_t divisor);
 C mp_limb_t mpn_divrem_1c (mp_ptr dst, mp_size_t xsize,
 C                          mp_srcptr src, mp_size_t size,
 C                          mp_limb_t divisor, mp_limb_t carry);
 C mp_limb_t mpn_preinv_divrem_1 (mp_ptr dst, mp_size_t xsize,
 C                                mp_srcptr src, mp_size_t size,
 C                                mp_limb_t divisor, mp_limb_t inverse,
 C                                unsigned shift);
 C
 C Algorithm:
 C
 C The method and nomenclature follow part 8 of "Division by Invariant
 C Integers using Multiplication" by Granlund and Montgomery, reference in
 C gmp.texi.
 C
 C "m" is written for what is m' in the paper, and "d" for d_norm, which
 C won't cause any confusion since it's only the normalized divisor that's of
 C any use in the code.  "b" is written for 2^N, the size of a limb, N being
 C 32 here.
 C
 C The step "sdword dr = n - 2^N*d + (2^N-1-q1) * d" is instead done as
 C "n-d - q1*d".  This rearrangement gives the same two-limb answer but lets
 C us have just a psubq on the dependent chain.
 C
 C For reference, the way the k7 code uses "n-(q1+1)*d" would not suit here,
 C detecting an overflow of q1+1 when q1=0xFFFFFFFF would cost too much.
 C
 C Notes:
 C
 C mpn_divrem_1 and mpn_preinv_divrem_1 avoid one division if the src high
 C limb is less than the divisor.  mpn_divrem_1c doesn't check for a zero
 C carry, since in normal circumstances that will be a very rare event.
 C
 C The test for skipping a division is branch free (once size>=1 is tested).
 C The store to the destination high limb is 0 when a divide is skipped, or
 C if it's not skipped then a copy of the src high limb is stored.  The
 C latter is in case src==dst.
 C
 C There's a small bias towards expecting xsize==0, by having code for
 C xsize==0 in a straight line and xsize!=0 under forward jumps.
 C
 C Enhancements:
 C
 C The loop measures 32 cycles, but the dependent chain would suggest it
 C could be done with 30.  Not sure where to start looking for the extras.
 C
 C Alternatives:
 C
 C If the divisor is normalized (high bit set) then a division step can
 C always be skipped, since the high destination limb is always 0 or 1 in
 C that case.  It doesn't seem worth checking for this though, since it
 C probably occurs infrequently.


 dnl  MUL_THRESHOLD is the value of xsize+size at which the multiply by
 dnl  inverse method is used, rather than plain "divl"s.  Minimum value 1.
 dnl
 dnl  The inverse takes about 80-90 cycles to calculate, but after that the
 dnl  multiply is 32 c/l versus division at about 58 c/l.
 dnl
 dnl  At 4 limbs the div is a touch faster than the mul (and of course
 dnl  simpler), so start the mul from 5 limbs.

 deflit(MUL_THRESHOLD, 5)


 defframe(PARAM_PREINV_SHIFT,   28)  dnl mpn_preinv_divrem_1
 defframe(PARAM_PREINV_INVERSE, 24)  dnl mpn_preinv_divrem_1
 defframe(PARAM_CARRY,  24)          dnl mpn_divrem_1c
 defframe(PARAM_DIVISOR,20)
 defframe(PARAM_SIZE,   16)
 defframe(PARAM_SRC,    12)
 defframe(PARAM_XSIZE,  8)
 defframe(PARAM_DST,    4)

 dnl  re-use parameter space
 define(SAVE_ESI,`PARAM_SIZE')
 define(SAVE_EBP,`PARAM_SRC')
 define(SAVE_EDI,`PARAM_DIVISOR')
 define(SAVE_EBX,`PARAM_DST')

 	TEXT

 	ALIGN(16)
 PROLOGUE(mpn_preinv_divrem_1)
 deflit(`FRAME',0)

 	movl	PARAM_SIZE, %ecx
 	xorl	%edx, %edx		C carry if can't skip a div

 	movl	%esi, SAVE_ESI
 	movl	PARAM_SRC, %esi

 	movl	%ebp, SAVE_EBP
 	movl	PARAM_DIVISOR, %ebp

 	movl	%edi, SAVE_EDI
 	movl	PARAM_DST, %edi

 	movl	-4(%esi,%ecx,4), %eax	C src high limb

 	movl	%ebx, SAVE_EBX
 	movl	PARAM_XSIZE, %ebx

 	movd	PARAM_PREINV_INVERSE, %mm4

 	movd	PARAM_PREINV_SHIFT, %mm7  C l
 	cmpl	%ebp, %eax		C high cmp divisor

 	cmovc(	%eax, %edx)		C high is carry if high<divisor
 	movd	%edx, %mm0		C carry

 	movd	%edx, %mm1		C carry
 	movl	$0, %edx

 	movd	%ebp, %mm5		C d
 	cmovnc(	%eax, %edx)		C 0 if skip div, src high if not
 					C (the latter in case src==dst)
 	leal	-4(%edi,%ebx,4), %edi	C &dst[xsize-1]

 	movl	%edx, (%edi,%ecx,4)	C dst high limb
 	sbbl	$0, %ecx		C skip one division if high<divisor
 	movl	$32, %eax

 	subl	PARAM_PREINV_SHIFT, %eax
 	psllq	%mm7, %mm5		C d normalized
 	leal	(%edi,%ecx,4), %edi	C &dst[xsize+size-1]
 	leal	-4(%esi,%ecx,4), %esi	C &src[size-1]

 	movd	%eax, %mm6		C 32-l
 	jmp	L(start_preinv)

 EPILOGUE()


 	ALIGN(16)
 PROLOGUE(mpn_divrem_1c)
 deflit(`FRAME',0)

 	movl	PARAM_CARRY, %edx

 	movl	PARAM_SIZE, %ecx

 	movl	%esi, SAVE_ESI
 	movl	PARAM_SRC, %esi

 	movl	%ebp, SAVE_EBP
 	movl	PARAM_DIVISOR, %ebp

 	movl	%edi, SAVE_EDI
 	movl	PARAM_DST, %edi

 	movl	%ebx, SAVE_EBX
 	movl	PARAM_XSIZE, %ebx

 	leal	-4(%edi,%ebx,4), %edi	C &dst[xsize-1]
 	jmp	L(start_1c)

 EPILOGUE()


 	ALIGN(16)
 PROLOGUE(mpn_divrem_1)
 deflit(`FRAME',0)

 	movl	PARAM_SIZE, %ecx
 	xorl	%edx, %edx		C initial carry (if can't skip a div)

 	movl	%esi, SAVE_ESI
 	movl	PARAM_SRC, %esi

 	movl	%ebp, SAVE_EBP
 	movl	PARAM_DIVISOR, %ebp

 	movl	%edi, SAVE_EDI
 	movl	PARAM_DST, %edi

 	movl	%ebx, SAVE_EBX
 	movl	PARAM_XSIZE, %ebx
 	leal	-4(%edi,%ebx,4), %edi	C &dst[xsize-1]

 	orl	%ecx, %ecx		C size
 	jz	L(no_skip_div)		C if size==0
 	movl	-4(%esi,%ecx,4), %eax	C src high limb

 	cmpl	%ebp, %eax		C high cmp divisor

 	cmovnc(	%eax, %edx)		C 0 if skip div, src high if not
 	movl	%edx, (%edi,%ecx,4)	C dst high limb

 	movl	$0, %edx
 	cmovc(	%eax, %edx)		C high is carry if high<divisor

 	sbbl	$0, %ecx		C size-1 if high<divisor
 L(no_skip_div):


 L(start_1c):
 	C eax
 	C ebx	xsize
 	C ecx	size
 	C edx	carry
 	C esi	src
 	C edi	&dst[xsize-1]
 	C ebp	divisor

 	leal	(%ebx,%ecx), %eax	C size+xsize
 	leal	-4(%esi,%ecx,4), %esi	C &src[size-1]
 	leal	(%edi,%ecx,4), %edi	C &dst[size+xsize-1]

 	cmpl	$MUL_THRESHOLD, %eax
 	jae	L(mul_by_inverse)


 	orl	%ecx, %ecx
 	jz	L(divide_no_integer)	C if size==0

 L(divide_integer):
 	C eax	scratch (quotient)
 	C ebx	xsize
 	C ecx	counter
 	C edx	carry
 	C esi	src, decrementing
 	C edi	dst, decrementing
 	C ebp	divisor

 	movl	(%esi), %eax
 	subl	$4, %esi

 	divl	%ebp

 	movl	%eax, (%edi)
 	subl	$4, %edi

 	subl	$1, %ecx
 	jnz	L(divide_integer)


 L(divide_no_integer):
 	orl	%ebx, %ebx
 	jnz	L(divide_fraction)	C if xsize!=0

 L(divide_done):
 	movl	SAVE_ESI, %esi
 	movl	SAVE_EDI, %edi
 	movl	SAVE_EBX, %ebx
 	movl	SAVE_EBP, %ebp
 	movl	%edx, %eax
 	ret


 L(divide_fraction):
 	C eax	scratch (quotient)
 	C ebx	counter
 	C ecx
 	C edx	carry
 	C esi
 	C edi	dst, decrementing
 	C ebp	divisor

 	movl	$0, %eax

 	divl	%ebp

 	movl	%eax, (%edi)
 	subl	$4, %edi

 	subl	$1, %ebx
 	jnz	L(divide_fraction)

 	jmp	L(divide_done)


 C -----------------------------------------------------------------------------

 L(mul_by_inverse):
 	C eax
 	C ebx	xsize
 	C ecx	size
 	C edx	carry
 	C esi	&src[size-1]
 	C edi	&dst[size+xsize-1]
 	C ebp	divisor

 	bsrl	%ebp, %eax		C 31-l
 	movd	%edx, %mm0		C carry
 	movd	%edx, %mm1		C carry
 	movl	%ecx, %edx		C size
 	movl	$31, %ecx

 	C

 	xorl	%eax, %ecx		C l = leading zeros on d
 	addl	$1, %eax

 	shll	%cl, %ebp		C d normalized
 	movd	%ecx, %mm7		C l
 	movl	%edx, %ecx		C size

 	movd	%eax, %mm6		C 32-l
 	movl	$-1, %edx
 	movl	$-1, %eax

 	C

 	subl	%ebp, %edx		C (b-d)-1 so  edx:eax = b*(b-d)-1

 	divl	%ebp			C floor (b*(b-d)-1 / d)
 	movd	%ebp, %mm5		C d

 	C

 	movd	%eax, %mm4		C m


 L(start_preinv):
 	C eax	inverse
 	C ebx	xsize
 	C ecx	size
 	C edx
 	C esi	&src[size-1]
 	C edi	&dst[size+xsize-1]
 	C ebp
 	C
 	C mm0	carry
 	C mm1	carry
 	C mm2
 	C mm4	m
 	C mm5	d
 	C mm6	31-l
 	C mm7	l

 	psllq	%mm7, %mm0		C n2 = carry << l, for size==0

 	subl	$1, %ecx
 	jb	L(integer_none)

 	movd	(%esi), %mm0		C src high limb
 	punpckldq %mm1, %mm0
 	psrlq	%mm6, %mm0		C n2 = high (carry:srchigh << l)
 	jz	L(integer_last)


 C The dependent chain here consists of
 C
 C	2   paddd    n1+n2
 C	8   pmuludq  m*(n1+n2)
 C	2   paddq    n2:nadj + m*(n1+n2)
 C	2   psrlq    q1
 C	8   pmuludq  d*q1
 C	2   psubq    (n-d)-q1*d
 C	2   psrlq    high n-(q1+1)*d mask
 C	2   pand     d masked
 C	2   paddd    n2+d addback
 C	--
 C	30
 C
 C But it seems to run at 32 cycles, so presumably there's something else
 C going on.

 	ALIGN(16)
 L(integer_top):
 	C eax
 	C ebx
 	C ecx	counter, size-1 to 0
 	C edx
 	C esi	src, decrementing
 	C edi	dst, decrementing
 	C
 	C mm0	n2
 	C mm4	m
 	C mm5	d
 	C mm6	32-l
 	C mm7	l

 	ASSERT(b,`C n2<d
 	 movd	%mm0, %eax
 	 movd	%mm5, %edx
 	 cmpl	%edx, %eax')

 	movd	-4(%esi), %mm1		C next src limbs
 	movd	(%esi), %mm2
 	leal	-4(%esi), %esi

 	punpckldq %mm2, %mm1
 	psrlq	%mm6, %mm1		C n10

 	movq	%mm1, %mm2		C n10
 	movq	%mm1, %mm3		C n10
 	psrad	$31, %mm1		C -n1
 	pand	%mm5, %mm1		C -n1 & d
 	paddd	%mm2, %mm1		C nadj = n10+(-n1&d), ignore overflow

 	psrld	$31, %mm2		C n1
 	paddd	%mm0, %mm2		C n2+n1
 	punpckldq %mm0, %mm1		C n2:nadj

 	pmuludq	%mm4, %mm2		C m*(n2+n1)

 	C

 	paddq	%mm2, %mm1		C n2:nadj + m*(n2+n1)
 	pxor	%mm2, %mm2		C break dependency, saves 4 cycles
 	pcmpeqd	%mm2, %mm2		C FF...FF
 	psrlq	$63, %mm2		C 1

 	psrlq	$32, %mm1		C q1 = high(n2:nadj + m*(n2+n1))

 	paddd	%mm1, %mm2		C q1+1
 	pmuludq	%mm5, %mm1		C q1*d

 	punpckldq %mm0, %mm3		C n = n2:n10
 	pxor	%mm0, %mm0

 	psubq	%mm5, %mm3		C n - d

 	C

 	psubq	%mm1, %mm3		C n - (q1+1)*d

 	por	%mm3, %mm0		C copy remainder -> new n2
 	psrlq	$32, %mm3		C high n - (q1+1)*d, 0 or -1

 	ASSERT(be,`C 0 or -1
 	 movd	%mm3, %eax
 	 addl	$1, %eax
 	 cmpl	$1, %eax')

 	paddd	%mm3, %mm2		C q
 	pand	%mm5, %mm3		C mask & d

 	paddd	%mm3, %mm0		C addback if necessary
 	movd	%mm2, (%edi)
 	leal	-4(%edi), %edi

 	subl	$1, %ecx
 	ja	L(integer_top)


 L(integer_last):
 	C eax
 	C ebx	xsize
 	C ecx
 	C edx
 	C esi	&src[0]
 	C edi	&dst[xsize]
 	C
 	C mm0	n2
 	C mm4	m
 	C mm5	d
 	C mm6
 	C mm7	l

 	ASSERT(b,`C n2<d
 	 movd	%mm0, %eax
 	 movd	%mm5, %edx
 	 cmpl	%edx, %eax')

 	movd	(%esi), %mm1		C src[0]
 	psllq	%mm7, %mm1		C n10

 	movq	%mm1, %mm2		C n10
 	movq	%mm1, %mm3		C n10
 	psrad	$31, %mm1		C -n1
 	pand	%mm5, %mm1		C -n1 & d
 	paddd	%mm2, %mm1		C nadj = n10+(-n1&d), ignore overflow

 	psrld	$31, %mm2		C n1
 	paddd	%mm0, %mm2		C n2+n1
 	punpckldq %mm0, %mm1		C n2:nadj

 	pmuludq	%mm4, %mm2		C m*(n2+n1)

 	C

 	paddq	%mm2, %mm1		C n2:nadj + m*(n2+n1)
 	pcmpeqd	%mm2, %mm2		C FF...FF
 	psrlq	$63, %mm2		C 1

 	psrlq	$32, %mm1		C q1 = high(n2:nadj + m*(n2+n1))
 	paddd	%mm1, %mm2		C q1

 	pmuludq	%mm5, %mm1		C q1*d
 	punpckldq %mm0, %mm3		C n
 	psubq	%mm5, %mm3		C n - d
 	pxor	%mm0, %mm0

 	C

 	psubq	%mm1, %mm3		C n - (q1+1)*d

 	por	%mm3, %mm0		C remainder -> n2
 	psrlq	$32, %mm3		C high n - (q1+1)*d, 0 or -1

 	ASSERT(be,`C 0 or -1
 	 movd	%mm3, %eax
 	 addl	$1, %eax
 	 cmpl	$1, %eax')

 	paddd	%mm3, %mm2		C q
 	pand	%mm5, %mm3		C mask & d

 	paddd	%mm3, %mm0		C addback if necessary
 	movd	%mm2, (%edi)
 	leal	-4(%edi), %edi


 L(integer_none):
 	C eax
 	C ebx	xsize

 	orl	%ebx, %ebx
 	jnz	L(fraction_some)	C if xsize!=0


 L(fraction_done):
 	movl	SAVE_EBP, %ebp
 	psrld	%mm7, %mm0		C remainder

 	movl	SAVE_EDI, %edi
 	movd	%mm0, %eax

 	movl	SAVE_ESI, %esi
 	movl	SAVE_EBX, %ebx
 	emms
 	ret


 C -----------------------------------------------------------------------------
 C

 L(fraction_some):
 	C eax
 	C ebx	xsize
 	C ecx
 	C edx
 	C esi
 	C edi	&dst[xsize-1]
 	C ebp


 L(fraction_top):
 	C eax
 	C ebx	counter, xsize iterations
 	C ecx
 	C edx
 	C esi	src, decrementing
 	C edi	dst, decrementing
 	C
 	C mm0	n2
 	C mm4	m
 	C mm5	d
 	C mm6	32-l
 	C mm7	l

 	ASSERT(b,`C n2<d
 	 movd	%mm0, %eax
 	 movd	%mm5, %edx
 	 cmpl	%edx, %eax')

 	movq	%mm0, %mm1		C n2
 	pmuludq	%mm4, %mm0		C m*n2

 	pcmpeqd	%mm2, %mm2
 	psrlq	$63, %mm2

 	C

 	psrlq	$32, %mm0		C high(m*n2)

 	paddd	%mm1, %mm0		C q1 = high(n2:0 + m*n2)

 	paddd	%mm0, %mm2		C q1+1
 	pmuludq	%mm5, %mm0		C q1*d

 	psllq	$32, %mm1		C n = n2:0
 	psubq	%mm5, %mm1		C n - d

 	C

 	psubq	%mm0, %mm1		C r = n - (q1+1)*d
 	pxor	%mm0, %mm0

 	por	%mm1, %mm0		C r -> n2
 	psrlq	$32, %mm1		C high n - (q1+1)*d, 0 or -1

 	ASSERT(be,`C 0 or -1
 	 movd	%mm1, %eax
 	 addl	$1, %eax
 	 cmpl	$1, %eax')

 	paddd	%mm1, %mm2		C q
 	pand	%mm5, %mm1		C mask & d

 	paddd	%mm1, %mm0		C addback if necessary
 	movd	%mm2, (%edi)
 	leal	-4(%edi), %edi

 	subl	$1, %ebx
 	jne	L(fraction_top)


 	jmp	L(fraction_done)

 EPILOGUE()
	dnl Intel Pentium-4 mpn_divrem_1 -- mpn by limb division.

	dnl Copyright 1999, 2000, 2001, 2002, 2003, 2004 Free Software Foundation,
	dnl Inc.
	dnl
	dnl This file is part of the GNU MP Library.
	dnl
	dnl The GNU MP Library is free software; you can redistribute it and/or
	dnl modify it under the terms of the GNU Lesser General Public License as
	dnl published by the Free Software Foundation; either version 3 of the
	dnl License, or (at your option) any later version.
	dnl
	dnl The GNU MP Library is distributed in the hope that it will be useful,
	dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
	dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	dnl Lesser General Public License for more details.
	dnl
	dnl You should have received a copy of the GNU Lesser General Public License
	dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.

	include(`../config.m4')


	C P4: 32 cycles/limb integer part, 30 cycles/limb fraction part.


	C mp_limb_t mpn_divrem_1 (mp_ptr dst, mp_size_t xsize,
	C mp_srcptr src, mp_size_t size,
	C mp_limb_t divisor);
	C mp_limb_t mpn_divrem_1c (mp_ptr dst, mp_size_t xsize,
	C mp_srcptr src, mp_size_t size,
	C mp_limb_t divisor, mp_limb_t carry);
	C mp_limb_t mpn_preinv_divrem_1 (mp_ptr dst, mp_size_t xsize,
	C mp_srcptr src, mp_size_t size,
	C mp_limb_t divisor, mp_limb_t inverse,
	C unsigned shift);
	C
	C Algorithm:
	C
	C The method and nomenclature follow part 8 of "Division by Invariant
	C Integers using Multiplication" by Granlund and Montgomery, reference in
	C gmp.texi.
	C
	C "m" is written for what is m' in the paper, and "d" for d_norm, which
	C won't cause any confusion since it's only the normalized divisor that's of
	C any use in the code. "b" is written for 2^N, the size of a limb, N being
	C 32 here.
	C
	C The step "sdword dr = n - 2^Nd + (2^N-1-q1) d" is instead done as
	C "n-d - q1*d". This rearrangement gives the same two-limb answer but lets
	C us have just a psubq on the dependent chain.
	C
	C For reference, the way the k7 code uses "n-(q1+1)*d" would not suit here,
	C detecting an overflow of q1+1 when q1=0xFFFFFFFF would cost too much.
	C
	C Notes:
	C
	C mpn_divrem_1 and mpn_preinv_divrem_1 avoid one division if the src high
	C limb is less than the divisor. mpn_divrem_1c doesn't check for a zero
	C carry, since in normal circumstances that will be a very rare event.
	C
	C The test for skipping a division is branch free (once size>=1 is tested).
	C The store to the destination high limb is 0 when a divide is skipped, or
	C if it's not skipped then a copy of the src high limb is stored. The
	C latter is in case src==dst.
	C
	C There's a small bias towards expecting xsize==0, by having code for
	C xsize==0 in a straight line and xsize!=0 under forward jumps.
	C
	C Enhancements:
	C
	C The loop measures 32 cycles, but the dependent chain would suggest it
	C could be done with 30. Not sure where to start looking for the extras.
	C
	C Alternatives:
	C
	C If the divisor is normalized (high bit set) then a division step can
	C always be skipped, since the high destination limb is always 0 or 1 in
	C that case. It doesn't seem worth checking for this though, since it
	C probably occurs infrequently.


	dnl MUL_THRESHOLD is the value of xsize+size at which the multiply by
	dnl inverse method is used, rather than plain "divl"s. Minimum value 1.
	dnl
	dnl The inverse takes about 80-90 cycles to calculate, but after that the
	dnl multiply is 32 c/l versus division at about 58 c/l.
	dnl
	dnl At 4 limbs the div is a touch faster than the mul (and of course
	dnl simpler), so start the mul from 5 limbs.

	deflit(MUL_THRESHOLD, 5)


	defframe(PARAM_PREINV_SHIFT, 28) dnl mpn_preinv_divrem_1
	defframe(PARAM_PREINV_INVERSE, 24) dnl mpn_preinv_divrem_1
	defframe(PARAM_CARRY, 24) dnl mpn_divrem_1c
	defframe(PARAM_DIVISOR,20)
	defframe(PARAM_SIZE, 16)
	defframe(PARAM_SRC, 12)
	defframe(PARAM_XSIZE, 8)
	defframe(PARAM_DST, 4)

	dnl re-use parameter space
	define(SAVE_ESI,`PARAM_SIZE')
	define(SAVE_EBP,`PARAM_SRC')
	define(SAVE_EDI,`PARAM_DIVISOR')
	define(SAVE_EBX,`PARAM_DST')

	TEXT

	ALIGN(16)
	PROLOGUE(mpn_preinv_divrem_1)
	deflit(`FRAME',0)

	movl PARAM_SIZE, %ecx
	xorl %edx, %edx C carry if can't skip a div

	movl %esi, SAVE_ESI
	movl PARAM_SRC, %esi

	movl %ebp, SAVE_EBP
	movl PARAM_DIVISOR, %ebp

	movl %edi, SAVE_EDI
	movl PARAM_DST, %edi

	movl -4(%esi,%ecx,4), %eax C src high limb

	movl %ebx, SAVE_EBX
	movl PARAM_XSIZE, %ebx

	movd PARAM_PREINV_INVERSE, %mm4

	movd PARAM_PREINV_SHIFT, %mm7 C l
	cmpl %ebp, %eax C high cmp divisor

	cmovc( %eax, %edx) C high is carry if high<divisor
	movd %edx, %mm0 C carry

	movd %edx, %mm1 C carry
	movl $0, %edx

	movd %ebp, %mm5 C d
	cmovnc( %eax, %edx) C 0 if skip div, src high if not
	C (the latter in case src==dst)
	leal -4(%edi,%ebx,4), %edi C &dst[xsize-1]

	movl %edx, (%edi,%ecx,4) C dst high limb
	sbbl $0, %ecx C skip one division if high<divisor
	movl $32, %eax

	subl PARAM_PREINV_SHIFT, %eax
	psllq %mm7, %mm5 C d normalized
	leal (%edi,%ecx,4), %edi C &dst[xsize+size-1]
	leal -4(%esi,%ecx,4), %esi C &src[size-1]

	movd %eax, %mm6 C 32-l
	jmp L(start_preinv)

	EPILOGUE()


	ALIGN(16)
	PROLOGUE(mpn_divrem_1c)
	deflit(`FRAME',0)

	movl PARAM_CARRY, %edx

	movl PARAM_SIZE, %ecx

	movl %esi, SAVE_ESI
	movl PARAM_SRC, %esi

	movl %ebp, SAVE_EBP
	movl PARAM_DIVISOR, %ebp

	movl %edi, SAVE_EDI
	movl PARAM_DST, %edi

	movl %ebx, SAVE_EBX
	movl PARAM_XSIZE, %ebx

	leal -4(%edi,%ebx,4), %edi C &dst[xsize-1]
	jmp L(start_1c)

	EPILOGUE()


	ALIGN(16)
	PROLOGUE(mpn_divrem_1)
	deflit(`FRAME',0)

	movl PARAM_SIZE, %ecx
	xorl %edx, %edx C initial carry (if can't skip a div)

	movl %esi, SAVE_ESI
	movl PARAM_SRC, %esi

	movl %ebp, SAVE_EBP
	movl PARAM_DIVISOR, %ebp

	movl %edi, SAVE_EDI
	movl PARAM_DST, %edi

	movl %ebx, SAVE_EBX
	movl PARAM_XSIZE, %ebx
	leal -4(%edi,%ebx,4), %edi C &dst[xsize-1]

	orl %ecx, %ecx C size
	jz L(no_skip_div) C if size==0
	movl -4(%esi,%ecx,4), %eax C src high limb

	cmpl %ebp, %eax C high cmp divisor

	cmovnc( %eax, %edx) C 0 if skip div, src high if not
	movl %edx, (%edi,%ecx,4) C dst high limb

	movl $0, %edx
	cmovc( %eax, %edx) C high is carry if high<divisor

	sbbl $0, %ecx C size-1 if high<divisor
	L(no_skip_div):


	L(start_1c):
	C eax
	C ebx xsize
	C ecx size
	C edx carry
	C esi src
	C edi &dst[xsize-1]
	C ebp divisor

	leal (%ebx,%ecx), %eax C size+xsize
	leal -4(%esi,%ecx,4), %esi C &src[size-1]
	leal (%edi,%ecx,4), %edi C &dst[size+xsize-1]

	cmpl $MUL_THRESHOLD, %eax
	jae L(mul_by_inverse)


	orl %ecx, %ecx
	jz L(divide_no_integer) C if size==0

	L(divide_integer):
	C eax scratch (quotient)
	C ebx xsize
	C ecx counter
	C edx carry
	C esi src, decrementing
	C edi dst, decrementing
	C ebp divisor

	movl (%esi), %eax
	subl $4, %esi

	divl %ebp

	movl %eax, (%edi)
	subl $4, %edi

	subl $1, %ecx
	jnz L(divide_integer)


	L(divide_no_integer):
	orl %ebx, %ebx
	jnz L(divide_fraction) C if xsize!=0

	L(divide_done):
	movl SAVE_ESI, %esi
	movl SAVE_EDI, %edi
	movl SAVE_EBX, %ebx
	movl SAVE_EBP, %ebp
	movl %edx, %eax
	ret


	L(divide_fraction):
	C eax scratch (quotient)
	C ebx counter
	C ecx
	C edx carry
	C esi
	C edi dst, decrementing
	C ebp divisor

	movl $0, %eax

	divl %ebp

	movl %eax, (%edi)
	subl $4, %edi

	subl $1, %ebx
	jnz L(divide_fraction)

	jmp L(divide_done)



	C -----------------------------------------------------------------------------

	L(mul_by_inverse):
	C eax
	C ebx xsize
	C ecx size
	C edx carry
	C esi &src[size-1]
	C edi &dst[size+xsize-1]
	C ebp divisor

	bsrl %ebp, %eax C 31-l
	movd %edx, %mm0 C carry
	movd %edx, %mm1 C carry
	movl %ecx, %edx C size
	movl $31, %ecx

	C

	xorl %eax, %ecx C l = leading zeros on d
	addl $1, %eax

	shll %cl, %ebp C d normalized
	movd %ecx, %mm7 C l
	movl %edx, %ecx C size

	movd %eax, %mm6 C 32-l
	movl $-1, %edx
	movl $-1, %eax

	C

	subl %ebp, %edx C (b-d)-1 so edx:eax = b*(b-d)-1

	divl %ebp C floor (b*(b-d)-1 / d)
	movd %ebp, %mm5 C d

	C

	movd %eax, %mm4 C m


	L(start_preinv):
	C eax inverse
	C ebx xsize
	C ecx size
	C edx
	C esi &src[size-1]
	C edi &dst[size+xsize-1]
	C ebp
	C
	C mm0 carry
	C mm1 carry
	C mm2
	C mm4 m
	C mm5 d
	C mm6 31-l
	C mm7 l

	psllq %mm7, %mm0 C n2 = carry << l, for size==0

	subl $1, %ecx
	jb L(integer_none)

	movd (%esi), %mm0 C src high limb
	punpckldq %mm1, %mm0
	psrlq %mm6, %mm0 C n2 = high (carry:srchigh << l)
	jz L(integer_last)


	C The dependent chain here consists of
	C
	C 2 paddd n1+n2
	C 8 pmuludq m*(n1+n2)
	C 2 paddq n2:nadj + m*(n1+n2)
	C 2 psrlq q1
	C 8 pmuludq d*q1
	C 2 psubq (n-d)-q1*d
	C 2 psrlq high n-(q1+1)*d mask
	C 2 pand d masked
	C 2 paddd n2+d addback
	C --
	C 30
	C
	C But it seems to run at 32 cycles, so presumably there's something else
	C going on.

	ALIGN(16)
	L(integer_top):
	C eax
	C ebx
	C ecx counter, size-1 to 0
	C edx
	C esi src, decrementing
	C edi dst, decrementing
	C
	C mm0 n2
	C mm4 m
	C mm5 d
	C mm6 32-l
	C mm7 l

	ASSERT(b,`C n2<d
	movd %mm0, %eax
	movd %mm5, %edx
	cmpl %edx, %eax')

	movd -4(%esi), %mm1 C next src limbs
	movd (%esi), %mm2
	leal -4(%esi), %esi

	punpckldq %mm2, %mm1
	psrlq %mm6, %mm1 C n10

	movq %mm1, %mm2 C n10
	movq %mm1, %mm3 C n10
	psrad $31, %mm1 C -n1
	pand %mm5, %mm1 C -n1 & d
	paddd %mm2, %mm1 C nadj = n10+(-n1&d), ignore overflow

	psrld $31, %mm2 C n1
	paddd %mm0, %mm2 C n2+n1
	punpckldq %mm0, %mm1 C n2:nadj

	pmuludq %mm4, %mm2 C m*(n2+n1)

	C

	paddq %mm2, %mm1 C n2:nadj + m*(n2+n1)
	pxor %mm2, %mm2 C break dependency, saves 4 cycles
	pcmpeqd %mm2, %mm2 C FF...FF
	psrlq $63, %mm2 C 1

	psrlq $32, %mm1 C q1 = high(n2:nadj + m*(n2+n1))

	paddd %mm1, %mm2 C q1+1
	pmuludq %mm5, %mm1 C q1*d

	punpckldq %mm0, %mm3 C n = n2:n10
	pxor %mm0, %mm0

	psubq %mm5, %mm3 C n - d

	C

	psubq %mm1, %mm3 C n - (q1+1)*d

	por %mm3, %mm0 C copy remainder -> new n2
	psrlq $32, %mm3 C high n - (q1+1)*d, 0 or -1

	ASSERT(be,`C 0 or -1
	movd %mm3, %eax
	addl $1, %eax
	cmpl $1, %eax')

	paddd %mm3, %mm2 C q
	pand %mm5, %mm3 C mask & d

	paddd %mm3, %mm0 C addback if necessary
	movd %mm2, (%edi)
	leal -4(%edi), %edi

	subl $1, %ecx
	ja L(integer_top)


	L(integer_last):
	C eax
	C ebx xsize
	C ecx
	C edx
	C esi &src[0]
	C edi &dst[xsize]
	C
	C mm0 n2
	C mm4 m
	C mm5 d
	C mm6
	C mm7 l

	ASSERT(b,`C n2<d
	movd %mm0, %eax
	movd %mm5, %edx
	cmpl %edx, %eax')

	movd (%esi), %mm1 C src[0]
	psllq %mm7, %mm1 C n10

	movq %mm1, %mm2 C n10
	movq %mm1, %mm3 C n10
	psrad $31, %mm1 C -n1
	pand %mm5, %mm1 C -n1 & d
	paddd %mm2, %mm1 C nadj = n10+(-n1&d), ignore overflow

	psrld $31, %mm2 C n1
	paddd %mm0, %mm2 C n2+n1
	punpckldq %mm0, %mm1 C n2:nadj

	pmuludq %mm4, %mm2 C m*(n2+n1)

	C

	paddq %mm2, %mm1 C n2:nadj + m*(n2+n1)
	pcmpeqd %mm2, %mm2 C FF...FF
	psrlq $63, %mm2 C 1

	psrlq $32, %mm1 C q1 = high(n2:nadj + m*(n2+n1))
	paddd %mm1, %mm2 C q1

	pmuludq %mm5, %mm1 C q1*d
	punpckldq %mm0, %mm3 C n
	psubq %mm5, %mm3 C n - d
	pxor %mm0, %mm0

	C

	psubq %mm1, %mm3 C n - (q1+1)*d

	por %mm3, %mm0 C remainder -> n2
	psrlq $32, %mm3 C high n - (q1+1)*d, 0 or -1

	ASSERT(be,`C 0 or -1
	movd %mm3, %eax
	addl $1, %eax
	cmpl $1, %eax')

	paddd %mm3, %mm2 C q
	pand %mm5, %mm3 C mask & d

	paddd %mm3, %mm0 C addback if necessary
	movd %mm2, (%edi)
	leal -4(%edi), %edi


	L(integer_none):
	C eax
	C ebx xsize

	orl %ebx, %ebx
	jnz L(fraction_some) C if xsize!=0


	L(fraction_done):
	movl SAVE_EBP, %ebp
	psrld %mm7, %mm0 C remainder

	movl SAVE_EDI, %edi
	movd %mm0, %eax

	movl SAVE_ESI, %esi
	movl SAVE_EBX, %ebx
	emms
	ret



	C -----------------------------------------------------------------------------
	C

	L(fraction_some):
	C eax
	C ebx xsize
	C ecx
	C edx
	C esi
	C edi &dst[xsize-1]
	C ebp


	L(fraction_top):
	C eax
	C ebx counter, xsize iterations
	C ecx
	C edx
	C esi src, decrementing
	C edi dst, decrementing
	C
	C mm0 n2
	C mm4 m
	C mm5 d
	C mm6 32-l
	C mm7 l

	ASSERT(b,`C n2<d
	movd %mm0, %eax
	movd %mm5, %edx
	cmpl %edx, %eax')

	movq %mm0, %mm1 C n2
	pmuludq %mm4, %mm0 C m*n2

	pcmpeqd %mm2, %mm2
	psrlq $63, %mm2

	C

	psrlq $32, %mm0 C high(m*n2)

	paddd %mm1, %mm0 C q1 = high(n2:0 + m*n2)

	paddd %mm0, %mm2 C q1+1
	pmuludq %mm5, %mm0 C q1*d

	psllq $32, %mm1 C n = n2:0
	psubq %mm5, %mm1 C n - d

	C

	psubq %mm0, %mm1 C r = n - (q1+1)*d
	pxor %mm0, %mm0

	por %mm1, %mm0 C r -> n2
	psrlq $32, %mm1 C high n - (q1+1)*d, 0 or -1

	ASSERT(be,`C 0 or -1
	movd %mm1, %eax
	addl $1, %eax
	cmpl $1, %eax')

	paddd %mm1, %mm2 C q
	pand %mm5, %mm1 C mask & d

	paddd %mm1, %mm0 C addback if necessary
	movd %mm2, (%edi)
	leal -4(%edi), %edi

	subl $1, %ebx
	jne L(fraction_top)


	jmp L(fraction_done)

	EPILOGUE()