gcc/gmp/mpn/x86/mod_1.asm - native_client/nacl-toolchain - Git at Google

 dnl  x86 mpn_mod_1 -- mpn by limb remainder.

 dnl  Copyright 1999, 2000, 2001, 2002, 2003, 2007 Free Software Foundation,
 dnl  Inc.
 dnl
 dnl  This file is part of the GNU MP Library.
 dnl
 dnl  The GNU MP Library is free software; you can redistribute it and/or
 dnl  modify it under the terms of the GNU Lesser General Public License as
 dnl  published by the Free Software Foundation; either version 3 of the
 dnl  License, or (at your option) any later version.
 dnl
 dnl  The GNU MP Library is distributed in the hope that it will be useful,
 dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
 dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 dnl  Lesser General Public License for more details.
 dnl
 dnl  You should have received a copy of the GNU Lesser General Public License
 dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.

 include(`../config.m4')


 C      cycles/limb
 C 486     42 approx, maybe
 C P5      44
 C P6      39
 C K6      20
 C K7      41
 C P4      58


 C mp_limb_t mpn_mod_1 (mp_srcptr src, mp_size_t size, mp_limb_t divisor);
 C mp_limb_t mpn_mod_1c (mp_srcptr src, mp_size_t size, mp_limb_t divisor,
 C                       mp_limb_t carry);
 C
 C Essentially this code is the same as the division based part of
 C mpn/generic/mod_1.c, but has the advantage that we get the desired divl
 C instruction even when gcc is not being used (where longlong.h only has the
 C rather slow generic C udiv_qrnnd().
 C
 C A test is done to see if the high limb is less than the divisor, and if so
 C one less div is done.  A div is between 20 and 40 cycles on the various
 C x86s, so assuming high<divisor about half the time, then this test saves
 C half that amount.  The branch misprediction penalty on each chip is less
 C than half a div.
 C
 C
 C Notes for K6:
 C
 C Back-to-back div instructions take 20 cycles, the same as the loop here,
 C so it seems there's nothing to gain by rearranging.  Pairing the mov and
 C loop instructions was found to gain nothing.  Normally we use a loop
 C instruction rather than decl/jnz, but it gains nothing here.
 C
 C A multiply-by-inverse is used in mpn/x86/k6/pre_mod_1.asm, but it saves
 C only 2 c/l so currently we haven't bothered with the same for mpn_mod_1.
 C If an inverse takes about 40 cycles for normalized or perhaps 60 for
 C unnormalized (due to bsfl being slow on k6) then the threshold would be at
 C least 20 or 30 limbs.
 C

 defframe(PARAM_CARRY,  16)
 defframe(PARAM_DIVISOR,12)
 defframe(PARAM_SIZE,   8)
 defframe(PARAM_SRC,    4)

 	TEXT

 	ALIGN(16)
 PROLOGUE(mpn_mod_1)
 deflit(`FRAME',0)

 	movl	PARAM_SIZE, %ecx
 	pushl	%ebx		FRAME_pushl()

 	movl	PARAM_SRC, %ebx
 	pushl	%esi		FRAME_pushl()

 	orl	%ecx, %ecx
 	jz	L(done_zero)

 	movl	PARAM_DIVISOR, %esi
 	movl	-4(%ebx,%ecx,4), %eax	C src high limb

 	cmpl	%esi, %eax

 	sbbl	%edx, %edx		C -1 if high<divisor

 	addl	%edx, %ecx		C skip one division if high<divisor
 	jz	L(done_eax)

 	andl	%eax, %edx		C carry if high<divisor


 L(top):
 	C eax	scratch (quotient)
 	C ebx	src
 	C ecx	counter
 	C edx	carry (remainder)
 	C esi	divisor
 	C edi
 	C ebp

 	movl	-4(%ebx,%ecx,4), %eax

 	divl	%esi

 	decl	%ecx
 	jnz	L(top)


 	movl	%edx, %eax
 L(done_eax):
 	popl	%esi

 	popl	%ebx

 	ret

 EPILOGUE()


 	C This code located after mpn_mod_1, so the jump to L(top) here is
 	C back and hence will be predicted as taken.  (size==0 is considered
 	C unlikely.)

 	ALIGN(16)
 PROLOGUE(mpn_mod_1c)
 deflit(`FRAME',0)

 	movl	PARAM_SIZE, %ecx
 	pushl	%ebx		FRAME_pushl()

 	movl	PARAM_SRC, %ebx
 	pushl	%esi		FRAME_pushl()

 	movl	PARAM_DIVISOR, %esi
 	orl	%ecx, %ecx

 	movl	PARAM_CARRY, %edx
 	jnz	L(top)

 	popl	%esi
 	movl	%edx, %eax

 	popl	%ebx

 	ret


 	C This code is for mpn_mod_1, but is positioned here to save some
 	C space in the alignment padding.
 	C
 L(done_zero):
 	popl	%esi
 	xorl	%eax, %eax

 	popl	%ebx

 	ret

 EPILOGUE()
	dnl x86 mpn_mod_1 -- mpn by limb remainder.

	dnl Copyright 1999, 2000, 2001, 2002, 2003, 2007 Free Software Foundation,
	dnl Inc.
	dnl
	dnl This file is part of the GNU MP Library.
	dnl
	dnl The GNU MP Library is free software; you can redistribute it and/or
	dnl modify it under the terms of the GNU Lesser General Public License as
	dnl published by the Free Software Foundation; either version 3 of the
	dnl License, or (at your option) any later version.
	dnl
	dnl The GNU MP Library is distributed in the hope that it will be useful,
	dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
	dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	dnl Lesser General Public License for more details.
	dnl
	dnl You should have received a copy of the GNU Lesser General Public License
	dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.

	include(`../config.m4')


	C cycles/limb
	C 486 42 approx, maybe
	C P5 44
	C P6 39
	C K6 20
	C K7 41
	C P4 58


	C mp_limb_t mpn_mod_1 (mp_srcptr src, mp_size_t size, mp_limb_t divisor);
	C mp_limb_t mpn_mod_1c (mp_srcptr src, mp_size_t size, mp_limb_t divisor,
	C mp_limb_t carry);
	C
	C Essentially this code is the same as the division based part of
	C mpn/generic/mod_1.c, but has the advantage that we get the desired divl
	C instruction even when gcc is not being used (where longlong.h only has the
	C rather slow generic C udiv_qrnnd().
	C
	C A test is done to see if the high limb is less than the divisor, and if so
	C one less div is done. A div is between 20 and 40 cycles on the various
	C x86s, so assuming high<divisor about half the time, then this test saves
	C half that amount. The branch misprediction penalty on each chip is less
	C than half a div.
	C
	C
	C Notes for K6:
	C
	C Back-to-back div instructions take 20 cycles, the same as the loop here,
	C so it seems there's nothing to gain by rearranging. Pairing the mov and
	C loop instructions was found to gain nothing. Normally we use a loop
	C instruction rather than decl/jnz, but it gains nothing here.
	C
	C A multiply-by-inverse is used in mpn/x86/k6/pre_mod_1.asm, but it saves
	C only 2 c/l so currently we haven't bothered with the same for mpn_mod_1.
	C If an inverse takes about 40 cycles for normalized or perhaps 60 for
	C unnormalized (due to bsfl being slow on k6) then the threshold would be at
	C least 20 or 30 limbs.
	C

	defframe(PARAM_CARRY, 16)
	defframe(PARAM_DIVISOR,12)
	defframe(PARAM_SIZE, 8)
	defframe(PARAM_SRC, 4)

	TEXT

	ALIGN(16)
	PROLOGUE(mpn_mod_1)
	deflit(`FRAME',0)

	movl PARAM_SIZE, %ecx
	pushl %ebx FRAME_pushl()

	movl PARAM_SRC, %ebx
	pushl %esi FRAME_pushl()

	orl %ecx, %ecx
	jz L(done_zero)

	movl PARAM_DIVISOR, %esi
	movl -4(%ebx,%ecx,4), %eax C src high limb

	cmpl %esi, %eax

	sbbl %edx, %edx C -1 if high<divisor

	addl %edx, %ecx C skip one division if high<divisor
	jz L(done_eax)

	andl %eax, %edx C carry if high<divisor


	L(top):
	C eax scratch (quotient)
	C ebx src
	C ecx counter
	C edx carry (remainder)
	C esi divisor
	C edi
	C ebp

	movl -4(%ebx,%ecx,4), %eax

	divl %esi

	decl %ecx
	jnz L(top)


	movl %edx, %eax
	L(done_eax):
	popl %esi

	popl %ebx

	ret

	EPILOGUE()


	C This code located after mpn_mod_1, so the jump to L(top) here is
	C back and hence will be predicted as taken. (size==0 is considered
	C unlikely.)

	ALIGN(16)
	PROLOGUE(mpn_mod_1c)
	deflit(`FRAME',0)

	movl PARAM_SIZE, %ecx
	pushl %ebx FRAME_pushl()

	movl PARAM_SRC, %ebx
	pushl %esi FRAME_pushl()

	movl PARAM_DIVISOR, %esi
	orl %ecx, %ecx

	movl PARAM_CARRY, %edx
	jnz L(top)

	popl %esi
	movl %edx, %eax

	popl %ebx

	ret


	C This code is for mpn_mod_1, but is positioned here to save some
	C space in the alignment padding.
	C
	L(done_zero):
	popl %esi
	xorl %eax, %eax

	popl %ebx

	ret

	EPILOGUE()