gcc/gmp/mpn/x86/k6/gcd_1.asm - native_client/nacl-toolchain - Git at Google

 dnl  AMD K6 mpn_gcd_1 -- mpn by 1 gcd.

 dnl  Copyright 2000, 2001, 2002, 2004 Free Software Foundation, Inc.
 dnl
 dnl  This file is part of the GNU MP Library.
 dnl
 dnl  The GNU MP Library is free software; you can redistribute it and/or
 dnl  modify it under the terms of the GNU Lesser General Public License as
 dnl  published by the Free Software Foundation; either version 3 of the
 dnl  License, or (at your option) any later version.
 dnl
 dnl  The GNU MP Library is distributed in the hope that it will be useful,
 dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
 dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 dnl  Lesser General Public License for more details.
 dnl
 dnl  You should have received a copy of the GNU Lesser General Public License
 dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.

 include(`../config.m4')


 C K6: 9.5 cycles/bit (approx)   1x1 gcd
 C     11.0 cycles/limb          Nx1 reduction (modexact_1_odd)


 C mp_limb_t mpn_gcd_1 (mp_srcptr src, mp_size_t size, mp_limb_t y);
 C
 C This code is nothing very special, but offers a speedup over what gcc 2.95
 C can do with mpn/generic/gcd_1.c.
 C
 C Future:
 C
 C Using a lookup table to count trailing zeros seems a touch quicker, but
 C after a slightly longer startup.  Might be worthwhile if an mpn_gcd_2 used
 C it too.


 dnl  If size==1 and x (the larger operand) is more than DIV_THRESHOLD bits
 dnl  bigger than y, then a division x%y is done to reduce it.
 dnl
 dnl  A divl is 20 cycles and the loop runs at about 9.5 cycles/bitpair so
 dnl  there should be an advantage in the divl at about 4 or 5 bits, which is
 dnl  what's found.

 deflit(DIV_THRESHOLD, 5)


 defframe(PARAM_LIMB, 12)
 defframe(PARAM_SIZE,  8)
 defframe(PARAM_SRC,   4)

 	TEXT
 	ALIGN(16)

 PROLOGUE(mpn_gcd_1)
 deflit(`FRAME',0)

 	ASSERT(ne, `cmpl $0, PARAM_LIMB')
 	ASSERT(ae, `cmpl $1, PARAM_SIZE')


 	movl	PARAM_SRC, %eax
 	pushl	%ebx			FRAME_pushl()

 	movl	PARAM_LIMB, %edx
 	movl	$-1, %ecx

 	movl	(%eax), %ebx		C src low limb

 	movl	%ebx, %eax		C src low limb
 	orl	%edx, %ebx

 L(common_twos):
 	shrl	%ebx
 	incl	%ecx

 	jnc	L(common_twos)		C 1/4 chance on random data
 	shrl	%cl, %edx		C y

 	cmpl	$1, PARAM_SIZE
 	ja	L(size_two_or_more)


 	ASSERT(nz, `orl	%eax, %eax')	C should have src limb != 0

 	shrl	%cl, %eax		C x


 	C Swap if necessary to make x>=y.  Measures a touch quicker as a
 	C jump than a branch free calculation.
 	C
 	C eax	x
 	C ebx
 	C ecx	common twos
 	C edx	y

 	movl	%eax, %ebx
 	cmpl	%eax, %edx

 	jb	L(noswap)
 	movl	%edx, %eax

 	movl	%ebx, %edx
 	movl	%eax, %ebx
 L(noswap):


 	C See if it's worth reducing x with a divl.
 	C
 	C eax	x
 	C ebx	x
 	C ecx	common twos
 	C edx	y

 	shrl	$DIV_THRESHOLD, %ebx

 	cmpl	%ebx, %edx
 	ja	L(nodiv)


 	C Reduce x to x%y.
 	C
 	C eax	x
 	C ebx
 	C ecx	common twos
 	C edx	y

 	movl	%edx, %ebx
 	xorl	%edx, %edx

 	divl	%ebx

 	orl	%edx, %edx	C y
 	nop	C code alignment

 	movl	%ebx, %eax	C x
 	jz	L(done_shll)
 L(nodiv):


 	C eax	x
 	C ebx
 	C ecx	common twos
 	C edx	y
 	C esi
 	C edi
 	C ebp

 L(strip_y):
 	shrl	%edx
 	jnc	L(strip_y)

 	leal	1(%edx,%edx), %edx
 	movl	%ecx, %ebx	C common twos

 	leal	1(%eax), %ecx
 	jmp	L(strip_x_and)


 C Calculating a %cl shift based on the low bit 0 or 1 avoids doing a branch
 C on a 50/50 chance of 0 or 1.  The chance of the next bit also being 0 is
 C only 1/4.
 C
 C A second computed %cl shift was tried, but that measured a touch slower
 C than branching back.
 C
 C A branch-free abs(x-y) and min(x,y) calculation was tried, but that
 C measured about 1 cycle/bit slower.

 	C eax	x
 	C ebx	common twos
 	C ecx	scratch
 	C edx	y

 	ALIGN(4)
 L(swap):
 	addl	%eax, %edx	C x-y+y = x
 	negl	%eax		C -(x-y) = y-x

 L(strip_x):
 	shrl	%eax		C odd-odd = even, so always one to strip
 	ASSERT(nz)

 L(strip_x_leal):
 	leal	1(%eax), %ecx

 L(strip_x_and):
 	andl	$1, %ecx	C (x^1)&1

 	shrl	%cl, %eax	C shift if x even

 	testb	$1, %al
 	jz	L(strip_x)

 	ASSERT(nz,`testl $1, %eax')	C x, y odd
 	ASSERT(nz,`testl $1, %edx')

 	subl	%edx, %eax
 	jb	L(swap)
 	ja	L(strip_x)


 	movl	%edx, %eax
 	movl	%ebx, %ecx

 L(done_shll):
 	shll	%cl, %eax
 	popl	%ebx

 	ret


 C -----------------------------------------------------------------------------
 C Two or more limbs.
 C
 C x={src,size} is reduced modulo y using either a plain mod_1 style
 C remainder, or a modexact_1 style exact division.

 deflit(MODEXACT_THRESHOLD, ifdef(`PIC', 4, 4))

 	ALIGN(8)
 L(size_two_or_more):
 	C eax
 	C ebx
 	C ecx	common twos
 	C edx	y, without common twos
 	C esi
 	C edi
 	C ebp

 deflit(FRAME_TWO_OR_MORE, FRAME)

 	pushl	%edi		defframe_pushl(SAVE_EDI)
 	movl	PARAM_SRC, %ebx

 L(y_twos):
 	shrl	%edx
 	jnc	L(y_twos)

 	movl	%ecx, %edi		C common twos
 	movl	PARAM_SIZE, %ecx

 	pushl	%esi		defframe_pushl(SAVE_ESI)
 	leal	1(%edx,%edx), %esi	C y (odd)

 	movl	-4(%ebx,%ecx,4), %eax	C src high limb

 	cmpl	%edx, %eax		C carry if high<divisor

 	sbbl	%edx, %edx		C -1 if high<divisor

 	addl	%edx, %ecx		C skip one limb if high<divisor
 	andl	%eax, %edx

 	cmpl	$MODEXACT_THRESHOLD, %ecx
 	jae	L(modexact)


 L(divide_top):
 	C eax	scratch (quotient)
 	C ebx	src
 	C ecx	counter, size-1 to 1
 	C edx	carry (remainder)
 	C esi	divisor (odd)
 	C edi
 	C ebp

 	movl	-4(%ebx,%ecx,4), %eax
 	divl	%esi
 	loop	L(divide_top)


 	movl	%edx, %eax	C x
 	movl	%esi, %edx	C y (odd)

 	movl	%edi, %ebx	C common twos
 	popl	%esi

 	popl	%edi
 	leal	1(%eax), %ecx

 	orl	%eax, %eax
 	jnz	L(strip_x_and)


 	movl	%ebx, %ecx
 	movl	%edx, %eax

 	shll	%cl, %eax
 	popl	%ebx

 	ret


 	ALIGN(8)
 L(modexact):
 	C eax
 	C ebx	src ptr
 	C ecx	size or size-1
 	C edx
 	C esi	y odd
 	C edi	common twos
 	C ebp

 	movl	PARAM_SIZE, %eax
 	pushl	%esi		FRAME_pushl()

 	pushl	%eax		FRAME_pushl()

 	pushl	%ebx		FRAME_pushl()

 ifdef(`PIC',`
 	nop	C code alignment
 	call	L(movl_eip_ebx)
 L(here):
 	addl	$_GLOBAL_OFFSET_TABLE_, %ebx
 	call	GSYM_PREFIX`'mpn_modexact_1_odd@PLT
 ',`
 	call	GSYM_PREFIX`'mpn_modexact_1_odd
 ')

 	movl	%esi, %edx		C y odd
 	movl	SAVE_ESI, %esi

 	movl	%edi, %ebx		C common twos
 	movl	SAVE_EDI, %edi

 	addl	$eval(FRAME - FRAME_TWO_OR_MORE), %esp
 	orl	%eax, %eax

 	leal	1(%eax), %ecx
 	jnz	L(strip_x_and)


 	movl	%ebx, %ecx
 	movl	%edx, %eax

 	shll	%cl, %eax
 	popl	%ebx

 	ret


 ifdef(`PIC',`
 L(movl_eip_ebx):
 	movl	(%esp), %ebx
 	ret_internal
 ')

 EPILOGUE()
	dnl AMD K6 mpn_gcd_1 -- mpn by 1 gcd.

	dnl Copyright 2000, 2001, 2002, 2004 Free Software Foundation, Inc.
	dnl
	dnl This file is part of the GNU MP Library.
	dnl
	dnl The GNU MP Library is free software; you can redistribute it and/or
	dnl modify it under the terms of the GNU Lesser General Public License as
	dnl published by the Free Software Foundation; either version 3 of the
	dnl License, or (at your option) any later version.
	dnl
	dnl The GNU MP Library is distributed in the hope that it will be useful,
	dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
	dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	dnl Lesser General Public License for more details.
	dnl
	dnl You should have received a copy of the GNU Lesser General Public License
	dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.

	include(`../config.m4')


	C K6: 9.5 cycles/bit (approx) 1x1 gcd
	C 11.0 cycles/limb Nx1 reduction (modexact_1_odd)


	C mp_limb_t mpn_gcd_1 (mp_srcptr src, mp_size_t size, mp_limb_t y);
	C
	C This code is nothing very special, but offers a speedup over what gcc 2.95
	C can do with mpn/generic/gcd_1.c.
	C
	C Future:
	C
	C Using a lookup table to count trailing zeros seems a touch quicker, but
	C after a slightly longer startup. Might be worthwhile if an mpn_gcd_2 used
	C it too.


	dnl If size==1 and x (the larger operand) is more than DIV_THRESHOLD bits
	dnl bigger than y, then a division x%y is done to reduce it.
	dnl
	dnl A divl is 20 cycles and the loop runs at about 9.5 cycles/bitpair so
	dnl there should be an advantage in the divl at about 4 or 5 bits, which is
	dnl what's found.

	deflit(DIV_THRESHOLD, 5)


	defframe(PARAM_LIMB, 12)
	defframe(PARAM_SIZE, 8)
	defframe(PARAM_SRC, 4)

	TEXT
	ALIGN(16)

	PROLOGUE(mpn_gcd_1)
	deflit(`FRAME',0)

	ASSERT(ne, `cmpl $0, PARAM_LIMB')
	ASSERT(ae, `cmpl $1, PARAM_SIZE')


	movl PARAM_SRC, %eax
	pushl %ebx FRAME_pushl()

	movl PARAM_LIMB, %edx
	movl $-1, %ecx

	movl (%eax), %ebx C src low limb

	movl %ebx, %eax C src low limb
	orl %edx, %ebx

	L(common_twos):
	shrl %ebx
	incl %ecx

	jnc L(common_twos) C 1/4 chance on random data
	shrl %cl, %edx C y

	cmpl $1, PARAM_SIZE
	ja L(size_two_or_more)


	ASSERT(nz, `orl %eax, %eax') C should have src limb != 0

	shrl %cl, %eax C x


	C Swap if necessary to make x>=y. Measures a touch quicker as a
	C jump than a branch free calculation.
	C
	C eax x
	C ebx
	C ecx common twos
	C edx y

	movl %eax, %ebx
	cmpl %eax, %edx

	jb L(noswap)
	movl %edx, %eax

	movl %ebx, %edx
	movl %eax, %ebx
	L(noswap):


	C See if it's worth reducing x with a divl.
	C
	C eax x
	C ebx x
	C ecx common twos
	C edx y

	shrl $DIV_THRESHOLD, %ebx

	cmpl %ebx, %edx
	ja L(nodiv)


	C Reduce x to x%y.
	C
	C eax x
	C ebx
	C ecx common twos
	C edx y

	movl %edx, %ebx
	xorl %edx, %edx

	divl %ebx

	orl %edx, %edx C y
	nop C code alignment

	movl %ebx, %eax C x
	jz L(done_shll)
	L(nodiv):


	C eax x
	C ebx
	C ecx common twos
	C edx y
	C esi
	C edi
	C ebp

	L(strip_y):
	shrl %edx
	jnc L(strip_y)

	leal 1(%edx,%edx), %edx
	movl %ecx, %ebx C common twos

	leal 1(%eax), %ecx
	jmp L(strip_x_and)


	C Calculating a %cl shift based on the low bit 0 or 1 avoids doing a branch
	C on a 50/50 chance of 0 or 1. The chance of the next bit also being 0 is
	C only 1/4.
	C
	C A second computed %cl shift was tried, but that measured a touch slower
	C than branching back.
	C
	C A branch-free abs(x-y) and min(x,y) calculation was tried, but that
	C measured about 1 cycle/bit slower.

	C eax x
	C ebx common twos
	C ecx scratch
	C edx y

	ALIGN(4)
	L(swap):
	addl %eax, %edx C x-y+y = x
	negl %eax C -(x-y) = y-x

	L(strip_x):
	shrl %eax C odd-odd = even, so always one to strip
	ASSERT(nz)

	L(strip_x_leal):
	leal 1(%eax), %ecx

	L(strip_x_and):
	andl $1, %ecx C (x^1)&1

	shrl %cl, %eax C shift if x even

	testb $1, %al
	jz L(strip_x)

	ASSERT(nz,`testl $1, %eax') C x, y odd
	ASSERT(nz,`testl $1, %edx')

	subl %edx, %eax
	jb L(swap)
	ja L(strip_x)


	movl %edx, %eax
	movl %ebx, %ecx

	L(done_shll):
	shll %cl, %eax
	popl %ebx

	ret


	C -----------------------------------------------------------------------------
	C Two or more limbs.
	C
	C x={src,size} is reduced modulo y using either a plain mod_1 style
	C remainder, or a modexact_1 style exact division.

	deflit(MODEXACT_THRESHOLD, ifdef(`PIC', 4, 4))

	ALIGN(8)
	L(size_two_or_more):
	C eax
	C ebx
	C ecx common twos
	C edx y, without common twos
	C esi
	C edi
	C ebp

	deflit(FRAME_TWO_OR_MORE, FRAME)

	pushl %edi defframe_pushl(SAVE_EDI)
	movl PARAM_SRC, %ebx

	L(y_twos):
	shrl %edx
	jnc L(y_twos)

	movl %ecx, %edi C common twos
	movl PARAM_SIZE, %ecx

	pushl %esi defframe_pushl(SAVE_ESI)
	leal 1(%edx,%edx), %esi C y (odd)

	movl -4(%ebx,%ecx,4), %eax C src high limb

	cmpl %edx, %eax C carry if high<divisor

	sbbl %edx, %edx C -1 if high<divisor

	addl %edx, %ecx C skip one limb if high<divisor
	andl %eax, %edx

	cmpl $MODEXACT_THRESHOLD, %ecx
	jae L(modexact)


	L(divide_top):
	C eax scratch (quotient)
	C ebx src
	C ecx counter, size-1 to 1
	C edx carry (remainder)
	C esi divisor (odd)
	C edi
	C ebp

	movl -4(%ebx,%ecx,4), %eax
	divl %esi
	loop L(divide_top)


	movl %edx, %eax C x
	movl %esi, %edx C y (odd)

	movl %edi, %ebx C common twos
	popl %esi

	popl %edi
	leal 1(%eax), %ecx

	orl %eax, %eax
	jnz L(strip_x_and)


	movl %ebx, %ecx
	movl %edx, %eax

	shll %cl, %eax
	popl %ebx

	ret


	ALIGN(8)
	L(modexact):
	C eax
	C ebx src ptr
	C ecx size or size-1
	C edx
	C esi y odd
	C edi common twos
	C ebp

	movl PARAM_SIZE, %eax
	pushl %esi FRAME_pushl()

	pushl %eax FRAME_pushl()

	pushl %ebx FRAME_pushl()

	ifdef(`PIC',`
	nop C code alignment
	call L(movl_eip_ebx)
	L(here):
	addl $_GLOBAL_OFFSET_TABLE_, %ebx
	call GSYM_PREFIX`'mpn_modexact_1_odd@PLT
	',`
	call GSYM_PREFIX`'mpn_modexact_1_odd
	')

	movl %esi, %edx C y odd
	movl SAVE_ESI, %esi

	movl %edi, %ebx C common twos
	movl SAVE_EDI, %edi

	addl $eval(FRAME - FRAME_TWO_OR_MORE), %esp
	orl %eax, %eax

	leal 1(%eax), %ecx
	jnz L(strip_x_and)


	movl %ebx, %ecx
	movl %edx, %eax

	shll %cl, %eax
	popl %ebx

	ret


	ifdef(`PIC',`
	L(movl_eip_ebx):
	movl (%esp), %ebx
	ret_internal
	')

	EPILOGUE()