gcc/gmp/mpn/x86/pentium/mmx/mul_1.asm - native_client/nacl-toolchain - Git at Google

 dnl  Intel Pentium MMX mpn_mul_1 -- mpn by limb multiplication.

 dnl  Copyright 2000, 2001, 2002 Free Software Foundation, Inc.
 dnl
 dnl  This file is part of the GNU MP Library.
 dnl
 dnl  The GNU MP Library is free software; you can redistribute it and/or
 dnl  modify it under the terms of the GNU Lesser General Public License as
 dnl  published by the Free Software Foundation; either version 3 of the
 dnl  License, or (at your option) any later version.
 dnl
 dnl  The GNU MP Library is distributed in the hope that it will be useful,
 dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
 dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 dnl  Lesser General Public License for more details.
 dnl
 dnl  You should have received a copy of the GNU Lesser General Public License
 dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.

 include(`../config.m4')


 C    cycles/limb
 C P5:   12.0   for 32-bit multiplier
 C        7.0   for 16-bit multiplier


 C mp_limb_t mpn_mul_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
 C                      mp_limb_t multiplier);
 C
 C When the multiplier is 16 bits some special case MMX code is used.  Small
 C multipliers might arise reasonably often from mpz_mul_ui etc.  If the size
 C is odd there's roughly a 5 cycle penalty, so times for say size==7 and
 C size==8 end up being quite close.  If src isn't aligned to an 8 byte
 C boundary then one limb is processed separately with roughly a 5 cycle
 C penalty, so in that case it's say size==8 and size==9 which are close.
 C
 C Alternatives:
 C
 C MMX is not believed to be of any use for 32-bit multipliers, since for
 C instance the current method would just have to be more or less duplicated
 C for the high and low halves of the multiplier, and would probably
 C therefore run at about 14 cycles, which is slower than the plain integer
 C at 12.
 C
 C Adding the high and low MMX products using integer code seems best.  An
 C attempt at using paddd and carry bit propagation with pcmpgtd didn't give
 C any joy.  Perhaps something could be done keeping the values signed and
 C thereby avoiding adjustments to make pcmpgtd into an unsigned compare, or
 C perhaps not.
 C
 C Future:
 C
 C An mpn_mul_1c entrypoint would need a double carry out of the low result
 C limb in the 16-bit code, unless it could be assumed the carry fits in 16
 C bits, possibly as carry<multiplier, this being true of a big calculation
 C done piece by piece.  But let's worry about that if/when mul_1c is
 C actually used.

 defframe(PARAM_MULTIPLIER,16)
 defframe(PARAM_SIZE,      12)
 defframe(PARAM_SRC,       8)
 defframe(PARAM_DST,       4)

 	TEXT

 	ALIGN(8)
 PROLOGUE(mpn_mul_1)
 deflit(`FRAME',0)

 	movl	PARAM_SIZE, %ecx
 	movl	PARAM_SRC, %edx

 	cmpl	$1, %ecx
 	jne	L(two_or_more)

 	C one limb only

 	movl	PARAM_MULTIPLIER, %eax
 	movl	PARAM_DST, %ecx

 	mull	(%edx)

 	movl	%eax, (%ecx)
 	movl	%edx, %eax

 	ret


 L(two_or_more):
 	C eax	size
 	C ebx
 	C ecx	carry
 	C edx
 	C esi	src
 	C edi
 	C ebp

 	pushl	%esi		FRAME_pushl()
 	pushl	%edi		FRAME_pushl()

 	movl	%edx, %esi		C src
 	movl	PARAM_DST, %edi

 	movl	PARAM_MULTIPLIER, %eax
 	pushl	%ebx		FRAME_pushl()

 	leal	(%esi,%ecx,4), %esi	C src end
 	leal	(%edi,%ecx,4), %edi	C dst end

 	negl	%ecx			C -size

 	pushl	%ebp		FRAME_pushl()
 	cmpl	$65536, %eax

 	jb	L(small)


 L(big):
 	xorl	%ebx, %ebx		C carry limb
 	sarl	%ecx			C -size/2

 	jnc	L(top)			C with carry flag clear


 	C size was odd, process one limb separately

 	mull	4(%esi,%ecx,8)		C m * src[0]

 	movl	%eax, 4(%edi,%ecx,8)
 	incl	%ecx

 	orl	%edx, %ebx		C carry limb, and clear carry flag


 L(top):
 	C eax
 	C ebx	carry
 	C ecx	counter, negative
 	C edx
 	C esi	src end
 	C edi	dst end
 	C ebp	(scratch carry)

 	adcl	$0, %ebx
 	movl	(%esi,%ecx,8), %eax

 	mull	PARAM_MULTIPLIER

 	movl	%edx, %ebp
 	addl	%eax, %ebx

 	adcl	$0, %ebp
 	movl	4(%esi,%ecx,8), %eax

 	mull	PARAM_MULTIPLIER

 	movl	%ebx, (%edi,%ecx,8)
 	addl	%ebp, %eax

 	movl	%eax, 4(%edi,%ecx,8)
 	incl	%ecx

 	movl	%edx, %ebx
 	jnz	L(top)


 	adcl	$0, %ebx
 	popl	%ebp

 	movl	%ebx, %eax
 	popl	%ebx

 	popl	%edi
 	popl	%esi

 	ret


 L(small):
 	C Special case for 16-bit multiplier.
 	C
 	C eax	multiplier
 	C ebx
 	C ecx	-size
 	C edx	src
 	C esi	src end
 	C edi	dst end
 	C ebp	multiplier

 	C size<3 not supported here.  At size==3 we're already a couple of
 	C cycles faster, so there's no threshold as such, just use the MMX
 	C as soon as possible.

 	cmpl	$-3, %ecx
 	ja	L(big)

 	movd	%eax, %mm7		C m
 	pxor	%mm6, %mm6		C initial carry word

 	punpcklwd %mm7, %mm7		C m replicated 2 times
 	addl	$2, %ecx		C -size+2

 	punpckldq %mm7, %mm7		C m replicated 4 times
 	andl	$4, %edx		C test alignment, clear carry flag

 	movq	%mm7, %mm0		C m
 	jz	L(small_entry)


 	C Source is unaligned, process one limb separately.
 	C
 	C Plain integer code is used here, since it's smaller and is about
 	C the same 13 cycles as an mmx block would be.
 	C
 	C An "addl $1,%ecx" doesn't clear the carry flag when size==3, hence
 	C the use of separate incl and orl.

 	mull	-8(%esi,%ecx,4)		C m * src[0]

 	movl	%eax, -8(%edi,%ecx,4)	C dst[0]
 	incl	%ecx			C one limb processed

 	movd	%edx, %mm6		C initial carry

 	orl	%eax, %eax		C clear carry flag
 	jmp	L(small_entry)


 C The scheduling here is quite tricky, since so many instructions have
 C pairing restrictions.  In particular the js won't pair with a movd, and
 C can't be paired with an adc since it wants flags from the inc, so
 C instructions are rotated to the top of the loop to find somewhere useful
 C for it.
 C
 C Trouble has been taken to avoid overlapping successive loop iterations,
 C since that would greatly increase the size of the startup and finishup
 C code.  Actually there's probably not much advantage to be had from
 C overlapping anyway, since the difficulties are mostly with pairing, not
 C with latencies as such.
 C
 C In the comments x represents the src data and m the multiplier (16
 C bits, but replicated 4 times).
 C
 C The m signs calculated in %mm3 are a loop invariant and could be held in
 C say %mm5, but that would save only one instruction and hence be no faster.

 L(small_top):
 	C eax	l.low, then l.high
 	C ebx	(h.low)
 	C ecx	counter, -size+2 to 0 or 1
 	C edx	(h.high)
 	C esi	&src[size]
 	C edi	&dst[size]
 	C ebp
 	C
 	C %mm0	(high products)
 	C %mm1	(low products)
 	C %mm2	(adjust for m using x signs)
 	C %mm3	(adjust for x using m signs)
 	C %mm4
 	C %mm5
 	C %mm6	h.low, then carry
 	C %mm7	m replicated 4 times

 	movd	%mm6, %ebx		C h.low
 	psrlq	$32, %mm1		C l.high

 	movd	%mm0, %edx		C h.high
 	movq	%mm0, %mm6		C new c

 	adcl	%eax, %ebx
 	incl	%ecx

 	movd	%mm1, %eax		C l.high
 	movq	%mm7, %mm0

 	adcl	%eax, %edx
 	movl	%ebx, -16(%edi,%ecx,4)

 	movl	%edx, -12(%edi,%ecx,4)
 	psrlq	$32, %mm6		C c

 L(small_entry):
 	pmulhw	-8(%esi,%ecx,4), %mm0	C h = (x*m).high
 	movq	%mm7, %mm1

 	pmullw	-8(%esi,%ecx,4), %mm1	C l = (x*m).low
 	movq	%mm7, %mm3

 	movq	-8(%esi,%ecx,4), %mm2	C x
 	psraw	$15, %mm3		C m signs

 	pand	-8(%esi,%ecx,4), %mm3	C x selected by m signs
 	psraw	$15, %mm2		C x signs

 	paddw	%mm3, %mm0		C add x to h if m neg
 	pand	%mm7, %mm2		C m selected by x signs

 	paddw	%mm2, %mm0		C add m to h if x neg
 	incl	%ecx

 	movd	%mm1, %eax		C l.low
 	punpcklwd %mm0, %mm6		C c + h.low << 16

 	psrlq	$16, %mm0		C h.high
 	js	L(small_top)


 	movd	%mm6, %ebx		C h.low
 	psrlq	$32, %mm1		C l.high

 	adcl	%eax, %ebx
 	popl	%ebp		FRAME_popl()

 	movd	%mm0, %edx		C h.high
 	psrlq	$32, %mm0		C l.high

 	movd	%mm1, %eax		C l.high

 	adcl	%eax, %edx
 	movl	%ebx, -12(%edi,%ecx,4)

 	movd	%mm0, %eax		C c

 	adcl	$0, %eax
 	movl	%edx, -8(%edi,%ecx,4)

 	orl	%ecx, %ecx
 	jnz	L(small_done)		C final %ecx==1 means even, ==0 odd


 	C Size odd, one extra limb to process.
 	C Plain integer code is used here, since it's smaller and is about
 	C the same speed as another mmx block would be.

 	movl	%eax, %ecx
 	movl	PARAM_MULTIPLIER, %eax

 	mull	-4(%esi)

 	addl	%ecx, %eax

 	adcl	$0, %edx
 	movl	%eax, -4(%edi)

 	movl	%edx, %eax
 L(small_done):
 	popl	%ebx

 	popl	%edi
 	popl	%esi

 	emms

 	ret

 EPILOGUE()
	dnl Intel Pentium MMX mpn_mul_1 -- mpn by limb multiplication.

	dnl Copyright 2000, 2001, 2002 Free Software Foundation, Inc.
	dnl
	dnl This file is part of the GNU MP Library.
	dnl
	dnl The GNU MP Library is free software; you can redistribute it and/or
	dnl modify it under the terms of the GNU Lesser General Public License as
	dnl published by the Free Software Foundation; either version 3 of the
	dnl License, or (at your option) any later version.
	dnl
	dnl The GNU MP Library is distributed in the hope that it will be useful,
	dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
	dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	dnl Lesser General Public License for more details.
	dnl
	dnl You should have received a copy of the GNU Lesser General Public License
	dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.

	include(`../config.m4')


	C cycles/limb
	C P5: 12.0 for 32-bit multiplier
	C 7.0 for 16-bit multiplier


	C mp_limb_t mpn_mul_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
	C mp_limb_t multiplier);
	C
	C When the multiplier is 16 bits some special case MMX code is used. Small
	C multipliers might arise reasonably often from mpz_mul_ui etc. If the size
	C is odd there's roughly a 5 cycle penalty, so times for say size==7 and
	C size==8 end up being quite close. If src isn't aligned to an 8 byte
	C boundary then one limb is processed separately with roughly a 5 cycle
	C penalty, so in that case it's say size==8 and size==9 which are close.
	C
	C Alternatives:
	C
	C MMX is not believed to be of any use for 32-bit multipliers, since for
	C instance the current method would just have to be more or less duplicated
	C for the high and low halves of the multiplier, and would probably
	C therefore run at about 14 cycles, which is slower than the plain integer
	C at 12.
	C
	C Adding the high and low MMX products using integer code seems best. An
	C attempt at using paddd and carry bit propagation with pcmpgtd didn't give
	C any joy. Perhaps something could be done keeping the values signed and
	C thereby avoiding adjustments to make pcmpgtd into an unsigned compare, or
	C perhaps not.
	C
	C Future:
	C
	C An mpn_mul_1c entrypoint would need a double carry out of the low result
	C limb in the 16-bit code, unless it could be assumed the carry fits in 16
	C bits, possibly as carry<multiplier, this being true of a big calculation
	C done piece by piece. But let's worry about that if/when mul_1c is
	C actually used.

	defframe(PARAM_MULTIPLIER,16)
	defframe(PARAM_SIZE, 12)
	defframe(PARAM_SRC, 8)
	defframe(PARAM_DST, 4)

	TEXT

	ALIGN(8)
	PROLOGUE(mpn_mul_1)
	deflit(`FRAME',0)

	movl PARAM_SIZE, %ecx
	movl PARAM_SRC, %edx

	cmpl $1, %ecx
	jne L(two_or_more)

	C one limb only

	movl PARAM_MULTIPLIER, %eax
	movl PARAM_DST, %ecx

	mull (%edx)

	movl %eax, (%ecx)
	movl %edx, %eax

	ret


	L(two_or_more):
	C eax size
	C ebx
	C ecx carry
	C edx
	C esi src
	C edi
	C ebp

	pushl %esi FRAME_pushl()
	pushl %edi FRAME_pushl()

	movl %edx, %esi C src
	movl PARAM_DST, %edi

	movl PARAM_MULTIPLIER, %eax
	pushl %ebx FRAME_pushl()

	leal (%esi,%ecx,4), %esi C src end
	leal (%edi,%ecx,4), %edi C dst end

	negl %ecx C -size

	pushl %ebp FRAME_pushl()
	cmpl $65536, %eax

	jb L(small)


	L(big):
	xorl %ebx, %ebx C carry limb
	sarl %ecx C -size/2

	jnc L(top) C with carry flag clear


	C size was odd, process one limb separately

	mull 4(%esi,%ecx,8) C m * src[0]

	movl %eax, 4(%edi,%ecx,8)
	incl %ecx

	orl %edx, %ebx C carry limb, and clear carry flag


	L(top):
	C eax
	C ebx carry
	C ecx counter, negative
	C edx
	C esi src end
	C edi dst end
	C ebp (scratch carry)

	adcl $0, %ebx
	movl (%esi,%ecx,8), %eax

	mull PARAM_MULTIPLIER

	movl %edx, %ebp
	addl %eax, %ebx

	adcl $0, %ebp
	movl 4(%esi,%ecx,8), %eax

	mull PARAM_MULTIPLIER

	movl %ebx, (%edi,%ecx,8)
	addl %ebp, %eax

	movl %eax, 4(%edi,%ecx,8)
	incl %ecx

	movl %edx, %ebx
	jnz L(top)


	adcl $0, %ebx
	popl %ebp

	movl %ebx, %eax
	popl %ebx

	popl %edi
	popl %esi

	ret


	L(small):
	C Special case for 16-bit multiplier.
	C
	C eax multiplier
	C ebx
	C ecx -size
	C edx src
	C esi src end
	C edi dst end
	C ebp multiplier

	C size<3 not supported here. At size==3 we're already a couple of
	C cycles faster, so there's no threshold as such, just use the MMX
	C as soon as possible.

	cmpl $-3, %ecx
	ja L(big)

	movd %eax, %mm7 C m
	pxor %mm6, %mm6 C initial carry word

	punpcklwd %mm7, %mm7 C m replicated 2 times
	addl $2, %ecx C -size+2

	punpckldq %mm7, %mm7 C m replicated 4 times
	andl $4, %edx C test alignment, clear carry flag

	movq %mm7, %mm0 C m
	jz L(small_entry)


	C Source is unaligned, process one limb separately.
	C
	C Plain integer code is used here, since it's smaller and is about
	C the same 13 cycles as an mmx block would be.
	C
	C An "addl $1,%ecx" doesn't clear the carry flag when size==3, hence
	C the use of separate incl and orl.

	mull -8(%esi,%ecx,4) C m * src[0]

	movl %eax, -8(%edi,%ecx,4) C dst[0]
	incl %ecx C one limb processed

	movd %edx, %mm6 C initial carry

	orl %eax, %eax C clear carry flag
	jmp L(small_entry)


	C The scheduling here is quite tricky, since so many instructions have
	C pairing restrictions. In particular the js won't pair with a movd, and
	C can't be paired with an adc since it wants flags from the inc, so
	C instructions are rotated to the top of the loop to find somewhere useful
	C for it.
	C
	C Trouble has been taken to avoid overlapping successive loop iterations,
	C since that would greatly increase the size of the startup and finishup
	C code. Actually there's probably not much advantage to be had from
	C overlapping anyway, since the difficulties are mostly with pairing, not
	C with latencies as such.
	C
	C In the comments x represents the src data and m the multiplier (16
	C bits, but replicated 4 times).
	C
	C The m signs calculated in %mm3 are a loop invariant and could be held in
	C say %mm5, but that would save only one instruction and hence be no faster.

	L(small_top):
	C eax l.low, then l.high
	C ebx (h.low)
	C ecx counter, -size+2 to 0 or 1
	C edx (h.high)
	C esi &src[size]
	C edi &dst[size]
	C ebp
	C
	C %mm0 (high products)
	C %mm1 (low products)
	C %mm2 (adjust for m using x signs)
	C %mm3 (adjust for x using m signs)
	C %mm4
	C %mm5
	C %mm6 h.low, then carry
	C %mm7 m replicated 4 times

	movd %mm6, %ebx C h.low
	psrlq $32, %mm1 C l.high

	movd %mm0, %edx C h.high
	movq %mm0, %mm6 C new c

	adcl %eax, %ebx
	incl %ecx

	movd %mm1, %eax C l.high
	movq %mm7, %mm0

	adcl %eax, %edx
	movl %ebx, -16(%edi,%ecx,4)

	movl %edx, -12(%edi,%ecx,4)
	psrlq $32, %mm6 C c

	L(small_entry):
	pmulhw -8(%esi,%ecx,4), %mm0 C h = (x*m).high
	movq %mm7, %mm1

	pmullw -8(%esi,%ecx,4), %mm1 C l = (x*m).low
	movq %mm7, %mm3

	movq -8(%esi,%ecx,4), %mm2 C x
	psraw $15, %mm3 C m signs

	pand -8(%esi,%ecx,4), %mm3 C x selected by m signs
	psraw $15, %mm2 C x signs

	paddw %mm3, %mm0 C add x to h if m neg
	pand %mm7, %mm2 C m selected by x signs

	paddw %mm2, %mm0 C add m to h if x neg
	incl %ecx

	movd %mm1, %eax C l.low
	punpcklwd %mm0, %mm6 C c + h.low << 16

	psrlq $16, %mm0 C h.high
	js L(small_top)




	movd %mm6, %ebx C h.low
	psrlq $32, %mm1 C l.high

	adcl %eax, %ebx
	popl %ebp FRAME_popl()

	movd %mm0, %edx C h.high
	psrlq $32, %mm0 C l.high

	movd %mm1, %eax C l.high

	adcl %eax, %edx
	movl %ebx, -12(%edi,%ecx,4)

	movd %mm0, %eax C c

	adcl $0, %eax
	movl %edx, -8(%edi,%ecx,4)

	orl %ecx, %ecx
	jnz L(small_done) C final %ecx==1 means even, ==0 odd


	C Size odd, one extra limb to process.
	C Plain integer code is used here, since it's smaller and is about
	C the same speed as another mmx block would be.

	movl %eax, %ecx
	movl PARAM_MULTIPLIER, %eax

	mull -4(%esi)

	addl %ecx, %eax

	adcl $0, %edx
	movl %eax, -4(%edi)

	movl %edx, %eax
	L(small_done):
	popl %ebx

	popl %edi
	popl %esi

	emms

	ret

	EPILOGUE()