blob: 7f973dbf983861af7115994fbc43284888d72757 [file] [log] [blame]
dnl Intel Pentium-4 mpn_divrem_1 -- mpn by limb division.
dnl Copyright 1999, 2000, 2001, 2002, 2003, 2004 Free Software Foundation,
dnl Inc.
dnl
dnl This file is part of the GNU MP Library.
dnl
dnl The GNU MP Library is free software; you can redistribute it and/or
dnl modify it under the terms of the GNU Lesser General Public License as
dnl published by the Free Software Foundation; either version 3 of the
dnl License, or (at your option) any later version.
dnl
dnl The GNU MP Library is distributed in the hope that it will be useful,
dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
dnl Lesser General Public License for more details.
dnl
dnl You should have received a copy of the GNU Lesser General Public License
dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
C P4: 32 cycles/limb integer part, 30 cycles/limb fraction part.
C mp_limb_t mpn_divrem_1 (mp_ptr dst, mp_size_t xsize,
C mp_srcptr src, mp_size_t size,
C mp_limb_t divisor);
C mp_limb_t mpn_divrem_1c (mp_ptr dst, mp_size_t xsize,
C mp_srcptr src, mp_size_t size,
C mp_limb_t divisor, mp_limb_t carry);
C mp_limb_t mpn_preinv_divrem_1 (mp_ptr dst, mp_size_t xsize,
C mp_srcptr src, mp_size_t size,
C mp_limb_t divisor, mp_limb_t inverse,
C unsigned shift);
C
C Algorithm:
C
C The method and nomenclature follow part 8 of "Division by Invariant
C Integers using Multiplication" by Granlund and Montgomery, reference in
C gmp.texi.
C
C "m" is written for what is m' in the paper, and "d" for d_norm, which
C won't cause any confusion since it's only the normalized divisor that's of
C any use in the code. "b" is written for 2^N, the size of a limb, N being
C 32 here.
C
C The step "sdword dr = n - 2^N*d + (2^N-1-q1) * d" is instead done as
C "n-d - q1*d". This rearrangement gives the same two-limb answer but lets
C us have just a psubq on the dependent chain.
C
C For reference, the way the k7 code uses "n-(q1+1)*d" would not suit here,
C detecting an overflow of q1+1 when q1=0xFFFFFFFF would cost too much.
C
C Notes:
C
C mpn_divrem_1 and mpn_preinv_divrem_1 avoid one division if the src high
C limb is less than the divisor. mpn_divrem_1c doesn't check for a zero
C carry, since in normal circumstances that will be a very rare event.
C
C The test for skipping a division is branch free (once size>=1 is tested).
C The store to the destination high limb is 0 when a divide is skipped, or
C if it's not skipped then a copy of the src high limb is stored. The
C latter is in case src==dst.
C
C There's a small bias towards expecting xsize==0, by having code for
C xsize==0 in a straight line and xsize!=0 under forward jumps.
C
C Enhancements:
C
C The loop measures 32 cycles, but the dependent chain would suggest it
C could be done with 30. Not sure where to start looking for the extras.
C
C Alternatives:
C
C If the divisor is normalized (high bit set) then a division step can
C always be skipped, since the high destination limb is always 0 or 1 in
C that case. It doesn't seem worth checking for this though, since it
C probably occurs infrequently.
dnl MUL_THRESHOLD is the value of xsize+size at which the multiply by
dnl inverse method is used, rather than plain "divl"s. Minimum value 1.
dnl
dnl The inverse takes about 80-90 cycles to calculate, but after that the
dnl multiply is 32 c/l versus division at about 58 c/l.
dnl
dnl At 4 limbs the div is a touch faster than the mul (and of course
dnl simpler), so start the mul from 5 limbs.
deflit(MUL_THRESHOLD, 5)
defframe(PARAM_PREINV_SHIFT, 28) dnl mpn_preinv_divrem_1
defframe(PARAM_PREINV_INVERSE, 24) dnl mpn_preinv_divrem_1
defframe(PARAM_CARRY, 24) dnl mpn_divrem_1c
defframe(PARAM_DIVISOR,20)
defframe(PARAM_SIZE, 16)
defframe(PARAM_SRC, 12)
defframe(PARAM_XSIZE, 8)
defframe(PARAM_DST, 4)
dnl re-use parameter space
define(SAVE_ESI,`PARAM_SIZE')
define(SAVE_EBP,`PARAM_SRC')
define(SAVE_EDI,`PARAM_DIVISOR')
define(SAVE_EBX,`PARAM_DST')
TEXT
ALIGN(16)
PROLOGUE(mpn_preinv_divrem_1)
deflit(`FRAME',0)
movl PARAM_SIZE, %ecx
xorl %edx, %edx C carry if can't skip a div
movl %esi, SAVE_ESI
movl PARAM_SRC, %esi
movl %ebp, SAVE_EBP
movl PARAM_DIVISOR, %ebp
movl %edi, SAVE_EDI
movl PARAM_DST, %edi
movl -4(%esi,%ecx,4), %eax C src high limb
movl %ebx, SAVE_EBX
movl PARAM_XSIZE, %ebx
movd PARAM_PREINV_INVERSE, %mm4
movd PARAM_PREINV_SHIFT, %mm7 C l
cmpl %ebp, %eax C high cmp divisor
cmovc( %eax, %edx) C high is carry if high<divisor
movd %edx, %mm0 C carry
movd %edx, %mm1 C carry
movl $0, %edx
movd %ebp, %mm5 C d
cmovnc( %eax, %edx) C 0 if skip div, src high if not
C (the latter in case src==dst)
leal -4(%edi,%ebx,4), %edi C &dst[xsize-1]
movl %edx, (%edi,%ecx,4) C dst high limb
sbbl $0, %ecx C skip one division if high<divisor
movl $32, %eax
subl PARAM_PREINV_SHIFT, %eax
psllq %mm7, %mm5 C d normalized
leal (%edi,%ecx,4), %edi C &dst[xsize+size-1]
leal -4(%esi,%ecx,4), %esi C &src[size-1]
movd %eax, %mm6 C 32-l
jmp L(start_preinv)
EPILOGUE()
ALIGN(16)
PROLOGUE(mpn_divrem_1c)
deflit(`FRAME',0)
movl PARAM_CARRY, %edx
movl PARAM_SIZE, %ecx
movl %esi, SAVE_ESI
movl PARAM_SRC, %esi
movl %ebp, SAVE_EBP
movl PARAM_DIVISOR, %ebp
movl %edi, SAVE_EDI
movl PARAM_DST, %edi
movl %ebx, SAVE_EBX
movl PARAM_XSIZE, %ebx
leal -4(%edi,%ebx,4), %edi C &dst[xsize-1]
jmp L(start_1c)
EPILOGUE()
ALIGN(16)
PROLOGUE(mpn_divrem_1)
deflit(`FRAME',0)
movl PARAM_SIZE, %ecx
xorl %edx, %edx C initial carry (if can't skip a div)
movl %esi, SAVE_ESI
movl PARAM_SRC, %esi
movl %ebp, SAVE_EBP
movl PARAM_DIVISOR, %ebp
movl %edi, SAVE_EDI
movl PARAM_DST, %edi
movl %ebx, SAVE_EBX
movl PARAM_XSIZE, %ebx
leal -4(%edi,%ebx,4), %edi C &dst[xsize-1]
orl %ecx, %ecx C size
jz L(no_skip_div) C if size==0
movl -4(%esi,%ecx,4), %eax C src high limb
cmpl %ebp, %eax C high cmp divisor
cmovnc( %eax, %edx) C 0 if skip div, src high if not
movl %edx, (%edi,%ecx,4) C dst high limb
movl $0, %edx
cmovc( %eax, %edx) C high is carry if high<divisor
sbbl $0, %ecx C size-1 if high<divisor
L(no_skip_div):
L(start_1c):
C eax
C ebx xsize
C ecx size
C edx carry
C esi src
C edi &dst[xsize-1]
C ebp divisor
leal (%ebx,%ecx), %eax C size+xsize
leal -4(%esi,%ecx,4), %esi C &src[size-1]
leal (%edi,%ecx,4), %edi C &dst[size+xsize-1]
cmpl $MUL_THRESHOLD, %eax
jae L(mul_by_inverse)
orl %ecx, %ecx
jz L(divide_no_integer) C if size==0
L(divide_integer):
C eax scratch (quotient)
C ebx xsize
C ecx counter
C edx carry
C esi src, decrementing
C edi dst, decrementing
C ebp divisor
movl (%esi), %eax
subl $4, %esi
divl %ebp
movl %eax, (%edi)
subl $4, %edi
subl $1, %ecx
jnz L(divide_integer)
L(divide_no_integer):
orl %ebx, %ebx
jnz L(divide_fraction) C if xsize!=0
L(divide_done):
movl SAVE_ESI, %esi
movl SAVE_EDI, %edi
movl SAVE_EBX, %ebx
movl SAVE_EBP, %ebp
movl %edx, %eax
ret
L(divide_fraction):
C eax scratch (quotient)
C ebx counter
C ecx
C edx carry
C esi
C edi dst, decrementing
C ebp divisor
movl $0, %eax
divl %ebp
movl %eax, (%edi)
subl $4, %edi
subl $1, %ebx
jnz L(divide_fraction)
jmp L(divide_done)
C -----------------------------------------------------------------------------
L(mul_by_inverse):
C eax
C ebx xsize
C ecx size
C edx carry
C esi &src[size-1]
C edi &dst[size+xsize-1]
C ebp divisor
bsrl %ebp, %eax C 31-l
movd %edx, %mm0 C carry
movd %edx, %mm1 C carry
movl %ecx, %edx C size
movl $31, %ecx
C
xorl %eax, %ecx C l = leading zeros on d
addl $1, %eax
shll %cl, %ebp C d normalized
movd %ecx, %mm7 C l
movl %edx, %ecx C size
movd %eax, %mm6 C 32-l
movl $-1, %edx
movl $-1, %eax
C
subl %ebp, %edx C (b-d)-1 so edx:eax = b*(b-d)-1
divl %ebp C floor (b*(b-d)-1 / d)
movd %ebp, %mm5 C d
C
movd %eax, %mm4 C m
L(start_preinv):
C eax inverse
C ebx xsize
C ecx size
C edx
C esi &src[size-1]
C edi &dst[size+xsize-1]
C ebp
C
C mm0 carry
C mm1 carry
C mm2
C mm4 m
C mm5 d
C mm6 31-l
C mm7 l
psllq %mm7, %mm0 C n2 = carry << l, for size==0
subl $1, %ecx
jb L(integer_none)
movd (%esi), %mm0 C src high limb
punpckldq %mm1, %mm0
psrlq %mm6, %mm0 C n2 = high (carry:srchigh << l)
jz L(integer_last)
C The dependent chain here consists of
C
C 2 paddd n1+n2
C 8 pmuludq m*(n1+n2)
C 2 paddq n2:nadj + m*(n1+n2)
C 2 psrlq q1
C 8 pmuludq d*q1
C 2 psubq (n-d)-q1*d
C 2 psrlq high n-(q1+1)*d mask
C 2 pand d masked
C 2 paddd n2+d addback
C --
C 30
C
C But it seems to run at 32 cycles, so presumably there's something else
C going on.
ALIGN(16)
L(integer_top):
C eax
C ebx
C ecx counter, size-1 to 0
C edx
C esi src, decrementing
C edi dst, decrementing
C
C mm0 n2
C mm4 m
C mm5 d
C mm6 32-l
C mm7 l
ASSERT(b,`C n2<d
movd %mm0, %eax
movd %mm5, %edx
cmpl %edx, %eax')
movd -4(%esi), %mm1 C next src limbs
movd (%esi), %mm2
leal -4(%esi), %esi
punpckldq %mm2, %mm1
psrlq %mm6, %mm1 C n10
movq %mm1, %mm2 C n10
movq %mm1, %mm3 C n10
psrad $31, %mm1 C -n1
pand %mm5, %mm1 C -n1 & d
paddd %mm2, %mm1 C nadj = n10+(-n1&d), ignore overflow
psrld $31, %mm2 C n1
paddd %mm0, %mm2 C n2+n1
punpckldq %mm0, %mm1 C n2:nadj
pmuludq %mm4, %mm2 C m*(n2+n1)
C
paddq %mm2, %mm1 C n2:nadj + m*(n2+n1)
pxor %mm2, %mm2 C break dependency, saves 4 cycles
pcmpeqd %mm2, %mm2 C FF...FF
psrlq $63, %mm2 C 1
psrlq $32, %mm1 C q1 = high(n2:nadj + m*(n2+n1))
paddd %mm1, %mm2 C q1+1
pmuludq %mm5, %mm1 C q1*d
punpckldq %mm0, %mm3 C n = n2:n10
pxor %mm0, %mm0
psubq %mm5, %mm3 C n - d
C
psubq %mm1, %mm3 C n - (q1+1)*d
por %mm3, %mm0 C copy remainder -> new n2
psrlq $32, %mm3 C high n - (q1+1)*d, 0 or -1
ASSERT(be,`C 0 or -1
movd %mm3, %eax
addl $1, %eax
cmpl $1, %eax')
paddd %mm3, %mm2 C q
pand %mm5, %mm3 C mask & d
paddd %mm3, %mm0 C addback if necessary
movd %mm2, (%edi)
leal -4(%edi), %edi
subl $1, %ecx
ja L(integer_top)
L(integer_last):
C eax
C ebx xsize
C ecx
C edx
C esi &src[0]
C edi &dst[xsize]
C
C mm0 n2
C mm4 m
C mm5 d
C mm6
C mm7 l
ASSERT(b,`C n2<d
movd %mm0, %eax
movd %mm5, %edx
cmpl %edx, %eax')
movd (%esi), %mm1 C src[0]
psllq %mm7, %mm1 C n10
movq %mm1, %mm2 C n10
movq %mm1, %mm3 C n10
psrad $31, %mm1 C -n1
pand %mm5, %mm1 C -n1 & d
paddd %mm2, %mm1 C nadj = n10+(-n1&d), ignore overflow
psrld $31, %mm2 C n1
paddd %mm0, %mm2 C n2+n1
punpckldq %mm0, %mm1 C n2:nadj
pmuludq %mm4, %mm2 C m*(n2+n1)
C
paddq %mm2, %mm1 C n2:nadj + m*(n2+n1)
pcmpeqd %mm2, %mm2 C FF...FF
psrlq $63, %mm2 C 1
psrlq $32, %mm1 C q1 = high(n2:nadj + m*(n2+n1))
paddd %mm1, %mm2 C q1
pmuludq %mm5, %mm1 C q1*d
punpckldq %mm0, %mm3 C n
psubq %mm5, %mm3 C n - d
pxor %mm0, %mm0
C
psubq %mm1, %mm3 C n - (q1+1)*d
por %mm3, %mm0 C remainder -> n2
psrlq $32, %mm3 C high n - (q1+1)*d, 0 or -1
ASSERT(be,`C 0 or -1
movd %mm3, %eax
addl $1, %eax
cmpl $1, %eax')
paddd %mm3, %mm2 C q
pand %mm5, %mm3 C mask & d
paddd %mm3, %mm0 C addback if necessary
movd %mm2, (%edi)
leal -4(%edi), %edi
L(integer_none):
C eax
C ebx xsize
orl %ebx, %ebx
jnz L(fraction_some) C if xsize!=0
L(fraction_done):
movl SAVE_EBP, %ebp
psrld %mm7, %mm0 C remainder
movl SAVE_EDI, %edi
movd %mm0, %eax
movl SAVE_ESI, %esi
movl SAVE_EBX, %ebx
emms
ret
C -----------------------------------------------------------------------------
C
L(fraction_some):
C eax
C ebx xsize
C ecx
C edx
C esi
C edi &dst[xsize-1]
C ebp
L(fraction_top):
C eax
C ebx counter, xsize iterations
C ecx
C edx
C esi src, decrementing
C edi dst, decrementing
C
C mm0 n2
C mm4 m
C mm5 d
C mm6 32-l
C mm7 l
ASSERT(b,`C n2<d
movd %mm0, %eax
movd %mm5, %edx
cmpl %edx, %eax')
movq %mm0, %mm1 C n2
pmuludq %mm4, %mm0 C m*n2
pcmpeqd %mm2, %mm2
psrlq $63, %mm2
C
psrlq $32, %mm0 C high(m*n2)
paddd %mm1, %mm0 C q1 = high(n2:0 + m*n2)
paddd %mm0, %mm2 C q1+1
pmuludq %mm5, %mm0 C q1*d
psllq $32, %mm1 C n = n2:0
psubq %mm5, %mm1 C n - d
C
psubq %mm0, %mm1 C r = n - (q1+1)*d
pxor %mm0, %mm0
por %mm1, %mm0 C r -> n2
psrlq $32, %mm1 C high n - (q1+1)*d, 0 or -1
ASSERT(be,`C 0 or -1
movd %mm1, %eax
addl $1, %eax
cmpl $1, %eax')
paddd %mm1, %mm2 C q
pand %mm5, %mm1 C mask & d
paddd %mm1, %mm0 C addback if necessary
movd %mm2, (%edi)
leal -4(%edi), %edi
subl $1, %ebx
jne L(fraction_top)
jmp L(fraction_done)
EPILOGUE()