blob: 37053ba88d92abc86b51c29b824dd739e07d6df5 [file] [log] [blame]
dnl x86-64 mpn_divrem_2 -- Divide an mpn number by a normalized 2-limb number.
dnl Copyright 2007, 2008 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
dnl The GNU MP Library is free software; you can redistribute it and/or modify
dnl it under the terms of the GNU Lesser General Public License as published
dnl by the Free Software Foundation; either version 3 of the License, or (at
dnl your option) any later version.
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
dnl License for more details.
dnl You should have received a copy of the GNU Lesser General Public License
dnl along with the GNU MP Library. If not, see
C norm frac
C K8 20 20
C P4 73 73
C P6-15 37 37
C * Perhaps compute the inverse without relying on divq? Could either use
C Newton's method and mulq, or perhaps the faster fdiv.
C * The loop has not been carefully tuned, nor analysed for critical path
C length. It seems that 20 c/l is a bit long, compared to the 13 c/l for
C mpn_divrem_1.
C * Clean up. This code is really crude.
define(`qp', `%rdi')
define(`fn', `%rsi')
define(`up_param', `%rdx')
define(`un_param', `%rcx')
define(`dp', `%r8')
define(`dinv', `%r9')
C rax rbx rcx rdx rsi rdi rbp r8 r9 r10 r11 r12 r13 r14 r15
C cnt qp d dinv
push %r15
lea (%rdx,%rcx,8), %rax
push %r14
push %r13
mov %rsi, %r13
push %r12
lea -24(%rax), %r12
push %rbp
mov %rdi, %rbp
push %rbx
mov 8(%r8), %r11
mov -8(%rax), %r9
mov (%r8), %r8
mov -16(%rax), %r10
xor R32(%r15), R32(%r15)
cmp %r9, %r11
ja L(2)
setb %dl
cmp %r10, %r8
setbe %al
or %al, %dl
jne L(23)
lea -3(%rcx,%r13), %rbx C un + fn - 3
test %rbx, %rbx
js L(6)
mov %r11, %rdx
mov $-1, %rax
not %rdx
div %r11
mov %r11, %rdx
mov %rax, %rdi
imul %rax, %rdx
mov %rdx, %r14
mul %r8
mov %rdx, %rcx
mov $-1, %rdx
add %r8, %r14
adc $0, %rdx
add %rcx, %r14
adc $0, %rdx
js L(8)
dec %rdi
sub %r11, %r14
sbb $0, %rdx
jns L(18)
C rax rbx rcx rdx rsi rdi rbp r8 r9 r10 r11 r12 r13 r14 r15
C n2 un n1 dinv qp d0 d1 up fn msl
C n2 un -d1 n1 dinv XX XX
lea (%rbp,%rbx,8), %rbp
mov %rbx, %rcx C un
mov %r9, %rbx
mov %rdi, %r9 C di
mov %r10, %r14
mov %r11, %rsi
neg %rsi C -d1
mov %r9, %rax C di ncp
mul %rbx C 0, 18
add %r14, %rax C 4
mov %rax, %r10 C q0 5
adc %rbx, %rdx C 5
mov %rdx, %rdi C q 6
imul %rsi, %rdx C 6
mov %r8, %rax C ncp
lea (%rdx, %r14), %rbx C n1 -= ... 7
mul %rdi C 7
xor R32(%r14), R32(%r14) C
cmp %rcx, %r13 C
jg L(19) C
mov (%r12), %r14 C
sub $8, %r12 C
L(19): sub %r8, %r14 C ncp
sbb %r11, %rbx C 9
sub %rax, %r14 C 11
sbb %rdx, %rbx C 12
inc %rdi C 7
xor R32(%rdx), R32(%rdx) C
cmp %r10, %rbx C 13
mov %r8, %rax C d1 ncp
adc $-1, %rdx C mask 14
add %rdx, %rdi C q-- 15
and %rdx, %rax C d0 or 0 15
and %r11, %rdx C d1 or 0 15
add %rax, %r14 C 16
adc %rdx, %rbx C 16
cmp %r11, %rbx C 17
jae L(fix) C
L(bck): mov %rdi, (%rbp) C
sub $8, %rbp C
dec %rcx
jns L(loop)
mov %r14, %r10
mov %rbx, %r9
lea (%rbp,%rbx,8), %rbp
mov %rbx, %rcx
mov %r9, %rax
mov %r10, %rsi
mov %rax, %r14 C 0, 19
mul %rdi C 0
mov %r11, %r9 C 1
add %rsi, %rax C 4
mov %rax, %rbx C q0 5
adc %r14, %rdx C q 5
lea 1(%rdx), %r10 C 6
mov %rdx, %rax C 6
imul %rdx, %r9 C 6
sub %r9, %rsi C 10
xor R32(%r9), R32(%r9) C
mul %r8 C 7
cmp %rcx, %r13 C
jg L(13) C
mov (%r12), %r9 C
sub $8, %r12 C
L(13): sub %r8, %r9 C ncp
sbb %r11, %rsi C 11
sub %rax, %r9 C 11
sbb %rdx, %rsi C 12
cmp %rbx, %rsi C 13
sbb %rax, %rax C 14
not %rax C 15
add %rax, %r10 C 16
mov %r8, %rbx C ncp
and %rax, %rbx C 16
and %r11, %rax C 16
add %rbx, %r9 C 17
adc %rsi, %rax C 18
cmp %rax, %r11 C 19
jbe L(fix) C
L(bck): mov %r10, (%rbp) C
sub $8, %rbp C
mov %r9, %rsi C 18
dec %rcx
jns L(loop)
mov %rsi, %r10
mov %rax, %r9
mov %r10, 8(%r12)
mov %r9, 16(%r12)
pop %rbx
pop %rbp
pop %r12
pop %r13
pop %r14
mov %r15, %rax
pop %r15
L(23): inc R32(%r15)
sub %r8, %r10
sbb %r11, %r9
jmp L(2)
L(fix): seta %dl
cmp %r8, %r14
setae %al
orb %dl, %al
je L(bck)
inc %rdi
sub %r8, %r14
sbb %r11, %rbx
jmp L(bck)
L(fix): jb L(88)
cmp %r8, %r9
jb L(bck)
L(88): inc %r10
sub %r8, %r9
sbb %r11, %rax
jmp L(bck)