| dnl x86-64 mpn_divrem_2 -- Divide an mpn number by a normalized 2-limb number. |
| |
| dnl Copyright 2007, 2008 Free Software Foundation, Inc. |
| |
| dnl This file is part of the GNU MP Library. |
| |
| dnl The GNU MP Library is free software; you can redistribute it and/or modify |
| dnl it under the terms of the GNU Lesser General Public License as published |
| dnl by the Free Software Foundation; either version 3 of the License, or (at |
| dnl your option) any later version. |
| |
| dnl The GNU MP Library is distributed in the hope that it will be useful, but |
| dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY |
| dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public |
| dnl License for more details. |
| |
| dnl You should have received a copy of the GNU Lesser General Public License |
| dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. |
| |
| include(`../config.m4') |
| |
| |
| C norm frac |
| C K8 20 20 |
| C P4 73 73 |
| C P6-15 37 37 |
| |
| C TODO |
| C * Perhaps compute the inverse without relying on divq? Could either use |
| C Newton's method and mulq, or perhaps the faster fdiv. |
| C * The loop has not been carefully tuned, nor analysed for critical path |
| C length. It seems that 20 c/l is a bit long, compared to the 13 c/l for |
| C mpn_divrem_1. |
| C * Clean up. This code is really crude. |
| |
| |
| C INPUT PARAMETERS |
| define(`qp', `%rdi') |
| define(`fn', `%rsi') |
| define(`up_param', `%rdx') |
| define(`un_param', `%rcx') |
| define(`dp', `%r8') |
| |
| define(`dinv', `%r9') |
| |
| |
| C rax rbx rcx rdx rsi rdi rbp r8 r9 r10 r11 r12 r13 r14 r15 |
| C cnt qp d dinv |
| |
| ASM_START() |
| TEXT |
| ALIGN(16) |
| PROLOGUE(mpn_divrem_2) |
| |
| push %r15 |
| lea (%rdx,%rcx,8), %rax |
| push %r14 |
| push %r13 |
| mov %rsi, %r13 |
| push %r12 |
| lea -24(%rax), %r12 |
| push %rbp |
| mov %rdi, %rbp |
| push %rbx |
| mov 8(%r8), %r11 |
| mov -8(%rax), %r9 |
| mov (%r8), %r8 |
| mov -16(%rax), %r10 |
| xor R32(%r15), R32(%r15) |
| cmp %r9, %r11 |
| ja L(2) |
| setb %dl |
| cmp %r10, %r8 |
| setbe %al |
| or %al, %dl |
| jne L(23) |
| L(2): |
| lea -3(%rcx,%r13), %rbx C un + fn - 3 |
| test %rbx, %rbx |
| js L(6) |
| mov %r11, %rdx |
| mov $-1, %rax |
| not %rdx |
| div %r11 |
| mov %r11, %rdx |
| mov %rax, %rdi |
| imul %rax, %rdx |
| mov %rdx, %r14 |
| mul %r8 |
| mov %rdx, %rcx |
| mov $-1, %rdx |
| add %r8, %r14 |
| adc $0, %rdx |
| add %rcx, %r14 |
| adc $0, %rdx |
| js L(8) |
| L(18): |
| dec %rdi |
| sub %r11, %r14 |
| sbb $0, %rdx |
| jns L(18) |
| L(8): |
| |
| C rax rbx rcx rdx rsi rdi rbp r8 r9 r10 r11 r12 r13 r14 r15 |
| C n2 un n1 dinv qp d0 d1 up fn msl |
| C n2 un -d1 n1 dinv XX XX |
| |
| ifdef(`NEW',` |
| lea (%rbp,%rbx,8), %rbp |
| mov %rbx, %rcx C un |
| mov %r9, %rbx |
| mov %rdi, %r9 C di |
| mov %r10, %r14 |
| mov %r11, %rsi |
| neg %rsi C -d1 |
| ALIGN(16) |
| L(loop): |
| mov %r9, %rax C di ncp |
| mul %rbx C 0, 18 |
| add %r14, %rax C 4 |
| mov %rax, %r10 C q0 5 |
| adc %rbx, %rdx C 5 |
| mov %rdx, %rdi C q 6 |
| imul %rsi, %rdx C 6 |
| mov %r8, %rax C ncp |
| lea (%rdx, %r14), %rbx C n1 -= ... 7 |
| mul %rdi C 7 |
| xor R32(%r14), R32(%r14) C |
| cmp %rcx, %r13 C |
| jg L(19) C |
| mov (%r12), %r14 C |
| sub $8, %r12 C |
| L(19): sub %r8, %r14 C ncp |
| sbb %r11, %rbx C 9 |
| sub %rax, %r14 C 11 |
| sbb %rdx, %rbx C 12 |
| inc %rdi C 7 |
| xor R32(%rdx), R32(%rdx) C |
| cmp %r10, %rbx C 13 |
| mov %r8, %rax C d1 ncp |
| adc $-1, %rdx C mask 14 |
| add %rdx, %rdi C q-- 15 |
| and %rdx, %rax C d0 or 0 15 |
| and %r11, %rdx C d1 or 0 15 |
| add %rax, %r14 C 16 |
| adc %rdx, %rbx C 16 |
| cmp %r11, %rbx C 17 |
| jae L(fix) C |
| L(bck): mov %rdi, (%rbp) C |
| sub $8, %rbp C |
| dec %rcx |
| jns L(loop) |
| |
| mov %r14, %r10 |
| mov %rbx, %r9 |
| ',` |
| lea (%rbp,%rbx,8), %rbp |
| mov %rbx, %rcx |
| mov %r9, %rax |
| mov %r10, %rsi |
| ALIGN(16) |
| L(loop): |
| mov %rax, %r14 C 0, 19 |
| mul %rdi C 0 |
| mov %r11, %r9 C 1 |
| add %rsi, %rax C 4 |
| mov %rax, %rbx C q0 5 |
| adc %r14, %rdx C q 5 |
| lea 1(%rdx), %r10 C 6 |
| mov %rdx, %rax C 6 |
| imul %rdx, %r9 C 6 |
| sub %r9, %rsi C 10 |
| xor R32(%r9), R32(%r9) C |
| mul %r8 C 7 |
| cmp %rcx, %r13 C |
| jg L(13) C |
| mov (%r12), %r9 C |
| sub $8, %r12 C |
| L(13): sub %r8, %r9 C ncp |
| sbb %r11, %rsi C 11 |
| sub %rax, %r9 C 11 |
| sbb %rdx, %rsi C 12 |
| cmp %rbx, %rsi C 13 |
| sbb %rax, %rax C 14 |
| not %rax C 15 |
| add %rax, %r10 C 16 |
| mov %r8, %rbx C ncp |
| and %rax, %rbx C 16 |
| and %r11, %rax C 16 |
| add %rbx, %r9 C 17 |
| adc %rsi, %rax C 18 |
| cmp %rax, %r11 C 19 |
| jbe L(fix) C |
| L(bck): mov %r10, (%rbp) C |
| sub $8, %rbp C |
| mov %r9, %rsi C 18 |
| dec %rcx |
| jns L(loop) |
| |
| mov %rsi, %r10 |
| mov %rax, %r9 |
| ') |
| L(6): |
| mov %r10, 8(%r12) |
| mov %r9, 16(%r12) |
| pop %rbx |
| pop %rbp |
| pop %r12 |
| pop %r13 |
| pop %r14 |
| mov %r15, %rax |
| pop %r15 |
| ret |
| |
| L(23): inc R32(%r15) |
| sub %r8, %r10 |
| sbb %r11, %r9 |
| jmp L(2) |
| |
| ifdef(`NEW',` |
| L(fix): seta %dl |
| cmp %r8, %r14 |
| setae %al |
| orb %dl, %al |
| je L(bck) |
| inc %rdi |
| sub %r8, %r14 |
| sbb %r11, %rbx |
| jmp L(bck) |
| ',` |
| L(fix): jb L(88) |
| cmp %r8, %r9 |
| jb L(bck) |
| L(88): inc %r10 |
| sub %r8, %r9 |
| sbb %r11, %rax |
| jmp L(bck) |
| ') |
| EPILOGUE() |