| dnl PowerPC-64 mpn_divexact_1 -- mpn by limb exact division. |
| |
| dnl Copyright 2006 Free Software Foundation, Inc. |
| |
| dnl This file is part of the GNU MP Library. |
| |
| dnl The GNU MP Library is free software; you can redistribute it and/or modify |
| dnl it under the terms of the GNU Lesser General Public License as published |
| dnl by the Free Software Foundation; either version 3 of the License, or (at |
| dnl your option) any later version. |
| |
| dnl The GNU MP Library is distributed in the hope that it will be useful, but |
| dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY |
| dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public |
| dnl License for more details. |
| |
| dnl You should have received a copy of the GNU Lesser General Public License |
| dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. |
| |
| include(`../config.m4') |
| |
| C cycles/limb |
| C POWER3/PPC630: 13-19 |
| C POWER4/PPC970: 16 |
| C POWER5: 16 |
| |
| C TODO |
| C * Check if n=1 code is really an improvment. It probably isn't. |
| C * Perhaps remove L(norm) code, it is currently unreachable. |
| C * Make more similar to mode1o.asm. |
| |
| C INPUT PARAMETERS |
| define(`rp', `r3') |
| define(`up', `r4') |
| define(`n', `r5') |
| define(`d', `r6') |
| |
| |
| ASM_START() |
| |
| EXTERN(binvert_limb_table) |
| |
| PROLOGUE(mpn_divexact_1) |
| addic. n, n, -1 |
| ld r12, 0(up) |
| bne cr0, L(2) |
| divdu r0, r12, d |
| std r0, 0(rp) |
| blr |
| L(2): |
| rldicl. r0, d, 0, 63 |
| li r10, 0 |
| bne cr0, L(7) |
| neg r0, d |
| and r0, d, r0 |
| cntlzd r0, r0 |
| subfic r0, r0, 63 |
| rldicl r10, r0, 0, 32 |
| srd d, d, r0 |
| L(7): |
| mtctr n |
| LEA( r5, binvert_limb_table) |
| rldicl r11, d, 63, 57 |
| C cmpdi cr7, r0, 0 |
| lbzx r0, r5, r11 |
| mulld r9, r0, r0 |
| sldi r0, r0, 1 |
| mulld r9, d, r9 |
| subf r0, r9, r0 |
| mulld r5, r0, r0 |
| sldi r0, r0, 1 |
| mulld r5, d, r5 |
| subf r0, r5, r0 |
| mulld r9, r0, r0 |
| sldi r0, r0, 1 |
| mulld r9, d, r9 |
| subf r7, r9, r0 C r7 = 1/d mod 2^64 |
| C beq cr7, L(norm) |
| subfic r8, r10, 64 C set carry as side effect |
| li r5, 0 |
| |
| ALIGN(16) |
| L(loop0): |
| srd r11, r12, r10 |
| ld r12, 8(up) |
| addi up, up, 8 |
| sld r0, r12, r8 |
| or r11, r11, r0 |
| subfe r9, r5, r11 |
| mulld r0, r7, r9 |
| std r0, 0(rp) |
| addi rp, rp, 8 |
| mulhdu r5, r0, d |
| bdnz L(loop0) |
| |
| srd r0, r12, r10 |
| subfe r0, r5, r0 |
| mulld r0, r7, r0 |
| std r0, 0(rp) |
| blr |
| |
| ALIGN(16) |
| L(norm): |
| mulld r11, r12, r7 |
| std r11, 0(rp) |
| ALIGN(16) |
| L(loop1): |
| mulhdu r5, r11, d |
| ld r9, 8(up) |
| addi up, up, 8 |
| subfe r5, r5, r9 |
| mulld r11, r7, r5 |
| std r11, 8(rp) |
| addi rp, rp, 8 |
| bdnz L(loop1) |
| blr |
| EPILOGUE() |
| ASM_END() |