| dnl PowerPC-32/VMX and PowerPC-64/VMX mpn_copyd. |
| |
| dnl Copyright 2006 Free Software Foundation, Inc. |
| |
| dnl This file is part of the GNU MP Library. |
| |
| dnl The GNU MP Library is free software; you can redistribute it and/or modify |
| dnl it under the terms of the GNU Lesser General Public License as published |
| dnl by the Free Software Foundation; either version 3 of the License, or (at |
| dnl your option) any later version. |
| |
| dnl The GNU MP Library is distributed in the hope that it will be useful, but |
| dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY |
| dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public |
| dnl License for more details. |
| |
| dnl You should have received a copy of the GNU Lesser General Public License |
| dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. |
| |
| include(`../config.m4') |
| |
| C 16-byte coaligned unaligned |
| C cycles/limb cycles/limb |
| C 7400,7410 (G4): 0.5 0.64 |
| C 744x,745x (G4+): 0.75 0.82 |
| C 970 (G5): 0.78 1.02 (64-bit limbs) |
| |
| C STATUS |
| C * Works for all sizes and alignments. |
| |
| C TODO |
| C * Optimize unaligned case. Some basic tests with 2-way and 4-way unrolling |
| C indicate that we can reach 0.56 c/l for 7400, 0.75 c/l for 745x, and 0.80 |
| C c/l for 970. |
| C * Consider using VMX instructions also for head and tail, by using some |
| C read-modify-write tricks. |
| C * The VMX code is used from the smallest sizes it handles, but measurements |
| C show a large speed bump at the cutoff points. Small copying (perhaps |
| C using some read-modify-write technique) should be optimized. |
| C * Make a mpn_com_n based on this code. |
| |
| define(`GMP_LIMB_BYTES', eval(GMP_LIMB_BITS/8)) |
| define(`LIMBS_PER_VR', eval(16/GMP_LIMB_BYTES)) |
| define(`LIMBS_PER_2VR', eval(32/GMP_LIMB_BYTES)) |
| |
| |
| ifelse(GMP_LIMB_BITS,32,` |
| define(`LIMB32',` $1') |
| define(`LIMB64',`') |
| ',` |
| define(`LIMB32',`') |
| define(`LIMB64',` $1') |
| ') |
| |
| C INPUT PARAMETERS |
| define(`rp', `r3') |
| define(`up', `r4') |
| define(`n', `r5') |
| |
| define(`us', `v4') |
| |
| |
| ASM_START() |
| PROLOGUE(mpn_copyd) |
| |
| LIMB32(`slwi. r0, n, 2 ') |
| LIMB64(`sldi. r0, n, 3 ') |
| add rp, rp, r0 |
| add up, up, r0 |
| |
| LIMB32(`cmpi cr7, n, 11 ') |
| LIMB64(`cmpdi cr7, n, 5 ') |
| bge cr7, L(big) |
| |
| beqlr cr0 |
| |
| C Handle small cases with plain operations |
| mtctr n |
| L(topS): |
| LIMB32(`lwz r0, -4(up) ') |
| LIMB64(`ld r0, -8(up) ') |
| addi up, up, -GMP_LIMB_BYTES |
| LIMB32(`stw r0, -4(rp) ') |
| LIMB64(`std r0, -8(rp) ') |
| addi rp, rp, -GMP_LIMB_BYTES |
| bdnz L(topS) |
| blr |
| |
| C Handle large cases with VMX operations |
| L(big): |
| addi rp, rp, -16 |
| addi up, up, -16 |
| mfspr r12, 256 |
| oris r0, r12, 0xf800 C Set VRSAVE bit 0-4 |
| mtspr 256, r0 |
| |
| LIMB32(`rlwinm. r7, rp, 30,30,31') C (rp >> 2) mod 4 |
| LIMB64(`rlwinm. r7, rp, 29,31,31') C (rp >> 3) mod 2 |
| beq L(rp_aligned) |
| |
| subf n, r7, n |
| L(top0): |
| LIMB32(`lwz r0, 12(up) ') |
| LIMB64(`ld r0, 8(up) ') |
| addi up, up, -GMP_LIMB_BYTES |
| LIMB32(`addic. r7, r7, -1 ') |
| LIMB32(`stw r0, 12(rp) ') |
| LIMB64(`std r0, 8(rp) ') |
| addi rp, rp, -GMP_LIMB_BYTES |
| LIMB32(`bne L(top0) ') |
| |
| L(rp_aligned): |
| |
| LIMB32(`rlwinm. r0, up, 30,30,31') C (up >> 2) mod 4 |
| LIMB64(`rlwinm. r0, up, 29,31,31') C (up >> 3) mod 2 |
| |
| LIMB64(`srdi r7, n, 2 ') C loop count corresponding to n |
| LIMB32(`srwi r7, n, 3 ') C loop count corresponding to n |
| mtctr r7 C copy n to count register |
| |
| li r10, -16 |
| |
| beq L(up_aligned) |
| |
| lvsl us, 0, up |
| |
| addi up, up, 16 |
| LIMB32(`andi. r0, n, 0x4 ') |
| LIMB64(`andi. r0, n, 0x2 ') |
| beq L(1) |
| lvx v0, 0, up |
| lvx v2, r10, up |
| vperm v3, v2, v0, us |
| stvx v3, 0, rp |
| addi up, up, -32 |
| addi rp, rp, -16 |
| b L(lpu) |
| L(1): lvx v2, 0, up |
| addi up, up, -16 |
| b L(lpu) |
| |
| ALIGN(32) |
| L(lpu): lvx v0, 0, up |
| vperm v3, v0, v2, us |
| stvx v3, 0, rp |
| lvx v2, r10, up |
| addi up, up, -32 |
| vperm v3, v2, v0, us |
| stvx v3, r10, rp |
| addi rp, rp, -32 |
| bdnz L(lpu) |
| |
| b L(tail) |
| |
| L(up_aligned): |
| |
| LIMB32(`andi. r0, n, 0x4 ') |
| LIMB64(`andi. r0, n, 0x2 ') |
| beq L(lpa) |
| lvx v0, 0, up |
| stvx v0, 0, rp |
| addi up, up, -16 |
| addi rp, rp, -16 |
| b L(lpa) |
| |
| ALIGN(32) |
| L(lpa): lvx v0, 0, up |
| lvx v1, r10, up |
| addi up, up, -32 |
| nop |
| stvx v0, 0, rp |
| stvx v1, r10, rp |
| addi rp, rp, -32 |
| bdnz L(lpa) |
| |
| L(tail): |
| LIMB32(`rlwinm. r7, n, 0,30,31 ') C r7 = n mod 4 |
| LIMB64(`rlwinm. r7, n, 0,31,31 ') C r7 = n mod 2 |
| beq L(ret) |
| LIMB32(`li r10, 12 ') |
| L(top2): |
| LIMB32(`lwzx r0, r10, up ') |
| LIMB64(`ld r0, 8(up) ') |
| LIMB32(`addic. r7, r7, -1 ') |
| LIMB32(`stwx r0, r10, rp ') |
| LIMB64(`std r0, 8(rp) ') |
| LIMB32(`addi r10, r10, -GMP_LIMB_BYTES') |
| LIMB32(`bne L(top2) ') |
| |
| L(ret): mtspr 256, r12 |
| blr |
| EPILOGUE() |