| dnl IA-64 mpn_lshift/mpn_rshift. |
| |
| dnl Copyright 2000, 2001, 2002, 2003, 2004, 2005 Free Software Foundation, |
| dnl Inc. |
| |
| dnl This file is part of the GNU MP Library. |
| |
| dnl The GNU MP Library is free software; you can redistribute it and/or modify |
| dnl it under the terms of the GNU Lesser General Public License as published |
| dnl by the Free Software Foundation; either version 3 of the License, or (at |
| dnl your option) any later version. |
| |
| dnl The GNU MP Library is distributed in the hope that it will be useful, but |
| dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY |
| dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public |
| dnl License for more details. |
| |
| dnl You should have received a copy of the GNU Lesser General Public License |
| dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. |
| |
| include(`../config.m4') |
| |
| C cycles/limb |
| C Itanium: 2.0 |
| C Itanium 2: 1.0 |
| |
| C This code is scheduled deeply since the plain shift instructions shr and shl |
| C have a latency of 4 (on Itanium) or 3 (on Itanium 2). Poor scheduling of |
| C these instructions cause a 10 cycle replay trap on Itanium. |
| |
| C TODO |
| C * Optimize function entry and feed-in code. |
| |
| C INPUT PARAMETERS |
| define(`rp',`r32') |
| define(`up',`r33') |
| define(`n',`r34') |
| define(`cnt',`r35') |
| |
| define(`tnc',`r9') |
| |
| ifdef(`OPERATION_lshift',` |
| define(`FSH',`shl') |
| define(`BSH',`shr.u') |
| define(`UPD',`-8') |
| define(`POFF',`-512') |
| define(`PUPD',`-32') |
| define(`func',`mpn_lshift') |
| ') |
| ifdef(`OPERATION_rshift',` |
| define(`FSH',`shr.u') |
| define(`BSH',`shl') |
| define(`UPD',`8') |
| define(`POFF',`512') |
| define(`PUPD',`32') |
| define(`func',`mpn_rshift') |
| ') |
| |
| MULFUNC_PROLOGUE(mpn_lshift mpn_rshift) |
| |
| ASM_START() |
| PROLOGUE(func) |
| .prologue |
| .save ar.lc, r2 |
| .body |
| ifdef(`HAVE_ABI_32', |
| ` addp4 rp = 0, rp C M I |
| addp4 up = 0, up C M I |
| sxt4 n = n C M I |
| zxt4 cnt = cnt C I |
| ;; |
| ') |
| |
| {.mmi; cmp.lt p14, p15 = 4, n C M I |
| and r14 = 3, n C M I |
| mov.i r2 = ar.lc C I0 |
| }{.mmi; add r15 = -1, n C M I |
| sub tnc = 64, cnt C M I |
| add r16 = -5, n |
| ;; |
| }{.mmi; cmp.eq p6, p0 = 1, r14 C M I |
| cmp.eq p7, p0 = 2, r14 C M I |
| shr.u n = r16, 2 C I0 |
| }{.mmi; cmp.eq p8, p0 = 3, r14 C M I |
| ifdef(`OPERATION_lshift', |
| ` shladd up = r15, 3, up C M I |
| shladd rp = r15, 3, rp') C M I |
| ;; |
| }{.mmi; add r11 = POFF, up C M I |
| ld8 r10 = [up], UPD C M01 |
| mov.i ar.lc = n C I0 |
| }{.bbb; |
| (p6) br.dptk .Lb01 |
| (p7) br.dptk .Lb10 |
| (p8) br.dptk .Lb11 |
| ;; |
| } |
| |
| .Lb00: ld8 r19 = [up], UPD |
| ;; |
| ld8 r16 = [up], UPD |
| ;; |
| ld8 r17 = [up], UPD |
| BSH r8 = r10, tnc C function return value |
| (p14) br.cond.dptk .grt4 |
| |
| FSH r24 = r10, cnt |
| BSH r25 = r19, tnc |
| ;; |
| FSH r26 = r19, cnt |
| BSH r27 = r16, tnc |
| ;; |
| FSH r20 = r16, cnt |
| BSH r21 = r17, tnc |
| ;; |
| or r14 = r25, r24 |
| FSH r22 = r17, cnt |
| BSH r23 = r10, tnc |
| br .Lr4 |
| |
| .grt4: FSH r24 = r10, cnt |
| BSH r25 = r19, tnc |
| ;; |
| ld8 r18 = [up], UPD |
| FSH r26 = r19, cnt |
| BSH r27 = r16, tnc |
| ;; |
| ld8 r19 = [up], UPD |
| FSH r20 = r16, cnt |
| BSH r21 = r17, tnc |
| ;; |
| ld8 r16 = [up], UPD |
| FSH r22 = r17, cnt |
| BSH r23 = r18, tnc |
| ;; |
| or r14 = r25, r24 |
| ld8 r17 = [up], UPD |
| br.cloop.dpnt .Ltop |
| br .Lbot |
| |
| .Lb01: |
| (p15) BSH r8 = r10, tnc C function return value I |
| (p15) FSH r22 = r10, cnt C I |
| (p15) br.cond.dptk .Lr1 C return B |
| |
| .grt1: ld8 r18 = [up], UPD |
| ;; |
| ld8 r19 = [up], UPD |
| BSH r8 = r10, tnc C function return value |
| ;; |
| ld8 r16 = [up], UPD |
| FSH r22 = r10, cnt |
| BSH r23 = r18, tnc |
| ;; |
| ld8 r17 = [up], UPD |
| br.cloop.dpnt .grt5 |
| ;; |
| |
| FSH r24 = r18, cnt |
| BSH r25 = r19, tnc |
| ;; |
| or r15 = r23, r22 |
| FSH r26 = r19, cnt |
| BSH r27 = r16, tnc |
| ;; |
| FSH r20 = r16, cnt |
| BSH r21 = r17, tnc |
| br .Lr5 |
| |
| .grt5: FSH r24 = r18, cnt |
| BSH r25 = r19, tnc |
| ;; |
| ld8 r18 = [up], UPD |
| FSH r26 = r19, cnt |
| BSH r27 = r16, tnc |
| ;; |
| ld8 r19 = [up], UPD |
| FSH r20 = r16, cnt |
| BSH r21 = r17, tnc |
| ;; |
| or r15 = r23, r22 |
| ld8 r16 = [up], UPD |
| br .LL01 |
| |
| |
| .Lb10: ld8 r17 = [up], UPD |
| (p14) br.cond.dptk .grt2 |
| |
| BSH r8 = r10, tnc C function return value |
| ;; |
| FSH r20 = r10, cnt |
| BSH r21 = r17, tnc |
| ;; |
| or r14 = r21, r20 |
| FSH r22 = r17, cnt |
| br .Lr2 C return |
| |
| .grt2: ld8 r18 = [up], UPD |
| BSH r8 = r10, tnc C function return value |
| ;; |
| ld8 r19 = [up], UPD |
| FSH r20 = r10, cnt |
| BSH r21 = r17, tnc |
| ;; |
| ld8 r16 = [up], UPD |
| FSH r22 = r17, cnt |
| BSH r23 = r18, tnc |
| ;; |
| ld8 r17 = [up], UPD |
| br.cloop.dpnt .grt6 |
| ;; |
| |
| or r14 = r21, r20 |
| FSH r24 = r18, cnt |
| BSH r25 = r19, tnc |
| ;; |
| FSH r26 = r19, cnt |
| BSH r27 = r16, tnc |
| br .Lr6 |
| |
| .grt6: or r14 = r21, r20 |
| FSH r24 = r18, cnt |
| BSH r25 = r19, tnc |
| ;; |
| ld8 r18 = [up], UPD |
| FSH r26 = r19, cnt |
| BSH r27 = r16, tnc |
| ;; |
| ld8 r19 = [up], UPD |
| br .LL10 |
| |
| |
| .Lb11: ld8 r16 = [up], UPD |
| ;; |
| ld8 r17 = [up], UPD |
| BSH r8 = r10, tnc C function return value |
| (p14) br.cond.dptk .grt3 |
| ;; |
| |
| FSH r26 = r10, cnt |
| BSH r27 = r16, tnc |
| ;; |
| FSH r20 = r16, cnt |
| BSH r21 = r17, tnc |
| ;; |
| or r15 = r27, r26 |
| FSH r22 = r17, cnt |
| br .Lr3 C return |
| |
| .grt3: ld8 r18 = [up], UPD |
| FSH r26 = r10, cnt |
| BSH r27 = r16, tnc |
| ;; |
| ld8 r19 = [up], UPD |
| FSH r20 = r16, cnt |
| BSH r21 = r17, tnc |
| ;; |
| ld8 r16 = [up], UPD |
| FSH r22 = r17, cnt |
| BSH r23 = r18, tnc |
| ;; |
| ld8 r17 = [up], UPD |
| br.cloop.dpnt .grt7 |
| |
| or r15 = r27, r26 |
| FSH r24 = r18, cnt |
| BSH r25 = r19, tnc |
| br .Lr7 |
| |
| .grt7: or r15 = r27, r26 |
| FSH r24 = r18, cnt |
| BSH r25 = r19, tnc |
| ld8 r18 = [up], UPD |
| br .LL11 |
| |
| C *** MAIN LOOP START *** |
| ALIGN(32) |
| .Ltop: |
| {.mmi; st8 [rp] = r14, UPD C M2 |
| or r15 = r27, r26 C M3 |
| FSH r24 = r18, cnt C I0 |
| }{.mmi; ld8 r18 = [up], UPD C M1 |
| lfetch [r11], PUPD |
| BSH r25 = r19, tnc C I1 |
| ;; } |
| .LL11: |
| {.mmi; st8 [rp] = r15, UPD |
| or r14 = r21, r20 |
| FSH r26 = r19, cnt |
| }{.mmi; ld8 r19 = [up], UPD |
| nop.m 0 |
| BSH r27 = r16, tnc |
| ;; } |
| .LL10: |
| {.mmi; st8 [rp] = r14, UPD |
| or r15 = r23, r22 |
| FSH r20 = r16, cnt |
| }{.mmi; ld8 r16 = [up], UPD |
| nop.m 0 |
| BSH r21 = r17, tnc |
| ;; } |
| .LL01: |
| {.mmi; st8 [rp] = r15, UPD |
| or r14 = r25, r24 |
| FSH r22 = r17, cnt |
| }{.mib; ld8 r17 = [up], UPD |
| BSH r23 = r18, tnc |
| br.cloop.dptk .Ltop |
| ;; } |
| |
| C *** MAIN LOOP END *** |
| |
| .Lbot: or r15 = r27, r26 |
| FSH r24 = r18, cnt |
| BSH r25 = r19, tnc |
| st8 [rp] = r14, UPD |
| ;; |
| .Lr7: or r14 = r21, r20 |
| FSH r26 = r19, cnt |
| BSH r27 = r16, tnc |
| st8 [rp] = r15, UPD |
| ;; |
| .Lr6: or r15 = r23, r22 |
| FSH r20 = r16, cnt |
| BSH r21 = r17, tnc |
| st8 [rp] = r14, UPD |
| ;; |
| .Lr5: st8 [rp] = r15, UPD |
| or r14 = r25, r24 |
| FSH r22 = r17, cnt |
| ;; |
| .Lr4: or r15 = r27, r26 |
| st8 [rp] = r14, UPD |
| ;; |
| .Lr3: or r14 = r21, r20 |
| st8 [rp] = r15, UPD |
| ;; |
| .Lr2: st8 [rp] = r14, UPD |
| ;; |
| .Lr1: st8 [rp] = r22, UPD C M23 |
| mov ar.lc = r2 C I0 |
| br.ret.sptk.many b0 C B |
| EPILOGUE(func) |
| ASM_END() |