| dnl IA-64 mpn_bdiv_dbm1. |
| |
| dnl Copyright 2008, 2009 Free Software Foundation, Inc. |
| |
| dnl This file is part of the GNU MP Library. |
| |
| dnl The GNU MP Library is free software; you can redistribute it and/or modify |
| dnl it under the terms of the GNU Lesser General Public License as published |
| dnl by the Free Software Foundation; either version 3 of the License, or (at |
| dnl your option) any later version. |
| |
| dnl The GNU MP Library is distributed in the hope that it will be useful, but |
| dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY |
| dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public |
| dnl License for more details. |
| |
| dnl You should have received a copy of the GNU Lesser General Public License |
| dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. |
| |
| include(`../config.m4') |
| |
| C cycles/limb |
| C Itanium: 4 |
| C Itanium 2: 2 |
| |
| C TODO |
| C * Optimize feed-in and wind-down code, both for speed and code size. |
| |
| C INPUT PARAMETERS |
| define(`rp', `r32') |
| define(`up', `r33') |
| define(`n', `r34') |
| define(`bd', `r35') |
| |
| ASM_START() |
| PROLOGUE(mpn_bdiv_dbm1c) |
| .prologue |
| .save ar.lc, r2 |
| .body |
| |
| ifdef(`HAVE_ABI_32', |
| ` addp4 rp = 0, rp C M I |
| addp4 up = 0, up C M I |
| zxt4 n = n C I |
| ;; |
| ') |
| {.mmb |
| mov r15 = r36 C M I |
| ldf8 f9 = [up], 8 C M |
| nop.b 0 C B |
| } |
| .Lcommon: |
| {.mii |
| adds r16 = -1, n C M I |
| mov r2 = ar.lc C I0 |
| and r14 = 3, n C M I |
| ;; |
| } |
| {.mii |
| setf.sig f6 = bd C M2 M3 |
| shr.u r31 = r16, 2 C I0 |
| cmp.eq p10, p0 = 0, r14 C M I |
| } |
| {.mii |
| nop.m 0 C M |
| cmp.eq p11, p0 = 2, r14 C M I |
| cmp.eq p12, p0 = 3, r14 C M I |
| ;; |
| } |
| {.mii |
| cmp.ne p6, p7 = r0, r0 C M I |
| mov.i ar.lc = r31 C I0 |
| cmp.ne p8, p9 = r0, r0 C M I |
| } |
| {.bbb |
| (p10) br.dptk .Lb00 C B |
| (p11) br.dptk .Lb10 C B |
| (p12) br.dptk .Lb11 C B |
| ;; |
| } |
| |
| .Lb01: br.cloop.dptk .grt1 |
| ;; |
| xma.l f38 = f9, f6, f0 |
| xma.hu f39 = f9, f6, f0 |
| ;; |
| getf.sig r26 = f38 |
| getf.sig r27 = f39 |
| br .Lcj1 |
| |
| .grt1: ldf8 f10 = [r33], 8 |
| ;; |
| ldf8 f11 = [r33], 8 |
| ;; |
| ldf8 f12 = [r33], 8 |
| ;; |
| xma.l f38 = f9, f6, f0 |
| xma.hu f39 = f9, f6, f0 |
| ;; |
| ldf8 f13 = [r33], 8 |
| ;; |
| xma.l f32 = f10, f6, f0 |
| xma.hu f33 = f10, f6, f0 |
| br.cloop.dptk .grt5 |
| |
| ;; |
| getf.sig r26 = f38 |
| xma.l f34 = f11, f6, f0 |
| xma.hu f35 = f11, f6, f0 |
| ;; |
| getf.sig r27 = f39 |
| ;; |
| getf.sig r20 = f32 |
| xma.l f36 = f12, f6, f0 |
| xma.hu f37 = f12, f6, f0 |
| ;; |
| getf.sig r21 = f33 |
| ;; |
| getf.sig r22 = f34 |
| xma.l f38 = f13, f6, f0 |
| xma.hu f39 = f13, f6, f0 |
| br .Lcj5 |
| |
| .grt5: ldf8 f10 = [r33], 8 |
| ;; |
| getf.sig r26 = f38 |
| xma.l f34 = f11, f6, f0 |
| xma.hu f35 = f11, f6, f0 |
| ;; |
| getf.sig r27 = f39 |
| ldf8 f11 = [r33], 8 |
| ;; |
| getf.sig r20 = f32 |
| xma.l f36 = f12, f6, f0 |
| xma.hu f37 = f12, f6, f0 |
| ;; |
| getf.sig r21 = f33 |
| ldf8 f12 = [r33], 8 |
| ;; |
| getf.sig r22 = f34 |
| xma.l f38 = f13, f6, f0 |
| xma.hu f39 = f13, f6, f0 |
| br .LL01 |
| |
| .Lb10: ldf8 f13 = [r33], 8 |
| br.cloop.dptk .grt2 |
| ;; |
| |
| xma.l f36 = f9, f6, f0 |
| xma.hu f37 = f9, f6, f0 |
| ;; |
| xma.l f38 = f13, f6, f0 |
| xma.hu f39 = f13, f6, f0 |
| ;; |
| getf.sig r24 = f36 |
| ;; |
| getf.sig r25 = f37 |
| ;; |
| getf.sig r26 = f38 |
| ;; |
| getf.sig r27 = f39 |
| br .Lcj2 |
| |
| .grt2: ldf8 f10 = [r33], 8 |
| ;; |
| ldf8 f11 = [r33], 8 |
| ;; |
| xma.l f36 = f9, f6, f0 |
| xma.hu f37 = f9, f6, f0 |
| ;; |
| ldf8 f12 = [r33], 8 |
| ;; |
| xma.l f38 = f13, f6, f0 |
| xma.hu f39 = f13, f6, f0 |
| ;; |
| ldf8 f13 = [r33], 8 |
| ;; |
| getf.sig r24 = f36 |
| xma.l f32 = f10, f6, f0 |
| xma.hu f33 = f10, f6, f0 |
| br.cloop.dptk .grt6 |
| |
| getf.sig r25 = f37 |
| ;; |
| getf.sig r26 = f38 |
| xma.l f34 = f11, f6, f0 |
| xma.hu f35 = f11, f6, f0 |
| ;; |
| getf.sig r27 = f39 |
| ;; |
| getf.sig r20 = f32 |
| xma.l f36 = f12, f6, f0 |
| xma.hu f37 = f12, f6, f0 |
| br .Lcj6 |
| |
| .grt6: getf.sig r25 = f37 |
| ldf8 f10 = [r33], 8 |
| ;; |
| getf.sig r26 = f38 |
| xma.l f34 = f11, f6, f0 |
| xma.hu f35 = f11, f6, f0 |
| ;; |
| getf.sig r27 = f39 |
| ldf8 f11 = [r33], 8 |
| ;; |
| getf.sig r20 = f32 |
| xma.l f36 = f12, f6, f0 |
| xma.hu f37 = f12, f6, f0 |
| br .LL10 |
| |
| |
| .Lb11: ldf8 f12 = [r33], 8 |
| ;; |
| ldf8 f13 = [r33], 8 |
| br.cloop.dptk .grt3 |
| ;; |
| |
| xma.l f34 = f9, f6, f0 |
| xma.hu f35 = f9, f6, f0 |
| ;; |
| xma.l f36 = f12, f6, f0 |
| xma.hu f37 = f12, f6, f0 |
| ;; |
| getf.sig r22 = f34 |
| xma.l f38 = f13, f6, f0 |
| xma.hu f39 = f13, f6, f0 |
| ;; |
| getf.sig r23 = f35 |
| ;; |
| getf.sig r24 = f36 |
| ;; |
| getf.sig r25 = f37 |
| ;; |
| getf.sig r26 = f38 |
| br .Lcj3 |
| |
| .grt3: ldf8 f10 = [r33], 8 |
| ;; |
| xma.l f34 = f9, f6, f0 |
| xma.hu f35 = f9, f6, f0 |
| ;; |
| ldf8 f11 = [r33], 8 |
| ;; |
| xma.l f36 = f12, f6, f0 |
| xma.hu f37 = f12, f6, f0 |
| ;; |
| ldf8 f12 = [r33], 8 |
| ;; |
| getf.sig r22 = f34 |
| xma.l f38 = f13, f6, f0 |
| xma.hu f39 = f13, f6, f0 |
| ;; |
| getf.sig r23 = f35 |
| ldf8 f13 = [r33], 8 |
| ;; |
| getf.sig r24 = f36 |
| xma.l f32 = f10, f6, f0 |
| xma.hu f33 = f10, f6, f0 |
| br.cloop.dptk .grt7 |
| |
| getf.sig r25 = f37 |
| ;; |
| getf.sig r26 = f38 |
| xma.l f34 = f11, f6, f0 |
| xma.hu f35 = f11, f6, f0 |
| br .Lcj7 |
| |
| .grt7: getf.sig r25 = f37 |
| ldf8 f10 = [r33], 8 |
| ;; |
| getf.sig r26 = f38 |
| xma.l f34 = f11, f6, f0 |
| xma.hu f35 = f11, f6, f0 |
| br .LL11 |
| |
| |
| .Lb00: ldf8 f11 = [r33], 8 |
| ;; |
| ldf8 f12 = [r33], 8 |
| ;; |
| ldf8 f13 = [r33], 8 |
| br.cloop.dptk .grt4 |
| ;; |
| |
| xma.l f32 = f9, f6, f0 |
| xma.hu f33 = f9, f6, f0 |
| ;; |
| xma.l f34 = f11, f6, f0 |
| xma.hu f35 = f11, f6, f0 |
| ;; |
| getf.sig r20 = f32 |
| xma.l f36 = f12, f6, f0 |
| xma.hu f37 = f12, f6, f0 |
| ;; |
| getf.sig r21 = f33 |
| ;; |
| getf.sig r22 = f34 |
| xma.l f38 = f13, f6, f0 |
| xma.hu f39 = f13, f6, f0 |
| ;; |
| getf.sig r23 = f35 |
| ;; |
| getf.sig r24 = f36 |
| br .Lcj4 |
| |
| .grt4: xma.l f32 = f9, f6, f0 |
| xma.hu f33 = f9, f6, f0 |
| ;; |
| ldf8 f10 = [r33], 8 |
| ;; |
| xma.l f34 = f11, f6, f0 |
| xma.hu f35 = f11, f6, f0 |
| ;; |
| ldf8 f11 = [r33], 8 |
| ;; |
| getf.sig r20 = f32 |
| xma.l f36 = f12, f6, f0 |
| xma.hu f37 = f12, f6, f0 |
| ;; |
| getf.sig r21 = f33 |
| ldf8 f12 = [r33], 8 |
| ;; |
| getf.sig r22 = f34 |
| xma.l f38 = f13, f6, f0 |
| xma.hu f39 = f13, f6, f0 |
| ;; |
| getf.sig r23 = f35 |
| ldf8 f13 = [r33], 8 |
| ;; |
| getf.sig r24 = f36 |
| xma.l f32 = f10, f6, f0 |
| xma.hu f33 = f10, f6, f0 |
| br.cloop.dptk .LL00 |
| br .Lcj8 |
| |
| C *** MAIN LOOP START *** |
| ALIGN(32) |
| .Ltop: |
| .pred.rel "mutex",p6,p7 |
| C .mfi |
| getf.sig r24 = f36 |
| xma.l f32 = f10, f6, f0 |
| (p6) sub r15 = r19, r27, 1 |
| C .mfi |
| st8 [r32] = r19, 8 |
| xma.hu f33 = f10, f6, f0 |
| (p7) sub r15 = r19, r27 |
| ;; |
| .LL00: |
| C .mfi |
| getf.sig r25 = f37 |
| nop.f 0 |
| cmp.ltu p6, p7 = r15, r20 |
| C .mib |
| ldf8 f10 = [r33], 8 |
| sub r16 = r15, r20 |
| nop.b 0 |
| ;; |
| |
| C .mfi |
| getf.sig r26 = f38 |
| xma.l f34 = f11, f6, f0 |
| (p6) sub r15 = r16, r21, 1 |
| C .mfi |
| st8 [r32] = r16, 8 |
| xma.hu f35 = f11, f6, f0 |
| (p7) sub r15 = r16, r21 |
| ;; |
| .LL11: |
| C .mfi |
| getf.sig r27 = f39 |
| nop.f 0 |
| cmp.ltu p6, p7 = r15, r22 |
| C .mib |
| ldf8 f11 = [r33], 8 |
| sub r17 = r15, r22 |
| nop.b 0 |
| ;; |
| |
| C .mfi |
| getf.sig r20 = f32 |
| xma.l f36 = f12, f6, f0 |
| (p6) sub r15 = r17, r23, 1 |
| C .mfi |
| st8 [r32] = r17, 8 |
| xma.hu f37 = f12, f6, f0 |
| (p7) sub r15 = r17, r23 |
| ;; |
| .LL10: |
| C .mfi |
| getf.sig r21 = f33 |
| nop.f 0 |
| cmp.ltu p6, p7 = r15, r24 |
| C .mib |
| ldf8 f12 = [r33], 8 |
| sub r18 = r15, r24 |
| nop.b 0 |
| ;; |
| |
| C .mfi |
| getf.sig r22 = f34 |
| xma.l f38 = f13, f6, f0 |
| (p6) sub r15 = r18, r25, 1 |
| C .mfi |
| st8 [r32] = r18, 8 |
| xma.hu f39 = f13, f6, f0 |
| (p7) sub r15 = r18, r25 |
| ;; |
| .LL01: |
| C .mfi |
| getf.sig r23 = f35 |
| nop.f 0 |
| cmp.ltu p6, p7 = r15, r26 |
| C .mib |
| ldf8 f13 = [r33], 8 |
| sub r19 = r15, r26 |
| br.cloop.sptk.few .Ltop |
| C *** MAIN LOOP END *** |
| ;; |
| |
| getf.sig r24 = f36 |
| xma.l f32 = f10, f6, f0 |
| (p6) sub r15 = r19, r27, 1 |
| st8 [r32] = r19, 8 |
| xma.hu f33 = f10, f6, f0 |
| (p7) sub r15 = r19, r27 |
| ;; |
| .Lcj8: getf.sig r25 = f37 |
| cmp.ltu p6, p7 = r15, r20 |
| sub r16 = r15, r20 |
| ;; |
| getf.sig r26 = f38 |
| xma.l f34 = f11, f6, f0 |
| (p6) sub r15 = r16, r21, 1 |
| st8 [r32] = r16, 8 |
| xma.hu f35 = f11, f6, f0 |
| (p7) sub r15 = r16, r21 |
| ;; |
| .Lcj7: getf.sig r27 = f39 |
| cmp.ltu p6, p7 = r15, r22 |
| sub r17 = r15, r22 |
| ;; |
| getf.sig r20 = f32 |
| xma.l f36 = f12, f6, f0 |
| (p6) sub r15 = r17, r23, 1 |
| st8 [r32] = r17, 8 |
| xma.hu f37 = f12, f6, f0 |
| (p7) sub r15 = r17, r23 |
| ;; |
| .Lcj6: getf.sig r21 = f33 |
| cmp.ltu p6, p7 = r15, r24 |
| sub r18 = r15, r24 |
| ;; |
| getf.sig r22 = f34 |
| xma.l f38 = f13, f6, f0 |
| (p6) sub r15 = r18, r25, 1 |
| st8 [r32] = r18, 8 |
| xma.hu f39 = f13, f6, f0 |
| (p7) sub r15 = r18, r25 |
| ;; |
| .Lcj5: getf.sig r23 = f35 |
| cmp.ltu p6, p7 = r15, r26 |
| sub r19 = r15, r26 |
| ;; |
| getf.sig r24 = f36 |
| (p6) sub r15 = r19, r27, 1 |
| st8 [r32] = r19, 8 |
| (p7) sub r15 = r19, r27 |
| ;; |
| .Lcj4: getf.sig r25 = f37 |
| cmp.ltu p6, p7 = r15, r20 |
| sub r16 = r15, r20 |
| ;; |
| getf.sig r26 = f38 |
| (p6) sub r15 = r16, r21, 1 |
| st8 [r32] = r16, 8 |
| (p7) sub r15 = r16, r21 |
| ;; |
| .Lcj3: getf.sig r27 = f39 |
| cmp.ltu p6, p7 = r15, r22 |
| sub r17 = r15, r22 |
| ;; |
| (p6) sub r15 = r17, r23, 1 |
| st8 [r32] = r17, 8 |
| (p7) sub r15 = r17, r23 |
| ;; |
| .Lcj2: cmp.ltu p6, p7 = r15, r24 |
| sub r18 = r15, r24 |
| ;; |
| (p6) sub r15 = r18, r25, 1 |
| st8 [r32] = r18, 8 |
| (p7) sub r15 = r18, r25 |
| ;; |
| .Lcj1: cmp.ltu p6, p7 = r15, r26 |
| sub r19 = r15, r26 |
| ;; |
| (p6) sub r8 = r19, r27, 1 |
| st8 [r32] = r19 |
| (p7) sub r8 = r19, r27 |
| mov ar.lc = r2 |
| br.ret.sptk.many b0 |
| EPILOGUE() |
| ASM_END() |