| dnl SPARC v9 64-bit mpn_addmul_1 -- Multiply a limb vector with a limb and add |
| dnl the result to a second limb vector. |
| |
| dnl Copyright 1998, 2000, 2001, 2002, 2003, 2004 Free Software Foundation, |
| dnl Inc. |
| |
| dnl This file is part of the GNU MP Library. |
| |
| dnl The GNU MP Library is free software; you can redistribute it and/or modify |
| dnl it under the terms of the GNU Lesser General Public License as published |
| dnl by the Free Software Foundation; either version 3 of the License, or (at |
| dnl your option) any later version. |
| |
| dnl The GNU MP Library is distributed in the hope that it will be useful, but |
| dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY |
| dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public |
| dnl License for more details. |
| |
| dnl You should have received a copy of the GNU Lesser General Public License |
| dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. |
| |
| include(`../config.m4') |
| |
| C cycles/limb |
| C UltraSPARC 1&2: 14 |
| C UltraSPARC 3: 17.5 |
| |
| C Algorithm: We use eight floating-point multiplies per limb product, with the |
| C invariant v operand split into four 16-bit pieces, and the up operand split |
| C into 32-bit pieces. We sum pairs of 48-bit partial products using |
| C floating-point add, then convert the four 49-bit product-sums and transfer |
| C them to the integer unit. |
| |
| C Possible optimizations: |
| C 0. Rewrite to use algorithm of mpn_addmul_2. |
| C 1. Align the stack area where we transfer the four 49-bit product-sums |
| C to a 32-byte boundary. That would minimize the cache collision. |
| C (UltraSPARC-1/2 use a direct-mapped cache.) (Perhaps even better would |
| C be to align the area to map to the area immediately before up?) |
| C 2. Sum the 4 49-bit quantities using 32-bit operations, as in the |
| C develop mpn_addmul_2. This would save many integer instructions. |
| C 3. Unrolling. Questionable if it is worth the code expansion, given that |
| C it could only save 1 cycle/limb. |
| C 4. Specialize for particular v values. If its upper 32 bits are zero, we |
| C could save many operations, in the FPU (fmuld), but more so in the IEU |
| C since we'll be summing 48-bit quantities, which might be simpler. |
| C 5. Ideally, we should schedule the f2/f3 and f4/f5 RAW further apart, and |
| C the i00,i16,i32,i48 RAW less apart. The latter apart-scheduling should |
| C not be greater than needed for L2 cache latency, and also not so great |
| C that i16 needs to be copied. |
| C 6. Avoid performing mem+fa+fm in the same cycle, at least not when we want |
| C to get high IEU bandwidth. (12 of the 14 cycles will be free for 2 IEU |
| C ops.) |
| |
| C Instruction classification (as per UltraSPARC-1/2 functional units): |
| C 8 FM |
| C 10 FA |
| C 12 MEM |
| C 10 ISHIFT + 14 IADDLOG |
| C 1 BRANCH |
| C 55 insns totally (plus one mov insn that should be optimized out) |
| |
| C The loop executes 56 instructions in 14 cycles on UltraSPARC-1/2, i.e we |
| C sustain the peak execution rate of 4 instructions/cycle. |
| |
| C INPUT PARAMETERS |
| C rp i0 |
| C up i1 |
| C n i2 |
| C v i3 |
| |
| ASM_START() |
| REGISTER(%g2,#scratch) |
| REGISTER(%g3,#scratch) |
| |
| define(`p00', `%f8') define(`p16',`%f10') define(`p32',`%f12') define(`p48',`%f14') |
| define(`r32',`%f16') define(`r48',`%f18') define(`r64',`%f20') define(`r80',`%f22') |
| define(`v00',`%f24') define(`v16',`%f26') define(`v32',`%f28') define(`v48',`%f30') |
| define(`u00',`%f32') define(`u32', `%f34') |
| define(`a00',`%f36') define(`a16',`%f38') define(`a32',`%f40') define(`a48',`%f42') |
| define(`cy',`%g1') |
| define(`rlimb',`%g3') |
| define(`i00',`%l0') define(`i16',`%l1') define(`i32',`%l2') define(`i48',`%l3') |
| define(`xffffffff',`%l7') |
| define(`xffff',`%o0') |
| |
| PROLOGUE(mpn_addmul_1) |
| |
| C Initialization. (1) Split v operand into four 16-bit chunks and store them |
| C as IEEE double in fp registers. (2) Clear upper 32 bits of fp register pairs |
| C f2 and f4. (3) Store masks in registers aliased to `xffff' and `xffffffff'. |
| |
| save %sp, -256, %sp |
| mov -1, %g4 |
| srlx %g4, 48, xffff C store mask in register `xffff' |
| and %i3, xffff, %g2 |
| stx %g2, [%sp+2223+0] |
| srlx %i3, 16, %g3 |
| and %g3, xffff, %g3 |
| stx %g3, [%sp+2223+8] |
| srlx %i3, 32, %g2 |
| and %g2, xffff, %g2 |
| stx %g2, [%sp+2223+16] |
| srlx %i3, 48, %g3 |
| stx %g3, [%sp+2223+24] |
| srlx %g4, 32, xffffffff C store mask in register `xffffffff' |
| |
| sllx %i2, 3, %i2 |
| mov 0, cy C clear cy |
| add %i0, %i2, %i0 |
| add %i1, %i2, %i1 |
| neg %i2 |
| add %i1, 4, %i5 |
| add %i0, -32, %i4 |
| add %i0, -16, %i0 |
| |
| ldd [%sp+2223+0], v00 |
| ldd [%sp+2223+8], v16 |
| ldd [%sp+2223+16], v32 |
| ldd [%sp+2223+24], v48 |
| ld [%sp+2223+0],%f2 C zero f2 |
| ld [%sp+2223+0],%f4 C zero f4 |
| ld [%i5+%i2], %f3 C read low 32 bits of up[i] |
| ld [%i1+%i2], %f5 C read high 32 bits of up[i] |
| fxtod v00, v00 |
| fxtod v16, v16 |
| fxtod v32, v32 |
| fxtod v48, v48 |
| |
| C Start real work. (We sneakingly read f3 and f5 above...) |
| C The software pipeline is very deep, requiring 4 feed-in stages. |
| |
| fxtod %f2, u00 |
| fxtod %f4, u32 |
| fmuld u00, v00, a00 |
| fmuld u00, v16, a16 |
| fmuld u00, v32, p32 |
| fmuld u32, v00, r32 |
| fmuld u00, v48, p48 |
| addcc %i2, 8, %i2 |
| bnz,pt %icc, .L_two_or_more |
| fmuld u32, v16, r48 |
| |
| .L_one: |
| fmuld u32, v32, r64 C FIXME not urgent |
| faddd p32, r32, a32 |
| fdtox a00, a00 |
| faddd p48, r48, a48 |
| fmuld u32, v48, r80 C FIXME not urgent |
| fdtox a16, a16 |
| fdtox a32, a32 |
| fdtox a48, a48 |
| std a00, [%sp+2223+0] |
| std a16, [%sp+2223+8] |
| std a32, [%sp+2223+16] |
| std a48, [%sp+2223+24] |
| add %i2, 8, %i2 |
| |
| fdtox r64, a00 |
| ldx [%i0+%i2], rlimb C read rp[i] |
| fdtox r80, a16 |
| ldx [%sp+2223+0], i00 |
| ldx [%sp+2223+8], i16 |
| ldx [%sp+2223+16], i32 |
| ldx [%sp+2223+24], i48 |
| std a00, [%sp+2223+0] |
| std a16, [%sp+2223+8] |
| add %i2, 8, %i2 |
| |
| srlx rlimb, 32, %g4 C HI(rlimb) |
| and rlimb, xffffffff, %g5 C LO(rlimb) |
| add i00, %g5, %g5 C i00+ now in g5 |
| ldx [%sp+2223+0], i00 |
| srlx i16, 48, %l4 C (i16 >> 48) |
| mov i16, %g2 |
| ldx [%sp+2223+8], i16 |
| srlx i48, 16, %l5 C (i48 >> 16) |
| add i32, %g4, %g4 C i32+ now in g4 |
| sllx i48, 32, %l6 C (i48 << 32) |
| srlx %g4, 32, %o3 C (i32 >> 32) |
| add %l5, %l4, %o1 C hi64- in %o1 |
| std a00, [%sp+2223+0] |
| sllx %g4, 16, %o2 C (i32 << 16) |
| add %o3, %o1, %o1 C hi64 in %o1 1st ASSIGNMENT |
| std a16, [%sp+2223+8] |
| sllx %o1, 48, %o3 C (hi64 << 48) |
| add %g2, %o2, %o2 C mi64- in %o2 |
| add %l6, %o2, %o2 C mi64- in %o2 |
| sub %o2, %o3, %o2 C mi64 in %o2 1st ASSIGNMENT |
| add cy, %g5, %o4 C x = prev(i00) + cy |
| b .L_out_1 |
| add %i2, 8, %i2 |
| |
| .L_two_or_more: |
| ld [%i5+%i2], %f3 C read low 32 bits of up[i] |
| fmuld u32, v32, r64 C FIXME not urgent |
| faddd p32, r32, a32 |
| ld [%i1+%i2], %f5 C read high 32 bits of up[i] |
| fdtox a00, a00 |
| faddd p48, r48, a48 |
| fmuld u32, v48, r80 C FIXME not urgent |
| fdtox a16, a16 |
| fdtox a32, a32 |
| fxtod %f2, u00 |
| fxtod %f4, u32 |
| fdtox a48, a48 |
| std a00, [%sp+2223+0] |
| fmuld u00, v00, p00 |
| std a16, [%sp+2223+8] |
| fmuld u00, v16, p16 |
| std a32, [%sp+2223+16] |
| fmuld u00, v32, p32 |
| std a48, [%sp+2223+24] |
| faddd p00, r64, a00 |
| fmuld u32, v00, r32 |
| faddd p16, r80, a16 |
| fmuld u00, v48, p48 |
| addcc %i2, 8, %i2 |
| bnz,pt %icc, .L_three_or_more |
| fmuld u32, v16, r48 |
| |
| .L_two: |
| fmuld u32, v32, r64 C FIXME not urgent |
| faddd p32, r32, a32 |
| fdtox a00, a00 |
| ldx [%i0+%i2], rlimb C read rp[i] |
| faddd p48, r48, a48 |
| fmuld u32, v48, r80 C FIXME not urgent |
| fdtox a16, a16 |
| ldx [%sp+2223+0], i00 |
| fdtox a32, a32 |
| ldx [%sp+2223+8], i16 |
| ldx [%sp+2223+16], i32 |
| ldx [%sp+2223+24], i48 |
| fdtox a48, a48 |
| std a00, [%sp+2223+0] |
| std a16, [%sp+2223+8] |
| std a32, [%sp+2223+16] |
| std a48, [%sp+2223+24] |
| add %i2, 8, %i2 |
| |
| fdtox r64, a00 |
| srlx rlimb, 32, %g4 C HI(rlimb) |
| and rlimb, xffffffff, %g5 C LO(rlimb) |
| ldx [%i0+%i2], rlimb C read rp[i] |
| add i00, %g5, %g5 C i00+ now in g5 |
| fdtox r80, a16 |
| ldx [%sp+2223+0], i00 |
| srlx i16, 48, %l4 C (i16 >> 48) |
| mov i16, %g2 |
| ldx [%sp+2223+8], i16 |
| srlx i48, 16, %l5 C (i48 >> 16) |
| add i32, %g4, %g4 C i32+ now in g4 |
| ldx [%sp+2223+16], i32 |
| sllx i48, 32, %l6 C (i48 << 32) |
| ldx [%sp+2223+24], i48 |
| srlx %g4, 32, %o3 C (i32 >> 32) |
| add %l5, %l4, %o1 C hi64- in %o1 |
| std a00, [%sp+2223+0] |
| sllx %g4, 16, %o2 C (i32 << 16) |
| add %o3, %o1, %o1 C hi64 in %o1 1st ASSIGNMENT |
| std a16, [%sp+2223+8] |
| sllx %o1, 48, %o3 C (hi64 << 48) |
| add %g2, %o2, %o2 C mi64- in %o2 |
| add %l6, %o2, %o2 C mi64- in %o2 |
| sub %o2, %o3, %o2 C mi64 in %o2 1st ASSIGNMENT |
| add cy, %g5, %o4 C x = prev(i00) + cy |
| b .L_out_2 |
| add %i2, 8, %i2 |
| |
| .L_three_or_more: |
| ld [%i5+%i2], %f3 C read low 32 bits of up[i] |
| fmuld u32, v32, r64 C FIXME not urgent |
| faddd p32, r32, a32 |
| ld [%i1+%i2], %f5 C read high 32 bits of up[i] |
| fdtox a00, a00 |
| ldx [%i0+%i2], rlimb C read rp[i] |
| faddd p48, r48, a48 |
| fmuld u32, v48, r80 C FIXME not urgent |
| fdtox a16, a16 |
| ldx [%sp+2223+0], i00 |
| fdtox a32, a32 |
| ldx [%sp+2223+8], i16 |
| fxtod %f2, u00 |
| ldx [%sp+2223+16], i32 |
| fxtod %f4, u32 |
| ldx [%sp+2223+24], i48 |
| fdtox a48, a48 |
| std a00, [%sp+2223+0] |
| fmuld u00, v00, p00 |
| std a16, [%sp+2223+8] |
| fmuld u00, v16, p16 |
| std a32, [%sp+2223+16] |
| fmuld u00, v32, p32 |
| std a48, [%sp+2223+24] |
| faddd p00, r64, a00 |
| fmuld u32, v00, r32 |
| faddd p16, r80, a16 |
| fmuld u00, v48, p48 |
| addcc %i2, 8, %i2 |
| bnz,pt %icc, .L_four_or_more |
| fmuld u32, v16, r48 |
| |
| .L_three: |
| fmuld u32, v32, r64 C FIXME not urgent |
| faddd p32, r32, a32 |
| fdtox a00, a00 |
| srlx rlimb, 32, %g4 C HI(rlimb) |
| and rlimb, xffffffff, %g5 C LO(rlimb) |
| ldx [%i0+%i2], rlimb C read rp[i] |
| faddd p48, r48, a48 |
| add i00, %g5, %g5 C i00+ now in g5 |
| fmuld u32, v48, r80 C FIXME not urgent |
| fdtox a16, a16 |
| ldx [%sp+2223+0], i00 |
| fdtox a32, a32 |
| srlx i16, 48, %l4 C (i16 >> 48) |
| mov i16, %g2 |
| ldx [%sp+2223+8], i16 |
| srlx i48, 16, %l5 C (i48 >> 16) |
| add i32, %g4, %g4 C i32+ now in g4 |
| ldx [%sp+2223+16], i32 |
| sllx i48, 32, %l6 C (i48 << 32) |
| ldx [%sp+2223+24], i48 |
| fdtox a48, a48 |
| srlx %g4, 32, %o3 C (i32 >> 32) |
| add %l5, %l4, %o1 C hi64- in %o1 |
| std a00, [%sp+2223+0] |
| sllx %g4, 16, %o2 C (i32 << 16) |
| add %o3, %o1, %o1 C hi64 in %o1 1st ASSIGNMENT |
| std a16, [%sp+2223+8] |
| sllx %o1, 48, %o3 C (hi64 << 48) |
| add %g2, %o2, %o2 C mi64- in %o2 |
| std a32, [%sp+2223+16] |
| add %l6, %o2, %o2 C mi64- in %o2 |
| std a48, [%sp+2223+24] |
| sub %o2, %o3, %o2 C mi64 in %o2 1st ASSIGNMENT |
| add cy, %g5, %o4 C x = prev(i00) + cy |
| b .L_out_3 |
| add %i2, 8, %i2 |
| |
| .L_four_or_more: |
| ld [%i5+%i2], %f3 C read low 32 bits of up[i] |
| fmuld u32, v32, r64 C FIXME not urgent |
| faddd p32, r32, a32 |
| ld [%i1+%i2], %f5 C read high 32 bits of up[i] |
| fdtox a00, a00 |
| srlx rlimb, 32, %g4 C HI(rlimb) |
| and rlimb, xffffffff, %g5 C LO(rlimb) |
| ldx [%i0+%i2], rlimb C read rp[i] |
| faddd p48, r48, a48 |
| add i00, %g5, %g5 C i00+ now in g5 |
| fmuld u32, v48, r80 C FIXME not urgent |
| fdtox a16, a16 |
| ldx [%sp+2223+0], i00 |
| fdtox a32, a32 |
| srlx i16, 48, %l4 C (i16 >> 48) |
| mov i16, %g2 |
| ldx [%sp+2223+8], i16 |
| fxtod %f2, u00 |
| srlx i48, 16, %l5 C (i48 >> 16) |
| add i32, %g4, %g4 C i32+ now in g4 |
| ldx [%sp+2223+16], i32 |
| fxtod %f4, u32 |
| sllx i48, 32, %l6 C (i48 << 32) |
| ldx [%sp+2223+24], i48 |
| fdtox a48, a48 |
| srlx %g4, 32, %o3 C (i32 >> 32) |
| add %l5, %l4, %o1 C hi64- in %o1 |
| std a00, [%sp+2223+0] |
| fmuld u00, v00, p00 |
| sllx %g4, 16, %o2 C (i32 << 16) |
| add %o3, %o1, %o1 C hi64 in %o1 1st ASSIGNMENT |
| std a16, [%sp+2223+8] |
| fmuld u00, v16, p16 |
| sllx %o1, 48, %o3 C (hi64 << 48) |
| add %g2, %o2, %o2 C mi64- in %o2 |
| std a32, [%sp+2223+16] |
| fmuld u00, v32, p32 |
| add %l6, %o2, %o2 C mi64- in %o2 |
| std a48, [%sp+2223+24] |
| faddd p00, r64, a00 |
| fmuld u32, v00, r32 |
| sub %o2, %o3, %o2 C mi64 in %o2 1st ASSIGNMENT |
| faddd p16, r80, a16 |
| fmuld u00, v48, p48 |
| add cy, %g5, %o4 C x = prev(i00) + cy |
| addcc %i2, 8, %i2 |
| bnz,pt %icc, .Loop |
| fmuld u32, v16, r48 |
| |
| .L_four: |
| b,a .L_out_4 |
| |
| C BEGIN MAIN LOOP |
| .align 16 |
| .Loop: |
| C 00 |
| srlx %o4, 16, %o5 C (x >> 16) |
| ld [%i5+%i2], %f3 C read low 32 bits of up[i] |
| fmuld u32, v32, r64 C FIXME not urgent |
| faddd p32, r32, a32 |
| C 01 |
| add %o5, %o2, %o2 C mi64 in %o2 2nd ASSIGNMENT |
| and %o4, xffff, %o5 C (x & 0xffff) |
| ld [%i1+%i2], %f5 C read high 32 bits of up[i] |
| fdtox a00, a00 |
| C 02 |
| srlx rlimb, 32, %g4 C HI(rlimb) |
| and rlimb, xffffffff, %g5 C LO(rlimb) |
| ldx [%i0+%i2], rlimb C read rp[i] |
| faddd p48, r48, a48 |
| C 03 |
| srlx %o2, 48, %o7 C (mi64 >> 48) |
| add i00, %g5, %g5 C i00+ now in g5 |
| fmuld u32, v48, r80 C FIXME not urgent |
| fdtox a16, a16 |
| C 04 |
| sllx %o2, 16, %i3 C (mi64 << 16) |
| add %o7, %o1, cy C new cy |
| ldx [%sp+2223+0], i00 |
| fdtox a32, a32 |
| C 05 |
| srlx i16, 48, %l4 C (i16 >> 48) |
| mov i16, %g2 |
| ldx [%sp+2223+8], i16 |
| fxtod %f2, u00 |
| C 06 |
| srlx i48, 16, %l5 C (i48 >> 16) |
| add i32, %g4, %g4 C i32+ now in g4 |
| ldx [%sp+2223+16], i32 |
| fxtod %f4, u32 |
| C 07 |
| sllx i48, 32, %l6 C (i48 << 32) |
| or %i3, %o5, %o5 |
| ldx [%sp+2223+24], i48 |
| fdtox a48, a48 |
| C 08 |
| srlx %g4, 32, %o3 C (i32 >> 32) |
| add %l5, %l4, %o1 C hi64- in %o1 |
| std a00, [%sp+2223+0] |
| fmuld u00, v00, p00 |
| C 09 |
| sllx %g4, 16, %o2 C (i32 << 16) |
| add %o3, %o1, %o1 C hi64 in %o1 1st ASSIGNMENT |
| std a16, [%sp+2223+8] |
| fmuld u00, v16, p16 |
| C 10 |
| sllx %o1, 48, %o3 C (hi64 << 48) |
| add %g2, %o2, %o2 C mi64- in %o2 |
| std a32, [%sp+2223+16] |
| fmuld u00, v32, p32 |
| C 11 |
| add %l6, %o2, %o2 C mi64- in %o2 |
| std a48, [%sp+2223+24] |
| faddd p00, r64, a00 |
| fmuld u32, v00, r32 |
| C 12 |
| sub %o2, %o3, %o2 C mi64 in %o2 1st ASSIGNMENT |
| stx %o5, [%i4+%i2] |
| faddd p16, r80, a16 |
| fmuld u00, v48, p48 |
| C 13 |
| add cy, %g5, %o4 C x = prev(i00) + cy |
| addcc %i2, 8, %i2 |
| bnz,pt %icc, .Loop |
| fmuld u32, v16, r48 |
| C END MAIN LOOP |
| |
| .L_out_4: |
| srlx %o4, 16, %o5 C (x >> 16) |
| fmuld u32, v32, r64 C FIXME not urgent |
| faddd p32, r32, a32 |
| add %o5, %o2, %o2 C mi64 in %o2 2nd ASSIGNMENT |
| and %o4, xffff, %o5 C (x & 0xffff) |
| fdtox a00, a00 |
| srlx rlimb, 32, %g4 C HI(rlimb) |
| and rlimb, xffffffff, %g5 C LO(rlimb) |
| ldx [%i0+%i2], rlimb C read rp[i] |
| faddd p48, r48, a48 |
| srlx %o2, 48, %o7 C (mi64 >> 48) |
| add i00, %g5, %g5 C i00+ now in g5 |
| fmuld u32, v48, r80 C FIXME not urgent |
| fdtox a16, a16 |
| sllx %o2, 16, %i3 C (mi64 << 16) |
| add %o7, %o1, cy C new cy |
| ldx [%sp+2223+0], i00 |
| fdtox a32, a32 |
| srlx i16, 48, %l4 C (i16 >> 48) |
| mov i16, %g2 |
| ldx [%sp+2223+8], i16 |
| srlx i48, 16, %l5 C (i48 >> 16) |
| add i32, %g4, %g4 C i32+ now in g4 |
| ldx [%sp+2223+16], i32 |
| sllx i48, 32, %l6 C (i48 << 32) |
| or %i3, %o5, %o5 |
| ldx [%sp+2223+24], i48 |
| fdtox a48, a48 |
| srlx %g4, 32, %o3 C (i32 >> 32) |
| add %l5, %l4, %o1 C hi64- in %o1 |
| std a00, [%sp+2223+0] |
| sllx %g4, 16, %o2 C (i32 << 16) |
| add %o3, %o1, %o1 C hi64 in %o1 1st ASSIGNMENT |
| std a16, [%sp+2223+8] |
| sllx %o1, 48, %o3 C (hi64 << 48) |
| add %g2, %o2, %o2 C mi64- in %o2 |
| std a32, [%sp+2223+16] |
| add %l6, %o2, %o2 C mi64- in %o2 |
| std a48, [%sp+2223+24] |
| sub %o2, %o3, %o2 C mi64 in %o2 1st ASSIGNMENT |
| stx %o5, [%i4+%i2] |
| add cy, %g5, %o4 C x = prev(i00) + cy |
| add %i2, 8, %i2 |
| .L_out_3: |
| srlx %o4, 16, %o5 C (x >> 16) |
| add %o5, %o2, %o2 C mi64 in %o2 2nd ASSIGNMENT |
| and %o4, xffff, %o5 C (x & 0xffff) |
| fdtox r64, a00 |
| srlx rlimb, 32, %g4 C HI(rlimb) |
| and rlimb, xffffffff, %g5 C LO(rlimb) |
| ldx [%i0+%i2], rlimb C read rp[i] |
| srlx %o2, 48, %o7 C (mi64 >> 48) |
| add i00, %g5, %g5 C i00+ now in g5 |
| fdtox r80, a16 |
| sllx %o2, 16, %i3 C (mi64 << 16) |
| add %o7, %o1, cy C new cy |
| ldx [%sp+2223+0], i00 |
| srlx i16, 48, %l4 C (i16 >> 48) |
| mov i16, %g2 |
| ldx [%sp+2223+8], i16 |
| srlx i48, 16, %l5 C (i48 >> 16) |
| add i32, %g4, %g4 C i32+ now in g4 |
| ldx [%sp+2223+16], i32 |
| sllx i48, 32, %l6 C (i48 << 32) |
| or %i3, %o5, %o5 |
| ldx [%sp+2223+24], i48 |
| srlx %g4, 32, %o3 C (i32 >> 32) |
| add %l5, %l4, %o1 C hi64- in %o1 |
| std a00, [%sp+2223+0] |
| sllx %g4, 16, %o2 C (i32 << 16) |
| add %o3, %o1, %o1 C hi64 in %o1 1st ASSIGNMENT |
| std a16, [%sp+2223+8] |
| sllx %o1, 48, %o3 C (hi64 << 48) |
| add %g2, %o2, %o2 C mi64- in %o2 |
| add %l6, %o2, %o2 C mi64- in %o2 |
| sub %o2, %o3, %o2 C mi64 in %o2 1st ASSIGNMENT |
| stx %o5, [%i4+%i2] |
| add cy, %g5, %o4 C x = prev(i00) + cy |
| add %i2, 8, %i2 |
| .L_out_2: |
| srlx %o4, 16, %o5 C (x >> 16) |
| add %o5, %o2, %o2 C mi64 in %o2 2nd ASSIGNMENT |
| and %o4, xffff, %o5 C (x & 0xffff) |
| srlx rlimb, 32, %g4 C HI(rlimb) |
| and rlimb, xffffffff, %g5 C LO(rlimb) |
| srlx %o2, 48, %o7 C (mi64 >> 48) |
| add i00, %g5, %g5 C i00+ now in g5 |
| sllx %o2, 16, %i3 C (mi64 << 16) |
| add %o7, %o1, cy C new cy |
| ldx [%sp+2223+0], i00 |
| srlx i16, 48, %l4 C (i16 >> 48) |
| mov i16, %g2 |
| ldx [%sp+2223+8], i16 |
| srlx i48, 16, %l5 C (i48 >> 16) |
| add i32, %g4, %g4 C i32+ now in g4 |
| sllx i48, 32, %l6 C (i48 << 32) |
| or %i3, %o5, %o5 |
| srlx %g4, 32, %o3 C (i32 >> 32) |
| add %l5, %l4, %o1 C hi64- in %o1 |
| sllx %g4, 16, %o2 C (i32 << 16) |
| add %o3, %o1, %o1 C hi64 in %o1 1st ASSIGNMENT |
| sllx %o1, 48, %o3 C (hi64 << 48) |
| add %g2, %o2, %o2 C mi64- in %o2 |
| add %l6, %o2, %o2 C mi64- in %o2 |
| sub %o2, %o3, %o2 C mi64 in %o2 1st ASSIGNMENT |
| stx %o5, [%i4+%i2] |
| add cy, %g5, %o4 C x = prev(i00) + cy |
| add %i2, 8, %i2 |
| .L_out_1: |
| srlx %o4, 16, %o5 C (x >> 16) |
| add %o5, %o2, %o2 C mi64 in %o2 2nd ASSIGNMENT |
| and %o4, xffff, %o5 C (x & 0xffff) |
| srlx %o2, 48, %o7 C (mi64 >> 48) |
| sllx %o2, 16, %i3 C (mi64 << 16) |
| add %o7, %o1, cy C new cy |
| or %i3, %o5, %o5 |
| stx %o5, [%i4+%i2] |
| |
| sllx i00, 0, %g2 |
| add %g2, cy, cy |
| sllx i16, 16, %g3 |
| add %g3, cy, cy |
| |
| return %i7+8 |
| mov cy, %o0 |
| EPILOGUE(mpn_addmul_1) |