| dnl x86-64 mpn_rshift optimized for Pentium 4. |
| |
| dnl Copyright 2003, 2005, 2007, 2008 Free Software Foundation, Inc. |
| dnl |
| dnl This file is part of the GNU MP Library. |
| dnl |
| dnl The GNU MP Library is free software; you can redistribute it and/or |
| dnl modify it under the terms of the GNU Lesser General Public License as |
| dnl published by the Free Software Foundation; either version 3 of the |
| dnl License, or (at your option) any later version. |
| dnl |
| dnl The GNU MP Library is distributed in the hope that it will be useful, |
| dnl but WITHOUT ANY WARRANTY; without even the implied warranty of |
| dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
| dnl Lesser General Public License for more details. |
| dnl |
| dnl You should have received a copy of the GNU Lesser General Public License |
| dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. |
| |
| include(`../config.m4') |
| |
| |
| C cycles/limb |
| C K8,K9: 2.5 |
| C K10: ? |
| C P4: 3.29 |
| C P6-15 (Core2): 2.1 (fluctuates, presumably cache related) |
| C P6-28 (Atom): 14.3 |
| |
| C INPUT PARAMETERS |
| define(`rp',`%rdi') |
| define(`up',`%rsi') |
| define(`n',`%rdx') |
| define(`cnt',`%cl') |
| |
| ASM_START() |
| TEXT |
| ALIGN(32) |
| PROLOGUE(mpn_rshift) |
| mov (up), %rax |
| movd %ecx, %mm4 |
| neg %ecx C put lsh count in cl |
| and $63, %ecx |
| movd %ecx, %mm5 |
| |
| lea -8(up,n,8), up |
| lea -8(rp,n,8), rp |
| lea 1(n), %r8d |
| neg n |
| |
| shl %cl, %rax C function return value |
| |
| and $3, %r8d |
| je L(rol) C jump for n = 3, 7, 11, ... |
| |
| dec %r8d |
| jne L(1) |
| C n = 4, 8, 12, ... |
| movq 8(up,n,8), %mm2 |
| psrlq %mm4, %mm2 |
| movq 16(up,n,8), %mm0 |
| psllq %mm5, %mm0 |
| por %mm0, %mm2 |
| movq %mm2, 8(rp,n,8) |
| inc n |
| jmp L(rol) |
| |
| L(1): dec %r8d |
| je L(1x) C jump for n = 1, 5, 9, 13, ... |
| C n = 2, 6, 10, 16, ... |
| movq 8(up,n,8), %mm2 |
| psrlq %mm4, %mm2 |
| movq 16(up,n,8), %mm0 |
| psllq %mm5, %mm0 |
| por %mm0, %mm2 |
| movq %mm2, 8(rp,n,8) |
| inc n |
| L(1x): |
| cmp $-1, n |
| je L(ast) |
| movq 8(up,n,8), %mm2 |
| psrlq %mm4, %mm2 |
| movq 16(up,n,8), %mm3 |
| psrlq %mm4, %mm3 |
| movq 16(up,n,8), %mm0 |
| movq 24(up,n,8), %mm1 |
| psllq %mm5, %mm0 |
| por %mm0, %mm2 |
| psllq %mm5, %mm1 |
| por %mm1, %mm3 |
| movq %mm2, 8(rp,n,8) |
| movq %mm3, 16(rp,n,8) |
| add $2, n |
| |
| L(rol): movq 8(up,n,8), %mm2 |
| psrlq %mm4, %mm2 |
| movq 16(up,n,8), %mm3 |
| psrlq %mm4, %mm3 |
| |
| add $4, n C 4 |
| jb L(end) C 2 |
| ALIGN(32) |
| L(top): |
| C finish stuff from lsh block |
| movq -16(up,n,8), %mm0 |
| movq -8(up,n,8), %mm1 |
| psllq %mm5, %mm0 |
| por %mm0, %mm2 |
| psllq %mm5, %mm1 |
| movq (up,n,8), %mm0 |
| por %mm1, %mm3 |
| movq 8(up,n,8), %mm1 |
| movq %mm2, -24(rp,n,8) |
| movq %mm3, -16(rp,n,8) |
| C start two new rsh |
| psllq %mm5, %mm0 |
| psllq %mm5, %mm1 |
| |
| C finish stuff from rsh block |
| movq -8(up,n,8), %mm2 |
| movq (up,n,8), %mm3 |
| psrlq %mm4, %mm2 |
| por %mm2, %mm0 |
| psrlq %mm4, %mm3 |
| movq 8(up,n,8), %mm2 |
| por %mm3, %mm1 |
| movq 16(up,n,8), %mm3 |
| movq %mm0, -8(rp,n,8) |
| movq %mm1, (rp,n,8) |
| C start two new lsh |
| add $4, n |
| psrlq %mm4, %mm2 |
| psrlq %mm4, %mm3 |
| |
| jae L(top) C 2 |
| L(end): |
| movq -16(up,n,8), %mm0 |
| psllq %mm5, %mm0 |
| por %mm0, %mm2 |
| movq -8(up,n,8), %mm1 |
| psllq %mm5, %mm1 |
| por %mm1, %mm3 |
| movq %mm2, -24(rp,n,8) |
| movq %mm3, -16(rp,n,8) |
| |
| L(ast): movq (up), %mm2 |
| psrlq %mm4, %mm2 |
| movq %mm2, (rp) |
| emms |
| ret |
| EPILOGUE() |