[ARM] Add Neon shift instructions vshl, vshr.

LOG=N
BUG=v8:4124

Review-Url: https://codereview.chromium.org/2629223005
Cr-Commit-Position: refs/heads/master@{#42610}
diff --git a/src/arm/assembler-arm.cc b/src/arm/assembler-arm.cc
index 6fef17b..52213b0 100644
--- a/src/arm/assembler-arm.cc
+++ b/src/arm/assembler-arm.cc
@@ -4412,6 +4412,48 @@
   emit(EncodeNeonBinOp(VMAX, dt, dst, src1, src2));
 }
 
+enum NeonShiftOp { VSHL, VSHR };
+
+static Instr EncodeNeonShiftOp(NeonShiftOp op, NeonDataType dt,
+                               QwNeonRegister dst, QwNeonRegister src,
+                               int shift) {
+  int vd, d;
+  dst.split_code(&vd, &d);
+  int vm, m;
+  src.split_code(&vm, &m);
+  int size_in_bits = kBitsPerByte << NeonSz(dt);
+  int op_encoding = 0;
+  int imm6 = 0;
+  if (op == VSHL) {
+    DCHECK(shift >= 0 && size_in_bits > shift);
+    imm6 = size_in_bits + shift;
+    op_encoding = 0x5 * B8;
+  } else {
+    DCHECK_EQ(VSHR, op);
+    DCHECK(shift > 0 && size_in_bits >= shift);
+    imm6 = 2 * size_in_bits - shift;
+    op_encoding = NeonU(dt) * B24;
+  }
+  return 0x1E5U * B23 | d * B22 | imm6 * B16 | vd * B12 | B6 | m * B5 | B4 |
+         vm | op_encoding;
+}
+
+void Assembler::vshl(NeonDataType dt, QwNeonRegister dst, QwNeonRegister src,
+                     int shift) {
+  DCHECK(IsEnabled(NEON));
+  // Qd = vshl(Qm, bits) SIMD shift left immediate.
+  // Instruction details available in ARM DDI 0406C.b, A8-1046.
+  emit(EncodeNeonShiftOp(VSHL, dt, dst, src, shift));
+}
+
+void Assembler::vshr(NeonDataType dt, QwNeonRegister dst, QwNeonRegister src,
+                     int shift) {
+  DCHECK(IsEnabled(NEON));
+  // Qd = vshl(Qm, bits) SIMD shift right immediate.
+  // Instruction details available in ARM DDI 0406C.b, A8-1052.
+  emit(EncodeNeonShiftOp(VSHR, dt, dst, src, shift));
+}
+
 static Instr EncodeNeonEstimateOp(bool is_rsqrt, QwNeonRegister dst,
                                   QwNeonRegister src) {
   int vd, d;
diff --git a/src/arm/assembler-arm.h b/src/arm/assembler-arm.h
index 6390300..d4f9402 100644
--- a/src/arm/assembler-arm.h
+++ b/src/arm/assembler-arm.h
@@ -1387,6 +1387,8 @@
   void vmax(QwNeonRegister dst, QwNeonRegister src1, QwNeonRegister src2);
   void vmax(NeonDataType dt, QwNeonRegister dst,
             QwNeonRegister src1, QwNeonRegister src2);
+  void vshl(NeonDataType dt, QwNeonRegister dst, QwNeonRegister src, int shift);
+  void vshr(NeonDataType dt, QwNeonRegister dst, QwNeonRegister src, int shift);
   // vrecpe and vrsqrte only support floating point lanes.
   void vrecpe(QwNeonRegister dst, QwNeonRegister src);
   void vrsqrte(QwNeonRegister dst, QwNeonRegister src);
diff --git a/src/arm/disasm-arm.cc b/src/arm/disasm-arm.cc
index db32fc9..7d61845 100644
--- a/src/arm/disasm-arm.cc
+++ b/src/arm/disasm-arm.cc
@@ -1973,6 +1973,24 @@
         out_buffer_pos_ +=
             SNPrintF(out_buffer_ + out_buffer_pos_, "vext.8 q%d, q%d, q%d, #%d",
                      Vd, Vn, Vm, imm4);
+      } else if (instr->Bits(11, 7) == 0xA && instr->Bit(4) == 1) {
+        // vshl.i<size> Qd, Qm, shift
+        int size = base::bits::RoundDownToPowerOfTwo32(instr->Bits(21, 16));
+        int shift = instr->Bits(21, 16) - size;
+        int Vd = instr->VFPDRegValue(kSimd128Precision);
+        int Vm = instr->VFPMRegValue(kSimd128Precision);
+        out_buffer_pos_ +=
+            SNPrintF(out_buffer_ + out_buffer_pos_, "vshl.i%d q%d, q%d, #%d",
+                     size, Vd, Vm, shift);
+      } else if (instr->Bits(11, 7) == 0 && instr->Bit(4) == 1) {
+        // vshr.s<size> Qd, Qm, shift
+        int size = base::bits::RoundDownToPowerOfTwo32(instr->Bits(21, 16));
+        int shift = 2 * size - instr->Bits(21, 16);
+        int Vd = instr->VFPDRegValue(kSimd128Precision);
+        int Vm = instr->VFPMRegValue(kSimd128Precision);
+        out_buffer_pos_ +=
+            SNPrintF(out_buffer_ + out_buffer_pos_, "vshr.s%d q%d, q%d, #%d",
+                     size, Vd, Vm, shift);
       } else {
         Unknown(instr);
       }
@@ -2162,15 +2180,24 @@
             Unknown(instr);
           }
         } else if (instr->Bits(19, 18) == 0x2 && instr->Bits(11, 8) == 0x5) {
+          // vrecpe/vrsqrte.f32 Qd, Qm.
           int Vd = instr->VFPDRegValue(kSimd128Precision);
           int Vm = instr->VFPMRegValue(kSimd128Precision);
           const char* op = instr->Bit(7) == 0 ? "vrecpe" : "vrsqrte";
-          // vrecpe/vrsqrte.f32 Qd, Qm.
           out_buffer_pos_ += SNPrintF(out_buffer_ + out_buffer_pos_,
                                       "%s.f32 q%d, q%d", op, Vd, Vm);
         } else {
           Unknown(instr);
         }
+      } else if (instr->Bits(11, 7) == 0 && instr->Bit(4) == 1) {
+        // vshr.u<size> Qd, Qm, shift
+        int size = base::bits::RoundDownToPowerOfTwo32(instr->Bits(21, 16));
+        int shift = 2 * size - instr->Bits(21, 16);
+        int Vd = instr->VFPDRegValue(kSimd128Precision);
+        int Vm = instr->VFPMRegValue(kSimd128Precision);
+        out_buffer_pos_ +=
+            SNPrintF(out_buffer_ + out_buffer_pos_, "vshr.u%d q%d, q%d, #%d",
+                     size, Vd, Vm, shift);
       } else {
         Unknown(instr);
       }
diff --git a/src/arm/simulator-arm.cc b/src/arm/simulator-arm.cc
index e22a8b9..ad7c4da 100644
--- a/src/arm/simulator-arm.cc
+++ b/src/arm/simulator-arm.cc
@@ -573,7 +573,6 @@
   return start_page == end_page;
 }
 
-
 void Simulator::set_last_debugger_input(char* input) {
   DeleteArray(last_debugger_input_);
   last_debugger_input_ = input;
@@ -4355,6 +4354,84 @@
           dst[i] = src2[i - boundary];
         }
         set_q_register(Vd, dst);
+      } else if (instr->Bits(11, 7) == 0xA && instr->Bit(4) == 1) {
+        // vshl.i<size> Qd, Qm, shift
+        int size = base::bits::RoundDownToPowerOfTwo32(instr->Bits(21, 16));
+        int shift = instr->Bits(21, 16) - size;
+        int Vd = instr->VFPDRegValue(kSimd128Precision);
+        int Vm = instr->VFPMRegValue(kSimd128Precision);
+        NeonSize ns = static_cast<NeonSize>(size / 16);
+        switch (ns) {
+          case Neon8: {
+            uint8_t src[16];
+            get_q_register(Vm, src);
+            for (int i = 0; i < 16; i++) {
+              src[i] <<= shift;
+            }
+            set_q_register(Vd, src);
+            break;
+          }
+          case Neon16: {
+            uint16_t src[8];
+            get_q_register(Vm, src);
+            for (int i = 0; i < 8; i++) {
+              src[i] <<= shift;
+            }
+            set_q_register(Vd, src);
+            break;
+          }
+          case Neon32: {
+            uint32_t src[4];
+            get_q_register(Vm, src);
+            for (int i = 0; i < 4; i++) {
+              src[i] <<= shift;
+            }
+            set_q_register(Vd, src);
+            break;
+          }
+          default:
+            UNREACHABLE();
+            break;
+        }
+      } else if (instr->Bits(11, 7) == 0 && instr->Bit(4) == 1) {
+        // vshr.s<size> Qd, Qm, shift
+        int size = base::bits::RoundDownToPowerOfTwo32(instr->Bits(21, 16));
+        int shift = 2 * size - instr->Bits(21, 16);
+        int Vd = instr->VFPDRegValue(kSimd128Precision);
+        int Vm = instr->VFPMRegValue(kSimd128Precision);
+        NeonSize ns = static_cast<NeonSize>(size / 16);
+        switch (ns) {
+          case Neon8: {
+            int8_t src[16];
+            get_q_register(Vm, src);
+            for (int i = 0; i < 16; i++) {
+              src[i] = ArithmeticShiftRight(src[i], shift);
+            }
+            set_q_register(Vd, src);
+            break;
+          }
+          case Neon16: {
+            int16_t src[8];
+            get_q_register(Vm, src);
+            for (int i = 0; i < 8; i++) {
+              src[i] = ArithmeticShiftRight(src[i], shift);
+            }
+            set_q_register(Vd, src);
+            break;
+          }
+          case Neon32: {
+            int32_t src[4];
+            get_q_register(Vm, src);
+            for (int i = 0; i < 4; i++) {
+              src[i] = ArithmeticShiftRight(src[i], shift);
+            }
+            set_q_register(Vd, src);
+            break;
+          }
+          default:
+            UNREACHABLE();
+            break;
+        }
       } else {
         UNIMPLEMENTED();
       }
@@ -4993,6 +5070,45 @@
         } else {
           UNIMPLEMENTED();
         }
+      } else if (instr->Bits(11, 7) == 0 && instr->Bit(4) == 1) {
+        // vshr.u<size> Qd, Qm, shift
+        int size = base::bits::RoundDownToPowerOfTwo32(instr->Bits(21, 16));
+        int shift = 2 * size - instr->Bits(21, 16);
+        int Vd = instr->VFPDRegValue(kSimd128Precision);
+        int Vm = instr->VFPMRegValue(kSimd128Precision);
+        NeonSize ns = static_cast<NeonSize>(size / 16);
+        switch (ns) {
+          case Neon8: {
+            uint8_t src[16];
+            get_q_register(Vm, src);
+            for (int i = 0; i < 16; i++) {
+              src[i] >>= shift;
+            }
+            set_q_register(Vd, src);
+            break;
+          }
+          case Neon16: {
+            uint16_t src[8];
+            get_q_register(Vm, src);
+            for (int i = 0; i < 8; i++) {
+              src[i] >>= shift;
+            }
+            set_q_register(Vd, src);
+            break;
+          }
+          case Neon32: {
+            uint32_t src[4];
+            get_q_register(Vm, src);
+            for (int i = 0; i < 4; i++) {
+              src[i] >>= shift;
+            }
+            set_q_register(Vd, src);
+            break;
+          }
+          default:
+            UNREACHABLE();
+            break;
+        }
       } else {
         UNIMPLEMENTED();
       }
diff --git a/src/utils.h b/src/utils.h
index 0ea1de1..6d0d3c8 100644
--- a/src/utils.h
+++ b/src/utils.h
@@ -137,15 +137,20 @@
   return nibble + msb4[x];
 }
 
-
-// The C++ standard leaves the semantics of '>>' undefined for
-// negative signed operands. Most implementations do the right thing,
-// though.
-inline int ArithmeticShiftRight(int x, int s) {
-  return x >> s;
+template <typename T>
+static T ArithmeticShiftRight(T x, int shift) {
+  DCHECK_LE(0, shift);
+  if (x < 0) {
+    // Right shift of signed values is implementation defined. Simulate a
+    // true arithmetic right shift by adding leading sign bits.
+    using UnsignedT = typename std::make_unsigned<T>::type;
+    UnsignedT mask = ~(static_cast<UnsignedT>(~0) >> shift);
+    return (static_cast<UnsignedT>(x) >> shift) | mask;
+  } else {
+    return x >> shift;
+  }
 }
 
-
 template <typename T>
 int Compare(const T& a, const T& b) {
   if (a == b)
diff --git a/test/cctest/test-assembler-arm.cc b/test/cctest/test-assembler-arm.cc
index 680b123..ac6b620 100644
--- a/test/cctest/test-assembler-arm.cc
+++ b/test/cctest/test-assembler-arm.cc
@@ -1301,6 +1301,8 @@
     uint32_t vadd8[4], vadd16[4], vadd32[4];
     uint32_t vsub8[4], vsub16[4], vsub32[4];
     uint32_t vmul8[4], vmul16[4], vmul32[4];
+    uint32_t vshl8[4], vshl16[4], vshl32[5];
+    uint32_t vshr_s8[4], vshr_u16[4], vshr_s32[5];
     uint32_t vceq[4], vceqf[4], vcgef[4], vcgtf[4];
     uint32_t vcge_s8[4], vcge_u16[4], vcge_s32[4];
     uint32_t vcgt_s8[4], vcgt_u16[4], vcgt_s32[4];
@@ -1671,6 +1673,32 @@
     __ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, vmul32))));
     __ vst1(Neon8, NeonListOperand(q1), NeonMemOperand(r4));
 
+    // vshl.
+    __ mov(r4, Operand(0x55));
+    __ vdup(Neon8, q0, r4);
+    __ vshl(NeonS8, q1, q0, 1);
+    __ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, vshl8))));
+    __ vst1(Neon8, NeonListOperand(q1), NeonMemOperand(r4));
+    __ vshl(NeonU16, q1, q0, 9);
+    __ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, vshl16))));
+    __ vst1(Neon8, NeonListOperand(q1), NeonMemOperand(r4));
+    __ vshl(NeonS32, q1, q0, 17);
+    __ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, vshl32))));
+    __ vst1(Neon8, NeonListOperand(q1), NeonMemOperand(r4));
+
+    // vshr.s, vshr.u.
+    __ mov(r4, Operand(0x80));
+    __ vdup(Neon8, q0, r4);
+    __ vshr(NeonS8, q1, q0, 1);
+    __ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, vshr_s8))));
+    __ vst1(Neon8, NeonListOperand(q1), NeonMemOperand(r4));
+    __ vshr(NeonU16, q1, q0, 9);
+    __ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, vshr_u16))));
+    __ vst1(Neon8, NeonListOperand(q1), NeonMemOperand(r4));
+    __ vshr(NeonS32, q1, q0, 17);
+    __ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, vshr_s32))));
+    __ vst1(Neon8, NeonListOperand(q1), NeonMemOperand(r4));
+
     // vceq.
     __ mov(r4, Operand(0x03));
     __ vdup(Neon8, q0, r4);
@@ -1926,6 +1954,12 @@
     CHECK_EQ_SPLAT(vmul8, 0x04040404u);
     CHECK_EQ_SPLAT(vmul16, 0x00040004u);
     CHECK_EQ_SPLAT(vmul32, 0x00000004u);
+    CHECK_EQ_SPLAT(vshl8, 0xaaaaaaaau);
+    CHECK_EQ_SPLAT(vshl16, 0xaa00aa00u);
+    CHECK_EQ_SPLAT(vshl32, 0xaaaa0000u);
+    CHECK_EQ_SPLAT(vshr_s8, 0xc0c0c0c0u);
+    CHECK_EQ_SPLAT(vshr_u16, 0x00400040u);
+    CHECK_EQ_SPLAT(vshr_s32, 0xffffc040u);
     CHECK_EQ_SPLAT(vceq, 0x00ff00ffu);
     // [0, 3, 0, 3, ...] >= [3, 3, 3, 3, ...]
     CHECK_EQ_SPLAT(vcge_s8, 0x00ff00ffu);
diff --git a/test/cctest/test-disasm-arm.cc b/test/cctest/test-disasm-arm.cc
index 3ba6bee..e02dabe 100644
--- a/test/cctest/test-disasm-arm.cc
+++ b/test/cctest/test-disasm-arm.cc
@@ -1063,6 +1063,18 @@
               "f2142970       vmul.i16 q1, q2, q8");
       COMPARE(vmul(Neon32, q15, q0, q8),
               "f260e970       vmul.i32 q15, q0, q8");
+      COMPARE(vshl(NeonS8, q15, q0, 6),
+              "f2cee550       vshl.i8 q15, q0, #6");
+      COMPARE(vshl(NeonU16, q15, q0, 10),
+              "f2dae550       vshl.i16 q15, q0, #10");
+      COMPARE(vshl(NeonS32, q15, q0, 17),
+              "f2f1e550       vshl.i32 q15, q0, #17");
+      COMPARE(vshr(NeonS8, q15, q0, 6),
+              "f2cae050       vshr.s8 q15, q0, #6");
+      COMPARE(vshr(NeonU16, q15, q0, 10),
+              "f3d6e050       vshr.u16 q15, q0, #10");
+      COMPARE(vshr(NeonS32, q15, q0, 17),
+              "f2efe050       vshr.s32 q15, q0, #17");
       COMPARE(vrecpe(q15, q0),
               "f3fbe540       vrecpe.f32 q15, q0");
       COMPARE(vrecps(q15, q0, q8),