[ARM] Add NEON instructions for implementing SIMD.
- Adds vabs, vneg, vmul, vext, vzip, vrev instructions.
- Adds Swizzle function to macro assembler.
- Simplifies if-else logic in disassembler, simulator, for Neon special.
- Some refactoring of Neon assembler, macro-assembler tests.

LOG=N
BUG=v8:4124

Review-Url: https://codereview.chromium.org/2579913002
Cr-Commit-Position: refs/heads/master@{#41781}
diff --git a/src/arm/assembler-arm.cc b/src/arm/assembler-arm.cc
index aa6be21..4a76b09 100644
--- a/src/arm/assembler-arm.cc
+++ b/src/arm/assembler-arm.cc
@@ -4074,6 +4074,50 @@
   emit(EncodeNeonVCVT(U32, dst, F32, src));
 }
 
+// op is instr->Bits(11, 7).
+static Instr EncodeNeonUnaryOp(int op, bool is_float, NeonSize size,
+                               const QwNeonRegister dst,
+                               const QwNeonRegister src) {
+  DCHECK_IMPLIES(is_float, size == Neon32);
+  int vd, d;
+  dst.split_code(&vd, &d);
+  int vm, m;
+  src.split_code(&vm, &m);
+  int F = is_float ? 1 : 0;
+  return 0x1E7U * B23 | d * B22 | 0x3 * B20 | size * B18 | B16 | vd * B12 |
+         F * B10 | B8 | op * B7 | B6 | m * B5 | vm;
+}
+
+void Assembler::vabs(const QwNeonRegister dst, const QwNeonRegister src) {
+  // Qd = vabs.f<size>(Qn, Qm) SIMD floating point absolute value.
+  // Instruction details available in ARM DDI 0406C.b, A8.8.824.
+  DCHECK(IsEnabled(NEON));
+  emit(EncodeNeonUnaryOp(0x6, true, Neon32, dst, src));
+}
+
+void Assembler::vabs(NeonSize size, const QwNeonRegister dst,
+                     const QwNeonRegister src) {
+  // Qd = vabs.s<size>(Qn, Qm) SIMD integer absolute value.
+  // Instruction details available in ARM DDI 0406C.b, A8.8.824.
+  DCHECK(IsEnabled(NEON));
+  emit(EncodeNeonUnaryOp(0x6, false, size, dst, src));
+}
+
+void Assembler::vneg(const QwNeonRegister dst, const QwNeonRegister src) {
+  // Qd = vabs.f<size>(Qn, Qm) SIMD floating point negate.
+  // Instruction details available in ARM DDI 0406C.b, A8.8.968.
+  DCHECK(IsEnabled(NEON));
+  emit(EncodeNeonUnaryOp(0x7, true, Neon32, dst, src));
+}
+
+void Assembler::vneg(NeonSize size, const QwNeonRegister dst,
+                     const QwNeonRegister src) {
+  // Qd = vabs.s<size>(Qn, Qm) SIMD integer negate.
+  // Instruction details available in ARM DDI 0406C.b, A8.8.968.
+  DCHECK(IsEnabled(NEON));
+  emit(EncodeNeonUnaryOp(0x7, false, size, dst, src));
+}
+
 void Assembler::veor(DwVfpRegister dst, DwVfpRegister src1,
                      DwVfpRegister src2) {
   // Dd = veor(Dn, Dm) 64 bit integer exclusive OR.
@@ -4166,6 +4210,37 @@
        n * B7 | B6 | m * B5 | vm);
 }
 
+void Assembler::vmul(QwNeonRegister dst, const QwNeonRegister src1,
+                     const QwNeonRegister src2) {
+  DCHECK(IsEnabled(NEON));
+  // Qd = vadd(Qn, Qm) SIMD floating point multiply.
+  // Instruction details available in ARM DDI 0406C.b, A8-958.
+  int vd, d;
+  dst.split_code(&vd, &d);
+  int vn, n;
+  src1.split_code(&vn, &n);
+  int vm, m;
+  src2.split_code(&vm, &m);
+  emit(0x1E6U * B23 | d * B22 | vn * B16 | vd * B12 | 0xD * B8 | n * B7 | B6 |
+       m * B5 | B4 | vm);
+}
+
+void Assembler::vmul(NeonSize size, QwNeonRegister dst,
+                     const QwNeonRegister src1, const QwNeonRegister src2) {
+  DCHECK(IsEnabled(NEON));
+  // Qd = vadd(Qn, Qm) SIMD integer multiply.
+  // Instruction details available in ARM DDI 0406C.b, A8-960.
+  int vd, d;
+  dst.split_code(&vd, &d);
+  int vn, n;
+  src1.split_code(&vn, &n);
+  int vm, m;
+  src2.split_code(&vm, &m);
+  int sz = static_cast<int>(size);
+  emit(0x1E4U * B23 | d * B22 | sz * B20 | vn * B16 | vd * B12 | 0x9 * B8 |
+       n * B7 | B6 | m * B5 | B4 | vm);
+}
+
 void Assembler::vtst(NeonSize size, QwNeonRegister dst,
                      const QwNeonRegister src1, const QwNeonRegister src2) {
   DCHECK(IsEnabled(NEON));
@@ -4185,7 +4260,7 @@
 void Assembler::vceq(NeonSize size, QwNeonRegister dst,
                      const QwNeonRegister src1, const QwNeonRegister src2) {
   DCHECK(IsEnabled(NEON));
-  // Qd = vceq(Qn, Qm) SIMD integer compare equal.
+  // Qd = vceq(Qn, Qm) SIMD bitwise compare equal.
   // Instruction details available in ARM DDI 0406C.b, A8-844.
   int vd, d;
   dst.split_code(&vd, &d);
@@ -4214,6 +4289,70 @@
        n * B7 | B6 | m * B5 | B4 | vm);
 }
 
+void Assembler::vext(QwNeonRegister dst, const QwNeonRegister src1,
+                     const QwNeonRegister src2, int bytes) {
+  DCHECK(IsEnabled(NEON));
+  // Qd = vext(Qn, Qm) SIMD byte extract.
+  // Instruction details available in ARM DDI 0406C.b, A8-890.
+  int vd, d;
+  dst.split_code(&vd, &d);
+  int vn, n;
+  src1.split_code(&vn, &n);
+  int vm, m;
+  src2.split_code(&vm, &m);
+  DCHECK_GT(16, bytes);
+  emit(0x1E5U * B23 | d * B22 | 0x3 * B20 | vn * B16 | vd * B12 | bytes * B8 |
+       n * B7 | B6 | m * B5 | vm);
+}
+
+void Assembler::vzip(NeonSize size, QwNeonRegister dst,
+                     const QwNeonRegister src) {
+  DCHECK(IsEnabled(NEON));
+  // Qd = vzip.<size>(Qn, Qm) SIMD zip (interleave).
+  // Instruction details available in ARM DDI 0406C.b, A8-1102.
+  int vd, d;
+  dst.split_code(&vd, &d);
+  int vm, m;
+  src.split_code(&vm, &m);
+  int sz = static_cast<int>(size);
+  emit(0x1E7U * B23 | d * B22 | 0x3 * B20 | sz * B18 | 2 * B16 | vd * B12 |
+       0x3 * B7 | B6 | m * B5 | vm);
+}
+
+static Instr EncodeNeonVREV(NeonSize op_size, NeonSize size,
+                            const QwNeonRegister dst,
+                            const QwNeonRegister src) {
+  // Qd = vrev<op_size>.<size>(Qn, Qm) SIMD scalar reverse.
+  // Instruction details available in ARM DDI 0406C.b, A8-1028.
+  DCHECK_GT(op_size, static_cast<int>(size));
+  int vd, d;
+  dst.split_code(&vd, &d);
+  int vm, m;
+  src.split_code(&vm, &m);
+  int sz = static_cast<int>(size);
+  int op = static_cast<int>(Neon64) - static_cast<int>(op_size);
+  return 0x1E7U * B23 | d * B22 | 0x3 * B20 | sz * B18 | vd * B12 | op * B7 |
+         B6 | m * B5 | vm;
+}
+
+void Assembler::vrev16(NeonSize size, const QwNeonRegister dst,
+                       const QwNeonRegister src) {
+  DCHECK(IsEnabled(NEON));
+  emit(EncodeNeonVREV(Neon16, size, dst, src));
+}
+
+void Assembler::vrev32(NeonSize size, const QwNeonRegister dst,
+                       const QwNeonRegister src) {
+  DCHECK(IsEnabled(NEON));
+  emit(EncodeNeonVREV(Neon32, size, dst, src));
+}
+
+void Assembler::vrev64(NeonSize size, const QwNeonRegister dst,
+                       const QwNeonRegister src) {
+  DCHECK(IsEnabled(NEON));
+  emit(EncodeNeonVREV(Neon64, size, dst, src));
+}
+
 // Encode NEON vtbl / vtbx instruction.
 static Instr EncodeNeonVTB(const DwVfpRegister dst, const NeonListOperand& list,
                            const DwVfpRegister index, bool vtbx) {
diff --git a/src/arm/assembler-arm.h b/src/arm/assembler-arm.h
index 235d80b..7bb92b4 100644
--- a/src/arm/assembler-arm.h
+++ b/src/arm/assembler-arm.h
@@ -1362,6 +1362,10 @@
   void vcvt_s32_f32(const QwNeonRegister dst, const QwNeonRegister src);
   void vcvt_u32_f32(const QwNeonRegister dst, const QwNeonRegister src);
 
+  void vabs(const QwNeonRegister dst, const QwNeonRegister src);
+  void vabs(NeonSize size, const QwNeonRegister dst, const QwNeonRegister src);
+  void vneg(const QwNeonRegister dst, const QwNeonRegister src);
+  void vneg(NeonSize size, const QwNeonRegister dst, const QwNeonRegister src);
   void veor(DwVfpRegister dst, DwVfpRegister src1, DwVfpRegister src2);
   void veor(QwNeonRegister dst, QwNeonRegister src1, QwNeonRegister src2);
   void vadd(const QwNeonRegister dst, const QwNeonRegister src1,
@@ -1372,12 +1376,25 @@
             const QwNeonRegister src2);
   void vsub(NeonSize size, const QwNeonRegister dst, const QwNeonRegister src1,
             const QwNeonRegister src2);
+  void vmul(const QwNeonRegister dst, const QwNeonRegister src1,
+            const QwNeonRegister src2);
+  void vmul(NeonSize size, const QwNeonRegister dst, const QwNeonRegister src1,
+            const QwNeonRegister src2);
   void vtst(NeonSize size, const QwNeonRegister dst, const QwNeonRegister src1,
             const QwNeonRegister src2);
   void vceq(NeonSize size, const QwNeonRegister dst, const QwNeonRegister src1,
             const QwNeonRegister src2);
   void vbsl(const QwNeonRegister dst, const QwNeonRegister src1,
             const QwNeonRegister src2);
+  void vext(const QwNeonRegister dst, const QwNeonRegister src1,
+            const QwNeonRegister src2, int bytes);
+  void vzip(NeonSize size, const QwNeonRegister dst, const QwNeonRegister src);
+  void vrev16(NeonSize size, const QwNeonRegister dst,
+            const QwNeonRegister src);
+  void vrev32(NeonSize size, const QwNeonRegister dst,
+            const QwNeonRegister src);
+  void vrev64(NeonSize size, const QwNeonRegister dst,
+            const QwNeonRegister src);
   void vtbl(const DwVfpRegister dst, const NeonListOperand& list,
             const DwVfpRegister index);
   void vtbx(const DwVfpRegister dst, const NeonListOperand& list,
diff --git a/src/arm/disasm-arm.cc b/src/arm/disasm-arm.cc
index 7a42386..ef99d53 100644
--- a/src/arm/disasm-arm.cc
+++ b/src/arm/disasm-arm.cc
@@ -1883,6 +1883,15 @@
         // vadd/vsub.f32 Qd, Qm, Qn.
         out_buffer_pos_ += SNPrintF(out_buffer_ + out_buffer_pos_,
                                     "%s.f32 q%d, q%d, q%d", op, Vd, Vn, Vm);
+      } else if (instr->Bits(11, 8) == 0x9 && instr->Bit(6) == 1 &&
+                 instr->Bit(4) == 1) {
+        int size = kBitsPerByte * (1 << instr->Bits(21, 20));
+        int Vd = instr->VFPDRegValue(kSimd128Precision);
+        int Vm = instr->VFPMRegValue(kSimd128Precision);
+        int Vn = instr->VFPNRegValue(kSimd128Precision);
+        // vmul.i<size> Qd, Qm, Qn.
+        out_buffer_pos_ += SNPrintF(out_buffer_ + out_buffer_pos_,
+                                    "vmul.i%d q%d, q%d, q%d", size, Vd, Vn, Vm);
       } else {
         Unknown(instr);
       }
@@ -1897,6 +1906,15 @@
         int imm3 = instr->Bits(21, 19);
         out_buffer_pos_ += SNPrintF(out_buffer_ + out_buffer_pos_,
                                     "vmovl.s%d q%d, d%d", imm3*8, Vd, Vm);
+      } else if (instr->Bits(21, 20) == 3 && instr->Bit(4) == 0) {
+        // vext.8 Qd, Qm, Qn, imm4
+        int imm4 = instr->Bits(11, 8);
+        int Vd = instr->VFPDRegValue(kSimd128Precision);
+        int Vm = instr->VFPMRegValue(kSimd128Precision);
+        int Vn = instr->VFPNRegValue(kSimd128Precision);
+        out_buffer_pos_ +=
+            SNPrintF(out_buffer_ + out_buffer_pos_, "vext.8 q%d, q%d, q%d, #%d",
+                     Vd, Vn, Vm, imm4);
       } else {
         Unknown(instr);
       }
@@ -1941,6 +1959,14 @@
           out_buffer_pos_ += SNPrintF(out_buffer_ + out_buffer_pos_,
                                       "veor q%d, q%d, q%d", Vd, Vn, Vm);
         }
+      } else if (instr->Bit(21) == 0 && instr->Bits(11, 8) == 0xd &&
+                 instr->Bit(6) == 1 && instr->Bit(4) == 1) {
+        // vmul.f32 Qd, Qn, Qm
+        int Vd = instr->VFPDRegValue(kSimd128Precision);
+        int Vn = instr->VFPNRegValue(kSimd128Precision);
+        int Vm = instr->VFPMRegValue(kSimd128Precision);
+        out_buffer_pos_ += SNPrintF(out_buffer_ + out_buffer_pos_,
+                                    "vmul.f32 q%d, q%d, q%d", Vd, Vn, Vm);
       } else {
         Unknown(instr);
       }
@@ -1955,68 +1981,102 @@
         int imm3 = instr->Bits(21, 19);
         out_buffer_pos_ += SNPrintF(out_buffer_ + out_buffer_pos_,
                                     "vmovl.u%d q%d, d%d", imm3*8, Vd, Vm);
-      } else if (instr->Opc1Value() == 7 && instr->Bits(19, 16) == 0 &&
-                 instr->Bits(11, 6) == 0x17 && instr->Bit(4) == 0) {
-        int Vd = instr->VFPDRegValue(kSimd128Precision);
-        int Vm = instr->VFPMRegValue(kSimd128Precision);
-        out_buffer_pos_ +=
-            SNPrintF(out_buffer_ + out_buffer_pos_, "vmvn q%d, q%d", Vd, Vm);
-      } else if (instr->Opc1Value() == 7 && instr->Bits(19, 16) == 0xB &&
-                 instr->Bits(11, 9) == 0x3 && instr->Bit(6) == 1 &&
+      } else if (instr->Opc1Value() == 7 && instr->Bits(21, 20) == 0x3 &&
                  instr->Bit(4) == 0) {
-        int Vd = instr->VFPDRegValue(kSimd128Precision);
-        int Vm = instr->VFPMRegValue(kSimd128Precision);
-        const char* suffix = nullptr;
-        int op = instr->Bits(8, 7);
-        switch (op) {
-          case 0:
-            suffix = "f32.s32";
-            break;
-          case 1:
-            suffix = "f32.u32";
-            break;
-          case 2:
-            suffix = "s32.f32";
-            break;
-          case 3:
-            suffix = "u32.f32";
-            break;
-        }
-        out_buffer_pos_ += SNPrintF(out_buffer_ + out_buffer_pos_,
-                                    "vcvt.%s q%d, q%d", suffix, Vd, Vm);
-      } else if ((instr->Bits(21, 16) == 0x32) && (instr->Bits(11, 7) == 0) &&
-                 (instr->Bit(4) == 0)) {
-        if (instr->Bit(6) == 0) {
-          int Vd = instr->VFPDRegValue(kDoublePrecision);
+        if (instr->Bits(17, 16) == 0x2 && instr->Bits(11, 7) == 0) {
+          if (instr->Bit(6) == 0) {
+            int Vd = instr->VFPDRegValue(kDoublePrecision);
+            int Vm = instr->VFPMRegValue(kDoublePrecision);
+            out_buffer_pos_ += SNPrintF(out_buffer_ + out_buffer_pos_,
+                                        "vswp d%d, d%d", Vd, Vm);
+          } else {
+            int Vd = instr->VFPDRegValue(kSimd128Precision);
+            int Vm = instr->VFPMRegValue(kSimd128Precision);
+            out_buffer_pos_ += SNPrintF(out_buffer_ + out_buffer_pos_,
+                                        "vswp q%d, q%d", Vd, Vm);
+          }
+        } else if (instr->Bits(11, 7) == 0x18) {
+          int Vd = instr->VFPDRegValue(kSimd128Precision);
           int Vm = instr->VFPMRegValue(kDoublePrecision);
-          out_buffer_pos_ +=
-              SNPrintF(out_buffer_ + out_buffer_pos_, "vswp d%d, d%d", Vd, Vm);
-        } else {
+          int index = instr->Bit(19);
+          out_buffer_pos_ += SNPrintF(out_buffer_ + out_buffer_pos_,
+                                      "vdup q%d, d%d[%d]", Vd, Vm, index);
+        } else if (instr->Bits(19, 16) == 0 && instr->Bits(11, 6) == 0x17) {
           int Vd = instr->VFPDRegValue(kSimd128Precision);
           int Vm = instr->VFPMRegValue(kSimd128Precision);
           out_buffer_pos_ +=
-              SNPrintF(out_buffer_ + out_buffer_pos_, "vswp q%d, q%d", Vd, Vm);
+              SNPrintF(out_buffer_ + out_buffer_pos_, "vmvn q%d, q%d", Vd, Vm);
+        } else if (instr->Bits(19, 16) == 0xB && instr->Bits(11, 9) == 0x3 &&
+                   instr->Bit(6) == 1) {
+          int Vd = instr->VFPDRegValue(kSimd128Precision);
+          int Vm = instr->VFPMRegValue(kSimd128Precision);
+          const char* suffix = nullptr;
+          int op = instr->Bits(8, 7);
+          switch (op) {
+            case 0:
+              suffix = "f32.s32";
+              break;
+            case 1:
+              suffix = "f32.u32";
+              break;
+            case 2:
+              suffix = "s32.f32";
+              break;
+            case 3:
+              suffix = "u32.f32";
+              break;
+          }
+          out_buffer_pos_ += SNPrintF(out_buffer_ + out_buffer_pos_,
+                                      "vcvt.%s q%d, q%d", suffix, Vd, Vm);
+        } else if (instr->Bits(11, 10) == 0x2) {
+          int Vd = instr->VFPDRegValue(kDoublePrecision);
+          int Vn = instr->VFPNRegValue(kDoublePrecision);
+          int Vm = instr->VFPMRegValue(kDoublePrecision);
+          int len = instr->Bits(9, 8);
+          NeonListOperand list(DwVfpRegister::from_code(Vn), len + 1);
+          out_buffer_pos_ +=
+              SNPrintF(out_buffer_ + out_buffer_pos_, "%s d%d, ",
+                       instr->Bit(6) == 0 ? "vtbl.8" : "vtbx.8", Vd);
+          FormatNeonList(Vn, list.type());
+          Print(", ");
+          PrintDRegister(Vm);
+        } else if (instr->Bits(17, 16) == 0x2 && instr->Bits(11, 6) == 0x7) {
+          int Vd = instr->VFPDRegValue(kSimd128Precision);
+          int Vm = instr->VFPMRegValue(kSimd128Precision);
+          int size = kBitsPerByte * (1 << instr->Bits(19, 18));
+          // vzip.<size> Qd, Qm.
+          out_buffer_pos_ += SNPrintF(out_buffer_ + out_buffer_pos_,
+                                      "vzip.%d q%d, q%d", size, Vd, Vm);
+        } else if (instr->Bits(17, 16) == 0 && instr->Bits(11, 9) == 0) {
+          int Vd = instr->VFPDRegValue(kSimd128Precision);
+          int Vm = instr->VFPMRegValue(kSimd128Precision);
+          int size = kBitsPerByte * (1 << instr->Bits(19, 18));
+          int op = kBitsPerByte
+                   << (static_cast<int>(Neon64) - instr->Bits(8, 7));
+          // vrev<op>.<size> Qd, Qm.
+          out_buffer_pos_ += SNPrintF(out_buffer_ + out_buffer_pos_,
+                                      "vrev%d.%d q%d, q%d", op, size, Vd, Vm);
+        } else if (instr->Bits(17, 16) == 0x1 && instr->Bit(11) == 0) {
+          int Vd = instr->VFPDRegValue(kSimd128Precision);
+          int Vm = instr->VFPMRegValue(kSimd128Precision);
+          int size = kBitsPerByte * (1 << instr->Bits(19, 18));
+          const char* type = instr->Bit(10) != 0 ? "f" : "s";
+          if (instr->Bits(9, 6) == 0xd) {
+            // vabs<type>.<size> Qd, Qm.
+            out_buffer_pos_ +=
+                SNPrintF(out_buffer_ + out_buffer_pos_, "vabs.%s%d q%d, q%d",
+                         type, size, Vd, Vm);
+          } else if (instr->Bits(9, 6) == 0xf) {
+            // vneg<type>.<size> Qd, Qm.
+            out_buffer_pos_ +=
+                SNPrintF(out_buffer_ + out_buffer_pos_, "vneg.%s%d q%d, q%d",
+                         type, size, Vd, Vm);
+          } else {
+            Unknown(instr);
+          }
+        } else {
+          Unknown(instr);
         }
-      } else if (instr->Opc1Value() == 0x7 && instr->Bits(11, 7) == 0x18 &&
-                 instr->Bit(4) == 0x0) {
-        int Vd = instr->VFPDRegValue(kSimd128Precision);
-        int Vm = instr->VFPMRegValue(kDoublePrecision);
-        int index = instr->Bit(19);
-        out_buffer_pos_ += SNPrintF(out_buffer_ + out_buffer_pos_,
-                                    "vdup q%d, d%d[%d]", Vd, Vm, index);
-      } else if (instr->Opc1Value() == 0x7 && instr->Bits(11, 10) == 0x2 &&
-                 instr->Bit(4) == 0x0) {
-        int Vd = instr->VFPDRegValue(kDoublePrecision);
-        int Vn = instr->VFPNRegValue(kDoublePrecision);
-        int Vm = instr->VFPMRegValue(kDoublePrecision);
-        int len = instr->Bits(9, 8);
-        NeonListOperand list(DwVfpRegister::from_code(Vn), len + 1);
-        out_buffer_pos_ +=
-            SNPrintF(out_buffer_ + out_buffer_pos_, "%s d%d, ",
-                     instr->Bit(6) == 0 ? "vtbl.8" : "vtbx.8", Vd);
-        FormatNeonList(Vn, list.type());
-        Print(", ");
-        PrintDRegister(Vm);
       } else {
         Unknown(instr);
       }
diff --git a/src/arm/macro-assembler-arm.cc b/src/arm/macro-assembler-arm.cc
index 8363d5e..5f4e492 100644
--- a/src/arm/macro-assembler-arm.cc
+++ b/src/arm/macro-assembler-arm.cc
@@ -1185,6 +1185,64 @@
   VmovExtended(s_code, src_lane.code(), scratch);
 }
 
+void MacroAssembler::Swizzle(QwNeonRegister dst, QwNeonRegister src,
+                             Register scratch, NeonSize size, uint32_t lanes) {
+  // TODO(bbudge) Handle Int16x8, Int8x16 vectors.
+  DCHECK_EQ(Neon32, size);
+  DCHECK_IMPLIES(size == Neon32, lanes < 0xFFFFu);
+  if (size == Neon32) {
+    switch (lanes) {
+      // TODO(bbudge) Handle more special cases.
+      case 0x3210:  // Identity.
+        Move(dst, src);
+        return;
+      case 0x1032:  // Swap top and bottom.
+        vext(dst, src, src, 8);
+        return;
+      case 0x2103:  // Rotation.
+        vext(dst, src, src, 12);
+        return;
+      case 0x0321:  // Rotation.
+        vext(dst, src, src, 4);
+        return;
+      case 0x0000:  // Equivalent to vdup.
+      case 0x1111:
+      case 0x2222:
+      case 0x3333: {
+        int lane_code = src.code() * 4 + (lanes & 0xF);
+        if (lane_code >= SwVfpRegister::kMaxNumRegisters) {
+          // TODO(bbudge) use vdup (vdup.32 dst, D<src>[lane]) once implemented.
+          int temp_code = kScratchDoubleReg.code() * 2;
+          VmovExtended(temp_code, lane_code, scratch);
+          lane_code = temp_code;
+        }
+        vdup(dst, SwVfpRegister::from_code(lane_code));
+        return;
+      }
+      case 0x2301:  // Swap lanes 0, 1 and lanes 2, 3.
+        vrev64(Neon32, dst, src);
+        return;
+      default:  // Handle all other cases with vmovs.
+        int src_code = src.code() * 4;
+        int dst_code = dst.code() * 4;
+        bool in_place = src.is(dst);
+        if (in_place) {
+          vmov(kScratchQuadReg, src);
+          src_code = kScratchQuadReg.code() * 4;
+        }
+        for (int i = 0; i < 4; i++) {
+          int lane = (lanes >> (i * 4) & 0xF);
+          VmovExtended(dst_code + i, src_code + lane, scratch);
+        }
+        if (in_place) {
+          // Restore zero reg.
+          veor(kDoubleRegZero, kDoubleRegZero, kDoubleRegZero);
+        }
+        return;
+    }
+  }
+}
+
 void MacroAssembler::LslPair(Register dst_low, Register dst_high,
                              Register src_low, Register src_high,
                              Register scratch, Register shift) {
diff --git a/src/arm/macro-assembler-arm.h b/src/arm/macro-assembler-arm.h
index 5a0a2b6..1bee1ed 100644
--- a/src/arm/macro-assembler-arm.h
+++ b/src/arm/macro-assembler-arm.h
@@ -568,6 +568,8 @@
                    NeonDataType dt, int lane);
   void ReplaceLane(QwNeonRegister dst, QwNeonRegister src,
                    SwVfpRegister src_lane, Register scratch, int lane);
+  void Swizzle(QwNeonRegister dst, QwNeonRegister src, Register scratch,
+               NeonSize size, uint32_t lanes);
 
   void LslPair(Register dst_low, Register dst_high, Register src_low,
                Register src_high, Register scratch, Register shift);
diff --git a/src/arm/simulator-arm.cc b/src/arm/simulator-arm.cc
index 1265483..8872010 100644
--- a/src/arm/simulator-arm.cc
+++ b/src/arm/simulator-arm.cc
@@ -3335,7 +3335,7 @@
             break;
           }
           case Neon16: {
-            // Perform pairwise ops instead of casting to uint16_t.
+            // Perform pairwise op.
             rt_value &= 0xFFFFu;
             uint32_t rt_rt = (rt_value << 16) | (rt_value & 0xFFFFu);
             for (int i = 0; i < 4; i++) {
@@ -3838,17 +3838,6 @@
   }
 }
 
-#define HIGH_16(x) ((x) >> 16)
-#define LOW_16(x) ((x)&0xFFFFu)
-#define COMBINE_32(high, low) ((high) << 16 | (low)&0xFFFFu)
-#define PAIRWISE_OP(x, y, OP) \
-  COMBINE_32(OP(HIGH_16((x)), HIGH_16((y))), OP(LOW_16((x)), LOW_16((y))))
-
-#define ADD_16(x, y) ((x) + (y))
-#define SUB_16(x, y) ((x) - (y))
-#define CEQ_16(x, y) ((x) == (y) ? 0xFFFFu : 0)
-#define TST_16(x, y) (((x) & (y)) != 0 ? 0xFFFFu : 0)
-
 void Simulator::DecodeSpecialCondition(Instruction* instr) {
   switch (instr->SpecialValue()) {
     case 4:
@@ -3881,9 +3870,13 @@
               break;
             }
             case Neon16: {
-              for (int i = 0; i < 4; i++) {
-                src1[i] = PAIRWISE_OP(src1[i], src2[i], ADD_16);
+              uint16_t s1[8], s2[8];
+              memcpy(s1, src1, sizeof(s1));
+              memcpy(s2, src2, sizeof(s2));
+              for (int i = 0; i < 8; i++) {
+                s1[i] += s2[i];
               }
+              memcpy(src1, s1, sizeof(src1));
               break;
             }
             case Neon32: {
@@ -3908,9 +3901,13 @@
               break;
             }
             case Neon16: {
-              for (int i = 0; i < 4; i++) {
-                src1[i] = PAIRWISE_OP(src1[i], src2[i], TST_16);
+              uint16_t s1[8], s2[8];
+              memcpy(s1, src1, sizeof(s1));
+              memcpy(s2, src2, sizeof(s2));
+              for (int i = 0; i < 8; i++) {
+                s1[i] = (s1[i] & s2[i]) != 0 ? 0xFFFFu : 0;
               }
+              memcpy(src1, s1, sizeof(src1));
               break;
             }
             case Neon32: {
@@ -3945,6 +3942,46 @@
           }
         }
         set_q_register(Vd, src1);
+      } else if (instr->Bits(11, 8) == 0x9 && instr->Bit(6) == 1 &&
+                 instr->Bit(4) == 1) {
+        // vmul.i<size> Qd, Qm, Qn.
+        NeonSize size = static_cast<NeonSize>(instr->Bits(21, 20));
+        int Vd = instr->VFPDRegValue(kSimd128Precision);
+        int Vm = instr->VFPMRegValue(kSimd128Precision);
+        int Vn = instr->VFPNRegValue(kSimd128Precision);
+        uint32_t src1[4], src2[4];
+        get_q_register(Vn, src1);
+        get_q_register(Vm, src2);
+        switch (size) {
+          case Neon8: {
+            uint8_t* s1 = reinterpret_cast<uint8_t*>(src1);
+            uint8_t* s2 = reinterpret_cast<uint8_t*>(src2);
+            for (int i = 0; i < 16; i++) {
+              s1[i] *= s2[i];
+            }
+            break;
+          }
+          case Neon16: {
+            uint16_t s1[8], s2[8];
+            memcpy(s1, src1, sizeof(s1));
+            memcpy(s2, src2, sizeof(s2));
+            for (int i = 0; i < 8; i++) {
+              s1[i] *= s2[i];
+            }
+            memcpy(src1, s1, sizeof(src1));
+            break;
+          }
+          case Neon32: {
+            for (int i = 0; i < 4; i++) {
+              src1[i] *= src2[i];
+            }
+            break;
+          }
+          default:
+            UNIMPLEMENTED();
+            break;
+        }
+        set_q_register(Vd, src1);
       } else {
         UNIMPLEMENTED();
       }
@@ -3969,6 +4006,27 @@
           e++;
         }
         set_q_register(Vd, reinterpret_cast<uint64_t*>(to));
+      } else if (instr->Bits(21, 20) == 3 && instr->Bit(4) == 0) {
+        // vext.
+        int imm4 = instr->Bits(11, 8);
+        int Vd = instr->VFPDRegValue(kSimd128Precision);
+        int Vm = instr->VFPMRegValue(kSimd128Precision);
+        int Vn = instr->VFPNRegValue(kSimd128Precision);
+        uint32_t src1[4], src2[4], dst[4];
+        get_q_register(Vn, src1);
+        get_q_register(Vm, src2);
+        uint8_t* s1 = reinterpret_cast<uint8_t*>(src1);
+        uint8_t* s2 = reinterpret_cast<uint8_t*>(src2);
+        uint8_t* d = reinterpret_cast<uint8_t*>(dst);
+        int boundary = 16 - imm4;
+        int i = 0;
+        for (; i < boundary; i++) {
+          d[i] = s1[i + imm4];
+        }
+        for (; i < 16; i++) {
+          d[i] = s2[i - boundary];
+        }
+        set_q_register(Vd, dst);
       } else {
         UNIMPLEMENTED();
       }
@@ -3993,9 +4051,13 @@
             break;
           }
           case Neon16: {
-            for (int i = 0; i < 4; i++) {
-              src1[i] = PAIRWISE_OP(src1[i], src2[i], SUB_16);
+            uint16_t s1[8], s2[8];
+            memcpy(s1, src1, sizeof(s1));
+            memcpy(s2, src2, sizeof(s2));
+            for (int i = 0; i < 8; i++) {
+              s1[i] -= s2[i];
             }
+            memcpy(src1, s1, sizeof(src1));
             break;
           }
           case Neon32: {
@@ -4028,9 +4090,13 @@
             break;
           }
           case Neon16: {
-            for (int i = 0; i < 4; i++) {
-              src1[i] = PAIRWISE_OP(src1[i], src2[i], CEQ_16);
+            uint16_t s1[8], s2[8];
+            memcpy(s1, src1, sizeof(s1));
+            memcpy(s2, src2, sizeof(s2));
+            for (int i = 0; i < 8; i++) {
+              s1[i] = s1[i] == s2[i] ? 0xffffu : 0;
             }
+            memcpy(src1, s1, sizeof(src1));
             break;
           }
           case Neon32: {
@@ -4065,23 +4131,37 @@
           int Vd = instr->VFPDRegValue(kDoublePrecision);
           int Vn = instr->VFPNRegValue(kDoublePrecision);
           int Vm = instr->VFPMRegValue(kDoublePrecision);
-          uint64_t n_data, m_data;
-          get_d_register(Vn, &n_data);
-          get_d_register(Vm, &m_data);
-          n_data ^= m_data;
-          set_d_register(Vd, &n_data);
+          uint64_t src1, src2;
+          get_d_register(Vn, &src1);
+          get_d_register(Vm, &src2);
+          src1 ^= src2;
+          set_d_register(Vd, &src1);
 
         } else {
           // veor Qd, Qn, Qm
           int Vd = instr->VFPDRegValue(kSimd128Precision);
           int Vn = instr->VFPNRegValue(kSimd128Precision);
           int Vm = instr->VFPMRegValue(kSimd128Precision);
-          uint32_t n_data[4], m_data[4];
-          get_q_register(Vn, n_data);
-          get_q_register(Vm, m_data);
-          for (int i = 0; i < 4; i++) n_data[i] ^= m_data[i];
-          set_q_register(Vd, n_data);
+          uint32_t src1[4], src2[4];
+          get_q_register(Vn, src1);
+          get_q_register(Vm, src2);
+          for (int i = 0; i < 4; i++) src1[i] ^= src2[i];
+          set_q_register(Vd, src1);
         }
+      } else if (instr->Bit(21) == 0 && instr->Bits(11, 8) == 0xd &&
+                 instr->Bit(6) == 1 && instr->Bit(4) == 1) {
+        // vmul.f32 Qd, Qn, Qm
+        int Vd = instr->VFPDRegValue(kSimd128Precision);
+        int Vn = instr->VFPNRegValue(kSimd128Precision);
+        int Vm = instr->VFPMRegValue(kSimd128Precision);
+        uint32_t src1[4], src2[4];
+        get_q_register(Vn, src1);
+        get_q_register(Vm, src2);
+        for (int i = 0; i < 4; i++) {
+          src1[i] = bit_cast<uint32_t>(bit_cast<float>(src1[i]) *
+                                       bit_cast<float>(src2[i]));
+        }
+        set_q_register(Vd, src1);
       } else {
         UNIMPLEMENTED();
       }
@@ -4106,106 +4186,314 @@
           e++;
         }
         set_q_register(Vd, reinterpret_cast<uint64_t*>(to));
-      } else if (instr->Opc1Value() == 7 && instr->Bits(19, 16) == 0xB &&
-                 instr->Bits(11, 9) == 0x3 && instr->Bit(6) == 1 &&
-                 instr->Bit(4) == 0) {
-        // vcvt.<Td>.<Tm> Qd, Qm.
-        int Vd = instr->VFPDRegValue(kSimd128Precision);
-        int Vm = instr->VFPMRegValue(kSimd128Precision);
-        uint32_t q_data[4];
-        get_q_register(Vm, q_data);
-        int op = instr->Bits(8, 7);
-        for (int i = 0; i < 4; i++) {
-          switch (op) {
-            case 0:
-              // f32 <- s32, round towards nearest.
-              q_data[i] = bit_cast<uint32_t>(
-                  std::round(static_cast<float>(bit_cast<int32_t>(q_data[i]))));
-              break;
-            case 1:
-              // f32 <- u32, round towards nearest.
-              q_data[i] =
-                  bit_cast<uint32_t>(std::round(static_cast<float>(q_data[i])));
-              break;
-            case 2:
-              // s32 <- f32, round to zero.
-              q_data[i] = static_cast<uint32_t>(
-                  ConvertDoubleToInt(bit_cast<float>(q_data[i]), false, RZ));
-              break;
-            case 3:
-              // u32 <- f32, round to zero.
-              q_data[i] = static_cast<uint32_t>(
-                  ConvertDoubleToInt(bit_cast<float>(q_data[i]), true, RZ));
-              break;
+      } else if (instr->Opc1Value() == 7 && instr->Bit(4) == 0) {
+        if (instr->Bits(19, 16) == 0xB && instr->Bits(11, 9) == 0x3 &&
+            instr->Bit(6) == 1) {
+          // vcvt.<Td>.<Tm> Qd, Qm.
+          int Vd = instr->VFPDRegValue(kSimd128Precision);
+          int Vm = instr->VFPMRegValue(kSimd128Precision);
+          uint32_t q_data[4];
+          get_q_register(Vm, q_data);
+          int op = instr->Bits(8, 7);
+          for (int i = 0; i < 4; i++) {
+            switch (op) {
+              case 0:
+                // f32 <- s32, round towards nearest.
+                q_data[i] = bit_cast<uint32_t>(std::round(
+                    static_cast<float>(bit_cast<int32_t>(q_data[i]))));
+                break;
+              case 1:
+                // f32 <- u32, round towards nearest.
+                q_data[i] = bit_cast<uint32_t>(
+                    std::round(static_cast<float>(q_data[i])));
+                break;
+              case 2:
+                // s32 <- f32, round to zero.
+                q_data[i] = static_cast<uint32_t>(
+                    ConvertDoubleToInt(bit_cast<float>(q_data[i]), false, RZ));
+                break;
+              case 3:
+                // u32 <- f32, round to zero.
+                q_data[i] = static_cast<uint32_t>(
+                    ConvertDoubleToInt(bit_cast<float>(q_data[i]), true, RZ));
+                break;
+            }
           }
-        }
-        set_q_register(Vd, q_data);
-      } else if ((instr->Bits(21, 16) == 0x32) && (instr->Bits(11, 7) == 0) &&
-                 (instr->Bit(4) == 0)) {
-        if (instr->Bit(6) == 0) {
-          // vswp Dd, Dm.
-          uint64_t dval, mval;
-          int vd = instr->VFPDRegValue(kDoublePrecision);
+          set_q_register(Vd, q_data);
+        } else if (instr->Bits(17, 16) == 0x2 && instr->Bits(11, 7) == 0) {
+          if (instr->Bit(6) == 0) {
+            // vswp Dd, Dm.
+            uint64_t dval, mval;
+            int vd = instr->VFPDRegValue(kDoublePrecision);
+            int vm = instr->VFPMRegValue(kDoublePrecision);
+            get_d_register(vd, &dval);
+            get_d_register(vm, &mval);
+            set_d_register(vm, &dval);
+            set_d_register(vd, &mval);
+          } else {
+            // vswp Qd, Qm.
+            uint32_t dval[4], mval[4];
+            int vd = instr->VFPDRegValue(kSimd128Precision);
+            int vm = instr->VFPMRegValue(kSimd128Precision);
+            get_q_register(vd, dval);
+            get_q_register(vm, mval);
+            set_q_register(vm, dval);
+            set_q_register(vd, mval);
+          }
+        } else if (instr->Bits(11, 7) == 0x18) {
+          // vdup.32 Qd, Sm.
+          int vd = instr->VFPDRegValue(kSimd128Precision);
           int vm = instr->VFPMRegValue(kDoublePrecision);
-          get_d_register(vd, &dval);
-          get_d_register(vm, &mval);
-          set_d_register(vm, &dval);
-          set_d_register(vd, &mval);
-        } else {
-          // vswp Qd, Qm.
-          uint32_t dval[4], mval[4];
+          int index = instr->Bit(19);
+          uint32_t s_data = get_s_register(vm * 2 + index);
+          uint32_t q_data[4];
+          for (int i = 0; i < 4; i++) q_data[i] = s_data;
+          set_q_register(vd, q_data);
+        } else if (instr->Bits(19, 16) == 0 && instr->Bits(11, 6) == 0x17) {
+          // vmvn Qd, Qm.
           int vd = instr->VFPDRegValue(kSimd128Precision);
           int vm = instr->VFPMRegValue(kSimd128Precision);
-          get_q_register(vd, dval);
-          get_q_register(vm, mval);
-          set_q_register(vm, dval);
-          set_q_register(vd, mval);
-        }
-      } else if (instr->Opc1Value() == 0x7 && instr->Bits(11, 7) == 0x18 &&
-                 instr->Bit(4) == 0x0) {
-        // vdup.32 Qd, Sm.
-        int vd = instr->VFPDRegValue(kSimd128Precision);
-        int vm = instr->VFPMRegValue(kDoublePrecision);
-        int index = instr->Bit(19);
-        uint32_t s_data = get_s_register(vm * 2 + index);
-        uint32_t q_data[4];
-        for (int i = 0; i < 4; i++) q_data[i] = s_data;
-        set_q_register(vd, q_data);
-      } else if (instr->Opc1Value() == 7 && instr->Bits(19, 16) == 0 &&
-                 instr->Bits(11, 6) == 0x17 && instr->Bit(4) == 0) {
-        // vmvn Qd, Qm.
-        int vd = instr->VFPDRegValue(kSimd128Precision);
-        int vm = instr->VFPMRegValue(kSimd128Precision);
-        uint32_t q_data[4];
-        get_q_register(vm, q_data);
-        for (int i = 0; i < 4; i++) q_data[i] = ~q_data[i];
-        set_q_register(vd, q_data);
-      } else if (instr->Opc1Value() == 0x7 && instr->Bits(11, 10) == 0x2 &&
-                 instr->Bit(4) == 0x0) {
-        // vtb[l,x] Dd, <list>, Dm.
-        int vd = instr->VFPDRegValue(kDoublePrecision);
-        int vn = instr->VFPNRegValue(kDoublePrecision);
-        int vm = instr->VFPMRegValue(kDoublePrecision);
-        int table_len = (instr->Bits(9, 8) + 1) * kDoubleSize;
-        bool vtbx = instr->Bit(6) != 0;  // vtbl / vtbx
-        uint64_t destination = 0, indices = 0, result = 0;
-        get_d_register(vd, &destination);
-        get_d_register(vm, &indices);
-        for (int i = 0; i < kDoubleSize; i++) {
-          int shift = i * kBitsPerByte;
-          int index = (indices >> shift) & 0xFF;
-          if (index < table_len) {
-            uint64_t table;
-            get_d_register(vn + index / kDoubleSize, &table);
-            result |= ((table >> ((index % kDoubleSize) * kBitsPerByte)) & 0xFF)
-                      << shift;
-          } else if (vtbx) {
-            result |= destination & (0xFFull << shift);
+          uint32_t q_data[4];
+          get_q_register(vm, q_data);
+          for (int i = 0; i < 4; i++) q_data[i] = ~q_data[i];
+          set_q_register(vd, q_data);
+        } else if (instr->Bits(11, 10) == 0x2) {
+          // vtb[l,x] Dd, <list>, Dm.
+          int vd = instr->VFPDRegValue(kDoublePrecision);
+          int vn = instr->VFPNRegValue(kDoublePrecision);
+          int vm = instr->VFPMRegValue(kDoublePrecision);
+          int table_len = (instr->Bits(9, 8) + 1) * kDoubleSize;
+          bool vtbx = instr->Bit(6) != 0;  // vtbl / vtbx
+          uint64_t destination = 0, indices = 0, result = 0;
+          get_d_register(vd, &destination);
+          get_d_register(vm, &indices);
+          for (int i = 0; i < kDoubleSize; i++) {
+            int shift = i * kBitsPerByte;
+            int index = (indices >> shift) & 0xFF;
+            if (index < table_len) {
+              uint64_t table;
+              get_d_register(vn + index / kDoubleSize, &table);
+              result |=
+                  ((table >> ((index % kDoubleSize) * kBitsPerByte)) & 0xFF)
+                  << shift;
+            } else if (vtbx) {
+              result |= destination & (0xFFull << shift);
+            }
           }
+          set_d_register(vd, &result);
+        } else if (instr->Bits(17, 16) == 0x2 && instr->Bits(11, 6) == 0x7) {
+          // vzip.<size> Qd, Qm.
+          int size = static_cast<NeonSize>(instr->Bits(19, 18));
+          int Vd = instr->VFPDRegValue(kSimd128Precision);
+          int Vm = instr->VFPMRegValue(kSimd128Precision);
+          uint32_t src1[4], src2[4], dst1[4], dst2[4];
+          get_q_register(Vd, src1);
+          get_q_register(Vm, src2);
+          switch (size) {
+            case Neon8: {
+              uint8_t* s1 = reinterpret_cast<uint8_t*>(src1);
+              uint8_t* s2 = reinterpret_cast<uint8_t*>(src2);
+              uint8_t* d1 = reinterpret_cast<uint8_t*>(dst1);
+              uint8_t* d2 = reinterpret_cast<uint8_t*>(dst2);
+              for (int i = 0; i < 8; i++) {
+                d1[i * 2] = s1[i];
+                d1[i * 2 + 1] = s2[i];
+                d2[i * 2] = s1[i + 8];
+                d2[i * 2 + 1] = s2[i + 8];
+              }
+              break;
+            }
+            case Neon16: {
+              uint16_t s1[8], s2[8], d1[8], d2[8];
+              memcpy(s1, src1, sizeof(s1));
+              memcpy(s2, src2, sizeof(s2));
+              for (int i = 0; i < 8; i += 2) {
+                d1[i] = s1[i / 2];
+                d1[i + 1] = s2[i / 2];
+                d2[i] = s1[i / 2 + 4];
+                d2[i + 1] = s2[i / 2 + 4];
+              }
+              memcpy(dst1, d1, sizeof(dst1));
+              memcpy(dst2, d2, sizeof(dst2));
+              break;
+            }
+            case Neon32: {
+              for (int i = 0; i < 2; i++) {
+                dst1[i * 2] = src1[i];
+                dst1[i * 2 + 1] = src2[i];
+                dst2[i * 2] = src1[i + 2];
+                dst2[i * 2 + 1] = src2[i + 2];
+              }
+              break;
+            }
+            default:
+              UNREACHABLE();
+              break;
+          }
+          set_q_register(Vd, dst1);
+          set_q_register(Vm, dst2);
+        } else if (instr->Bits(17, 16) == 0 && instr->Bits(11, 9) == 0) {
+          // vrev<op>.size Qd, Qm
+          int Vd = instr->VFPDRegValue(kSimd128Precision);
+          int Vm = instr->VFPMRegValue(kSimd128Precision);
+          int size = static_cast<NeonSize>(instr->Bits(19, 18));
+          NeonSize op = static_cast<NeonSize>(static_cast<int>(Neon64) -
+                                              instr->Bits(8, 7));
+          uint32_t src[4];
+          get_q_register(Vm, src);
+          switch (op) {
+            case Neon16: {
+              DCHECK_EQ(Neon8, size);
+              uint8_t* s = reinterpret_cast<uint8_t*>(src);
+              for (int i = 0; i < 16; i += 2) {
+                std::swap(s[i], s[i + 1]);
+              }
+              break;
+            }
+            case Neon32: {
+              switch (size) {
+                case Neon16:
+                  for (int i = 0; i < 4; i++) {
+                    src[i] = (src[i] >> 16) | (src[i] << 16);
+                  }
+                  break;
+                case Neon8: {
+                  uint8_t* s = reinterpret_cast<uint8_t*>(src);
+                  for (int i = 0; i < 4; i++) {
+                    std::swap(s[i * 4], s[i * 4 + 3]);
+                    std::swap(s[i * 4 + 1], s[i * 4 + 2]);
+                  }
+                  break;
+                }
+                default:
+                  UNREACHABLE();
+                  break;
+              }
+              break;
+            }
+            case Neon64: {
+              switch (size) {
+                case Neon32: {
+                  std::swap(src[0], src[1]);
+                  std::swap(src[2], src[3]);
+                  break;
+                }
+                case Neon16: {
+                  for (int i = 0; i <= 2; i += 2) {
+                    uint32_t w1 = src[i];
+                    uint32_t w2 = src[i + 1];
+                    src[i] = (w2 >> 16) | (w2 << 16);
+                    src[i + 1] = (w1 >> 16) | (w1 << 16);
+                  }
+                  break;
+                }
+                case Neon8: {
+                  uint8_t* s = reinterpret_cast<uint8_t*>(src);
+                  for (int i = 0; i < 4; i++) {
+                    std::swap(s[i], s[7 - i]);
+                    std::swap(s[i + 8], s[15 - i]);
+                  }
+                  break;
+                }
+                default:
+                  UNREACHABLE();
+                  break;
+              }
+              break;
+            }
+            default:
+              UNREACHABLE();
+              break;
+          }
+          set_q_register(Vd, src);
+        } else if (instr->Bits(17, 16) == 0x1 && instr->Bit(11) == 0) {
+          int Vd = instr->VFPDRegValue(kSimd128Precision);
+          int Vm = instr->VFPMRegValue(kSimd128Precision);
+          int size = static_cast<NeonSize>(instr->Bits(19, 18));
+          uint32_t src[4];
+          get_q_register(Vm, src);
+          if (instr->Bits(9, 6) == 0xd) {
+            // vabs<type>.<size> Qd, Qm
+            if (instr->Bit(10) != 0) {
+              // floating point (clear sign bits)
+              for (int i = 0; i < 4; i++) {
+                src[i] &= ~0x80000000;
+              }
+            } else {
+              // signed integer
+              switch (size) {
+                case Neon8: {
+                  int8_t* s = reinterpret_cast<int8_t*>(src);
+                  for (int i = 0; i < 16; i++) {
+                    s[i] = std::abs(s[i]);
+                  }
+                  break;
+                }
+                case Neon16: {
+                  int16_t s[8];
+                  memcpy(s, src, sizeof(s));
+                  for (int i = 0; i < 8; i++) {
+                    s[i] = std::abs(s[i]);
+                  }
+                  memcpy(src, s, sizeof(src));
+                  break;
+                }
+                case Neon32: {
+                  int32_t* as_signed = reinterpret_cast<int32_t*>(src);
+                  for (int i = 0; i < 4; i++) {
+                    as_signed[i] = std::abs(as_signed[i]);
+                  }
+                  break;
+                }
+                default:
+                  UNIMPLEMENTED();
+                  break;
+              }
+            }
+          } else if (instr->Bits(9, 6) == 0xf) {
+            // vneg<type>.<size> Qd, Qm (signed integer)
+            if (instr->Bit(10) != 0) {
+              // floating point (toggle sign bits)
+              for (int i = 0; i < 4; i++) {
+                src[i] ^= 0x80000000;
+              }
+            } else {
+              // signed integer
+              switch (size) {
+                case Neon8: {
+                  int8_t* s = reinterpret_cast<int8_t*>(src);
+                  for (int i = 0; i < 16; i++) {
+                    s[i] = -s[i];
+                  }
+                  break;
+                }
+                case Neon16:
+                  int16_t s[8];
+                  memcpy(s, src, sizeof(s));
+                  for (int i = 0; i < 8; i++) {
+                    s[i] = -s[i];
+                  }
+                  memcpy(src, s, sizeof(src));
+                  break;
+                case Neon32: {
+                  int32_t* as_signed = reinterpret_cast<int32_t*>(src);
+                  for (int i = 0; i < 4; i++) {
+                    as_signed[i] = -as_signed[i];
+                  }
+                  break;
+                }
+                default:
+                  UNIMPLEMENTED();
+                  break;
+              }
+            }
+          } else {
+            UNIMPLEMENTED();
+          }
+          set_q_register(Vd, src);
+        } else {
+          UNIMPLEMENTED();
         }
-        set_d_register(vd, &result);
-      } else {
-        UNIMPLEMENTED();
       }
       break;
     case 8:
diff --git a/test/cctest/test-assembler-arm.cc b/test/cctest/test-assembler-arm.cc
index 7873714..95141b7 100644
--- a/test/cctest/test-assembler-arm.cc
+++ b/test/cctest/test-assembler-arm.cc
@@ -1221,6 +1221,18 @@
   CHECK_EQ(kArmNanLower32, bit_cast<int64_t>(t.div_result) & 0xffffffffu);
 }
 
+#define CHECK_EQ_SPLAT(field, ex) \
+  CHECK_EQ(ex, t.field[0]);       \
+  CHECK_EQ(ex, t.field[1]);       \
+  CHECK_EQ(ex, t.field[2]);       \
+  CHECK_EQ(ex, t.field[3]);
+
+#define CHECK_EQ_32X4(field, ex0, ex1, ex2, ex3) \
+  CHECK_EQ(ex0, t.field[0]);                     \
+  CHECK_EQ(ex1, t.field[1]);                     \
+  CHECK_EQ(ex2, t.field[2]);                     \
+  CHECK_EQ(ex3, t.field[3]);
+
 #define INT32_TO_FLOAT(val) \
   std::round(static_cast<float>(bit_cast<int32_t>(val)))
 #define UINT32_TO_FLOAT(val) \
@@ -1259,28 +1271,39 @@
     uint32_t dstA5;
     uint32_t dstA6;
     uint32_t dstA7;
+    uint32_t lane_test[4];
     uint64_t vmov_to_scalar1, vmov_to_scalar2;
     uint32_t vmov_from_scalar_s8, vmov_from_scalar_u8;
     uint32_t vmov_from_scalar_s16, vmov_from_scalar_u16;
     uint32_t vmov_from_scalar_32;
-    uint32_t vmov_src[4], vmov_dst[4], vmvn[4];
+    uint32_t vmov[4], vmvn[4];
     int32_t vcvt_s32_f32[4];
     uint32_t vcvt_u32_f32[4];
     float vcvt_f32_s32[4], vcvt_f32_u32[4];
-    uint32_t vdup1[4], vdup2[4], vdup3[4], vdup4[4];
+    uint32_t vdup8[4], vdup16[4], vdup32[4];
+    float vabsf[4], vnegf[4];
+    uint32_t vabs_s8[4], vabs_s16[4], vabs_s32[4];
+    uint32_t vneg_s8[4], vneg_s16[4], vneg_s32[4];
     uint32_t veor[4];
+    float vdupf[4], vaddf[4], vsubf[4], vmulf[4];
     uint32_t vadd8[4], vadd16[4], vadd32[4];
     uint32_t vsub8[4], vsub16[4], vsub32[4];
-    uint32_t vtst[4], vceq[4], vbsl[4], vtbl[2], vtbx[2];
-    float vaddf[4], vsubf[4];
+    uint32_t vmul8[4], vmul16[4], vmul32[4];
+    uint32_t vtst[4], vceq[4], vbsl[4];
+    uint32_t vext[4];
+    uint32_t vzip8a[4], vzip8b[4], vzip16a[4], vzip16b[4], vzip32a[4],
+        vzip32b[4];
+    uint32_t vrev64_32[4], vrev64_16[4], vrev64_8[4];
+    uint32_t vrev32_16[4], vrev32_8[4];
+    uint32_t vrev16_8[4];
+    uint32_t vtbl[2], vtbx[2];
   } T;
   T t;
 
   // Create a function that accepts &t, and loads, manipulates, and stores
-  // the doubles and floats.
+  // the doubles, floats, and SIMD values.
   Assembler assm(isolate, NULL, 0);
 
-
   if (CpuFeatures::IsSupported(NEON)) {
     CpuFeatureScope scope(&assm, NEON);
 
@@ -1306,7 +1329,7 @@
     __ vst1(Neon8, NeonListOperand(d2, 2), NeonMemOperand(r4));
 
     // ARM core register to scalar.
-    __ mov(r4, Operand(0xFFFFFFF8));
+    __ mov(r4, Operand(0xfffffff8));
     __ vmov(d0, 0);
     __ vmov(NeonS8, d0, 1, r4);
     __ vmov(NeonS16, d0, 1, r4);
@@ -1318,8 +1341,8 @@
     __ vstr(d0, r0, offsetof(T, vmov_to_scalar2));
 
     // Scalar to ARM core register.
-    __ mov(r4, Operand(0xFFFFFF00));
-    __ mov(r5, Operand(0xFFFFFFFF));
+    __ mov(r4, Operand(0xffffff00));
+    __ mov(r5, Operand(0xffffffff));
     __ vmov(d0, r4, r5);
     __ vmov(NeonS8, r4, d0, 1);
     __ str(r4, MemOperand(r0, offsetof(T, vmov_from_scalar_s8)));
@@ -1333,15 +1356,15 @@
     __ str(r4, MemOperand(r0, offsetof(T, vmov_from_scalar_32)));
 
     // vmov for q-registers.
-    __ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, vmov_src))));
+    __ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, lane_test))));
     __ vld1(Neon8, NeonListOperand(q0), NeonMemOperand(r4));
     __ vmov(q1, q0);
-    __ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, vmov_dst))));
+    __ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, vmov))));
     __ vst1(Neon8, NeonListOperand(q1), NeonMemOperand(r4));
 
     // vmvn.
-    __ mov(r4, Operand(0xFF));
-    __ vdup(Neon16, q0, r4);
+    __ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, lane_test))));
+    __ vld1(Neon8, NeonListOperand(q0), NeonMemOperand(r4));
     __ vmvn(q1, q0);
     __ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, vmvn))));
     __ vst1(Neon8, NeonListOperand(q1), NeonMemOperand(r4));
@@ -1370,23 +1393,64 @@
     __ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, vcvt_f32_u32))));
     __ vst1(Neon8, NeonListOperand(q1), NeonMemOperand(r4));
 
-    // int vdup.
+    // vdup (integer).
     __ mov(r4, Operand(0xa));
     __ vdup(Neon8, q0, r4);
     __ vdup(Neon16, q1, r4);
     __ vdup(Neon32, q2, r4);
-    __ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, vdup1))));
+    __ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, vdup8))));
     __ vst1(Neon8, NeonListOperand(q0), NeonMemOperand(r4));
-    __ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, vdup2))));
+    __ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, vdup16))));
     __ vst1(Neon8, NeonListOperand(q1), NeonMemOperand(r4));
-    __ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, vdup3))));
+    __ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, vdup32))));
     __ vst1(Neon8, NeonListOperand(q2), NeonMemOperand(r4));
-    // float vdup.
+
+    // vdup (float).
     __ vmov(s0, -1.0);
     __ vdup(q0, s0);
-    __ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, vdup4))));
+    __ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, vdupf))));
     __ vst1(Neon8, NeonListOperand(q0), NeonMemOperand(r4));
 
+    // vabs (float).
+    __ vmov(s0, -1.0);
+    __ vmov(s1, -0.0);
+    __ vmov(s2, 0.0);
+    __ vmov(s3, 1.0);
+    __ vabs(q1, q0);
+    __ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, vabsf))));
+    __ vst1(Neon8, NeonListOperand(q1), NeonMemOperand(r4));
+    // vneg (float).
+    __ vneg(q1, q0);
+    __ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, vnegf))));
+    __ vst1(Neon8, NeonListOperand(q1), NeonMemOperand(r4));
+
+    // vabs (integer).
+    __ mov(r4, Operand(0x7f7f7f7f));
+    __ mov(r5, Operand(0x01010101));
+    __ vmov(d0, r4, r5);
+    __ mov(r4, Operand(0xffffffff));
+    __ mov(r5, Operand(0x80808080));
+    __ vmov(d1, r4, r5);
+    __ vabs(Neon8, q1, q0);
+    __ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, vabs_s8))));
+    __ vst1(Neon8, NeonListOperand(q1), NeonMemOperand(r4));
+    __ vabs(Neon16, q1, q0);
+    __ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, vabs_s16))));
+    __ vst1(Neon8, NeonListOperand(q1), NeonMemOperand(r4));
+    __ vabs(Neon32, q1, q0);
+    __ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, vabs_s32))));
+    __ vst1(Neon8, NeonListOperand(q1), NeonMemOperand(r4));
+    // vneg (integer).
+    __ vneg(Neon8, q1, q0);
+    __ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, vneg_s8))));
+    __ vst1(Neon8, NeonListOperand(q1), NeonMemOperand(r4));
+    __ vneg(Neon16, q1, q0);
+    __ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, vneg_s16))));
+    __ vst1(Neon8, NeonListOperand(q1), NeonMemOperand(r4));
+    __ vneg(Neon32, q1, q0);
+    __ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, vneg_s32))));
+    __ vst1(Neon8, NeonListOperand(q1), NeonMemOperand(r4));
+
     // veor.
     __ mov(r4, Operand(0x00aa));
     __ vdup(Neon16, q0, r4);
@@ -1396,7 +1460,30 @@
     __ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, veor))));
     __ vst1(Neon8, NeonListOperand(q1), NeonMemOperand(r4));
 
-    // vadd(integer).
+    // vadd (float).
+    __ vmov(s4, 1.0);
+    __ vdup(q0, s4);
+    __ vdup(q1, s4);
+    __ vadd(q1, q1, q0);
+    __ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, vaddf))));
+    __ vst1(Neon8, NeonListOperand(q1), NeonMemOperand(r4));
+    // vsub (float).
+    __ vmov(s4, 2.0);
+    __ vdup(q0, s4);
+    __ vmov(s4, 1.0);
+    __ vdup(q1, s4);
+    __ vsub(q1, q1, q0);
+    __ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, vsubf))));
+    __ vst1(Neon8, NeonListOperand(q1), NeonMemOperand(r4));
+    // vmul (float).
+    __ vmov(s4, 2.0);
+    __ vdup(q0, s4);
+    __ vdup(q1, s4);
+    __ vmul(q1, q1, q0);
+    __ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, vmulf))));
+    __ vst1(Neon8, NeonListOperand(q1), NeonMemOperand(r4));
+
+    // vadd (integer).
     __ mov(r4, Operand(0x81));
     __ vdup(Neon8, q0, r4);
     __ mov(r4, Operand(0x82));
@@ -1419,44 +1506,44 @@
     __ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, vadd32))));
     __ vst1(Neon8, NeonListOperand(q1), NeonMemOperand(r4));
 
-    // vadd(float).
-    __ vmov(s4, 1.0);
-    __ vdup(q0, s4);
-    __ vdup(q1, s4);
-    __ vadd(q1, q1, q0);
-    __ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, vaddf))));
-    __ vst1(Neon8, NeonListOperand(q1), NeonMemOperand(r4));
-
-    // vsub(integer).
+    // vsub (integer).
     __ mov(r4, Operand(0x01));
     __ vdup(Neon8, q0, r4);
-    __ mov(r4, Operand(0x02));
+    __ mov(r4, Operand(0x03));
     __ vdup(Neon8, q1, r4);
     __ vsub(Neon8, q1, q0, q1);
     __ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, vsub8))));
     __ vst1(Neon8, NeonListOperand(q1), NeonMemOperand(r4));
     __ mov(r4, Operand(0x0001));
     __ vdup(Neon16, q0, r4);
-    __ mov(r4, Operand(0x0002));
+    __ mov(r4, Operand(0x0003));
     __ vdup(Neon16, q1, r4);
     __ vsub(Neon16, q1, q0, q1);
     __ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, vsub16))));
     __ vst1(Neon8, NeonListOperand(q1), NeonMemOperand(r4));
     __ mov(r4, Operand(0x00000001));
     __ vdup(Neon32, q0, r4);
-    __ mov(r4, Operand(0x00000002));
+    __ mov(r4, Operand(0x00000003));
     __ vdup(Neon32, q1, r4);
     __ vsub(Neon32, q1, q0, q1);
     __ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, vsub32))));
     __ vst1(Neon8, NeonListOperand(q1), NeonMemOperand(r4));
 
-    // vsub(float).
-    __ vmov(s4, 2.0);
-    __ vdup(q0, s4);
-    __ vmov(s4, 1.0);
-    __ vdup(q1, s4);
-    __ vsub(q1, q1, q0);
-    __ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, vsubf))));
+    // vmul (integer).
+    __ mov(r4, Operand(0x02));
+    __ vdup(Neon8, q0, r4);
+    __ vmul(Neon8, q1, q0, q0);
+    __ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, vmul8))));
+    __ vst1(Neon8, NeonListOperand(q1), NeonMemOperand(r4));
+    __ mov(r4, Operand(0x0002));
+    __ vdup(Neon16, q0, r4);
+    __ vmul(Neon16, q1, q0, q0);
+    __ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, vmul16))));
+    __ vst1(Neon8, NeonListOperand(q1), NeonMemOperand(r4));
+    __ mov(r4, Operand(0x00000002));
+    __ vdup(Neon32, q0, r4);
+    __ vmul(Neon32, q1, q0, q0);
+    __ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, vmul32))));
     __ vst1(Neon8, NeonListOperand(q1), NeonMemOperand(r4));
 
     // vceq.
@@ -1488,6 +1575,62 @@
     __ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, vbsl))));
     __ vst1(Neon8, NeonListOperand(q0), NeonMemOperand(r4));
 
+    // vext.
+    __ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, lane_test))));
+    __ vld1(Neon8, NeonListOperand(q0), NeonMemOperand(r4));
+    __ vmov(q1, q0);
+    __ vext(q2, q0, q1, 3);
+    __ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, vext))));
+    __ vst1(Neon8, NeonListOperand(q2), NeonMemOperand(r4));
+
+    // vzip.
+    __ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, lane_test))));
+    __ vld1(Neon8, NeonListOperand(q0), NeonMemOperand(r4));
+    __ vmov(q1, q0);
+    __ vzip(Neon8, q0, q1);
+    __ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, vzip8a))));
+    __ vst1(Neon8, NeonListOperand(q0), NeonMemOperand(r4));
+    __ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, vzip8b))));
+    __ vst1(Neon8, NeonListOperand(q1), NeonMemOperand(r4));
+    __ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, lane_test))));
+    __ vld1(Neon8, NeonListOperand(q0), NeonMemOperand(r4));
+    __ vmov(q1, q0);
+    __ vzip(Neon16, q0, q1);
+    __ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, vzip16a))));
+    __ vst1(Neon8, NeonListOperand(q0), NeonMemOperand(r4));
+    __ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, vzip16b))));
+    __ vst1(Neon8, NeonListOperand(q1), NeonMemOperand(r4));
+    __ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, lane_test))));
+    __ vld1(Neon8, NeonListOperand(q0), NeonMemOperand(r4));
+    __ vmov(q1, q0);
+    __ vzip(Neon32, q0, q1);
+    __ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, vzip32a))));
+    __ vst1(Neon8, NeonListOperand(q0), NeonMemOperand(r4));
+    __ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, vzip32b))));
+    __ vst1(Neon8, NeonListOperand(q1), NeonMemOperand(r4));
+
+    // vrev64/32/16
+    __ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, lane_test))));
+    __ vld1(Neon8, NeonListOperand(q0), NeonMemOperand(r4));
+    __ vrev64(Neon32, q1, q0);
+    __ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, vrev64_32))));
+    __ vst1(Neon8, NeonListOperand(q1), NeonMemOperand(r4));
+    __ vrev64(Neon16, q1, q0);
+    __ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, vrev64_16))));
+    __ vst1(Neon8, NeonListOperand(q1), NeonMemOperand(r4));
+    __ vrev64(Neon8, q1, q0);
+    __ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, vrev64_8))));
+    __ vst1(Neon8, NeonListOperand(q1), NeonMemOperand(r4));
+    __ vrev32(Neon16, q1, q0);
+    __ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, vrev32_16))));
+    __ vst1(Neon8, NeonListOperand(q1), NeonMemOperand(r4));
+    __ vrev32(Neon8, q1, q0);
+    __ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, vrev32_8))));
+    __ vst1(Neon8, NeonListOperand(q1), NeonMemOperand(r4));
+    __ vrev16(Neon8, q1, q0);
+    __ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, vrev16_8))));
+    __ vst1(Neon8, NeonListOperand(q1), NeonMemOperand(r4));
+
     // vtb[l/x].
     __ mov(r4, Operand(0x06040200));
     __ mov(r5, Operand(0xff050301));
@@ -1535,8 +1678,10 @@
     t.dstA5 = 0;
     t.dstA6 = 0;
     t.dstA7 = 0;
-    t.vmov_src[0] = t.vmov_src[1] = t.vmov_src[2] = t.vmov_src[3] = 1;
-    t.vmov_dst[0] = t.vmov_dst[1] = t.vmov_dst[2] = t.vmov_dst[3] = 0;
+    t.lane_test[0] = 0x03020100;
+    t.lane_test[1] = 0x07060504;
+    t.lane_test[2] = 0x0b0a0908;
+    t.lane_test[3] = 0x0f0e0d0c;
     Object* dummy = CALL_GENERATED_CODE(isolate, f, &t, 0, 0, 0, 0);
     USE(dummy);
 
@@ -1556,6 +1701,7 @@
     CHECK_EQ(0x00410042u, t.dstA5);
     CHECK_EQ(0x00830084u, t.dstA6);
     CHECK_EQ(0x00810082u, t.dstA7);
+
     CHECK_EQ(0xfffffff8fff8f800u, t.vmov_to_scalar1);
     CHECK_EQ(0xfff80000f8000000u, t.vmov_to_scalar2);
     CHECK_EQ(0xFFFFFFFFu, t.vmov_from_scalar_s8);
@@ -1563,46 +1709,73 @@
     CHECK_EQ(0xFFFFFFFFu, t.vmov_from_scalar_s16);
     CHECK_EQ(0xFFFFu, t.vmov_from_scalar_u16);
     CHECK_EQ(0xFFFFFFFFu, t.vmov_from_scalar_32);
-    CHECK_EQ(1u, t.vmov_dst[0]);
-    CHECK_EQ(1u, t.vmov_dst[1]);
-    CHECK_EQ(1u, t.vmov_dst[2]);
-    CHECK_EQ(1u, t.vmov_dst[3]);
-    CHECK_EQ(-1, t.vcvt_s32_f32[0]);
-    CHECK_EQ(-1, t.vcvt_s32_f32[1]);
-    CHECK_EQ(1, t.vcvt_s32_f32[2]);
-    CHECK_EQ(1, t.vcvt_s32_f32[3]);
-    CHECK_EQ(0u, t.vcvt_u32_f32[0]);
-    CHECK_EQ(0u, t.vcvt_u32_f32[1]);
-    CHECK_EQ(1u, t.vcvt_u32_f32[2]);
-    CHECK_EQ(1u, t.vcvt_u32_f32[3]);
 
+    CHECK_EQ_32X4(vmov, 0x03020100u, 0x07060504u, 0x0b0a0908u, 0x0f0e0d0cu);
+    CHECK_EQ_32X4(vmvn, 0xfcfdfeffu, 0xf8f9fafbu, 0xf4f5f6f7u, 0xf0f1f2f3u);
+
+    CHECK_EQ_SPLAT(vdup8, 0x0a0a0a0au);
+    CHECK_EQ_SPLAT(vdup16, 0x000a000au);
+    CHECK_EQ_SPLAT(vdup32, 0x0000000au);
+    CHECK_EQ_SPLAT(vdupf, -1.0);
+
+    // src: [-1, -1, 1, 1]
+    CHECK_EQ_32X4(vcvt_s32_f32, -1, -1, 1, 1);
+    CHECK_EQ_32X4(vcvt_u32_f32, 0u, 0u, 1u, 1u);
     // src: [kMinInt, kMaxInt, kMaxUInt32, kMinInt + 1]
-    CHECK_EQ(INT32_TO_FLOAT(kMinInt), t.vcvt_f32_s32[0]);
-    CHECK_EQ(INT32_TO_FLOAT(kMaxInt), t.vcvt_f32_s32[1]);
-    CHECK_EQ(INT32_TO_FLOAT(kMaxUInt32), t.vcvt_f32_s32[2]);
-    CHECK_EQ(INT32_TO_FLOAT(kMinInt + 1), t.vcvt_f32_s32[3]);
-    CHECK_EQ(UINT32_TO_FLOAT(kMinInt), t.vcvt_f32_u32[0]);
-    CHECK_EQ(UINT32_TO_FLOAT(kMaxInt), t.vcvt_f32_u32[1]);
-    CHECK_EQ(UINT32_TO_FLOAT(kMaxUInt32), t.vcvt_f32_u32[2]);
-    CHECK_EQ(UINT32_TO_FLOAT(kMinInt + 1), t.vcvt_f32_u32[3]);
+    CHECK_EQ_32X4(vcvt_f32_s32, INT32_TO_FLOAT(kMinInt),
+                  INT32_TO_FLOAT(kMaxInt), INT32_TO_FLOAT(kMaxUInt32),
+                  INT32_TO_FLOAT(kMinInt + 1));
+    CHECK_EQ_32X4(vcvt_f32_u32, UINT32_TO_FLOAT(kMinInt),
+                  UINT32_TO_FLOAT(kMaxInt), UINT32_TO_FLOAT(kMaxUInt32),
+                  UINT32_TO_FLOAT(kMinInt + 1));
 
-    for (int i = 0; i < 4; i++) CHECK_EQ(0xFF00FF00, t.vmvn[i]);
-    for (int i = 0; i < 4; i++) CHECK_EQ(0x0a0a0a0au, t.vdup1[i]);
-    for (int i = 0; i < 4; i++) CHECK_EQ(0x000a000au, t.vdup2[i]);
-    for (int i = 0; i < 4; i++) CHECK_EQ(0x0000000au, t.vdup3[i]);
-    for (int i = 0; i < 4; i++) CHECK_EQ(0xbf800000u, t.vdup4[i]);  // -1.0f
-    for (int i = 0; i < 4; i++) CHECK_EQ(0x00ff00ffu, t.veor[i]);
-    for (int i = 0; i < 4; i++) CHECK_EQ(2.0, t.vaddf[i]);
-    for (int i = 0; i < 4; i++) CHECK_EQ(0x03030303u, t.vadd8[i]);
-    for (int i = 0; i < 4; i++) CHECK_EQ(0x00030003u, t.vadd16[i]);
-    for (int i = 0; i < 4; i++) CHECK_EQ(0x00000003u, t.vadd32[i]);
-    for (int i = 0; i < 4; i++) CHECK_EQ(-1.0, t.vsubf[i]);
-    for (int i = 0; i < 4; i++) CHECK_EQ(0xffffffffu, t.vsub8[i]);
-    for (int i = 0; i < 4; i++) CHECK_EQ(0xffffffffu, t.vsub16[i]);
-    for (int i = 0; i < 4; i++) CHECK_EQ(0xffffffffu, t.vsub32[i]);
-    for (int i = 0; i < 4; i++) CHECK_EQ(0x00ff00ffu, t.vceq[i]);
-    for (int i = 0; i < 4; i++) CHECK_EQ(0x00ff00ffu, t.vtst[i]);
-    for (int i = 0; i < 4; i++) CHECK_EQ(0x02010201u, t.vbsl[i]);
+    CHECK_EQ_32X4(vabsf, 1.0, 0.0, 0.0, 1.0);
+    CHECK_EQ_32X4(vnegf, 1.0, 0.0, -0.0, -1.0);
+    // src: [0x7f7f7f7f, 0x01010101, 0xffffffff, 0x80808080]
+    CHECK_EQ_32X4(vabs_s8, 0x7f7f7f7fu, 0x01010101u, 0x01010101u, 0x80808080u);
+    CHECK_EQ_32X4(vabs_s16, 0x7f7f7f7fu, 0x01010101u, 0x00010001u, 0x7f807f80u);
+    CHECK_EQ_32X4(vabs_s32, 0x7f7f7f7fu, 0x01010101u, 0x00000001u, 0x7f7f7f80u);
+    CHECK_EQ_32X4(vneg_s8, 0x81818181u, 0xffffffffu, 0x01010101u, 0x80808080u);
+    CHECK_EQ_32X4(vneg_s16, 0x80818081u, 0xfefffeffu, 0x00010001u, 0x7f807f80u);
+    CHECK_EQ_32X4(vneg_s32, 0x80808081u, 0xfefefeffu, 0x00000001u, 0x7f7f7f80u);
+
+    CHECK_EQ_SPLAT(veor, 0x00ff00ffu);
+    CHECK_EQ_SPLAT(vaddf, 2.0);
+    CHECK_EQ_SPLAT(vsubf, -1.0);
+    CHECK_EQ_SPLAT(vmulf, 4.0);
+    CHECK_EQ_SPLAT(vadd8, 0x03030303u);
+    CHECK_EQ_SPLAT(vadd16, 0x00030003u);
+    CHECK_EQ_SPLAT(vadd32, 0x00000003u);
+    CHECK_EQ_SPLAT(vsub8, 0xfefefefeu);
+    CHECK_EQ_SPLAT(vsub16, 0xfffefffeu);
+    CHECK_EQ_SPLAT(vsub32, 0xfffffffeu);
+    CHECK_EQ_SPLAT(vmul8, 0x04040404u);
+    CHECK_EQ_SPLAT(vmul16, 0x00040004u);
+    CHECK_EQ_SPLAT(vmul32, 0x00000004u);
+    CHECK_EQ_SPLAT(vceq, 0x00ff00ffu);
+    CHECK_EQ_SPLAT(vtst, 0x00ff00ffu);
+    CHECK_EQ_SPLAT(vbsl, 0x02010201u);
+
+    CHECK_EQ_32X4(vext, 0x06050403u, 0x0a090807u, 0x0e0d0c0bu, 0x0201000fu);
+
+    CHECK_EQ_32X4(vzip8a, 0x01010000u, 0x03030202u, 0x05050404u, 0x07070606u);
+    CHECK_EQ_32X4(vzip8b, 0x09090808u, 0x0b0b0a0au, 0x0d0d0c0cu, 0x0f0f0e0eu);
+    CHECK_EQ_32X4(vzip16a, 0x01000100u, 0x03020302u, 0x05040504u, 0x07060706u);
+    CHECK_EQ_32X4(vzip16b, 0x09080908u, 0x0b0a0b0au, 0x0d0c0d0cu, 0x0f0e0f0eu);
+    CHECK_EQ_32X4(vzip32a, 0x03020100u, 0x03020100u, 0x07060504u, 0x07060504u);
+    CHECK_EQ_32X4(vzip32b, 0x0b0a0908u, 0x0b0a0908u, 0x0f0e0d0cu, 0x0f0e0d0cu);
+
+    // src: 0 1 2 3  4 5 6 7  8 9 a b  c d e f (little endian)
+    CHECK_EQ_32X4(vrev64_32, 0x07060504u, 0x03020100u, 0x0f0e0d0cu,
+                  0x0b0a0908u);
+    CHECK_EQ_32X4(vrev64_16, 0x05040706u, 0x01000302u, 0x0d0c0f0eu,
+                  0x09080b0au);
+    CHECK_EQ_32X4(vrev64_8, 0x04050607u, 0x00010203u, 0x0c0d0e0fu, 0x08090a0bu);
+    CHECK_EQ_32X4(vrev32_16, 0x01000302u, 0x05040706u, 0x09080b0au,
+                  0x0d0c0f0eu);
+    CHECK_EQ_32X4(vrev32_8, 0x00010203u, 0x04050607u, 0x08090a0bu, 0x0c0d0e0fu);
+    CHECK_EQ_32X4(vrev16_8, 0x02030001u, 0x06070405u, 0x0a0b0809u, 0x0e0f0c0du);
+
     CHECK_EQ(0x05010400u, t.vtbl[0]);
     CHECK_EQ(0x00030602u, t.vtbl[1]);
     CHECK_EQ(0x05010400u, t.vtbx[0]);
@@ -1610,7 +1783,6 @@
   }
 }
 
-
 TEST(16) {
   // Test the pkh, uxtb, uxtab and uxtb16 instructions.
   CcTest::InitializeVM();
diff --git a/test/cctest/test-disasm-arm.cc b/test/cctest/test-disasm-arm.cc
index 75cd843..e594ab4 100644
--- a/test/cctest/test-disasm-arm.cc
+++ b/test/cctest/test-disasm-arm.cc
@@ -1001,6 +1001,22 @@
               "f3fbe742       vcvt.s32.f32 q15, q1");
       COMPARE(vcvt_u32_f32(q8, q9),
               "f3fb07e2       vcvt.u32.f32 q8, q9");
+      COMPARE(vabs(q0, q1),
+              "f3b90742       vabs.f32 q0, q1");
+      COMPARE(vabs(Neon8, q6, q7),
+              "f3b1c34e       vabs.s8 q6, q7");
+      COMPARE(vabs(Neon16, q0, q1),
+              "f3b50342       vabs.s16 q0, q1");
+      COMPARE(vabs(Neon32, q0, q1),
+              "f3b90342       vabs.s32 q0, q1");
+      COMPARE(vneg(q0, q1),
+              "f3b907c2       vneg.f32 q0, q1");
+      COMPARE(vneg(Neon8, q6, q7),
+              "f3b1c3ce       vneg.s8 q6, q7");
+      COMPARE(vneg(Neon16, q0, q1),
+              "f3b503c2       vneg.s16 q0, q1");
+      COMPARE(vneg(Neon32, q0, q1),
+              "f3b903c2       vneg.s32 q0, q1");
       COMPARE(veor(d0, d1, d2),
               "f3010112       veor d0, d1, d2");
       COMPARE(veor(d0, d30, d31),
@@ -1025,6 +1041,14 @@
               "f3142860       vsub.i16 q1, q2, q8");
       COMPARE(vsub(Neon32, q15, q0, q8),
               "f360e860       vsub.i32 q15, q0, q8");
+      COMPARE(vmul(q0, q1, q2),
+              "f3020d54       vmul.f32 q0, q1, q2");
+      COMPARE(vmul(Neon8, q0, q1, q2),
+              "f2020954       vmul.i8 q0, q1, q2");
+      COMPARE(vmul(Neon16, q1, q2, q8),
+              "f2142970       vmul.i16 q1, q2, q8");
+      COMPARE(vmul(Neon32, q15, q0, q8),
+              "f260e970       vmul.i32 q15, q0, q8");
       COMPARE(vtst(Neon8, q0, q1, q2),
               "f2020854       vtst.i8 q0, q1, q2");
       COMPARE(vtst(Neon16, q1, q2, q8),
@@ -1041,6 +1065,12 @@
               "f3120154       vbsl q0, q1, q2");
       COMPARE(vbsl(q15, q0, q8),
               "f350e170       vbsl q15, q0, q8");
+      COMPARE(vext(q15, q0, q8, 3),
+              "f2f0e360       vext.8 q15, q0, q8, #3");
+      COMPARE(vzip(Neon16, q15, q0),
+              "f3f6e1c0       vzip.16 q15, q0");
+      COMPARE(vrev64(Neon8, q15, q0),
+              "f3f0e040       vrev64.8 q15, q0");
       COMPARE(vtbl(d0, NeonListOperand(d1, 1), d2),
               "f3b10802       vtbl.8 d0, {d1}, d2");
       COMPARE(vtbl(d31, NeonListOperand(d0, 2), d4),
diff --git a/test/cctest/test-macro-assembler-arm.cc b/test/cctest/test-macro-assembler-arm.cc
index 6b69296..63919a4 100644
--- a/test/cctest/test-macro-assembler-arm.cc
+++ b/test/cctest/test-macro-assembler-arm.cc
@@ -379,4 +379,115 @@
   }
 }
 
+#define CHECK_EQ_32X4(field, v0, v1, v2, v3) \
+  CHECK_EQ(v0, t.field[0]);                  \
+  CHECK_EQ(v1, t.field[1]);                  \
+  CHECK_EQ(v2, t.field[2]);                  \
+  CHECK_EQ(v3, t.field[3]);
+
+TEST(Swizzle) {
+  if (!CpuFeatures::IsSupported(NEON)) return;
+
+  // Allocate an executable page of memory.
+  size_t actual_size;
+  byte* buffer = static_cast<byte*>(v8::base::OS::Allocate(
+      Assembler::kMinimalBufferSize, &actual_size, true));
+  CHECK(buffer);
+  Isolate* isolate = CcTest::i_isolate();
+  HandleScope handles(isolate);
+  MacroAssembler assembler(isolate, buffer, static_cast<int>(actual_size),
+                           v8::internal::CodeObjectRequired::kYes);
+  MacroAssembler* masm = &assembler;  // Create a pointer for the __ macro.
+
+  typedef struct {
+    int32_t _32x4_3210[4];  // identity
+    int32_t _32x4_1032[4];  // high / low swap
+    int32_t _32x4_0000[4];  // vdup's
+    int32_t _32x4_1111[4];
+    int32_t _32x4_2222[4];
+    int32_t _32x4_3333[4];
+    int32_t _32x4_2103[4];           // rotate left
+    int32_t _32x4_0321[4];           // rotate right
+    int32_t _32x4_1132[4];           // irregular
+    int32_t _32x4_1132_in_place[4];  // irregular, in-place
+  } T;
+  T t;
+
+  __ stm(db_w, sp, r4.bit() | r5.bit() | r6.bit() | r7.bit() | lr.bit());
+
+  const Register kScratch = r5;
+
+  // Make test vector [0, 1, 2, 3]
+  __ veor(q1, q1, q1);  // Zero
+  for (int i = 0; i < 4; i++) {
+    __ mov(r4, Operand(i));
+    __ ReplaceLane(q1, q1, r4, NeonS32, i);
+  }
+  __ Swizzle(q0, q1, kScratch, Neon32, 0x3210);
+  __ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, _32x4_3210))));
+  __ vst1(Neon8, NeonListOperand(q0), NeonMemOperand(r4));
+
+  __ Swizzle(q0, q1, kScratch, Neon32, 0x1032);
+  __ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, _32x4_1032))));
+  __ vst1(Neon8, NeonListOperand(q0), NeonMemOperand(r4));
+
+  __ Swizzle(q0, q1, kScratch, Neon32, 0x0000);
+  __ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, _32x4_0000))));
+  __ vst1(Neon8, NeonListOperand(q0), NeonMemOperand(r4));
+
+  __ Swizzle(q0, q1, kScratch, Neon32, 0x1111);
+  __ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, _32x4_1111))));
+  __ vst1(Neon8, NeonListOperand(q0), NeonMemOperand(r4));
+
+  __ Swizzle(q0, q1, kScratch, Neon32, 0x2222);
+  __ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, _32x4_2222))));
+  __ vst1(Neon8, NeonListOperand(q0), NeonMemOperand(r4));
+
+  __ Swizzle(q0, q1, kScratch, Neon32, 0x3333);
+  __ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, _32x4_3333))));
+  __ vst1(Neon8, NeonListOperand(q0), NeonMemOperand(r4));
+
+  __ Swizzle(q0, q1, kScratch, Neon32, 0x2103);
+  __ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, _32x4_2103))));
+  __ vst1(Neon8, NeonListOperand(q0), NeonMemOperand(r4));
+
+  __ Swizzle(q0, q1, kScratch, Neon32, 0x0321);
+  __ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, _32x4_0321))));
+  __ vst1(Neon8, NeonListOperand(q0), NeonMemOperand(r4));
+
+  __ Swizzle(q0, q1, kScratch, Neon32, 0x1132);
+  __ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, _32x4_1132))));
+  __ vst1(Neon8, NeonListOperand(q0), NeonMemOperand(r4));
+
+  __ vmov(q0, q1);
+  __ Swizzle(q0, q0, kScratch, Neon32, 0x1132);
+  __ add(r4, r0,
+         Operand(static_cast<int32_t>(offsetof(T, _32x4_1132_in_place))));
+  __ vst1(Neon8, NeonListOperand(q0), NeonMemOperand(r4));
+
+  __ ldm(ia_w, sp, r4.bit() | r5.bit() | r6.bit() | r7.bit() | pc.bit());
+
+  CodeDesc desc;
+  masm->GetCode(&desc);
+  Handle<Code> code = isolate->factory()->NewCode(
+      desc, Code::ComputeFlags(Code::STUB), Handle<Code>());
+#ifdef DEBUG
+  OFStream os(stdout);
+  code->Print(os);
+#endif
+  F3 f = FUNCTION_CAST<F3>(code->entry());
+  Object* dummy = CALL_GENERATED_CODE(isolate, f, &t, 0, 0, 0, 0);
+  USE(dummy);
+  CHECK_EQ_32X4(_32x4_3210, 0, 1, 2, 3);
+  CHECK_EQ_32X4(_32x4_1032, 2, 3, 0, 1);
+  CHECK_EQ_32X4(_32x4_0000, 0, 0, 0, 0);
+  CHECK_EQ_32X4(_32x4_1111, 1, 1, 1, 1);
+  CHECK_EQ_32X4(_32x4_2222, 2, 2, 2, 2);
+  CHECK_EQ_32X4(_32x4_3333, 3, 3, 3, 3);
+  CHECK_EQ_32X4(_32x4_2103, 3, 0, 1, 2);
+  CHECK_EQ_32X4(_32x4_0321, 1, 2, 3, 0);
+  CHECK_EQ_32X4(_32x4_1132, 2, 3, 1, 1);
+  CHECK_EQ_32X4(_32x4_1132_in_place, 2, 3, 1, 1);
+}
+
 #undef __