[ARM] Add NEON instructions for implementing SIMD.
- Adds vabs, vneg, vmul, vext, vzip, vrev instructions.
- Adds Swizzle function to macro assembler.
- Simplifies if-else logic in disassembler, simulator, for Neon special.
- Some refactoring of Neon assembler, macro-assembler tests.
LOG=N
BUG=v8:4124
Review-Url: https://codereview.chromium.org/2579913002
Cr-Commit-Position: refs/heads/master@{#41781}
diff --git a/src/arm/assembler-arm.cc b/src/arm/assembler-arm.cc
index aa6be21..4a76b09 100644
--- a/src/arm/assembler-arm.cc
+++ b/src/arm/assembler-arm.cc
@@ -4074,6 +4074,50 @@
emit(EncodeNeonVCVT(U32, dst, F32, src));
}
+// op is instr->Bits(11, 7).
+static Instr EncodeNeonUnaryOp(int op, bool is_float, NeonSize size,
+ const QwNeonRegister dst,
+ const QwNeonRegister src) {
+ DCHECK_IMPLIES(is_float, size == Neon32);
+ int vd, d;
+ dst.split_code(&vd, &d);
+ int vm, m;
+ src.split_code(&vm, &m);
+ int F = is_float ? 1 : 0;
+ return 0x1E7U * B23 | d * B22 | 0x3 * B20 | size * B18 | B16 | vd * B12 |
+ F * B10 | B8 | op * B7 | B6 | m * B5 | vm;
+}
+
+void Assembler::vabs(const QwNeonRegister dst, const QwNeonRegister src) {
+ // Qd = vabs.f<size>(Qn, Qm) SIMD floating point absolute value.
+ // Instruction details available in ARM DDI 0406C.b, A8.8.824.
+ DCHECK(IsEnabled(NEON));
+ emit(EncodeNeonUnaryOp(0x6, true, Neon32, dst, src));
+}
+
+void Assembler::vabs(NeonSize size, const QwNeonRegister dst,
+ const QwNeonRegister src) {
+ // Qd = vabs.s<size>(Qn, Qm) SIMD integer absolute value.
+ // Instruction details available in ARM DDI 0406C.b, A8.8.824.
+ DCHECK(IsEnabled(NEON));
+ emit(EncodeNeonUnaryOp(0x6, false, size, dst, src));
+}
+
+void Assembler::vneg(const QwNeonRegister dst, const QwNeonRegister src) {
+ // Qd = vabs.f<size>(Qn, Qm) SIMD floating point negate.
+ // Instruction details available in ARM DDI 0406C.b, A8.8.968.
+ DCHECK(IsEnabled(NEON));
+ emit(EncodeNeonUnaryOp(0x7, true, Neon32, dst, src));
+}
+
+void Assembler::vneg(NeonSize size, const QwNeonRegister dst,
+ const QwNeonRegister src) {
+ // Qd = vabs.s<size>(Qn, Qm) SIMD integer negate.
+ // Instruction details available in ARM DDI 0406C.b, A8.8.968.
+ DCHECK(IsEnabled(NEON));
+ emit(EncodeNeonUnaryOp(0x7, false, size, dst, src));
+}
+
void Assembler::veor(DwVfpRegister dst, DwVfpRegister src1,
DwVfpRegister src2) {
// Dd = veor(Dn, Dm) 64 bit integer exclusive OR.
@@ -4166,6 +4210,37 @@
n * B7 | B6 | m * B5 | vm);
}
+void Assembler::vmul(QwNeonRegister dst, const QwNeonRegister src1,
+ const QwNeonRegister src2) {
+ DCHECK(IsEnabled(NEON));
+ // Qd = vadd(Qn, Qm) SIMD floating point multiply.
+ // Instruction details available in ARM DDI 0406C.b, A8-958.
+ int vd, d;
+ dst.split_code(&vd, &d);
+ int vn, n;
+ src1.split_code(&vn, &n);
+ int vm, m;
+ src2.split_code(&vm, &m);
+ emit(0x1E6U * B23 | d * B22 | vn * B16 | vd * B12 | 0xD * B8 | n * B7 | B6 |
+ m * B5 | B4 | vm);
+}
+
+void Assembler::vmul(NeonSize size, QwNeonRegister dst,
+ const QwNeonRegister src1, const QwNeonRegister src2) {
+ DCHECK(IsEnabled(NEON));
+ // Qd = vadd(Qn, Qm) SIMD integer multiply.
+ // Instruction details available in ARM DDI 0406C.b, A8-960.
+ int vd, d;
+ dst.split_code(&vd, &d);
+ int vn, n;
+ src1.split_code(&vn, &n);
+ int vm, m;
+ src2.split_code(&vm, &m);
+ int sz = static_cast<int>(size);
+ emit(0x1E4U * B23 | d * B22 | sz * B20 | vn * B16 | vd * B12 | 0x9 * B8 |
+ n * B7 | B6 | m * B5 | B4 | vm);
+}
+
void Assembler::vtst(NeonSize size, QwNeonRegister dst,
const QwNeonRegister src1, const QwNeonRegister src2) {
DCHECK(IsEnabled(NEON));
@@ -4185,7 +4260,7 @@
void Assembler::vceq(NeonSize size, QwNeonRegister dst,
const QwNeonRegister src1, const QwNeonRegister src2) {
DCHECK(IsEnabled(NEON));
- // Qd = vceq(Qn, Qm) SIMD integer compare equal.
+ // Qd = vceq(Qn, Qm) SIMD bitwise compare equal.
// Instruction details available in ARM DDI 0406C.b, A8-844.
int vd, d;
dst.split_code(&vd, &d);
@@ -4214,6 +4289,70 @@
n * B7 | B6 | m * B5 | B4 | vm);
}
+void Assembler::vext(QwNeonRegister dst, const QwNeonRegister src1,
+ const QwNeonRegister src2, int bytes) {
+ DCHECK(IsEnabled(NEON));
+ // Qd = vext(Qn, Qm) SIMD byte extract.
+ // Instruction details available in ARM DDI 0406C.b, A8-890.
+ int vd, d;
+ dst.split_code(&vd, &d);
+ int vn, n;
+ src1.split_code(&vn, &n);
+ int vm, m;
+ src2.split_code(&vm, &m);
+ DCHECK_GT(16, bytes);
+ emit(0x1E5U * B23 | d * B22 | 0x3 * B20 | vn * B16 | vd * B12 | bytes * B8 |
+ n * B7 | B6 | m * B5 | vm);
+}
+
+void Assembler::vzip(NeonSize size, QwNeonRegister dst,
+ const QwNeonRegister src) {
+ DCHECK(IsEnabled(NEON));
+ // Qd = vzip.<size>(Qn, Qm) SIMD zip (interleave).
+ // Instruction details available in ARM DDI 0406C.b, A8-1102.
+ int vd, d;
+ dst.split_code(&vd, &d);
+ int vm, m;
+ src.split_code(&vm, &m);
+ int sz = static_cast<int>(size);
+ emit(0x1E7U * B23 | d * B22 | 0x3 * B20 | sz * B18 | 2 * B16 | vd * B12 |
+ 0x3 * B7 | B6 | m * B5 | vm);
+}
+
+static Instr EncodeNeonVREV(NeonSize op_size, NeonSize size,
+ const QwNeonRegister dst,
+ const QwNeonRegister src) {
+ // Qd = vrev<op_size>.<size>(Qn, Qm) SIMD scalar reverse.
+ // Instruction details available in ARM DDI 0406C.b, A8-1028.
+ DCHECK_GT(op_size, static_cast<int>(size));
+ int vd, d;
+ dst.split_code(&vd, &d);
+ int vm, m;
+ src.split_code(&vm, &m);
+ int sz = static_cast<int>(size);
+ int op = static_cast<int>(Neon64) - static_cast<int>(op_size);
+ return 0x1E7U * B23 | d * B22 | 0x3 * B20 | sz * B18 | vd * B12 | op * B7 |
+ B6 | m * B5 | vm;
+}
+
+void Assembler::vrev16(NeonSize size, const QwNeonRegister dst,
+ const QwNeonRegister src) {
+ DCHECK(IsEnabled(NEON));
+ emit(EncodeNeonVREV(Neon16, size, dst, src));
+}
+
+void Assembler::vrev32(NeonSize size, const QwNeonRegister dst,
+ const QwNeonRegister src) {
+ DCHECK(IsEnabled(NEON));
+ emit(EncodeNeonVREV(Neon32, size, dst, src));
+}
+
+void Assembler::vrev64(NeonSize size, const QwNeonRegister dst,
+ const QwNeonRegister src) {
+ DCHECK(IsEnabled(NEON));
+ emit(EncodeNeonVREV(Neon64, size, dst, src));
+}
+
// Encode NEON vtbl / vtbx instruction.
static Instr EncodeNeonVTB(const DwVfpRegister dst, const NeonListOperand& list,
const DwVfpRegister index, bool vtbx) {
diff --git a/src/arm/assembler-arm.h b/src/arm/assembler-arm.h
index 235d80b..7bb92b4 100644
--- a/src/arm/assembler-arm.h
+++ b/src/arm/assembler-arm.h
@@ -1362,6 +1362,10 @@
void vcvt_s32_f32(const QwNeonRegister dst, const QwNeonRegister src);
void vcvt_u32_f32(const QwNeonRegister dst, const QwNeonRegister src);
+ void vabs(const QwNeonRegister dst, const QwNeonRegister src);
+ void vabs(NeonSize size, const QwNeonRegister dst, const QwNeonRegister src);
+ void vneg(const QwNeonRegister dst, const QwNeonRegister src);
+ void vneg(NeonSize size, const QwNeonRegister dst, const QwNeonRegister src);
void veor(DwVfpRegister dst, DwVfpRegister src1, DwVfpRegister src2);
void veor(QwNeonRegister dst, QwNeonRegister src1, QwNeonRegister src2);
void vadd(const QwNeonRegister dst, const QwNeonRegister src1,
@@ -1372,12 +1376,25 @@
const QwNeonRegister src2);
void vsub(NeonSize size, const QwNeonRegister dst, const QwNeonRegister src1,
const QwNeonRegister src2);
+ void vmul(const QwNeonRegister dst, const QwNeonRegister src1,
+ const QwNeonRegister src2);
+ void vmul(NeonSize size, const QwNeonRegister dst, const QwNeonRegister src1,
+ const QwNeonRegister src2);
void vtst(NeonSize size, const QwNeonRegister dst, const QwNeonRegister src1,
const QwNeonRegister src2);
void vceq(NeonSize size, const QwNeonRegister dst, const QwNeonRegister src1,
const QwNeonRegister src2);
void vbsl(const QwNeonRegister dst, const QwNeonRegister src1,
const QwNeonRegister src2);
+ void vext(const QwNeonRegister dst, const QwNeonRegister src1,
+ const QwNeonRegister src2, int bytes);
+ void vzip(NeonSize size, const QwNeonRegister dst, const QwNeonRegister src);
+ void vrev16(NeonSize size, const QwNeonRegister dst,
+ const QwNeonRegister src);
+ void vrev32(NeonSize size, const QwNeonRegister dst,
+ const QwNeonRegister src);
+ void vrev64(NeonSize size, const QwNeonRegister dst,
+ const QwNeonRegister src);
void vtbl(const DwVfpRegister dst, const NeonListOperand& list,
const DwVfpRegister index);
void vtbx(const DwVfpRegister dst, const NeonListOperand& list,
diff --git a/src/arm/disasm-arm.cc b/src/arm/disasm-arm.cc
index 7a42386..ef99d53 100644
--- a/src/arm/disasm-arm.cc
+++ b/src/arm/disasm-arm.cc
@@ -1883,6 +1883,15 @@
// vadd/vsub.f32 Qd, Qm, Qn.
out_buffer_pos_ += SNPrintF(out_buffer_ + out_buffer_pos_,
"%s.f32 q%d, q%d, q%d", op, Vd, Vn, Vm);
+ } else if (instr->Bits(11, 8) == 0x9 && instr->Bit(6) == 1 &&
+ instr->Bit(4) == 1) {
+ int size = kBitsPerByte * (1 << instr->Bits(21, 20));
+ int Vd = instr->VFPDRegValue(kSimd128Precision);
+ int Vm = instr->VFPMRegValue(kSimd128Precision);
+ int Vn = instr->VFPNRegValue(kSimd128Precision);
+ // vmul.i<size> Qd, Qm, Qn.
+ out_buffer_pos_ += SNPrintF(out_buffer_ + out_buffer_pos_,
+ "vmul.i%d q%d, q%d, q%d", size, Vd, Vn, Vm);
} else {
Unknown(instr);
}
@@ -1897,6 +1906,15 @@
int imm3 = instr->Bits(21, 19);
out_buffer_pos_ += SNPrintF(out_buffer_ + out_buffer_pos_,
"vmovl.s%d q%d, d%d", imm3*8, Vd, Vm);
+ } else if (instr->Bits(21, 20) == 3 && instr->Bit(4) == 0) {
+ // vext.8 Qd, Qm, Qn, imm4
+ int imm4 = instr->Bits(11, 8);
+ int Vd = instr->VFPDRegValue(kSimd128Precision);
+ int Vm = instr->VFPMRegValue(kSimd128Precision);
+ int Vn = instr->VFPNRegValue(kSimd128Precision);
+ out_buffer_pos_ +=
+ SNPrintF(out_buffer_ + out_buffer_pos_, "vext.8 q%d, q%d, q%d, #%d",
+ Vd, Vn, Vm, imm4);
} else {
Unknown(instr);
}
@@ -1941,6 +1959,14 @@
out_buffer_pos_ += SNPrintF(out_buffer_ + out_buffer_pos_,
"veor q%d, q%d, q%d", Vd, Vn, Vm);
}
+ } else if (instr->Bit(21) == 0 && instr->Bits(11, 8) == 0xd &&
+ instr->Bit(6) == 1 && instr->Bit(4) == 1) {
+ // vmul.f32 Qd, Qn, Qm
+ int Vd = instr->VFPDRegValue(kSimd128Precision);
+ int Vn = instr->VFPNRegValue(kSimd128Precision);
+ int Vm = instr->VFPMRegValue(kSimd128Precision);
+ out_buffer_pos_ += SNPrintF(out_buffer_ + out_buffer_pos_,
+ "vmul.f32 q%d, q%d, q%d", Vd, Vn, Vm);
} else {
Unknown(instr);
}
@@ -1955,68 +1981,102 @@
int imm3 = instr->Bits(21, 19);
out_buffer_pos_ += SNPrintF(out_buffer_ + out_buffer_pos_,
"vmovl.u%d q%d, d%d", imm3*8, Vd, Vm);
- } else if (instr->Opc1Value() == 7 && instr->Bits(19, 16) == 0 &&
- instr->Bits(11, 6) == 0x17 && instr->Bit(4) == 0) {
- int Vd = instr->VFPDRegValue(kSimd128Precision);
- int Vm = instr->VFPMRegValue(kSimd128Precision);
- out_buffer_pos_ +=
- SNPrintF(out_buffer_ + out_buffer_pos_, "vmvn q%d, q%d", Vd, Vm);
- } else if (instr->Opc1Value() == 7 && instr->Bits(19, 16) == 0xB &&
- instr->Bits(11, 9) == 0x3 && instr->Bit(6) == 1 &&
+ } else if (instr->Opc1Value() == 7 && instr->Bits(21, 20) == 0x3 &&
instr->Bit(4) == 0) {
- int Vd = instr->VFPDRegValue(kSimd128Precision);
- int Vm = instr->VFPMRegValue(kSimd128Precision);
- const char* suffix = nullptr;
- int op = instr->Bits(8, 7);
- switch (op) {
- case 0:
- suffix = "f32.s32";
- break;
- case 1:
- suffix = "f32.u32";
- break;
- case 2:
- suffix = "s32.f32";
- break;
- case 3:
- suffix = "u32.f32";
- break;
- }
- out_buffer_pos_ += SNPrintF(out_buffer_ + out_buffer_pos_,
- "vcvt.%s q%d, q%d", suffix, Vd, Vm);
- } else if ((instr->Bits(21, 16) == 0x32) && (instr->Bits(11, 7) == 0) &&
- (instr->Bit(4) == 0)) {
- if (instr->Bit(6) == 0) {
- int Vd = instr->VFPDRegValue(kDoublePrecision);
+ if (instr->Bits(17, 16) == 0x2 && instr->Bits(11, 7) == 0) {
+ if (instr->Bit(6) == 0) {
+ int Vd = instr->VFPDRegValue(kDoublePrecision);
+ int Vm = instr->VFPMRegValue(kDoublePrecision);
+ out_buffer_pos_ += SNPrintF(out_buffer_ + out_buffer_pos_,
+ "vswp d%d, d%d", Vd, Vm);
+ } else {
+ int Vd = instr->VFPDRegValue(kSimd128Precision);
+ int Vm = instr->VFPMRegValue(kSimd128Precision);
+ out_buffer_pos_ += SNPrintF(out_buffer_ + out_buffer_pos_,
+ "vswp q%d, q%d", Vd, Vm);
+ }
+ } else if (instr->Bits(11, 7) == 0x18) {
+ int Vd = instr->VFPDRegValue(kSimd128Precision);
int Vm = instr->VFPMRegValue(kDoublePrecision);
- out_buffer_pos_ +=
- SNPrintF(out_buffer_ + out_buffer_pos_, "vswp d%d, d%d", Vd, Vm);
- } else {
+ int index = instr->Bit(19);
+ out_buffer_pos_ += SNPrintF(out_buffer_ + out_buffer_pos_,
+ "vdup q%d, d%d[%d]", Vd, Vm, index);
+ } else if (instr->Bits(19, 16) == 0 && instr->Bits(11, 6) == 0x17) {
int Vd = instr->VFPDRegValue(kSimd128Precision);
int Vm = instr->VFPMRegValue(kSimd128Precision);
out_buffer_pos_ +=
- SNPrintF(out_buffer_ + out_buffer_pos_, "vswp q%d, q%d", Vd, Vm);
+ SNPrintF(out_buffer_ + out_buffer_pos_, "vmvn q%d, q%d", Vd, Vm);
+ } else if (instr->Bits(19, 16) == 0xB && instr->Bits(11, 9) == 0x3 &&
+ instr->Bit(6) == 1) {
+ int Vd = instr->VFPDRegValue(kSimd128Precision);
+ int Vm = instr->VFPMRegValue(kSimd128Precision);
+ const char* suffix = nullptr;
+ int op = instr->Bits(8, 7);
+ switch (op) {
+ case 0:
+ suffix = "f32.s32";
+ break;
+ case 1:
+ suffix = "f32.u32";
+ break;
+ case 2:
+ suffix = "s32.f32";
+ break;
+ case 3:
+ suffix = "u32.f32";
+ break;
+ }
+ out_buffer_pos_ += SNPrintF(out_buffer_ + out_buffer_pos_,
+ "vcvt.%s q%d, q%d", suffix, Vd, Vm);
+ } else if (instr->Bits(11, 10) == 0x2) {
+ int Vd = instr->VFPDRegValue(kDoublePrecision);
+ int Vn = instr->VFPNRegValue(kDoublePrecision);
+ int Vm = instr->VFPMRegValue(kDoublePrecision);
+ int len = instr->Bits(9, 8);
+ NeonListOperand list(DwVfpRegister::from_code(Vn), len + 1);
+ out_buffer_pos_ +=
+ SNPrintF(out_buffer_ + out_buffer_pos_, "%s d%d, ",
+ instr->Bit(6) == 0 ? "vtbl.8" : "vtbx.8", Vd);
+ FormatNeonList(Vn, list.type());
+ Print(", ");
+ PrintDRegister(Vm);
+ } else if (instr->Bits(17, 16) == 0x2 && instr->Bits(11, 6) == 0x7) {
+ int Vd = instr->VFPDRegValue(kSimd128Precision);
+ int Vm = instr->VFPMRegValue(kSimd128Precision);
+ int size = kBitsPerByte * (1 << instr->Bits(19, 18));
+ // vzip.<size> Qd, Qm.
+ out_buffer_pos_ += SNPrintF(out_buffer_ + out_buffer_pos_,
+ "vzip.%d q%d, q%d", size, Vd, Vm);
+ } else if (instr->Bits(17, 16) == 0 && instr->Bits(11, 9) == 0) {
+ int Vd = instr->VFPDRegValue(kSimd128Precision);
+ int Vm = instr->VFPMRegValue(kSimd128Precision);
+ int size = kBitsPerByte * (1 << instr->Bits(19, 18));
+ int op = kBitsPerByte
+ << (static_cast<int>(Neon64) - instr->Bits(8, 7));
+ // vrev<op>.<size> Qd, Qm.
+ out_buffer_pos_ += SNPrintF(out_buffer_ + out_buffer_pos_,
+ "vrev%d.%d q%d, q%d", op, size, Vd, Vm);
+ } else if (instr->Bits(17, 16) == 0x1 && instr->Bit(11) == 0) {
+ int Vd = instr->VFPDRegValue(kSimd128Precision);
+ int Vm = instr->VFPMRegValue(kSimd128Precision);
+ int size = kBitsPerByte * (1 << instr->Bits(19, 18));
+ const char* type = instr->Bit(10) != 0 ? "f" : "s";
+ if (instr->Bits(9, 6) == 0xd) {
+ // vabs<type>.<size> Qd, Qm.
+ out_buffer_pos_ +=
+ SNPrintF(out_buffer_ + out_buffer_pos_, "vabs.%s%d q%d, q%d",
+ type, size, Vd, Vm);
+ } else if (instr->Bits(9, 6) == 0xf) {
+ // vneg<type>.<size> Qd, Qm.
+ out_buffer_pos_ +=
+ SNPrintF(out_buffer_ + out_buffer_pos_, "vneg.%s%d q%d, q%d",
+ type, size, Vd, Vm);
+ } else {
+ Unknown(instr);
+ }
+ } else {
+ Unknown(instr);
}
- } else if (instr->Opc1Value() == 0x7 && instr->Bits(11, 7) == 0x18 &&
- instr->Bit(4) == 0x0) {
- int Vd = instr->VFPDRegValue(kSimd128Precision);
- int Vm = instr->VFPMRegValue(kDoublePrecision);
- int index = instr->Bit(19);
- out_buffer_pos_ += SNPrintF(out_buffer_ + out_buffer_pos_,
- "vdup q%d, d%d[%d]", Vd, Vm, index);
- } else if (instr->Opc1Value() == 0x7 && instr->Bits(11, 10) == 0x2 &&
- instr->Bit(4) == 0x0) {
- int Vd = instr->VFPDRegValue(kDoublePrecision);
- int Vn = instr->VFPNRegValue(kDoublePrecision);
- int Vm = instr->VFPMRegValue(kDoublePrecision);
- int len = instr->Bits(9, 8);
- NeonListOperand list(DwVfpRegister::from_code(Vn), len + 1);
- out_buffer_pos_ +=
- SNPrintF(out_buffer_ + out_buffer_pos_, "%s d%d, ",
- instr->Bit(6) == 0 ? "vtbl.8" : "vtbx.8", Vd);
- FormatNeonList(Vn, list.type());
- Print(", ");
- PrintDRegister(Vm);
} else {
Unknown(instr);
}
diff --git a/src/arm/macro-assembler-arm.cc b/src/arm/macro-assembler-arm.cc
index 8363d5e..5f4e492 100644
--- a/src/arm/macro-assembler-arm.cc
+++ b/src/arm/macro-assembler-arm.cc
@@ -1185,6 +1185,64 @@
VmovExtended(s_code, src_lane.code(), scratch);
}
+void MacroAssembler::Swizzle(QwNeonRegister dst, QwNeonRegister src,
+ Register scratch, NeonSize size, uint32_t lanes) {
+ // TODO(bbudge) Handle Int16x8, Int8x16 vectors.
+ DCHECK_EQ(Neon32, size);
+ DCHECK_IMPLIES(size == Neon32, lanes < 0xFFFFu);
+ if (size == Neon32) {
+ switch (lanes) {
+ // TODO(bbudge) Handle more special cases.
+ case 0x3210: // Identity.
+ Move(dst, src);
+ return;
+ case 0x1032: // Swap top and bottom.
+ vext(dst, src, src, 8);
+ return;
+ case 0x2103: // Rotation.
+ vext(dst, src, src, 12);
+ return;
+ case 0x0321: // Rotation.
+ vext(dst, src, src, 4);
+ return;
+ case 0x0000: // Equivalent to vdup.
+ case 0x1111:
+ case 0x2222:
+ case 0x3333: {
+ int lane_code = src.code() * 4 + (lanes & 0xF);
+ if (lane_code >= SwVfpRegister::kMaxNumRegisters) {
+ // TODO(bbudge) use vdup (vdup.32 dst, D<src>[lane]) once implemented.
+ int temp_code = kScratchDoubleReg.code() * 2;
+ VmovExtended(temp_code, lane_code, scratch);
+ lane_code = temp_code;
+ }
+ vdup(dst, SwVfpRegister::from_code(lane_code));
+ return;
+ }
+ case 0x2301: // Swap lanes 0, 1 and lanes 2, 3.
+ vrev64(Neon32, dst, src);
+ return;
+ default: // Handle all other cases with vmovs.
+ int src_code = src.code() * 4;
+ int dst_code = dst.code() * 4;
+ bool in_place = src.is(dst);
+ if (in_place) {
+ vmov(kScratchQuadReg, src);
+ src_code = kScratchQuadReg.code() * 4;
+ }
+ for (int i = 0; i < 4; i++) {
+ int lane = (lanes >> (i * 4) & 0xF);
+ VmovExtended(dst_code + i, src_code + lane, scratch);
+ }
+ if (in_place) {
+ // Restore zero reg.
+ veor(kDoubleRegZero, kDoubleRegZero, kDoubleRegZero);
+ }
+ return;
+ }
+ }
+}
+
void MacroAssembler::LslPair(Register dst_low, Register dst_high,
Register src_low, Register src_high,
Register scratch, Register shift) {
diff --git a/src/arm/macro-assembler-arm.h b/src/arm/macro-assembler-arm.h
index 5a0a2b6..1bee1ed 100644
--- a/src/arm/macro-assembler-arm.h
+++ b/src/arm/macro-assembler-arm.h
@@ -568,6 +568,8 @@
NeonDataType dt, int lane);
void ReplaceLane(QwNeonRegister dst, QwNeonRegister src,
SwVfpRegister src_lane, Register scratch, int lane);
+ void Swizzle(QwNeonRegister dst, QwNeonRegister src, Register scratch,
+ NeonSize size, uint32_t lanes);
void LslPair(Register dst_low, Register dst_high, Register src_low,
Register src_high, Register scratch, Register shift);
diff --git a/src/arm/simulator-arm.cc b/src/arm/simulator-arm.cc
index 1265483..8872010 100644
--- a/src/arm/simulator-arm.cc
+++ b/src/arm/simulator-arm.cc
@@ -3335,7 +3335,7 @@
break;
}
case Neon16: {
- // Perform pairwise ops instead of casting to uint16_t.
+ // Perform pairwise op.
rt_value &= 0xFFFFu;
uint32_t rt_rt = (rt_value << 16) | (rt_value & 0xFFFFu);
for (int i = 0; i < 4; i++) {
@@ -3838,17 +3838,6 @@
}
}
-#define HIGH_16(x) ((x) >> 16)
-#define LOW_16(x) ((x)&0xFFFFu)
-#define COMBINE_32(high, low) ((high) << 16 | (low)&0xFFFFu)
-#define PAIRWISE_OP(x, y, OP) \
- COMBINE_32(OP(HIGH_16((x)), HIGH_16((y))), OP(LOW_16((x)), LOW_16((y))))
-
-#define ADD_16(x, y) ((x) + (y))
-#define SUB_16(x, y) ((x) - (y))
-#define CEQ_16(x, y) ((x) == (y) ? 0xFFFFu : 0)
-#define TST_16(x, y) (((x) & (y)) != 0 ? 0xFFFFu : 0)
-
void Simulator::DecodeSpecialCondition(Instruction* instr) {
switch (instr->SpecialValue()) {
case 4:
@@ -3881,9 +3870,13 @@
break;
}
case Neon16: {
- for (int i = 0; i < 4; i++) {
- src1[i] = PAIRWISE_OP(src1[i], src2[i], ADD_16);
+ uint16_t s1[8], s2[8];
+ memcpy(s1, src1, sizeof(s1));
+ memcpy(s2, src2, sizeof(s2));
+ for (int i = 0; i < 8; i++) {
+ s1[i] += s2[i];
}
+ memcpy(src1, s1, sizeof(src1));
break;
}
case Neon32: {
@@ -3908,9 +3901,13 @@
break;
}
case Neon16: {
- for (int i = 0; i < 4; i++) {
- src1[i] = PAIRWISE_OP(src1[i], src2[i], TST_16);
+ uint16_t s1[8], s2[8];
+ memcpy(s1, src1, sizeof(s1));
+ memcpy(s2, src2, sizeof(s2));
+ for (int i = 0; i < 8; i++) {
+ s1[i] = (s1[i] & s2[i]) != 0 ? 0xFFFFu : 0;
}
+ memcpy(src1, s1, sizeof(src1));
break;
}
case Neon32: {
@@ -3945,6 +3942,46 @@
}
}
set_q_register(Vd, src1);
+ } else if (instr->Bits(11, 8) == 0x9 && instr->Bit(6) == 1 &&
+ instr->Bit(4) == 1) {
+ // vmul.i<size> Qd, Qm, Qn.
+ NeonSize size = static_cast<NeonSize>(instr->Bits(21, 20));
+ int Vd = instr->VFPDRegValue(kSimd128Precision);
+ int Vm = instr->VFPMRegValue(kSimd128Precision);
+ int Vn = instr->VFPNRegValue(kSimd128Precision);
+ uint32_t src1[4], src2[4];
+ get_q_register(Vn, src1);
+ get_q_register(Vm, src2);
+ switch (size) {
+ case Neon8: {
+ uint8_t* s1 = reinterpret_cast<uint8_t*>(src1);
+ uint8_t* s2 = reinterpret_cast<uint8_t*>(src2);
+ for (int i = 0; i < 16; i++) {
+ s1[i] *= s2[i];
+ }
+ break;
+ }
+ case Neon16: {
+ uint16_t s1[8], s2[8];
+ memcpy(s1, src1, sizeof(s1));
+ memcpy(s2, src2, sizeof(s2));
+ for (int i = 0; i < 8; i++) {
+ s1[i] *= s2[i];
+ }
+ memcpy(src1, s1, sizeof(src1));
+ break;
+ }
+ case Neon32: {
+ for (int i = 0; i < 4; i++) {
+ src1[i] *= src2[i];
+ }
+ break;
+ }
+ default:
+ UNIMPLEMENTED();
+ break;
+ }
+ set_q_register(Vd, src1);
} else {
UNIMPLEMENTED();
}
@@ -3969,6 +4006,27 @@
e++;
}
set_q_register(Vd, reinterpret_cast<uint64_t*>(to));
+ } else if (instr->Bits(21, 20) == 3 && instr->Bit(4) == 0) {
+ // vext.
+ int imm4 = instr->Bits(11, 8);
+ int Vd = instr->VFPDRegValue(kSimd128Precision);
+ int Vm = instr->VFPMRegValue(kSimd128Precision);
+ int Vn = instr->VFPNRegValue(kSimd128Precision);
+ uint32_t src1[4], src2[4], dst[4];
+ get_q_register(Vn, src1);
+ get_q_register(Vm, src2);
+ uint8_t* s1 = reinterpret_cast<uint8_t*>(src1);
+ uint8_t* s2 = reinterpret_cast<uint8_t*>(src2);
+ uint8_t* d = reinterpret_cast<uint8_t*>(dst);
+ int boundary = 16 - imm4;
+ int i = 0;
+ for (; i < boundary; i++) {
+ d[i] = s1[i + imm4];
+ }
+ for (; i < 16; i++) {
+ d[i] = s2[i - boundary];
+ }
+ set_q_register(Vd, dst);
} else {
UNIMPLEMENTED();
}
@@ -3993,9 +4051,13 @@
break;
}
case Neon16: {
- for (int i = 0; i < 4; i++) {
- src1[i] = PAIRWISE_OP(src1[i], src2[i], SUB_16);
+ uint16_t s1[8], s2[8];
+ memcpy(s1, src1, sizeof(s1));
+ memcpy(s2, src2, sizeof(s2));
+ for (int i = 0; i < 8; i++) {
+ s1[i] -= s2[i];
}
+ memcpy(src1, s1, sizeof(src1));
break;
}
case Neon32: {
@@ -4028,9 +4090,13 @@
break;
}
case Neon16: {
- for (int i = 0; i < 4; i++) {
- src1[i] = PAIRWISE_OP(src1[i], src2[i], CEQ_16);
+ uint16_t s1[8], s2[8];
+ memcpy(s1, src1, sizeof(s1));
+ memcpy(s2, src2, sizeof(s2));
+ for (int i = 0; i < 8; i++) {
+ s1[i] = s1[i] == s2[i] ? 0xffffu : 0;
}
+ memcpy(src1, s1, sizeof(src1));
break;
}
case Neon32: {
@@ -4065,23 +4131,37 @@
int Vd = instr->VFPDRegValue(kDoublePrecision);
int Vn = instr->VFPNRegValue(kDoublePrecision);
int Vm = instr->VFPMRegValue(kDoublePrecision);
- uint64_t n_data, m_data;
- get_d_register(Vn, &n_data);
- get_d_register(Vm, &m_data);
- n_data ^= m_data;
- set_d_register(Vd, &n_data);
+ uint64_t src1, src2;
+ get_d_register(Vn, &src1);
+ get_d_register(Vm, &src2);
+ src1 ^= src2;
+ set_d_register(Vd, &src1);
} else {
// veor Qd, Qn, Qm
int Vd = instr->VFPDRegValue(kSimd128Precision);
int Vn = instr->VFPNRegValue(kSimd128Precision);
int Vm = instr->VFPMRegValue(kSimd128Precision);
- uint32_t n_data[4], m_data[4];
- get_q_register(Vn, n_data);
- get_q_register(Vm, m_data);
- for (int i = 0; i < 4; i++) n_data[i] ^= m_data[i];
- set_q_register(Vd, n_data);
+ uint32_t src1[4], src2[4];
+ get_q_register(Vn, src1);
+ get_q_register(Vm, src2);
+ for (int i = 0; i < 4; i++) src1[i] ^= src2[i];
+ set_q_register(Vd, src1);
}
+ } else if (instr->Bit(21) == 0 && instr->Bits(11, 8) == 0xd &&
+ instr->Bit(6) == 1 && instr->Bit(4) == 1) {
+ // vmul.f32 Qd, Qn, Qm
+ int Vd = instr->VFPDRegValue(kSimd128Precision);
+ int Vn = instr->VFPNRegValue(kSimd128Precision);
+ int Vm = instr->VFPMRegValue(kSimd128Precision);
+ uint32_t src1[4], src2[4];
+ get_q_register(Vn, src1);
+ get_q_register(Vm, src2);
+ for (int i = 0; i < 4; i++) {
+ src1[i] = bit_cast<uint32_t>(bit_cast<float>(src1[i]) *
+ bit_cast<float>(src2[i]));
+ }
+ set_q_register(Vd, src1);
} else {
UNIMPLEMENTED();
}
@@ -4106,106 +4186,314 @@
e++;
}
set_q_register(Vd, reinterpret_cast<uint64_t*>(to));
- } else if (instr->Opc1Value() == 7 && instr->Bits(19, 16) == 0xB &&
- instr->Bits(11, 9) == 0x3 && instr->Bit(6) == 1 &&
- instr->Bit(4) == 0) {
- // vcvt.<Td>.<Tm> Qd, Qm.
- int Vd = instr->VFPDRegValue(kSimd128Precision);
- int Vm = instr->VFPMRegValue(kSimd128Precision);
- uint32_t q_data[4];
- get_q_register(Vm, q_data);
- int op = instr->Bits(8, 7);
- for (int i = 0; i < 4; i++) {
- switch (op) {
- case 0:
- // f32 <- s32, round towards nearest.
- q_data[i] = bit_cast<uint32_t>(
- std::round(static_cast<float>(bit_cast<int32_t>(q_data[i]))));
- break;
- case 1:
- // f32 <- u32, round towards nearest.
- q_data[i] =
- bit_cast<uint32_t>(std::round(static_cast<float>(q_data[i])));
- break;
- case 2:
- // s32 <- f32, round to zero.
- q_data[i] = static_cast<uint32_t>(
- ConvertDoubleToInt(bit_cast<float>(q_data[i]), false, RZ));
- break;
- case 3:
- // u32 <- f32, round to zero.
- q_data[i] = static_cast<uint32_t>(
- ConvertDoubleToInt(bit_cast<float>(q_data[i]), true, RZ));
- break;
+ } else if (instr->Opc1Value() == 7 && instr->Bit(4) == 0) {
+ if (instr->Bits(19, 16) == 0xB && instr->Bits(11, 9) == 0x3 &&
+ instr->Bit(6) == 1) {
+ // vcvt.<Td>.<Tm> Qd, Qm.
+ int Vd = instr->VFPDRegValue(kSimd128Precision);
+ int Vm = instr->VFPMRegValue(kSimd128Precision);
+ uint32_t q_data[4];
+ get_q_register(Vm, q_data);
+ int op = instr->Bits(8, 7);
+ for (int i = 0; i < 4; i++) {
+ switch (op) {
+ case 0:
+ // f32 <- s32, round towards nearest.
+ q_data[i] = bit_cast<uint32_t>(std::round(
+ static_cast<float>(bit_cast<int32_t>(q_data[i]))));
+ break;
+ case 1:
+ // f32 <- u32, round towards nearest.
+ q_data[i] = bit_cast<uint32_t>(
+ std::round(static_cast<float>(q_data[i])));
+ break;
+ case 2:
+ // s32 <- f32, round to zero.
+ q_data[i] = static_cast<uint32_t>(
+ ConvertDoubleToInt(bit_cast<float>(q_data[i]), false, RZ));
+ break;
+ case 3:
+ // u32 <- f32, round to zero.
+ q_data[i] = static_cast<uint32_t>(
+ ConvertDoubleToInt(bit_cast<float>(q_data[i]), true, RZ));
+ break;
+ }
}
- }
- set_q_register(Vd, q_data);
- } else if ((instr->Bits(21, 16) == 0x32) && (instr->Bits(11, 7) == 0) &&
- (instr->Bit(4) == 0)) {
- if (instr->Bit(6) == 0) {
- // vswp Dd, Dm.
- uint64_t dval, mval;
- int vd = instr->VFPDRegValue(kDoublePrecision);
+ set_q_register(Vd, q_data);
+ } else if (instr->Bits(17, 16) == 0x2 && instr->Bits(11, 7) == 0) {
+ if (instr->Bit(6) == 0) {
+ // vswp Dd, Dm.
+ uint64_t dval, mval;
+ int vd = instr->VFPDRegValue(kDoublePrecision);
+ int vm = instr->VFPMRegValue(kDoublePrecision);
+ get_d_register(vd, &dval);
+ get_d_register(vm, &mval);
+ set_d_register(vm, &dval);
+ set_d_register(vd, &mval);
+ } else {
+ // vswp Qd, Qm.
+ uint32_t dval[4], mval[4];
+ int vd = instr->VFPDRegValue(kSimd128Precision);
+ int vm = instr->VFPMRegValue(kSimd128Precision);
+ get_q_register(vd, dval);
+ get_q_register(vm, mval);
+ set_q_register(vm, dval);
+ set_q_register(vd, mval);
+ }
+ } else if (instr->Bits(11, 7) == 0x18) {
+ // vdup.32 Qd, Sm.
+ int vd = instr->VFPDRegValue(kSimd128Precision);
int vm = instr->VFPMRegValue(kDoublePrecision);
- get_d_register(vd, &dval);
- get_d_register(vm, &mval);
- set_d_register(vm, &dval);
- set_d_register(vd, &mval);
- } else {
- // vswp Qd, Qm.
- uint32_t dval[4], mval[4];
+ int index = instr->Bit(19);
+ uint32_t s_data = get_s_register(vm * 2 + index);
+ uint32_t q_data[4];
+ for (int i = 0; i < 4; i++) q_data[i] = s_data;
+ set_q_register(vd, q_data);
+ } else if (instr->Bits(19, 16) == 0 && instr->Bits(11, 6) == 0x17) {
+ // vmvn Qd, Qm.
int vd = instr->VFPDRegValue(kSimd128Precision);
int vm = instr->VFPMRegValue(kSimd128Precision);
- get_q_register(vd, dval);
- get_q_register(vm, mval);
- set_q_register(vm, dval);
- set_q_register(vd, mval);
- }
- } else if (instr->Opc1Value() == 0x7 && instr->Bits(11, 7) == 0x18 &&
- instr->Bit(4) == 0x0) {
- // vdup.32 Qd, Sm.
- int vd = instr->VFPDRegValue(kSimd128Precision);
- int vm = instr->VFPMRegValue(kDoublePrecision);
- int index = instr->Bit(19);
- uint32_t s_data = get_s_register(vm * 2 + index);
- uint32_t q_data[4];
- for (int i = 0; i < 4; i++) q_data[i] = s_data;
- set_q_register(vd, q_data);
- } else if (instr->Opc1Value() == 7 && instr->Bits(19, 16) == 0 &&
- instr->Bits(11, 6) == 0x17 && instr->Bit(4) == 0) {
- // vmvn Qd, Qm.
- int vd = instr->VFPDRegValue(kSimd128Precision);
- int vm = instr->VFPMRegValue(kSimd128Precision);
- uint32_t q_data[4];
- get_q_register(vm, q_data);
- for (int i = 0; i < 4; i++) q_data[i] = ~q_data[i];
- set_q_register(vd, q_data);
- } else if (instr->Opc1Value() == 0x7 && instr->Bits(11, 10) == 0x2 &&
- instr->Bit(4) == 0x0) {
- // vtb[l,x] Dd, <list>, Dm.
- int vd = instr->VFPDRegValue(kDoublePrecision);
- int vn = instr->VFPNRegValue(kDoublePrecision);
- int vm = instr->VFPMRegValue(kDoublePrecision);
- int table_len = (instr->Bits(9, 8) + 1) * kDoubleSize;
- bool vtbx = instr->Bit(6) != 0; // vtbl / vtbx
- uint64_t destination = 0, indices = 0, result = 0;
- get_d_register(vd, &destination);
- get_d_register(vm, &indices);
- for (int i = 0; i < kDoubleSize; i++) {
- int shift = i * kBitsPerByte;
- int index = (indices >> shift) & 0xFF;
- if (index < table_len) {
- uint64_t table;
- get_d_register(vn + index / kDoubleSize, &table);
- result |= ((table >> ((index % kDoubleSize) * kBitsPerByte)) & 0xFF)
- << shift;
- } else if (vtbx) {
- result |= destination & (0xFFull << shift);
+ uint32_t q_data[4];
+ get_q_register(vm, q_data);
+ for (int i = 0; i < 4; i++) q_data[i] = ~q_data[i];
+ set_q_register(vd, q_data);
+ } else if (instr->Bits(11, 10) == 0x2) {
+ // vtb[l,x] Dd, <list>, Dm.
+ int vd = instr->VFPDRegValue(kDoublePrecision);
+ int vn = instr->VFPNRegValue(kDoublePrecision);
+ int vm = instr->VFPMRegValue(kDoublePrecision);
+ int table_len = (instr->Bits(9, 8) + 1) * kDoubleSize;
+ bool vtbx = instr->Bit(6) != 0; // vtbl / vtbx
+ uint64_t destination = 0, indices = 0, result = 0;
+ get_d_register(vd, &destination);
+ get_d_register(vm, &indices);
+ for (int i = 0; i < kDoubleSize; i++) {
+ int shift = i * kBitsPerByte;
+ int index = (indices >> shift) & 0xFF;
+ if (index < table_len) {
+ uint64_t table;
+ get_d_register(vn + index / kDoubleSize, &table);
+ result |=
+ ((table >> ((index % kDoubleSize) * kBitsPerByte)) & 0xFF)
+ << shift;
+ } else if (vtbx) {
+ result |= destination & (0xFFull << shift);
+ }
}
+ set_d_register(vd, &result);
+ } else if (instr->Bits(17, 16) == 0x2 && instr->Bits(11, 6) == 0x7) {
+ // vzip.<size> Qd, Qm.
+ int size = static_cast<NeonSize>(instr->Bits(19, 18));
+ int Vd = instr->VFPDRegValue(kSimd128Precision);
+ int Vm = instr->VFPMRegValue(kSimd128Precision);
+ uint32_t src1[4], src2[4], dst1[4], dst2[4];
+ get_q_register(Vd, src1);
+ get_q_register(Vm, src2);
+ switch (size) {
+ case Neon8: {
+ uint8_t* s1 = reinterpret_cast<uint8_t*>(src1);
+ uint8_t* s2 = reinterpret_cast<uint8_t*>(src2);
+ uint8_t* d1 = reinterpret_cast<uint8_t*>(dst1);
+ uint8_t* d2 = reinterpret_cast<uint8_t*>(dst2);
+ for (int i = 0; i < 8; i++) {
+ d1[i * 2] = s1[i];
+ d1[i * 2 + 1] = s2[i];
+ d2[i * 2] = s1[i + 8];
+ d2[i * 2 + 1] = s2[i + 8];
+ }
+ break;
+ }
+ case Neon16: {
+ uint16_t s1[8], s2[8], d1[8], d2[8];
+ memcpy(s1, src1, sizeof(s1));
+ memcpy(s2, src2, sizeof(s2));
+ for (int i = 0; i < 8; i += 2) {
+ d1[i] = s1[i / 2];
+ d1[i + 1] = s2[i / 2];
+ d2[i] = s1[i / 2 + 4];
+ d2[i + 1] = s2[i / 2 + 4];
+ }
+ memcpy(dst1, d1, sizeof(dst1));
+ memcpy(dst2, d2, sizeof(dst2));
+ break;
+ }
+ case Neon32: {
+ for (int i = 0; i < 2; i++) {
+ dst1[i * 2] = src1[i];
+ dst1[i * 2 + 1] = src2[i];
+ dst2[i * 2] = src1[i + 2];
+ dst2[i * 2 + 1] = src2[i + 2];
+ }
+ break;
+ }
+ default:
+ UNREACHABLE();
+ break;
+ }
+ set_q_register(Vd, dst1);
+ set_q_register(Vm, dst2);
+ } else if (instr->Bits(17, 16) == 0 && instr->Bits(11, 9) == 0) {
+ // vrev<op>.size Qd, Qm
+ int Vd = instr->VFPDRegValue(kSimd128Precision);
+ int Vm = instr->VFPMRegValue(kSimd128Precision);
+ int size = static_cast<NeonSize>(instr->Bits(19, 18));
+ NeonSize op = static_cast<NeonSize>(static_cast<int>(Neon64) -
+ instr->Bits(8, 7));
+ uint32_t src[4];
+ get_q_register(Vm, src);
+ switch (op) {
+ case Neon16: {
+ DCHECK_EQ(Neon8, size);
+ uint8_t* s = reinterpret_cast<uint8_t*>(src);
+ for (int i = 0; i < 16; i += 2) {
+ std::swap(s[i], s[i + 1]);
+ }
+ break;
+ }
+ case Neon32: {
+ switch (size) {
+ case Neon16:
+ for (int i = 0; i < 4; i++) {
+ src[i] = (src[i] >> 16) | (src[i] << 16);
+ }
+ break;
+ case Neon8: {
+ uint8_t* s = reinterpret_cast<uint8_t*>(src);
+ for (int i = 0; i < 4; i++) {
+ std::swap(s[i * 4], s[i * 4 + 3]);
+ std::swap(s[i * 4 + 1], s[i * 4 + 2]);
+ }
+ break;
+ }
+ default:
+ UNREACHABLE();
+ break;
+ }
+ break;
+ }
+ case Neon64: {
+ switch (size) {
+ case Neon32: {
+ std::swap(src[0], src[1]);
+ std::swap(src[2], src[3]);
+ break;
+ }
+ case Neon16: {
+ for (int i = 0; i <= 2; i += 2) {
+ uint32_t w1 = src[i];
+ uint32_t w2 = src[i + 1];
+ src[i] = (w2 >> 16) | (w2 << 16);
+ src[i + 1] = (w1 >> 16) | (w1 << 16);
+ }
+ break;
+ }
+ case Neon8: {
+ uint8_t* s = reinterpret_cast<uint8_t*>(src);
+ for (int i = 0; i < 4; i++) {
+ std::swap(s[i], s[7 - i]);
+ std::swap(s[i + 8], s[15 - i]);
+ }
+ break;
+ }
+ default:
+ UNREACHABLE();
+ break;
+ }
+ break;
+ }
+ default:
+ UNREACHABLE();
+ break;
+ }
+ set_q_register(Vd, src);
+ } else if (instr->Bits(17, 16) == 0x1 && instr->Bit(11) == 0) {
+ int Vd = instr->VFPDRegValue(kSimd128Precision);
+ int Vm = instr->VFPMRegValue(kSimd128Precision);
+ int size = static_cast<NeonSize>(instr->Bits(19, 18));
+ uint32_t src[4];
+ get_q_register(Vm, src);
+ if (instr->Bits(9, 6) == 0xd) {
+ // vabs<type>.<size> Qd, Qm
+ if (instr->Bit(10) != 0) {
+ // floating point (clear sign bits)
+ for (int i = 0; i < 4; i++) {
+ src[i] &= ~0x80000000;
+ }
+ } else {
+ // signed integer
+ switch (size) {
+ case Neon8: {
+ int8_t* s = reinterpret_cast<int8_t*>(src);
+ for (int i = 0; i < 16; i++) {
+ s[i] = std::abs(s[i]);
+ }
+ break;
+ }
+ case Neon16: {
+ int16_t s[8];
+ memcpy(s, src, sizeof(s));
+ for (int i = 0; i < 8; i++) {
+ s[i] = std::abs(s[i]);
+ }
+ memcpy(src, s, sizeof(src));
+ break;
+ }
+ case Neon32: {
+ int32_t* as_signed = reinterpret_cast<int32_t*>(src);
+ for (int i = 0; i < 4; i++) {
+ as_signed[i] = std::abs(as_signed[i]);
+ }
+ break;
+ }
+ default:
+ UNIMPLEMENTED();
+ break;
+ }
+ }
+ } else if (instr->Bits(9, 6) == 0xf) {
+ // vneg<type>.<size> Qd, Qm (signed integer)
+ if (instr->Bit(10) != 0) {
+ // floating point (toggle sign bits)
+ for (int i = 0; i < 4; i++) {
+ src[i] ^= 0x80000000;
+ }
+ } else {
+ // signed integer
+ switch (size) {
+ case Neon8: {
+ int8_t* s = reinterpret_cast<int8_t*>(src);
+ for (int i = 0; i < 16; i++) {
+ s[i] = -s[i];
+ }
+ break;
+ }
+ case Neon16:
+ int16_t s[8];
+ memcpy(s, src, sizeof(s));
+ for (int i = 0; i < 8; i++) {
+ s[i] = -s[i];
+ }
+ memcpy(src, s, sizeof(src));
+ break;
+ case Neon32: {
+ int32_t* as_signed = reinterpret_cast<int32_t*>(src);
+ for (int i = 0; i < 4; i++) {
+ as_signed[i] = -as_signed[i];
+ }
+ break;
+ }
+ default:
+ UNIMPLEMENTED();
+ break;
+ }
+ }
+ } else {
+ UNIMPLEMENTED();
+ }
+ set_q_register(Vd, src);
+ } else {
+ UNIMPLEMENTED();
}
- set_d_register(vd, &result);
- } else {
- UNIMPLEMENTED();
}
break;
case 8:
diff --git a/test/cctest/test-assembler-arm.cc b/test/cctest/test-assembler-arm.cc
index 7873714..95141b7 100644
--- a/test/cctest/test-assembler-arm.cc
+++ b/test/cctest/test-assembler-arm.cc
@@ -1221,6 +1221,18 @@
CHECK_EQ(kArmNanLower32, bit_cast<int64_t>(t.div_result) & 0xffffffffu);
}
+#define CHECK_EQ_SPLAT(field, ex) \
+ CHECK_EQ(ex, t.field[0]); \
+ CHECK_EQ(ex, t.field[1]); \
+ CHECK_EQ(ex, t.field[2]); \
+ CHECK_EQ(ex, t.field[3]);
+
+#define CHECK_EQ_32X4(field, ex0, ex1, ex2, ex3) \
+ CHECK_EQ(ex0, t.field[0]); \
+ CHECK_EQ(ex1, t.field[1]); \
+ CHECK_EQ(ex2, t.field[2]); \
+ CHECK_EQ(ex3, t.field[3]);
+
#define INT32_TO_FLOAT(val) \
std::round(static_cast<float>(bit_cast<int32_t>(val)))
#define UINT32_TO_FLOAT(val) \
@@ -1259,28 +1271,39 @@
uint32_t dstA5;
uint32_t dstA6;
uint32_t dstA7;
+ uint32_t lane_test[4];
uint64_t vmov_to_scalar1, vmov_to_scalar2;
uint32_t vmov_from_scalar_s8, vmov_from_scalar_u8;
uint32_t vmov_from_scalar_s16, vmov_from_scalar_u16;
uint32_t vmov_from_scalar_32;
- uint32_t vmov_src[4], vmov_dst[4], vmvn[4];
+ uint32_t vmov[4], vmvn[4];
int32_t vcvt_s32_f32[4];
uint32_t vcvt_u32_f32[4];
float vcvt_f32_s32[4], vcvt_f32_u32[4];
- uint32_t vdup1[4], vdup2[4], vdup3[4], vdup4[4];
+ uint32_t vdup8[4], vdup16[4], vdup32[4];
+ float vabsf[4], vnegf[4];
+ uint32_t vabs_s8[4], vabs_s16[4], vabs_s32[4];
+ uint32_t vneg_s8[4], vneg_s16[4], vneg_s32[4];
uint32_t veor[4];
+ float vdupf[4], vaddf[4], vsubf[4], vmulf[4];
uint32_t vadd8[4], vadd16[4], vadd32[4];
uint32_t vsub8[4], vsub16[4], vsub32[4];
- uint32_t vtst[4], vceq[4], vbsl[4], vtbl[2], vtbx[2];
- float vaddf[4], vsubf[4];
+ uint32_t vmul8[4], vmul16[4], vmul32[4];
+ uint32_t vtst[4], vceq[4], vbsl[4];
+ uint32_t vext[4];
+ uint32_t vzip8a[4], vzip8b[4], vzip16a[4], vzip16b[4], vzip32a[4],
+ vzip32b[4];
+ uint32_t vrev64_32[4], vrev64_16[4], vrev64_8[4];
+ uint32_t vrev32_16[4], vrev32_8[4];
+ uint32_t vrev16_8[4];
+ uint32_t vtbl[2], vtbx[2];
} T;
T t;
// Create a function that accepts &t, and loads, manipulates, and stores
- // the doubles and floats.
+ // the doubles, floats, and SIMD values.
Assembler assm(isolate, NULL, 0);
-
if (CpuFeatures::IsSupported(NEON)) {
CpuFeatureScope scope(&assm, NEON);
@@ -1306,7 +1329,7 @@
__ vst1(Neon8, NeonListOperand(d2, 2), NeonMemOperand(r4));
// ARM core register to scalar.
- __ mov(r4, Operand(0xFFFFFFF8));
+ __ mov(r4, Operand(0xfffffff8));
__ vmov(d0, 0);
__ vmov(NeonS8, d0, 1, r4);
__ vmov(NeonS16, d0, 1, r4);
@@ -1318,8 +1341,8 @@
__ vstr(d0, r0, offsetof(T, vmov_to_scalar2));
// Scalar to ARM core register.
- __ mov(r4, Operand(0xFFFFFF00));
- __ mov(r5, Operand(0xFFFFFFFF));
+ __ mov(r4, Operand(0xffffff00));
+ __ mov(r5, Operand(0xffffffff));
__ vmov(d0, r4, r5);
__ vmov(NeonS8, r4, d0, 1);
__ str(r4, MemOperand(r0, offsetof(T, vmov_from_scalar_s8)));
@@ -1333,15 +1356,15 @@
__ str(r4, MemOperand(r0, offsetof(T, vmov_from_scalar_32)));
// vmov for q-registers.
- __ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, vmov_src))));
+ __ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, lane_test))));
__ vld1(Neon8, NeonListOperand(q0), NeonMemOperand(r4));
__ vmov(q1, q0);
- __ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, vmov_dst))));
+ __ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, vmov))));
__ vst1(Neon8, NeonListOperand(q1), NeonMemOperand(r4));
// vmvn.
- __ mov(r4, Operand(0xFF));
- __ vdup(Neon16, q0, r4);
+ __ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, lane_test))));
+ __ vld1(Neon8, NeonListOperand(q0), NeonMemOperand(r4));
__ vmvn(q1, q0);
__ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, vmvn))));
__ vst1(Neon8, NeonListOperand(q1), NeonMemOperand(r4));
@@ -1370,23 +1393,64 @@
__ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, vcvt_f32_u32))));
__ vst1(Neon8, NeonListOperand(q1), NeonMemOperand(r4));
- // int vdup.
+ // vdup (integer).
__ mov(r4, Operand(0xa));
__ vdup(Neon8, q0, r4);
__ vdup(Neon16, q1, r4);
__ vdup(Neon32, q2, r4);
- __ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, vdup1))));
+ __ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, vdup8))));
__ vst1(Neon8, NeonListOperand(q0), NeonMemOperand(r4));
- __ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, vdup2))));
+ __ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, vdup16))));
__ vst1(Neon8, NeonListOperand(q1), NeonMemOperand(r4));
- __ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, vdup3))));
+ __ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, vdup32))));
__ vst1(Neon8, NeonListOperand(q2), NeonMemOperand(r4));
- // float vdup.
+
+ // vdup (float).
__ vmov(s0, -1.0);
__ vdup(q0, s0);
- __ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, vdup4))));
+ __ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, vdupf))));
__ vst1(Neon8, NeonListOperand(q0), NeonMemOperand(r4));
+ // vabs (float).
+ __ vmov(s0, -1.0);
+ __ vmov(s1, -0.0);
+ __ vmov(s2, 0.0);
+ __ vmov(s3, 1.0);
+ __ vabs(q1, q0);
+ __ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, vabsf))));
+ __ vst1(Neon8, NeonListOperand(q1), NeonMemOperand(r4));
+ // vneg (float).
+ __ vneg(q1, q0);
+ __ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, vnegf))));
+ __ vst1(Neon8, NeonListOperand(q1), NeonMemOperand(r4));
+
+ // vabs (integer).
+ __ mov(r4, Operand(0x7f7f7f7f));
+ __ mov(r5, Operand(0x01010101));
+ __ vmov(d0, r4, r5);
+ __ mov(r4, Operand(0xffffffff));
+ __ mov(r5, Operand(0x80808080));
+ __ vmov(d1, r4, r5);
+ __ vabs(Neon8, q1, q0);
+ __ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, vabs_s8))));
+ __ vst1(Neon8, NeonListOperand(q1), NeonMemOperand(r4));
+ __ vabs(Neon16, q1, q0);
+ __ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, vabs_s16))));
+ __ vst1(Neon8, NeonListOperand(q1), NeonMemOperand(r4));
+ __ vabs(Neon32, q1, q0);
+ __ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, vabs_s32))));
+ __ vst1(Neon8, NeonListOperand(q1), NeonMemOperand(r4));
+ // vneg (integer).
+ __ vneg(Neon8, q1, q0);
+ __ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, vneg_s8))));
+ __ vst1(Neon8, NeonListOperand(q1), NeonMemOperand(r4));
+ __ vneg(Neon16, q1, q0);
+ __ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, vneg_s16))));
+ __ vst1(Neon8, NeonListOperand(q1), NeonMemOperand(r4));
+ __ vneg(Neon32, q1, q0);
+ __ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, vneg_s32))));
+ __ vst1(Neon8, NeonListOperand(q1), NeonMemOperand(r4));
+
// veor.
__ mov(r4, Operand(0x00aa));
__ vdup(Neon16, q0, r4);
@@ -1396,7 +1460,30 @@
__ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, veor))));
__ vst1(Neon8, NeonListOperand(q1), NeonMemOperand(r4));
- // vadd(integer).
+ // vadd (float).
+ __ vmov(s4, 1.0);
+ __ vdup(q0, s4);
+ __ vdup(q1, s4);
+ __ vadd(q1, q1, q0);
+ __ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, vaddf))));
+ __ vst1(Neon8, NeonListOperand(q1), NeonMemOperand(r4));
+ // vsub (float).
+ __ vmov(s4, 2.0);
+ __ vdup(q0, s4);
+ __ vmov(s4, 1.0);
+ __ vdup(q1, s4);
+ __ vsub(q1, q1, q0);
+ __ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, vsubf))));
+ __ vst1(Neon8, NeonListOperand(q1), NeonMemOperand(r4));
+ // vmul (float).
+ __ vmov(s4, 2.0);
+ __ vdup(q0, s4);
+ __ vdup(q1, s4);
+ __ vmul(q1, q1, q0);
+ __ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, vmulf))));
+ __ vst1(Neon8, NeonListOperand(q1), NeonMemOperand(r4));
+
+ // vadd (integer).
__ mov(r4, Operand(0x81));
__ vdup(Neon8, q0, r4);
__ mov(r4, Operand(0x82));
@@ -1419,44 +1506,44 @@
__ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, vadd32))));
__ vst1(Neon8, NeonListOperand(q1), NeonMemOperand(r4));
- // vadd(float).
- __ vmov(s4, 1.0);
- __ vdup(q0, s4);
- __ vdup(q1, s4);
- __ vadd(q1, q1, q0);
- __ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, vaddf))));
- __ vst1(Neon8, NeonListOperand(q1), NeonMemOperand(r4));
-
- // vsub(integer).
+ // vsub (integer).
__ mov(r4, Operand(0x01));
__ vdup(Neon8, q0, r4);
- __ mov(r4, Operand(0x02));
+ __ mov(r4, Operand(0x03));
__ vdup(Neon8, q1, r4);
__ vsub(Neon8, q1, q0, q1);
__ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, vsub8))));
__ vst1(Neon8, NeonListOperand(q1), NeonMemOperand(r4));
__ mov(r4, Operand(0x0001));
__ vdup(Neon16, q0, r4);
- __ mov(r4, Operand(0x0002));
+ __ mov(r4, Operand(0x0003));
__ vdup(Neon16, q1, r4);
__ vsub(Neon16, q1, q0, q1);
__ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, vsub16))));
__ vst1(Neon8, NeonListOperand(q1), NeonMemOperand(r4));
__ mov(r4, Operand(0x00000001));
__ vdup(Neon32, q0, r4);
- __ mov(r4, Operand(0x00000002));
+ __ mov(r4, Operand(0x00000003));
__ vdup(Neon32, q1, r4);
__ vsub(Neon32, q1, q0, q1);
__ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, vsub32))));
__ vst1(Neon8, NeonListOperand(q1), NeonMemOperand(r4));
- // vsub(float).
- __ vmov(s4, 2.0);
- __ vdup(q0, s4);
- __ vmov(s4, 1.0);
- __ vdup(q1, s4);
- __ vsub(q1, q1, q0);
- __ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, vsubf))));
+ // vmul (integer).
+ __ mov(r4, Operand(0x02));
+ __ vdup(Neon8, q0, r4);
+ __ vmul(Neon8, q1, q0, q0);
+ __ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, vmul8))));
+ __ vst1(Neon8, NeonListOperand(q1), NeonMemOperand(r4));
+ __ mov(r4, Operand(0x0002));
+ __ vdup(Neon16, q0, r4);
+ __ vmul(Neon16, q1, q0, q0);
+ __ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, vmul16))));
+ __ vst1(Neon8, NeonListOperand(q1), NeonMemOperand(r4));
+ __ mov(r4, Operand(0x00000002));
+ __ vdup(Neon32, q0, r4);
+ __ vmul(Neon32, q1, q0, q0);
+ __ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, vmul32))));
__ vst1(Neon8, NeonListOperand(q1), NeonMemOperand(r4));
// vceq.
@@ -1488,6 +1575,62 @@
__ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, vbsl))));
__ vst1(Neon8, NeonListOperand(q0), NeonMemOperand(r4));
+ // vext.
+ __ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, lane_test))));
+ __ vld1(Neon8, NeonListOperand(q0), NeonMemOperand(r4));
+ __ vmov(q1, q0);
+ __ vext(q2, q0, q1, 3);
+ __ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, vext))));
+ __ vst1(Neon8, NeonListOperand(q2), NeonMemOperand(r4));
+
+ // vzip.
+ __ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, lane_test))));
+ __ vld1(Neon8, NeonListOperand(q0), NeonMemOperand(r4));
+ __ vmov(q1, q0);
+ __ vzip(Neon8, q0, q1);
+ __ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, vzip8a))));
+ __ vst1(Neon8, NeonListOperand(q0), NeonMemOperand(r4));
+ __ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, vzip8b))));
+ __ vst1(Neon8, NeonListOperand(q1), NeonMemOperand(r4));
+ __ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, lane_test))));
+ __ vld1(Neon8, NeonListOperand(q0), NeonMemOperand(r4));
+ __ vmov(q1, q0);
+ __ vzip(Neon16, q0, q1);
+ __ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, vzip16a))));
+ __ vst1(Neon8, NeonListOperand(q0), NeonMemOperand(r4));
+ __ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, vzip16b))));
+ __ vst1(Neon8, NeonListOperand(q1), NeonMemOperand(r4));
+ __ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, lane_test))));
+ __ vld1(Neon8, NeonListOperand(q0), NeonMemOperand(r4));
+ __ vmov(q1, q0);
+ __ vzip(Neon32, q0, q1);
+ __ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, vzip32a))));
+ __ vst1(Neon8, NeonListOperand(q0), NeonMemOperand(r4));
+ __ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, vzip32b))));
+ __ vst1(Neon8, NeonListOperand(q1), NeonMemOperand(r4));
+
+ // vrev64/32/16
+ __ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, lane_test))));
+ __ vld1(Neon8, NeonListOperand(q0), NeonMemOperand(r4));
+ __ vrev64(Neon32, q1, q0);
+ __ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, vrev64_32))));
+ __ vst1(Neon8, NeonListOperand(q1), NeonMemOperand(r4));
+ __ vrev64(Neon16, q1, q0);
+ __ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, vrev64_16))));
+ __ vst1(Neon8, NeonListOperand(q1), NeonMemOperand(r4));
+ __ vrev64(Neon8, q1, q0);
+ __ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, vrev64_8))));
+ __ vst1(Neon8, NeonListOperand(q1), NeonMemOperand(r4));
+ __ vrev32(Neon16, q1, q0);
+ __ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, vrev32_16))));
+ __ vst1(Neon8, NeonListOperand(q1), NeonMemOperand(r4));
+ __ vrev32(Neon8, q1, q0);
+ __ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, vrev32_8))));
+ __ vst1(Neon8, NeonListOperand(q1), NeonMemOperand(r4));
+ __ vrev16(Neon8, q1, q0);
+ __ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, vrev16_8))));
+ __ vst1(Neon8, NeonListOperand(q1), NeonMemOperand(r4));
+
// vtb[l/x].
__ mov(r4, Operand(0x06040200));
__ mov(r5, Operand(0xff050301));
@@ -1535,8 +1678,10 @@
t.dstA5 = 0;
t.dstA6 = 0;
t.dstA7 = 0;
- t.vmov_src[0] = t.vmov_src[1] = t.vmov_src[2] = t.vmov_src[3] = 1;
- t.vmov_dst[0] = t.vmov_dst[1] = t.vmov_dst[2] = t.vmov_dst[3] = 0;
+ t.lane_test[0] = 0x03020100;
+ t.lane_test[1] = 0x07060504;
+ t.lane_test[2] = 0x0b0a0908;
+ t.lane_test[3] = 0x0f0e0d0c;
Object* dummy = CALL_GENERATED_CODE(isolate, f, &t, 0, 0, 0, 0);
USE(dummy);
@@ -1556,6 +1701,7 @@
CHECK_EQ(0x00410042u, t.dstA5);
CHECK_EQ(0x00830084u, t.dstA6);
CHECK_EQ(0x00810082u, t.dstA7);
+
CHECK_EQ(0xfffffff8fff8f800u, t.vmov_to_scalar1);
CHECK_EQ(0xfff80000f8000000u, t.vmov_to_scalar2);
CHECK_EQ(0xFFFFFFFFu, t.vmov_from_scalar_s8);
@@ -1563,46 +1709,73 @@
CHECK_EQ(0xFFFFFFFFu, t.vmov_from_scalar_s16);
CHECK_EQ(0xFFFFu, t.vmov_from_scalar_u16);
CHECK_EQ(0xFFFFFFFFu, t.vmov_from_scalar_32);
- CHECK_EQ(1u, t.vmov_dst[0]);
- CHECK_EQ(1u, t.vmov_dst[1]);
- CHECK_EQ(1u, t.vmov_dst[2]);
- CHECK_EQ(1u, t.vmov_dst[3]);
- CHECK_EQ(-1, t.vcvt_s32_f32[0]);
- CHECK_EQ(-1, t.vcvt_s32_f32[1]);
- CHECK_EQ(1, t.vcvt_s32_f32[2]);
- CHECK_EQ(1, t.vcvt_s32_f32[3]);
- CHECK_EQ(0u, t.vcvt_u32_f32[0]);
- CHECK_EQ(0u, t.vcvt_u32_f32[1]);
- CHECK_EQ(1u, t.vcvt_u32_f32[2]);
- CHECK_EQ(1u, t.vcvt_u32_f32[3]);
+ CHECK_EQ_32X4(vmov, 0x03020100u, 0x07060504u, 0x0b0a0908u, 0x0f0e0d0cu);
+ CHECK_EQ_32X4(vmvn, 0xfcfdfeffu, 0xf8f9fafbu, 0xf4f5f6f7u, 0xf0f1f2f3u);
+
+ CHECK_EQ_SPLAT(vdup8, 0x0a0a0a0au);
+ CHECK_EQ_SPLAT(vdup16, 0x000a000au);
+ CHECK_EQ_SPLAT(vdup32, 0x0000000au);
+ CHECK_EQ_SPLAT(vdupf, -1.0);
+
+ // src: [-1, -1, 1, 1]
+ CHECK_EQ_32X4(vcvt_s32_f32, -1, -1, 1, 1);
+ CHECK_EQ_32X4(vcvt_u32_f32, 0u, 0u, 1u, 1u);
// src: [kMinInt, kMaxInt, kMaxUInt32, kMinInt + 1]
- CHECK_EQ(INT32_TO_FLOAT(kMinInt), t.vcvt_f32_s32[0]);
- CHECK_EQ(INT32_TO_FLOAT(kMaxInt), t.vcvt_f32_s32[1]);
- CHECK_EQ(INT32_TO_FLOAT(kMaxUInt32), t.vcvt_f32_s32[2]);
- CHECK_EQ(INT32_TO_FLOAT(kMinInt + 1), t.vcvt_f32_s32[3]);
- CHECK_EQ(UINT32_TO_FLOAT(kMinInt), t.vcvt_f32_u32[0]);
- CHECK_EQ(UINT32_TO_FLOAT(kMaxInt), t.vcvt_f32_u32[1]);
- CHECK_EQ(UINT32_TO_FLOAT(kMaxUInt32), t.vcvt_f32_u32[2]);
- CHECK_EQ(UINT32_TO_FLOAT(kMinInt + 1), t.vcvt_f32_u32[3]);
+ CHECK_EQ_32X4(vcvt_f32_s32, INT32_TO_FLOAT(kMinInt),
+ INT32_TO_FLOAT(kMaxInt), INT32_TO_FLOAT(kMaxUInt32),
+ INT32_TO_FLOAT(kMinInt + 1));
+ CHECK_EQ_32X4(vcvt_f32_u32, UINT32_TO_FLOAT(kMinInt),
+ UINT32_TO_FLOAT(kMaxInt), UINT32_TO_FLOAT(kMaxUInt32),
+ UINT32_TO_FLOAT(kMinInt + 1));
- for (int i = 0; i < 4; i++) CHECK_EQ(0xFF00FF00, t.vmvn[i]);
- for (int i = 0; i < 4; i++) CHECK_EQ(0x0a0a0a0au, t.vdup1[i]);
- for (int i = 0; i < 4; i++) CHECK_EQ(0x000a000au, t.vdup2[i]);
- for (int i = 0; i < 4; i++) CHECK_EQ(0x0000000au, t.vdup3[i]);
- for (int i = 0; i < 4; i++) CHECK_EQ(0xbf800000u, t.vdup4[i]); // -1.0f
- for (int i = 0; i < 4; i++) CHECK_EQ(0x00ff00ffu, t.veor[i]);
- for (int i = 0; i < 4; i++) CHECK_EQ(2.0, t.vaddf[i]);
- for (int i = 0; i < 4; i++) CHECK_EQ(0x03030303u, t.vadd8[i]);
- for (int i = 0; i < 4; i++) CHECK_EQ(0x00030003u, t.vadd16[i]);
- for (int i = 0; i < 4; i++) CHECK_EQ(0x00000003u, t.vadd32[i]);
- for (int i = 0; i < 4; i++) CHECK_EQ(-1.0, t.vsubf[i]);
- for (int i = 0; i < 4; i++) CHECK_EQ(0xffffffffu, t.vsub8[i]);
- for (int i = 0; i < 4; i++) CHECK_EQ(0xffffffffu, t.vsub16[i]);
- for (int i = 0; i < 4; i++) CHECK_EQ(0xffffffffu, t.vsub32[i]);
- for (int i = 0; i < 4; i++) CHECK_EQ(0x00ff00ffu, t.vceq[i]);
- for (int i = 0; i < 4; i++) CHECK_EQ(0x00ff00ffu, t.vtst[i]);
- for (int i = 0; i < 4; i++) CHECK_EQ(0x02010201u, t.vbsl[i]);
+ CHECK_EQ_32X4(vabsf, 1.0, 0.0, 0.0, 1.0);
+ CHECK_EQ_32X4(vnegf, 1.0, 0.0, -0.0, -1.0);
+ // src: [0x7f7f7f7f, 0x01010101, 0xffffffff, 0x80808080]
+ CHECK_EQ_32X4(vabs_s8, 0x7f7f7f7fu, 0x01010101u, 0x01010101u, 0x80808080u);
+ CHECK_EQ_32X4(vabs_s16, 0x7f7f7f7fu, 0x01010101u, 0x00010001u, 0x7f807f80u);
+ CHECK_EQ_32X4(vabs_s32, 0x7f7f7f7fu, 0x01010101u, 0x00000001u, 0x7f7f7f80u);
+ CHECK_EQ_32X4(vneg_s8, 0x81818181u, 0xffffffffu, 0x01010101u, 0x80808080u);
+ CHECK_EQ_32X4(vneg_s16, 0x80818081u, 0xfefffeffu, 0x00010001u, 0x7f807f80u);
+ CHECK_EQ_32X4(vneg_s32, 0x80808081u, 0xfefefeffu, 0x00000001u, 0x7f7f7f80u);
+
+ CHECK_EQ_SPLAT(veor, 0x00ff00ffu);
+ CHECK_EQ_SPLAT(vaddf, 2.0);
+ CHECK_EQ_SPLAT(vsubf, -1.0);
+ CHECK_EQ_SPLAT(vmulf, 4.0);
+ CHECK_EQ_SPLAT(vadd8, 0x03030303u);
+ CHECK_EQ_SPLAT(vadd16, 0x00030003u);
+ CHECK_EQ_SPLAT(vadd32, 0x00000003u);
+ CHECK_EQ_SPLAT(vsub8, 0xfefefefeu);
+ CHECK_EQ_SPLAT(vsub16, 0xfffefffeu);
+ CHECK_EQ_SPLAT(vsub32, 0xfffffffeu);
+ CHECK_EQ_SPLAT(vmul8, 0x04040404u);
+ CHECK_EQ_SPLAT(vmul16, 0x00040004u);
+ CHECK_EQ_SPLAT(vmul32, 0x00000004u);
+ CHECK_EQ_SPLAT(vceq, 0x00ff00ffu);
+ CHECK_EQ_SPLAT(vtst, 0x00ff00ffu);
+ CHECK_EQ_SPLAT(vbsl, 0x02010201u);
+
+ CHECK_EQ_32X4(vext, 0x06050403u, 0x0a090807u, 0x0e0d0c0bu, 0x0201000fu);
+
+ CHECK_EQ_32X4(vzip8a, 0x01010000u, 0x03030202u, 0x05050404u, 0x07070606u);
+ CHECK_EQ_32X4(vzip8b, 0x09090808u, 0x0b0b0a0au, 0x0d0d0c0cu, 0x0f0f0e0eu);
+ CHECK_EQ_32X4(vzip16a, 0x01000100u, 0x03020302u, 0x05040504u, 0x07060706u);
+ CHECK_EQ_32X4(vzip16b, 0x09080908u, 0x0b0a0b0au, 0x0d0c0d0cu, 0x0f0e0f0eu);
+ CHECK_EQ_32X4(vzip32a, 0x03020100u, 0x03020100u, 0x07060504u, 0x07060504u);
+ CHECK_EQ_32X4(vzip32b, 0x0b0a0908u, 0x0b0a0908u, 0x0f0e0d0cu, 0x0f0e0d0cu);
+
+ // src: 0 1 2 3 4 5 6 7 8 9 a b c d e f (little endian)
+ CHECK_EQ_32X4(vrev64_32, 0x07060504u, 0x03020100u, 0x0f0e0d0cu,
+ 0x0b0a0908u);
+ CHECK_EQ_32X4(vrev64_16, 0x05040706u, 0x01000302u, 0x0d0c0f0eu,
+ 0x09080b0au);
+ CHECK_EQ_32X4(vrev64_8, 0x04050607u, 0x00010203u, 0x0c0d0e0fu, 0x08090a0bu);
+ CHECK_EQ_32X4(vrev32_16, 0x01000302u, 0x05040706u, 0x09080b0au,
+ 0x0d0c0f0eu);
+ CHECK_EQ_32X4(vrev32_8, 0x00010203u, 0x04050607u, 0x08090a0bu, 0x0c0d0e0fu);
+ CHECK_EQ_32X4(vrev16_8, 0x02030001u, 0x06070405u, 0x0a0b0809u, 0x0e0f0c0du);
+
CHECK_EQ(0x05010400u, t.vtbl[0]);
CHECK_EQ(0x00030602u, t.vtbl[1]);
CHECK_EQ(0x05010400u, t.vtbx[0]);
@@ -1610,7 +1783,6 @@
}
}
-
TEST(16) {
// Test the pkh, uxtb, uxtab and uxtb16 instructions.
CcTest::InitializeVM();
diff --git a/test/cctest/test-disasm-arm.cc b/test/cctest/test-disasm-arm.cc
index 75cd843..e594ab4 100644
--- a/test/cctest/test-disasm-arm.cc
+++ b/test/cctest/test-disasm-arm.cc
@@ -1001,6 +1001,22 @@
"f3fbe742 vcvt.s32.f32 q15, q1");
COMPARE(vcvt_u32_f32(q8, q9),
"f3fb07e2 vcvt.u32.f32 q8, q9");
+ COMPARE(vabs(q0, q1),
+ "f3b90742 vabs.f32 q0, q1");
+ COMPARE(vabs(Neon8, q6, q7),
+ "f3b1c34e vabs.s8 q6, q7");
+ COMPARE(vabs(Neon16, q0, q1),
+ "f3b50342 vabs.s16 q0, q1");
+ COMPARE(vabs(Neon32, q0, q1),
+ "f3b90342 vabs.s32 q0, q1");
+ COMPARE(vneg(q0, q1),
+ "f3b907c2 vneg.f32 q0, q1");
+ COMPARE(vneg(Neon8, q6, q7),
+ "f3b1c3ce vneg.s8 q6, q7");
+ COMPARE(vneg(Neon16, q0, q1),
+ "f3b503c2 vneg.s16 q0, q1");
+ COMPARE(vneg(Neon32, q0, q1),
+ "f3b903c2 vneg.s32 q0, q1");
COMPARE(veor(d0, d1, d2),
"f3010112 veor d0, d1, d2");
COMPARE(veor(d0, d30, d31),
@@ -1025,6 +1041,14 @@
"f3142860 vsub.i16 q1, q2, q8");
COMPARE(vsub(Neon32, q15, q0, q8),
"f360e860 vsub.i32 q15, q0, q8");
+ COMPARE(vmul(q0, q1, q2),
+ "f3020d54 vmul.f32 q0, q1, q2");
+ COMPARE(vmul(Neon8, q0, q1, q2),
+ "f2020954 vmul.i8 q0, q1, q2");
+ COMPARE(vmul(Neon16, q1, q2, q8),
+ "f2142970 vmul.i16 q1, q2, q8");
+ COMPARE(vmul(Neon32, q15, q0, q8),
+ "f260e970 vmul.i32 q15, q0, q8");
COMPARE(vtst(Neon8, q0, q1, q2),
"f2020854 vtst.i8 q0, q1, q2");
COMPARE(vtst(Neon16, q1, q2, q8),
@@ -1041,6 +1065,12 @@
"f3120154 vbsl q0, q1, q2");
COMPARE(vbsl(q15, q0, q8),
"f350e170 vbsl q15, q0, q8");
+ COMPARE(vext(q15, q0, q8, 3),
+ "f2f0e360 vext.8 q15, q0, q8, #3");
+ COMPARE(vzip(Neon16, q15, q0),
+ "f3f6e1c0 vzip.16 q15, q0");
+ COMPARE(vrev64(Neon8, q15, q0),
+ "f3f0e040 vrev64.8 q15, q0");
COMPARE(vtbl(d0, NeonListOperand(d1, 1), d2),
"f3b10802 vtbl.8 d0, {d1}, d2");
COMPARE(vtbl(d31, NeonListOperand(d0, 2), d4),
diff --git a/test/cctest/test-macro-assembler-arm.cc b/test/cctest/test-macro-assembler-arm.cc
index 6b69296..63919a4 100644
--- a/test/cctest/test-macro-assembler-arm.cc
+++ b/test/cctest/test-macro-assembler-arm.cc
@@ -379,4 +379,115 @@
}
}
+#define CHECK_EQ_32X4(field, v0, v1, v2, v3) \
+ CHECK_EQ(v0, t.field[0]); \
+ CHECK_EQ(v1, t.field[1]); \
+ CHECK_EQ(v2, t.field[2]); \
+ CHECK_EQ(v3, t.field[3]);
+
+TEST(Swizzle) {
+ if (!CpuFeatures::IsSupported(NEON)) return;
+
+ // Allocate an executable page of memory.
+ size_t actual_size;
+ byte* buffer = static_cast<byte*>(v8::base::OS::Allocate(
+ Assembler::kMinimalBufferSize, &actual_size, true));
+ CHECK(buffer);
+ Isolate* isolate = CcTest::i_isolate();
+ HandleScope handles(isolate);
+ MacroAssembler assembler(isolate, buffer, static_cast<int>(actual_size),
+ v8::internal::CodeObjectRequired::kYes);
+ MacroAssembler* masm = &assembler; // Create a pointer for the __ macro.
+
+ typedef struct {
+ int32_t _32x4_3210[4]; // identity
+ int32_t _32x4_1032[4]; // high / low swap
+ int32_t _32x4_0000[4]; // vdup's
+ int32_t _32x4_1111[4];
+ int32_t _32x4_2222[4];
+ int32_t _32x4_3333[4];
+ int32_t _32x4_2103[4]; // rotate left
+ int32_t _32x4_0321[4]; // rotate right
+ int32_t _32x4_1132[4]; // irregular
+ int32_t _32x4_1132_in_place[4]; // irregular, in-place
+ } T;
+ T t;
+
+ __ stm(db_w, sp, r4.bit() | r5.bit() | r6.bit() | r7.bit() | lr.bit());
+
+ const Register kScratch = r5;
+
+ // Make test vector [0, 1, 2, 3]
+ __ veor(q1, q1, q1); // Zero
+ for (int i = 0; i < 4; i++) {
+ __ mov(r4, Operand(i));
+ __ ReplaceLane(q1, q1, r4, NeonS32, i);
+ }
+ __ Swizzle(q0, q1, kScratch, Neon32, 0x3210);
+ __ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, _32x4_3210))));
+ __ vst1(Neon8, NeonListOperand(q0), NeonMemOperand(r4));
+
+ __ Swizzle(q0, q1, kScratch, Neon32, 0x1032);
+ __ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, _32x4_1032))));
+ __ vst1(Neon8, NeonListOperand(q0), NeonMemOperand(r4));
+
+ __ Swizzle(q0, q1, kScratch, Neon32, 0x0000);
+ __ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, _32x4_0000))));
+ __ vst1(Neon8, NeonListOperand(q0), NeonMemOperand(r4));
+
+ __ Swizzle(q0, q1, kScratch, Neon32, 0x1111);
+ __ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, _32x4_1111))));
+ __ vst1(Neon8, NeonListOperand(q0), NeonMemOperand(r4));
+
+ __ Swizzle(q0, q1, kScratch, Neon32, 0x2222);
+ __ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, _32x4_2222))));
+ __ vst1(Neon8, NeonListOperand(q0), NeonMemOperand(r4));
+
+ __ Swizzle(q0, q1, kScratch, Neon32, 0x3333);
+ __ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, _32x4_3333))));
+ __ vst1(Neon8, NeonListOperand(q0), NeonMemOperand(r4));
+
+ __ Swizzle(q0, q1, kScratch, Neon32, 0x2103);
+ __ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, _32x4_2103))));
+ __ vst1(Neon8, NeonListOperand(q0), NeonMemOperand(r4));
+
+ __ Swizzle(q0, q1, kScratch, Neon32, 0x0321);
+ __ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, _32x4_0321))));
+ __ vst1(Neon8, NeonListOperand(q0), NeonMemOperand(r4));
+
+ __ Swizzle(q0, q1, kScratch, Neon32, 0x1132);
+ __ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, _32x4_1132))));
+ __ vst1(Neon8, NeonListOperand(q0), NeonMemOperand(r4));
+
+ __ vmov(q0, q1);
+ __ Swizzle(q0, q0, kScratch, Neon32, 0x1132);
+ __ add(r4, r0,
+ Operand(static_cast<int32_t>(offsetof(T, _32x4_1132_in_place))));
+ __ vst1(Neon8, NeonListOperand(q0), NeonMemOperand(r4));
+
+ __ ldm(ia_w, sp, r4.bit() | r5.bit() | r6.bit() | r7.bit() | pc.bit());
+
+ CodeDesc desc;
+ masm->GetCode(&desc);
+ Handle<Code> code = isolate->factory()->NewCode(
+ desc, Code::ComputeFlags(Code::STUB), Handle<Code>());
+#ifdef DEBUG
+ OFStream os(stdout);
+ code->Print(os);
+#endif
+ F3 f = FUNCTION_CAST<F3>(code->entry());
+ Object* dummy = CALL_GENERATED_CODE(isolate, f, &t, 0, 0, 0, 0);
+ USE(dummy);
+ CHECK_EQ_32X4(_32x4_3210, 0, 1, 2, 3);
+ CHECK_EQ_32X4(_32x4_1032, 2, 3, 0, 1);
+ CHECK_EQ_32X4(_32x4_0000, 0, 0, 0, 0);
+ CHECK_EQ_32X4(_32x4_1111, 1, 1, 1, 1);
+ CHECK_EQ_32X4(_32x4_2222, 2, 2, 2, 2);
+ CHECK_EQ_32X4(_32x4_3333, 3, 3, 3, 3);
+ CHECK_EQ_32X4(_32x4_2103, 3, 0, 1, 2);
+ CHECK_EQ_32X4(_32x4_0321, 1, 2, 3, 0);
+ CHECK_EQ_32X4(_32x4_1132, 2, 3, 1, 1);
+ CHECK_EQ_32X4(_32x4_1132_in_place, 2, 3, 1, 1);
+}
+
#undef __