[ARM] Add Neon shift instructions vshl, vshr.
LOG=N
BUG=v8:4124
Review-Url: https://codereview.chromium.org/2629223005
Cr-Commit-Position: refs/heads/master@{#42610}
diff --git a/src/arm/assembler-arm.cc b/src/arm/assembler-arm.cc
index 6fef17b..52213b0 100644
--- a/src/arm/assembler-arm.cc
+++ b/src/arm/assembler-arm.cc
@@ -4412,6 +4412,48 @@
emit(EncodeNeonBinOp(VMAX, dt, dst, src1, src2));
}
+enum NeonShiftOp { VSHL, VSHR };
+
+static Instr EncodeNeonShiftOp(NeonShiftOp op, NeonDataType dt,
+ QwNeonRegister dst, QwNeonRegister src,
+ int shift) {
+ int vd, d;
+ dst.split_code(&vd, &d);
+ int vm, m;
+ src.split_code(&vm, &m);
+ int size_in_bits = kBitsPerByte << NeonSz(dt);
+ int op_encoding = 0;
+ int imm6 = 0;
+ if (op == VSHL) {
+ DCHECK(shift >= 0 && size_in_bits > shift);
+ imm6 = size_in_bits + shift;
+ op_encoding = 0x5 * B8;
+ } else {
+ DCHECK_EQ(VSHR, op);
+ DCHECK(shift > 0 && size_in_bits >= shift);
+ imm6 = 2 * size_in_bits - shift;
+ op_encoding = NeonU(dt) * B24;
+ }
+ return 0x1E5U * B23 | d * B22 | imm6 * B16 | vd * B12 | B6 | m * B5 | B4 |
+ vm | op_encoding;
+}
+
+void Assembler::vshl(NeonDataType dt, QwNeonRegister dst, QwNeonRegister src,
+ int shift) {
+ DCHECK(IsEnabled(NEON));
+ // Qd = vshl(Qm, bits) SIMD shift left immediate.
+ // Instruction details available in ARM DDI 0406C.b, A8-1046.
+ emit(EncodeNeonShiftOp(VSHL, dt, dst, src, shift));
+}
+
+void Assembler::vshr(NeonDataType dt, QwNeonRegister dst, QwNeonRegister src,
+ int shift) {
+ DCHECK(IsEnabled(NEON));
+ // Qd = vshl(Qm, bits) SIMD shift right immediate.
+ // Instruction details available in ARM DDI 0406C.b, A8-1052.
+ emit(EncodeNeonShiftOp(VSHR, dt, dst, src, shift));
+}
+
static Instr EncodeNeonEstimateOp(bool is_rsqrt, QwNeonRegister dst,
QwNeonRegister src) {
int vd, d;
diff --git a/src/arm/assembler-arm.h b/src/arm/assembler-arm.h
index 6390300..d4f9402 100644
--- a/src/arm/assembler-arm.h
+++ b/src/arm/assembler-arm.h
@@ -1387,6 +1387,8 @@
void vmax(QwNeonRegister dst, QwNeonRegister src1, QwNeonRegister src2);
void vmax(NeonDataType dt, QwNeonRegister dst,
QwNeonRegister src1, QwNeonRegister src2);
+ void vshl(NeonDataType dt, QwNeonRegister dst, QwNeonRegister src, int shift);
+ void vshr(NeonDataType dt, QwNeonRegister dst, QwNeonRegister src, int shift);
// vrecpe and vrsqrte only support floating point lanes.
void vrecpe(QwNeonRegister dst, QwNeonRegister src);
void vrsqrte(QwNeonRegister dst, QwNeonRegister src);
diff --git a/src/arm/disasm-arm.cc b/src/arm/disasm-arm.cc
index db32fc9..7d61845 100644
--- a/src/arm/disasm-arm.cc
+++ b/src/arm/disasm-arm.cc
@@ -1973,6 +1973,24 @@
out_buffer_pos_ +=
SNPrintF(out_buffer_ + out_buffer_pos_, "vext.8 q%d, q%d, q%d, #%d",
Vd, Vn, Vm, imm4);
+ } else if (instr->Bits(11, 7) == 0xA && instr->Bit(4) == 1) {
+ // vshl.i<size> Qd, Qm, shift
+ int size = base::bits::RoundDownToPowerOfTwo32(instr->Bits(21, 16));
+ int shift = instr->Bits(21, 16) - size;
+ int Vd = instr->VFPDRegValue(kSimd128Precision);
+ int Vm = instr->VFPMRegValue(kSimd128Precision);
+ out_buffer_pos_ +=
+ SNPrintF(out_buffer_ + out_buffer_pos_, "vshl.i%d q%d, q%d, #%d",
+ size, Vd, Vm, shift);
+ } else if (instr->Bits(11, 7) == 0 && instr->Bit(4) == 1) {
+ // vshr.s<size> Qd, Qm, shift
+ int size = base::bits::RoundDownToPowerOfTwo32(instr->Bits(21, 16));
+ int shift = 2 * size - instr->Bits(21, 16);
+ int Vd = instr->VFPDRegValue(kSimd128Precision);
+ int Vm = instr->VFPMRegValue(kSimd128Precision);
+ out_buffer_pos_ +=
+ SNPrintF(out_buffer_ + out_buffer_pos_, "vshr.s%d q%d, q%d, #%d",
+ size, Vd, Vm, shift);
} else {
Unknown(instr);
}
@@ -2162,15 +2180,24 @@
Unknown(instr);
}
} else if (instr->Bits(19, 18) == 0x2 && instr->Bits(11, 8) == 0x5) {
+ // vrecpe/vrsqrte.f32 Qd, Qm.
int Vd = instr->VFPDRegValue(kSimd128Precision);
int Vm = instr->VFPMRegValue(kSimd128Precision);
const char* op = instr->Bit(7) == 0 ? "vrecpe" : "vrsqrte";
- // vrecpe/vrsqrte.f32 Qd, Qm.
out_buffer_pos_ += SNPrintF(out_buffer_ + out_buffer_pos_,
"%s.f32 q%d, q%d", op, Vd, Vm);
} else {
Unknown(instr);
}
+ } else if (instr->Bits(11, 7) == 0 && instr->Bit(4) == 1) {
+ // vshr.u<size> Qd, Qm, shift
+ int size = base::bits::RoundDownToPowerOfTwo32(instr->Bits(21, 16));
+ int shift = 2 * size - instr->Bits(21, 16);
+ int Vd = instr->VFPDRegValue(kSimd128Precision);
+ int Vm = instr->VFPMRegValue(kSimd128Precision);
+ out_buffer_pos_ +=
+ SNPrintF(out_buffer_ + out_buffer_pos_, "vshr.u%d q%d, q%d, #%d",
+ size, Vd, Vm, shift);
} else {
Unknown(instr);
}
diff --git a/src/arm/simulator-arm.cc b/src/arm/simulator-arm.cc
index e22a8b9..ad7c4da 100644
--- a/src/arm/simulator-arm.cc
+++ b/src/arm/simulator-arm.cc
@@ -573,7 +573,6 @@
return start_page == end_page;
}
-
void Simulator::set_last_debugger_input(char* input) {
DeleteArray(last_debugger_input_);
last_debugger_input_ = input;
@@ -4355,6 +4354,84 @@
dst[i] = src2[i - boundary];
}
set_q_register(Vd, dst);
+ } else if (instr->Bits(11, 7) == 0xA && instr->Bit(4) == 1) {
+ // vshl.i<size> Qd, Qm, shift
+ int size = base::bits::RoundDownToPowerOfTwo32(instr->Bits(21, 16));
+ int shift = instr->Bits(21, 16) - size;
+ int Vd = instr->VFPDRegValue(kSimd128Precision);
+ int Vm = instr->VFPMRegValue(kSimd128Precision);
+ NeonSize ns = static_cast<NeonSize>(size / 16);
+ switch (ns) {
+ case Neon8: {
+ uint8_t src[16];
+ get_q_register(Vm, src);
+ for (int i = 0; i < 16; i++) {
+ src[i] <<= shift;
+ }
+ set_q_register(Vd, src);
+ break;
+ }
+ case Neon16: {
+ uint16_t src[8];
+ get_q_register(Vm, src);
+ for (int i = 0; i < 8; i++) {
+ src[i] <<= shift;
+ }
+ set_q_register(Vd, src);
+ break;
+ }
+ case Neon32: {
+ uint32_t src[4];
+ get_q_register(Vm, src);
+ for (int i = 0; i < 4; i++) {
+ src[i] <<= shift;
+ }
+ set_q_register(Vd, src);
+ break;
+ }
+ default:
+ UNREACHABLE();
+ break;
+ }
+ } else if (instr->Bits(11, 7) == 0 && instr->Bit(4) == 1) {
+ // vshr.s<size> Qd, Qm, shift
+ int size = base::bits::RoundDownToPowerOfTwo32(instr->Bits(21, 16));
+ int shift = 2 * size - instr->Bits(21, 16);
+ int Vd = instr->VFPDRegValue(kSimd128Precision);
+ int Vm = instr->VFPMRegValue(kSimd128Precision);
+ NeonSize ns = static_cast<NeonSize>(size / 16);
+ switch (ns) {
+ case Neon8: {
+ int8_t src[16];
+ get_q_register(Vm, src);
+ for (int i = 0; i < 16; i++) {
+ src[i] = ArithmeticShiftRight(src[i], shift);
+ }
+ set_q_register(Vd, src);
+ break;
+ }
+ case Neon16: {
+ int16_t src[8];
+ get_q_register(Vm, src);
+ for (int i = 0; i < 8; i++) {
+ src[i] = ArithmeticShiftRight(src[i], shift);
+ }
+ set_q_register(Vd, src);
+ break;
+ }
+ case Neon32: {
+ int32_t src[4];
+ get_q_register(Vm, src);
+ for (int i = 0; i < 4; i++) {
+ src[i] = ArithmeticShiftRight(src[i], shift);
+ }
+ set_q_register(Vd, src);
+ break;
+ }
+ default:
+ UNREACHABLE();
+ break;
+ }
} else {
UNIMPLEMENTED();
}
@@ -4993,6 +5070,45 @@
} else {
UNIMPLEMENTED();
}
+ } else if (instr->Bits(11, 7) == 0 && instr->Bit(4) == 1) {
+ // vshr.u<size> Qd, Qm, shift
+ int size = base::bits::RoundDownToPowerOfTwo32(instr->Bits(21, 16));
+ int shift = 2 * size - instr->Bits(21, 16);
+ int Vd = instr->VFPDRegValue(kSimd128Precision);
+ int Vm = instr->VFPMRegValue(kSimd128Precision);
+ NeonSize ns = static_cast<NeonSize>(size / 16);
+ switch (ns) {
+ case Neon8: {
+ uint8_t src[16];
+ get_q_register(Vm, src);
+ for (int i = 0; i < 16; i++) {
+ src[i] >>= shift;
+ }
+ set_q_register(Vd, src);
+ break;
+ }
+ case Neon16: {
+ uint16_t src[8];
+ get_q_register(Vm, src);
+ for (int i = 0; i < 8; i++) {
+ src[i] >>= shift;
+ }
+ set_q_register(Vd, src);
+ break;
+ }
+ case Neon32: {
+ uint32_t src[4];
+ get_q_register(Vm, src);
+ for (int i = 0; i < 4; i++) {
+ src[i] >>= shift;
+ }
+ set_q_register(Vd, src);
+ break;
+ }
+ default:
+ UNREACHABLE();
+ break;
+ }
} else {
UNIMPLEMENTED();
}
diff --git a/src/utils.h b/src/utils.h
index 0ea1de1..6d0d3c8 100644
--- a/src/utils.h
+++ b/src/utils.h
@@ -137,15 +137,20 @@
return nibble + msb4[x];
}
-
-// The C++ standard leaves the semantics of '>>' undefined for
-// negative signed operands. Most implementations do the right thing,
-// though.
-inline int ArithmeticShiftRight(int x, int s) {
- return x >> s;
+template <typename T>
+static T ArithmeticShiftRight(T x, int shift) {
+ DCHECK_LE(0, shift);
+ if (x < 0) {
+ // Right shift of signed values is implementation defined. Simulate a
+ // true arithmetic right shift by adding leading sign bits.
+ using UnsignedT = typename std::make_unsigned<T>::type;
+ UnsignedT mask = ~(static_cast<UnsignedT>(~0) >> shift);
+ return (static_cast<UnsignedT>(x) >> shift) | mask;
+ } else {
+ return x >> shift;
+ }
}
-
template <typename T>
int Compare(const T& a, const T& b) {
if (a == b)
diff --git a/test/cctest/test-assembler-arm.cc b/test/cctest/test-assembler-arm.cc
index 680b123..ac6b620 100644
--- a/test/cctest/test-assembler-arm.cc
+++ b/test/cctest/test-assembler-arm.cc
@@ -1301,6 +1301,8 @@
uint32_t vadd8[4], vadd16[4], vadd32[4];
uint32_t vsub8[4], vsub16[4], vsub32[4];
uint32_t vmul8[4], vmul16[4], vmul32[4];
+ uint32_t vshl8[4], vshl16[4], vshl32[5];
+ uint32_t vshr_s8[4], vshr_u16[4], vshr_s32[5];
uint32_t vceq[4], vceqf[4], vcgef[4], vcgtf[4];
uint32_t vcge_s8[4], vcge_u16[4], vcge_s32[4];
uint32_t vcgt_s8[4], vcgt_u16[4], vcgt_s32[4];
@@ -1671,6 +1673,32 @@
__ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, vmul32))));
__ vst1(Neon8, NeonListOperand(q1), NeonMemOperand(r4));
+ // vshl.
+ __ mov(r4, Operand(0x55));
+ __ vdup(Neon8, q0, r4);
+ __ vshl(NeonS8, q1, q0, 1);
+ __ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, vshl8))));
+ __ vst1(Neon8, NeonListOperand(q1), NeonMemOperand(r4));
+ __ vshl(NeonU16, q1, q0, 9);
+ __ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, vshl16))));
+ __ vst1(Neon8, NeonListOperand(q1), NeonMemOperand(r4));
+ __ vshl(NeonS32, q1, q0, 17);
+ __ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, vshl32))));
+ __ vst1(Neon8, NeonListOperand(q1), NeonMemOperand(r4));
+
+ // vshr.s, vshr.u.
+ __ mov(r4, Operand(0x80));
+ __ vdup(Neon8, q0, r4);
+ __ vshr(NeonS8, q1, q0, 1);
+ __ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, vshr_s8))));
+ __ vst1(Neon8, NeonListOperand(q1), NeonMemOperand(r4));
+ __ vshr(NeonU16, q1, q0, 9);
+ __ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, vshr_u16))));
+ __ vst1(Neon8, NeonListOperand(q1), NeonMemOperand(r4));
+ __ vshr(NeonS32, q1, q0, 17);
+ __ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, vshr_s32))));
+ __ vst1(Neon8, NeonListOperand(q1), NeonMemOperand(r4));
+
// vceq.
__ mov(r4, Operand(0x03));
__ vdup(Neon8, q0, r4);
@@ -1926,6 +1954,12 @@
CHECK_EQ_SPLAT(vmul8, 0x04040404u);
CHECK_EQ_SPLAT(vmul16, 0x00040004u);
CHECK_EQ_SPLAT(vmul32, 0x00000004u);
+ CHECK_EQ_SPLAT(vshl8, 0xaaaaaaaau);
+ CHECK_EQ_SPLAT(vshl16, 0xaa00aa00u);
+ CHECK_EQ_SPLAT(vshl32, 0xaaaa0000u);
+ CHECK_EQ_SPLAT(vshr_s8, 0xc0c0c0c0u);
+ CHECK_EQ_SPLAT(vshr_u16, 0x00400040u);
+ CHECK_EQ_SPLAT(vshr_s32, 0xffffc040u);
CHECK_EQ_SPLAT(vceq, 0x00ff00ffu);
// [0, 3, 0, 3, ...] >= [3, 3, 3, 3, ...]
CHECK_EQ_SPLAT(vcge_s8, 0x00ff00ffu);
diff --git a/test/cctest/test-disasm-arm.cc b/test/cctest/test-disasm-arm.cc
index 3ba6bee..e02dabe 100644
--- a/test/cctest/test-disasm-arm.cc
+++ b/test/cctest/test-disasm-arm.cc
@@ -1063,6 +1063,18 @@
"f2142970 vmul.i16 q1, q2, q8");
COMPARE(vmul(Neon32, q15, q0, q8),
"f260e970 vmul.i32 q15, q0, q8");
+ COMPARE(vshl(NeonS8, q15, q0, 6),
+ "f2cee550 vshl.i8 q15, q0, #6");
+ COMPARE(vshl(NeonU16, q15, q0, 10),
+ "f2dae550 vshl.i16 q15, q0, #10");
+ COMPARE(vshl(NeonS32, q15, q0, 17),
+ "f2f1e550 vshl.i32 q15, q0, #17");
+ COMPARE(vshr(NeonS8, q15, q0, 6),
+ "f2cae050 vshr.s8 q15, q0, #6");
+ COMPARE(vshr(NeonU16, q15, q0, 10),
+ "f3d6e050 vshr.u16 q15, q0, #10");
+ COMPARE(vshr(NeonS32, q15, q0, 17),
+ "f2efe050 vshr.s32 q15, q0, #17");
COMPARE(vrecpe(q15, q0),
"f3fbe540 vrecpe.f32 q15, q0");
COMPARE(vrecps(q15, q0, q8),