| /* |
| * Copyright (C) 2023-2026 Apple Inc. All rights reserved. |
| * Copyright (C) 2025-2026 the V8 project authors. All rights reserved. |
| * |
| * Redistribution and use in source and binary forms, with or without |
| * modification, are permitted provided that the following conditions |
| * are met: |
| * 1. Redistributions of source code must retain the above copyright |
| * notice, this list of conditions and the following disclaimer. |
| * 2. Redistributions in binary form must reproduce the above copyright |
| * notice, this list of conditions and the following disclaimer in the |
| * documentation and/or other materials provided with the distribution. |
| * |
| * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY |
| * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
| * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR |
| * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR |
| * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, |
| * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, |
| * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR |
| * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY |
| * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
| * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
| * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
| */ |
| |
| #pragma once |
| |
| #include <JavaScriptCore/CPU.h> |
| #include <JavaScriptCore/SIMDInfo.h> |
| |
| WTF_ALLOW_UNSAFE_BUFFER_USAGE_BEGIN |
| |
| namespace JSC { |
| |
| // Canonical shuffle patterns recognized for ARM64 specialized instruction selection. |
| enum class CanonicalShuffle : uint8_t { |
| Unknown, |
| Identity, |
| // 64-bit element patterns |
| S64x2UnzipEven, // UZP1.2D |
| S64x2UnzipOdd, // UZP2.2D |
| S64x2Reverse, // EXT #8 (unary): swap two 64-bit halves |
| // 32-bit element patterns |
| S32x4UnzipEven, // UZP1.4S |
| S32x4UnzipOdd, // UZP2.4S |
| S32x4ZipLower, // ZIP1.4S |
| S32x4ZipHigher, // ZIP2.4S |
| S32x4TransposeEven, // TRN1.4S |
| S32x4TransposeOdd, // TRN2.4S |
| S32x2Reverse, // REV64.4S |
| // 16-bit element patterns |
| S16x8UnzipEven, // UZP1.8H |
| S16x8UnzipOdd, // UZP2.8H |
| S16x8ZipLower, // ZIP1.8H |
| S16x8ZipHigher, // ZIP2.8H |
| S16x8TransposeEven, // TRN1.8H |
| S16x8TransposeOdd, // TRN2.8H |
| S16x2Reverse, // REV32.8H |
| S16x4Reverse, // REV64.8H |
| // 8-bit element patterns |
| S8x16UnzipEven, // UZP1.16B |
| S8x16UnzipOdd, // UZP2.16B |
| S8x16ZipLower, // ZIP1.16B |
| S8x16ZipHigher, // ZIP2.16B |
| S8x16TransposeEven, // TRN1.16B |
| S8x16TransposeOdd, // TRN2.16B |
| S8x2Reverse, // REV16.16B |
| S8x4Reverse, // REV32.16B |
| S8x8Reverse, // REV64.16B |
| }; |
| |
| class SIMDShuffle { |
| public: |
| static std::optional<unsigned> isOnlyOneSideMask(v128_t pattern) |
| { |
| unsigned first = pattern.u8x16[0]; |
| if (first < 16) { |
| for (unsigned i = 1; i < 16; ++i) { |
| if (pattern.u8x16[i] >= 16) |
| return std::nullopt; |
| } |
| return 0; |
| } |
| |
| if (first >= 32) |
| return std::nullopt; |
| |
| for (unsigned i = 1; i < 16; ++i) { |
| if (pattern.u8x16[i] < 16) |
| return std::nullopt; |
| if (pattern.u8x16[i] >= 32) |
| return std::nullopt; |
| } |
| return 1; |
| } |
| |
| static std::optional<uint8_t> isI8x16SameElement(v128_t pattern) |
| { |
| constexpr unsigned numberOfElements = 16 / sizeof(uint8_t); |
| if (std::all_of(pattern.u8x16, pattern.u8x16 + numberOfElements, [&](auto value) { return value == pattern.u8x16[0]; })) |
| return pattern.u8x16[0]; |
| return std::nullopt; |
| } |
| |
| static std::optional<uint8_t> isI8x16DupElement(v128_t pattern) |
| { |
| constexpr unsigned numberOfElements = 16 / sizeof(uint8_t); |
| if (std::all_of(pattern.u8x16, pattern.u8x16 + numberOfElements, [&](auto value) { return value == pattern.u8x16[0]; })) { |
| uint8_t lane = pattern.u8x16[0] / sizeof(uint8_t); |
| if (lane < numberOfElements) |
| return lane; |
| } |
| return std::nullopt; |
| } |
| |
| static std::optional<uint8_t> isI16x8DupElement(v128_t pattern) |
| { |
| if (!isI16x8Shuffle(pattern)) |
| return std::nullopt; |
| constexpr unsigned numberOfElements = 16 / sizeof(uint16_t); |
| if (std::all_of(pattern.u16x8, pattern.u16x8 + numberOfElements, [&](auto value) { return value == pattern.u16x8[0]; })) { |
| uint8_t lane = pattern.u8x16[0] / sizeof(uint16_t); |
| if (lane < numberOfElements) |
| return lane; |
| } |
| return std::nullopt; |
| } |
| |
| static std::optional<uint8_t> isI32x4DupElement(v128_t pattern) |
| { |
| if (!isI32x4Shuffle(pattern)) |
| return std::nullopt; |
| constexpr unsigned numberOfElements = 16 / sizeof(uint32_t); |
| if (std::all_of(pattern.u32x4, pattern.u32x4 + numberOfElements, [&](auto value) { return value == pattern.u32x4[0]; })) { |
| uint8_t lane = pattern.u8x16[0] / sizeof(uint32_t); |
| if (lane < numberOfElements) |
| return lane; |
| } |
| return std::nullopt; |
| } |
| |
| static std::optional<uint8_t> isI64x2DupElement(v128_t pattern) |
| { |
| if (!isI64x2Shuffle(pattern)) |
| return std::nullopt; |
| constexpr unsigned numberOfElements = 16 / sizeof(uint64_t); |
| if (std::all_of(pattern.u64x2, pattern.u64x2 + numberOfElements, [&](auto value) { return value == pattern.u64x2[0]; })) { |
| uint8_t lane = pattern.u8x16[0] / sizeof(uint64_t); |
| if (lane < numberOfElements) |
| return lane; |
| } |
| return std::nullopt; |
| } |
| |
| static bool isI16x8Shuffle(v128_t pattern) |
| { |
| return isLargerElementShuffle(pattern, 2); |
| } |
| |
| static bool isI32x4Shuffle(v128_t pattern) |
| { |
| return isLargerElementShuffle(pattern, 4); |
| } |
| |
| static bool isI64x2Shuffle(v128_t pattern) |
| { |
| return isLargerElementShuffle(pattern, 8); |
| } |
| |
| static bool isIdentity(v128_t pattern) |
| { |
| return isLargerElementShuffle(pattern, 16); |
| } |
| |
| static bool isAllOutOfBoundsForUnaryShuffle(v128_t pattern) |
| { |
| for (unsigned i = 0; i < 16; ++i) { |
| if constexpr (isX86()) { |
| // https://www.felixcloutier.com/x86/pshufb |
| // On x64, OOB index means that highest bit is set. |
| // The acutal index is extracted by masking with 0b1111. |
| // So, for example, 0x11 index (17) will be converted to 0x1 access (not OOB). |
| if (!(pattern.u8x16[i] & 0x80)) |
| return false; |
| } else if constexpr (isARM64()) { |
| // https://developer.arm.com/documentation/dui0801/g/A64-SIMD-Vector-Instructions/TBL--vector- |
| // On ARM64, OOB index means out of 0..15 range for unary TBL. |
| if (pattern.u8x16[i] < 16) |
| return false; |
| } else |
| return false; |
| } |
| return true; |
| } |
| |
| static bool isAllOutOfBoundsForBinaryShuffle(v128_t pattern) |
| { |
| ASSERT(isARM64()); // Binary Shuffle is only supported by ARM64. |
| for (unsigned i = 0; i < 16; ++i) { |
| if (pattern.u8x16[i] < 32) |
| return false; |
| } |
| return true; |
| } |
| |
| // Detect unary EXT (byte rotation) pattern. |
| // Returns offset if pattern[i] == (offset + i) % 16 for all i, |
| // with offset in [1, 15]. Offset 0 is identity (handled separately). |
| static std::optional<uint8_t> isUnaryEXT(v128_t pattern) |
| { |
| uint8_t first = pattern.u8x16[0]; |
| if (first == 0 || first >= 16) |
| return std::nullopt; |
| for (unsigned i = 1; i < 16; ++i) { |
| if (pattern.u8x16[i] != ((first + i) % 16)) |
| return std::nullopt; |
| } |
| return first; |
| } |
| |
| // Detect EXT (byte extraction / concatenation) pattern for binary shuffle. |
| // Returns the byte offset if the pattern is {offset, offset+1, ..., 31, 0, 1, ...} |
| // i.e., pattern[i] == (offset + i) % 32 for some offset in [0, 15]. |
| // ARM64 EXT instruction: EXT Vd.16B, Vn.16B, Vm.16B, #imm |
| // extracts bytes from the concatenation of Vn:Vm. |
| // Returns {offset, needsSwap}. |
| struct EXTInfo { |
| uint8_t offset; |
| bool needsSwap; |
| }; |
| static std::optional<EXTInfo> isEXTWithSwap(v128_t pattern) |
| { |
| uint8_t first = pattern.u8x16[0]; |
| if (first >= 32) |
| return std::nullopt; |
| for (unsigned i = 1; i < 16; ++i) { |
| if (pattern.u8x16[i] != ((first + i) % 32)) |
| return std::nullopt; |
| } |
| if (first < 16) |
| return EXTInfo { first, false }; |
| return EXTInfo { static_cast<uint8_t>(first - 16), true }; |
| } |
| |
| // Try to match a canonical binary shuffle pattern for ARM64 specialized instructions. |
| static CanonicalShuffle tryMatchCanonicalBinary(v128_t pattern) |
| { |
| return tryMatchCanonicalBinaryImpl(pattern, [](v128_t v) constexpr { return v; }); |
| } |
| |
| // Try to match a unary shuffle pattern as a binary canonical shuffle. |
| // Binary canonical pattern gets converted to unary canonical pattern with the invariant that both inputs are the same. |
| // TRN2.16B {1,17, 3,19, 5,21, 7,23, 9,25, 11,27, 13,29, 15,31} gets converted to |
| // {1,1, 3,3, 5,5, 7,7, 9,9, 11,11, 13,13, 15,15}. |
| static CanonicalShuffle tryMatchUnaryAsBinaryCanonical(v128_t pattern) |
| { |
| return tryMatchCanonicalBinaryImpl(pattern, |
| [](v128_t v) constexpr { |
| for (unsigned i = 0; i < 16; ++i) { |
| if (v.u8x16[i] >= 16) |
| v.u8x16[i] = v.u8x16[i] - 16; |
| } |
| return v; |
| }); |
| } |
| |
| // Try to match a canonical unary shuffle pattern. |
| static CanonicalShuffle tryMatchCanonicalUnary(v128_t pattern) |
| { |
| // REV64.4S: reverse 32-bit pairs within 64-bit lanes |
| if (bitEquals(pattern, v128_t::fromU8x16(4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11))) |
| return CanonicalShuffle::S32x2Reverse; |
| // REV64.8H: reverse 16-bit elements within 64-bit lanes |
| if (bitEquals(pattern, v128_t::fromU8x16(6,7, 4,5, 2,3, 0,1, 14,15, 12,13, 10,11, 8,9))) |
| return CanonicalShuffle::S16x4Reverse; |
| // REV32.8H: reverse 16-bit pairs within 32-bit groups |
| if (bitEquals(pattern, v128_t::fromU8x16(2,3, 0,1, 6,7, 4,5, 10,11, 8,9, 14,15, 12,13))) |
| return CanonicalShuffle::S16x2Reverse; |
| // REV64.16B: reverse bytes within 64-bit lanes |
| if (bitEquals(pattern, v128_t::fromU8x16(7,6,5,4,3,2,1,0, 15,14,13,12,11,10,9,8))) |
| return CanonicalShuffle::S8x8Reverse; |
| // REV32.16B: reverse bytes within 32-bit groups |
| if (bitEquals(pattern, v128_t::fromU8x16(3,2,1,0, 7,6,5,4, 11,10,9,8, 15,14,13,12))) |
| return CanonicalShuffle::S8x4Reverse; |
| // REV16.16B: reverse bytes within 16-bit groups |
| if (bitEquals(pattern, v128_t::fromU8x16(1,0, 3,2, 5,4, 7,6, 9,8, 11,10, 13,12, 15,14))) |
| return CanonicalShuffle::S8x2Reverse; |
| // S64x2 reverse: swap two 64-bit halves = {8..15, 0..7} |
| if (bitEquals(pattern, v128_t::fromU8x16(8,9,10,11,12,13,14,15, 0,1,2,3,4,5,6,7))) |
| return CanonicalShuffle::S64x2Reverse; |
| |
| return CanonicalShuffle::Unknown; |
| } |
| |
| // Compose two unary shuffles: outer(inner(x)) → combined(x). |
| // Both patterns have indices in 0..15 (or OOB >= 16). |
| static v128_t composeUnaryShuffle(v128_t outerPattern, v128_t innerPattern) |
| { |
| v128_t result; |
| for (unsigned i = 0; i < 16; ++i) { |
| uint8_t outerIdx = outerPattern.u8x16[i]; |
| if (outerIdx >= 16) { |
| result.u8x16[i] = 0xFF; // OOB |
| continue; |
| } |
| result.u8x16[i] = innerPattern.u8x16[outerIdx]; |
| } |
| return result; |
| } |
| |
| // Compose an outer shuffle with an inner shuffle. |
| // Given outer = shuffle(inner_result, other, outerPattern) or |
| // outer = shuffle(other, inner_result, outerPattern) |
| // where inner_result = shuffle(innerSrc, innerPattern) [unary] |
| // Produces a combined pattern that reads from (innerSrc, other) or (other, innerSrc). |
| // |
| // innerIsChild0: whether the inner shuffle's result is child(0) of the outer. |
| // Returns nullopt if composition is not possible (e.g., indices go out of range). |
| static v128_t composeShuffle(v128_t outerPattern, v128_t innerPattern, bool innerIsChild0) |
| { |
| v128_t result; |
| for (unsigned i = 0; i < 16; ++i) { |
| uint8_t outerIdx = outerPattern.u8x16[i]; |
| if (outerIdx >= 32) { |
| // Out of bounds in outer — result is 0 on ARM64 TBL. |
| result.u8x16[i] = 0xFF; // OOB |
| continue; |
| } |
| |
| if (innerIsChild0) { |
| if (outerIdx < 16) { |
| // Reads from inner's output. Chase through inner. |
| uint8_t innerIdx = innerPattern.u8x16[outerIdx]; |
| if (innerIdx >= 16) { |
| result.u8x16[i] = 0xFF; // OOB in inner → zero |
| continue; |
| } |
| // innerIdx is 0..15, referring to inner's input. |
| // In the composed shuffle, inner's input is child0, other is child1. |
| result.u8x16[i] = innerIdx; |
| } else { |
| // Reads from the other child (child1 of outer). |
| // In composed result, other is child1, offset 16..31. |
| result.u8x16[i] = outerIdx; // stays as 16..31 |
| } |
| } else { |
| // inner is child1 of outer |
| if (outerIdx >= 16) { |
| // Reads from inner's output (which is child1 of outer). |
| uint8_t innerIdx = innerPattern.u8x16[outerIdx - 16]; |
| if (innerIdx >= 16) { |
| result.u8x16[i] = 0xFF; |
| continue; |
| } |
| // inner's input becomes child1 in composed result. |
| result.u8x16[i] = innerIdx + 16; |
| } else { |
| // Reads from the other child (child0 of outer). |
| result.u8x16[i] = outerIdx; // stays as 0..15 |
| } |
| } |
| } |
| return result; |
| } |
| |
| private: |
| static CanonicalShuffle tryMatchCanonicalBinaryImpl(v128_t pattern, const Invocable<v128_t(v128_t)> auto& canonicalize) |
| { |
| // 64-bit element patterns |
| // UZP1.2D: {a[lo64], b[lo64]} |
| if (bitEquals(pattern, canonicalize(v128_t::fromU8x16(0,1,2,3,4,5,6,7, 16,17,18,19,20,21,22,23)))) |
| return CanonicalShuffle::S64x2UnzipEven; |
| // UZP2.2D: {a[hi64], b[hi64]} |
| if (bitEquals(pattern, canonicalize(v128_t::fromU8x16(8,9,10,11,12,13,14,15, 24,25,26,27,28,29,30,31)))) |
| return CanonicalShuffle::S64x2UnzipOdd; |
| |
| // 32-bit element patterns |
| // UZP1.4S: {a[0], a[2], b[0], b[2]} |
| if (bitEquals(pattern, canonicalize(v128_t::fromU8x16(0,1,2,3, 8,9,10,11, 16,17,18,19, 24,25,26,27)))) |
| return CanonicalShuffle::S32x4UnzipEven; |
| // UZP2.4S: {a[1], a[3], b[1], b[3]} |
| if (bitEquals(pattern, canonicalize(v128_t::fromU8x16(4,5,6,7, 12,13,14,15, 20,21,22,23, 28,29,30,31)))) |
| return CanonicalShuffle::S32x4UnzipOdd; |
| // ZIP1.4S: {a[0], b[0], a[1], b[1]} |
| if (bitEquals(pattern, canonicalize(v128_t::fromU8x16(0,1,2,3, 16,17,18,19, 4,5,6,7, 20,21,22,23)))) |
| return CanonicalShuffle::S32x4ZipLower; |
| // ZIP2.4S: {a[2], b[2], a[3], b[3]} |
| if (bitEquals(pattern, canonicalize(v128_t::fromU8x16(8,9,10,11, 24,25,26,27, 12,13,14,15, 28,29,30,31)))) |
| return CanonicalShuffle::S32x4ZipHigher; |
| // TRN1.4S: {a[0], b[0], a[2], b[2]} |
| if (bitEquals(pattern, canonicalize(v128_t::fromU8x16(0,1,2,3, 16,17,18,19, 8,9,10,11, 24,25,26,27)))) |
| return CanonicalShuffle::S32x4TransposeEven; |
| // TRN2.4S: {a[1], b[1], a[3], b[3]} |
| if (bitEquals(pattern, canonicalize(v128_t::fromU8x16(4,5,6,7, 20,21,22,23, 12,13,14,15, 28,29,30,31)))) |
| return CanonicalShuffle::S32x4TransposeOdd; |
| |
| // 16-bit element patterns |
| // UZP1.8H: {a[0],a[2],a[4],a[6], b[0],b[2],b[4],b[6]} |
| if (bitEquals(pattern, canonicalize(v128_t::fromU8x16(0,1, 4,5, 8,9, 12,13, 16,17, 20,21, 24,25, 28,29)))) |
| return CanonicalShuffle::S16x8UnzipEven; |
| // UZP2.8H |
| if (bitEquals(pattern, canonicalize(v128_t::fromU8x16(2,3, 6,7, 10,11, 14,15, 18,19, 22,23, 26,27, 30,31)))) |
| return CanonicalShuffle::S16x8UnzipOdd; |
| // ZIP1.8H |
| if (bitEquals(pattern, canonicalize(v128_t::fromU8x16(0,1, 16,17, 2,3, 18,19, 4,5, 20,21, 6,7, 22,23)))) |
| return CanonicalShuffle::S16x8ZipLower; |
| // ZIP2.8H |
| if (bitEquals(pattern, canonicalize(v128_t::fromU8x16(8,9, 24,25, 10,11, 26,27, 12,13, 28,29, 14,15, 30,31)))) |
| return CanonicalShuffle::S16x8ZipHigher; |
| // TRN1.8H |
| if (bitEquals(pattern, canonicalize(v128_t::fromU8x16(0,1, 16,17, 4,5, 20,21, 8,9, 24,25, 12,13, 28,29)))) |
| return CanonicalShuffle::S16x8TransposeEven; |
| // TRN2.8H |
| if (bitEquals(pattern, canonicalize(v128_t::fromU8x16(2,3, 18,19, 6,7, 22,23, 10,11, 26,27, 14,15, 30,31)))) |
| return CanonicalShuffle::S16x8TransposeOdd; |
| |
| // 8-bit element patterns |
| // UZP1.16B |
| if (bitEquals(pattern, canonicalize(v128_t::fromU8x16(0,2,4,6,8,10,12,14, 16,18,20,22,24,26,28,30)))) |
| return CanonicalShuffle::S8x16UnzipEven; |
| // UZP2.16B |
| if (bitEquals(pattern, canonicalize(v128_t::fromU8x16(1,3,5,7,9,11,13,15, 17,19,21,23,25,27,29,31)))) |
| return CanonicalShuffle::S8x16UnzipOdd; |
| // ZIP1.16B |
| if (bitEquals(pattern, canonicalize(v128_t::fromU8x16(0,16, 1,17, 2,18, 3,19, 4,20, 5,21, 6,22, 7,23)))) |
| return CanonicalShuffle::S8x16ZipLower; |
| // ZIP2.16B |
| if (bitEquals(pattern, canonicalize(v128_t::fromU8x16(8,24, 9,25, 10,26, 11,27, 12,28, 13,29, 14,30, 15,31)))) |
| return CanonicalShuffle::S8x16ZipHigher; |
| // TRN1.16B |
| if (bitEquals(pattern, canonicalize(v128_t::fromU8x16(0,16, 2,18, 4,20, 6,22, 8,24, 10,26, 12,28, 14,30)))) |
| return CanonicalShuffle::S8x16TransposeEven; |
| // TRN2.16B |
| if (bitEquals(pattern, canonicalize(v128_t::fromU8x16(1,17, 3,19, 5,21, 7,23, 9,25, 11,27, 13,29, 15,31)))) |
| return CanonicalShuffle::S8x16TransposeOdd; |
| |
| return CanonicalShuffle::Unknown; |
| } |
| |
| static bool isLargerElementShuffle(v128_t pattern, unsigned size) |
| { |
| unsigned numberOfElements = 16 / size; |
| for (unsigned i = 0; i < numberOfElements; ++i) { |
| unsigned firstIndex = i * size; |
| unsigned first = pattern.u8x16[firstIndex]; |
| if (first % size != 0) |
| return false; |
| for (unsigned j = 1; j < size; ++j) { |
| unsigned index = j + firstIndex; |
| if (pattern.u8x16[index] != (first + j)) |
| return false; |
| } |
| } |
| return true; |
| } |
| }; |
| |
| } // namespace JSC |
| |
| WTF_ALLOW_UNSAFE_BUFFER_USAGE_END |