| // Copyright 2022 the V8 project authors. All rights reserved. |
| // Use of this source code is governed by a BSD-style license that can be |
| // found in the LICENSE file. |
| |
| #include "src/objects/simd.h" |
| |
| #include "src/base/cpu.h" |
| #include "src/codegen/cpu-features.h" |
| #include "src/objects/compressed-slots.h" |
| #include "src/objects/fixed-array-inl.h" |
| #include "src/objects/heap-number-inl.h" |
| #include "src/objects/smi-inl.h" |
| |
| #ifdef _MSC_VER |
| // MSVC doesn't define SSE3. However, it does define AVX, and AVX implies SSE3. |
| #ifdef __AVX__ |
| #ifndef __SSE3__ |
| #define __SSE3__ |
| #endif |
| #endif |
| #endif |
| |
| #ifdef __SSE3__ |
| #include <immintrin.h> |
| #endif |
| |
| #ifdef V8_HOST_ARCH_ARM64 |
| // We use Neon only on 64-bit ARM (because on 32-bit, some instructions and some |
| // types are not available). Note that ARM64 is guaranteed to have Neon. |
| #define NEON64 |
| #include <arm_neon.h> |
| #endif |
| |
| namespace v8 { |
| namespace internal { |
| |
| namespace { |
| |
| enum class SimdKinds { kSSE, kNeon, kAVX2, kNone }; |
| |
| inline SimdKinds get_vectorization_kind() { |
| #ifdef __SSE3__ |
| #if defined(V8_TARGET_ARCH_IA32) || defined(V8_TARGET_ARCH_X64) |
| bool has_avx2 = CpuFeatures::IsSupported(AVX2); |
| #else |
| bool has_avx2 = false; |
| #endif |
| if (has_avx2) { |
| return SimdKinds::kAVX2; |
| } else { |
| // No need for a runtime check since we do not support x86/x64 CPUs without |
| // SSE3. |
| return SimdKinds::kSSE; |
| } |
| #elif defined(NEON64) |
| // No need for a runtime check since all Arm64 CPUs have Neon. |
| return SimdKinds::kNeon; |
| #else |
| return SimdKinds::kNone; |
| #endif |
| } |
| |
| // Searches for |search_element| in |array| using a simple non-vectorized linear |
| // search. This is used as a fall-back when SIMD are not available, and to |
| // process the end of arrays than SIMD cannot process. |
| template <typename T> |
| inline uintptr_t slow_search(T* array, uintptr_t array_len, uintptr_t index, |
| T search_element) { |
| for (; index < array_len; index++) { |
| if (array[index] == search_element) { |
| return index; |
| } |
| } |
| return -1; |
| } |
| |
| #ifdef NEON64 |
| // extract_first_nonzero_index returns the first non-zero index in |v|. |v| is a |
| // Neon vector that can be either 32x4 (the return is then 0, 1, 2 or 3) or 64x2 |
| // (the return is then 0 or 1). This is more or less equivalent to doing a |
| // movemask followed by a tzcnt on Intel. |
| // |
| // The input |v| should be a vector of -1 or 0 (for instance {0, 0}, |
| // {0, -1, 0, -1}, {0, -1, 0, 0}), where -1 represents a match (and 0 a |
| // non-match), that was obtained by doing a vceqq. This function extract the |
| // index of the first non-zero item of the vector. To do so, we "and" the vector |
| // with {4, 3, 2, 1} (each number is "4 - the index of the item it's in"), which |
| // produces a vector of "indices or 0". Then, we extract the maximum of this |
| // vector, which is the index of the 1st match. An example: |
| // |
| // v = {-1, 0, 0, -1} |
| // mask = {4, 3, 2, 1} |
| // v & mask = {4, 0, 0, 1} |
| // max(v & mask) = 4 |
| // index of the first match = 4-max = 4-4 = 0 |
| // |
| |
| // With MSVC, uint32x4_t and uint64x2_t typedef to a union, where first member |
| // is uint64_t[2], and not uint32_t[4]. |
| // C++ standard dictates that a union can only be initialized through its first |
| // member, which forces us to have uint64_t[2] for definition. |
| #if defined(_MSC_VER) && !defined(__clang__) |
| #define PACK32x4(w, x, y, z) \ |
| { ((w) + (uint64_t(x) << 32)), ((y) + (uint64_t(z) << 32)) } |
| #else |
| #define PACK32x4(w, x, y, z) \ |
| { (w), (x), (y), (z) } |
| #endif // MSVC workaround |
| |
| V8_ALLOW_UNUSED inline int extract_first_nonzero_index_uint32x4_t( |
| uint32x4_t v) { |
| uint32x4_t mask = PACK32x4(4, 3, 2, 1); |
| mask = vandq_u32(mask, v); |
| return 4 - vmaxvq_u32(mask); |
| } |
| |
| inline int extract_first_nonzero_index_uint64x2_t(uint64x2_t v) { |
| uint32x4_t mask = |
| PACK32x4(2, 0, 1, 0); // Could also be {2,2,1,1} or {0,2,0,1} |
| mask = vandq_u32(mask, vreinterpretq_u32_u64(v)); |
| return 2 - vmaxvq_u32(mask); |
| } |
| |
| inline int32_t reinterpret_vmaxvq_u64(uint64x2_t v) { |
| return vmaxvq_u32(vreinterpretq_u32_u64(v)); |
| } |
| #endif |
| |
| #define VECTORIZED_LOOP_Neon(type_load, type_eq, set1, cmp, movemask) \ |
| { \ |
| constexpr int elems_in_vector = sizeof(type_load) / sizeof(T); \ |
| type_load search_element_vec = set1(search_element); \ |
| \ |
| for (; index + elems_in_vector <= array_len; index += elems_in_vector) { \ |
| type_load vector = *reinterpret_cast<type_load*>(&array[index]); \ |
| type_eq eq = cmp(vector, search_element_vec); \ |
| if (movemask(eq)) { \ |
| return index + extract_first_nonzero_index_##type_eq(eq); \ |
| } \ |
| } \ |
| } |
| |
| #define VECTORIZED_LOOP_x86(type_load, type_eq, set1, cmp, movemask, extract) \ |
| { \ |
| constexpr int elems_in_vector = sizeof(type_load) / sizeof(T); \ |
| type_load search_element_vec = set1(search_element); \ |
| \ |
| for (; index + elems_in_vector <= array_len; index += elems_in_vector) { \ |
| type_load vector = *reinterpret_cast<type_load*>(&array[index]); \ |
| type_eq eq = cmp(vector, search_element_vec); \ |
| int eq_mask = movemask(eq); \ |
| if (eq_mask) { \ |
| return index + extract(eq_mask); \ |
| } \ |
| } \ |
| } |
| |
| // Uses SIMD to vectorize the search loop. This function should only be called |
| // for large-ish arrays. Note that nothing will break if |array_len| is less |
| // than vectorization_threshold: things will just be slower than necessary. |
| template <typename T> |
| inline uintptr_t fast_search_noavx(T* array, uintptr_t array_len, |
| uintptr_t index, T search_element) { |
| static constexpr bool is_uint32 = |
| sizeof(T) == sizeof(uint32_t) && std::is_integral<T>::value; |
| static constexpr bool is_uint64 = |
| sizeof(T) == sizeof(uint64_t) && std::is_integral<T>::value; |
| static constexpr bool is_double = |
| sizeof(T) == sizeof(double) && std::is_floating_point<T>::value; |
| |
| static_assert(is_uint32 || is_uint64 || is_double); |
| |
| #if !(defined(__SSE3__) || defined(NEON64)) |
| // No SIMD available. |
| return slow_search(array, array_len, index, search_element); |
| #endif |
| |
| #ifdef __SSE3__ |
| const int target_align = 16; |
| #elif defined(NEON64) |
| const int target_align = 16; |
| #else |
| const int target_align = 4; |
| UNREACHABLE(); |
| #endif |
| |
| // Scalar loop to reach desired alignment |
| for (; |
| index < array_len && |
| (reinterpret_cast<std::uintptr_t>(&(array[index])) % target_align) != 0; |
| index++) { |
| if (array[index] == search_element) { |
| return index; |
| } |
| } |
| |
| // Inserting one of the vectorized loop |
| #ifdef __SSE3__ |
| if constexpr (is_uint32) { |
| #define MOVEMASK(x) _mm_movemask_ps(_mm_castsi128_ps(x)) |
| #define EXTRACT(x) base::bits::CountTrailingZeros32(x) |
| VECTORIZED_LOOP_x86(__m128i, __m128i, _mm_set1_epi32, _mm_cmpeq_epi32, |
| MOVEMASK, EXTRACT) |
| #undef MOVEMASK |
| #undef EXTRACT |
| } else if constexpr (is_uint64) { |
| #define SET1(x) _mm_castsi128_ps(_mm_set1_epi64x(x)) |
| #define CMP(a, b) _mm_cmpeq_pd(_mm_castps_pd(a), _mm_castps_pd(b)) |
| #define EXTRACT(x) base::bits::CountTrailingZeros32(x) |
| VECTORIZED_LOOP_x86(__m128, __m128d, SET1, CMP, _mm_movemask_pd, EXTRACT) |
| #undef SET1 |
| #undef CMP |
| #undef EXTRACT |
| } else if constexpr (is_double) { |
| #define EXTRACT(x) base::bits::CountTrailingZeros32(x) |
| VECTORIZED_LOOP_x86(__m128d, __m128d, _mm_set1_pd, _mm_cmpeq_pd, |
| _mm_movemask_pd, EXTRACT) |
| #undef EXTRACT |
| } |
| #elif defined(NEON64) |
| if constexpr (is_uint32) { |
| VECTORIZED_LOOP_Neon(uint32x4_t, uint32x4_t, vdupq_n_u32, vceqq_u32, |
| vmaxvq_u32) |
| } else if constexpr (is_uint64) { |
| VECTORIZED_LOOP_Neon(uint64x2_t, uint64x2_t, vdupq_n_u64, vceqq_u64, |
| reinterpret_vmaxvq_u64) |
| } else if constexpr (is_double) { |
| VECTORIZED_LOOP_Neon(float64x2_t, uint64x2_t, vdupq_n_f64, vceqq_f64, |
| reinterpret_vmaxvq_u64) |
| } |
| #else |
| UNREACHABLE(); |
| #endif |
| |
| // The vectorized loop stops when there are not enough items left in the array |
| // to fill a vector register. The slow_search function will take care of |
| // iterating through the few remaining items. |
| return slow_search(array, array_len, index, search_element); |
| } |
| |
| #if defined(_MSC_VER) && defined(__clang__) |
| // Generating AVX2 code with Clang on Windows without the /arch:AVX2 flag does |
| // not seem possible at the moment. |
| #define IS_CLANG_WIN 1 |
| #endif |
| |
| // Since we don't compile with -mavx or -mavx2 (or /arch:AVX2 on MSVC), Clang |
| // and MSVC do not define __AVX__ nor __AVX2__. Thus, if __SSE3__ is defined, we |
| // generate the AVX2 code, and, at runtime, we'll decide to call it or not, |
| // depending on whether the CPU supports AVX2. |
| #if defined(__SSE3__) && !defined(_M_IX86) && !defined(IS_CLANG_WIN) |
| #ifdef _MSC_VER |
| #define TARGET_AVX2 |
| #else |
| #define TARGET_AVX2 __attribute__((target("avx2"))) |
| #endif |
| template <typename T> |
| TARGET_AVX2 inline uintptr_t fast_search_avx(T* array, uintptr_t array_len, |
| uintptr_t index, |
| T search_element) { |
| static constexpr bool is_uint32 = |
| sizeof(T) == sizeof(uint32_t) && std::is_integral<T>::value; |
| static constexpr bool is_uint64 = |
| sizeof(T) == sizeof(uint64_t) && std::is_integral<T>::value; |
| static constexpr bool is_double = |
| sizeof(T) == sizeof(double) && std::is_floating_point<T>::value; |
| |
| static_assert(is_uint32 || is_uint64 || is_double); |
| |
| const int target_align = 32; |
| // Scalar loop to reach desired alignment |
| for (; |
| index < array_len && |
| (reinterpret_cast<std::uintptr_t>(&(array[index])) % target_align) != 0; |
| index++) { |
| if (array[index] == search_element) { |
| return index; |
| } |
| } |
| |
| // Generating vectorized loop |
| if constexpr (is_uint32) { |
| #define MOVEMASK(x) _mm256_movemask_ps(_mm256_castsi256_ps(x)) |
| #define EXTRACT(x) base::bits::CountTrailingZeros32(x) |
| VECTORIZED_LOOP_x86(__m256i, __m256i, _mm256_set1_epi32, _mm256_cmpeq_epi32, |
| MOVEMASK, EXTRACT) |
| #undef MOVEMASK |
| #undef EXTRACT |
| } else if constexpr (is_uint64) { |
| #define MOVEMASK(x) _mm256_movemask_pd(_mm256_castsi256_pd(x)) |
| #define EXTRACT(x) base::bits::CountTrailingZeros32(x) |
| VECTORIZED_LOOP_x86(__m256i, __m256i, _mm256_set1_epi64x, |
| _mm256_cmpeq_epi64, MOVEMASK, EXTRACT) |
| #undef MOVEMASK |
| #undef EXTRACT |
| } else if constexpr (is_double) { |
| #define CMP(a, b) _mm256_cmp_pd(a, b, _CMP_EQ_OQ) |
| #define EXTRACT(x) base::bits::CountTrailingZeros32(x) |
| VECTORIZED_LOOP_x86(__m256d, __m256d, _mm256_set1_pd, CMP, |
| _mm256_movemask_pd, EXTRACT) |
| #undef CMP |
| #undef EXTRACT |
| } |
| |
| // The vectorized loop stops when there are not enough items left in the array |
| // to fill a vector register. The slow_search function will take care of |
| // iterating through the few remaining items. |
| return slow_search(array, array_len, index, search_element); |
| } |
| |
| #undef TARGET_AVX2 |
| #elif defined(IS_CLANG_WIN) |
| template <typename T> |
| inline uintptr_t fast_search_avx(T* array, uintptr_t array_len, uintptr_t index, |
| T search_element) { |
| // Falling back to SSE version |
| return fast_search_noavx(array, array_len, index, search_element); |
| } |
| #else |
| template <typename T> |
| uintptr_t fast_search_avx(T* array, uintptr_t array_len, uintptr_t index, |
| T search_element) { |
| UNREACHABLE(); |
| } |
| #endif // ifdef __SSE3__ |
| |
| #undef IS_CLANG_WIN |
| #undef VECTORIZED_LOOP_Neon |
| #undef VECTORIZED_LOOP_x86 |
| |
| template <typename T> |
| inline uintptr_t search(T* array, uintptr_t array_len, uintptr_t index, |
| T search_element) { |
| if (get_vectorization_kind() == SimdKinds::kAVX2) { |
| return fast_search_avx(array, array_len, index, search_element); |
| } else { |
| return fast_search_noavx(array, array_len, index, search_element); |
| } |
| } |
| |
| enum class ArrayIndexOfIncludesKind { DOUBLE, OBJECTORSMI }; |
| |
| // ArrayIndexOfIncludes only handles cases that can be efficiently |
| // vectorized: |
| // |
| // * Searching for a Smi in a Smi array |
| // |
| // * Searching for a Smi or Double in a Double array |
| // |
| // * Searching for an object in an object array. |
| // |
| // Other cases should be dealt with either with the CSA builtin or with the |
| // inlined optimized code. |
| template <ArrayIndexOfIncludesKind kind> |
| Address ArrayIndexOfIncludes(Address array_start, uintptr_t array_len, |
| uintptr_t from_index, Address search_element) { |
| if (array_len == 0) { |
| return Smi::FromInt(-1).ptr(); |
| } |
| |
| if constexpr (kind == ArrayIndexOfIncludesKind::DOUBLE) { |
| Tagged<FixedDoubleArray> fixed_array = |
| FixedDoubleArray::cast(Tagged<Object>(array_start)); |
| double* array = static_cast<double*>( |
| fixed_array->RawField(FixedDoubleArray::OffsetOfElementAt(0)) |
| .ToVoidPtr()); |
| |
| double search_num; |
| if (IsSmi(Tagged<Object>(search_element))) { |
| search_num = Tagged<Object>(search_element).ToSmi().value(); |
| } else { |
| DCHECK(IsHeapNumber(Tagged<Object>(search_element))); |
| search_num = HeapNumber::cast(Tagged<Object>(search_element))->value(); |
| } |
| |
| DCHECK(!std::isnan(search_num)); |
| |
| if (reinterpret_cast<uintptr_t>(array) % sizeof(double) != 0) { |
| // Slow scalar search for unaligned double array. |
| for (; from_index < array_len; from_index++) { |
| if (fixed_array->is_the_hole(static_cast<int>(from_index))) { |
| // |search_num| cannot be NaN, so there is no need to check against |
| // holes. |
| continue; |
| } |
| if (fixed_array->get_scalar(static_cast<int>(from_index)) == |
| search_num) { |
| return from_index; |
| } |
| } |
| return Smi::FromInt(-1).ptr(); |
| } |
| |
| return search<double>(array, array_len, from_index, search_num); |
| } |
| |
| if constexpr (kind == ArrayIndexOfIncludesKind::OBJECTORSMI) { |
| Tagged<FixedArray> fixed_array = |
| FixedArray::cast(Tagged<Object>(array_start)); |
| Tagged_t* array = static_cast<Tagged_t*>( |
| fixed_array->RawFieldOfFirstElement().ToVoidPtr()); |
| |
| DCHECK(!IsHeapNumber(Tagged<Object>(search_element))); |
| DCHECK(!IsBigInt(Tagged<Object>(search_element))); |
| DCHECK(!IsString(Tagged<Object>(search_element))); |
| |
| return search<Tagged_t>(array, array_len, from_index, |
| static_cast<Tagged_t>(search_element)); |
| } |
| } |
| |
| } // namespace |
| |
| uintptr_t ArrayIndexOfIncludesSmiOrObject(Address array_start, |
| uintptr_t array_len, |
| uintptr_t from_index, |
| Address search_element) { |
| return ArrayIndexOfIncludes<ArrayIndexOfIncludesKind::OBJECTORSMI>( |
| array_start, array_len, from_index, search_element); |
| } |
| |
| uintptr_t ArrayIndexOfIncludesDouble(Address array_start, uintptr_t array_len, |
| uintptr_t from_index, |
| Address search_element) { |
| return ArrayIndexOfIncludes<ArrayIndexOfIncludesKind::DOUBLE>( |
| array_start, array_len, from_index, search_element); |
| } |
| |
| #ifdef NEON64 |
| #undef NEON64 |
| #endif |
| |
| } // namespace internal |
| } // namespace v8 |