Merge pull request #4394 from AZero13/zstd

Remove redundant setting of allJobsCompleted to 1
diff --git a/.github/workflows/dev-short-tests.yml b/.github/workflows/dev-short-tests.yml
index 53f640d..62667f3 100644
--- a/.github/workflows/dev-short-tests.yml
+++ b/.github/workflows/dev-short-tests.yml
@@ -314,8 +314,7 @@
           { name: "VS 2022 Win32 Debug", platform: Win32, configuration: Debug, toolset: v143, runner: "windows-2022", arch: "" },
           { name: "VS 2022 x64 Release", platform: x64, configuration: Release, toolset: v143, runner: "windows-2022", arch: ""},
           { name: "VS 2022 Win32 Release", platform: Win32, configuration: Release, toolset: v143, runner: "windows-2022", arch: ""},
-          { name: "VS 2019 x64 Release", platform: Win32, configuration: Release, toolset: v142, runner: "windows-2019", arch: ""},
-          { name: "VS 2019 Win32 Release", platform: x64, configuration: Release, toolset: v142, runner: "windows-2019", arch: ""},
+          { name: "VS 2025 x64 Debug", platform: x64, configuration: Debug, toolset: v143, runner: "windows-2025", arch: ""},
           { name: "VS 2022 x64 Release AVX2", platform: x64, configuration: Release, toolset: v143, runner: "windows-2022", arch: "AdvancedVectorExtensions2" },
         ]
     runs-on: ${{matrix.runner}}
@@ -435,8 +434,8 @@
         make clean
         LDFLAGS="-static" CC=$XCC QEMU_SYS=$XEMU make -j check
         LDFLAGS="-static" CC=$XCC QEMU_SYS=$XEMU make -j -C tests test-cli-tests
-        CFLAGS="-march=armv8.2-a+sve2" LDFLAGS="-static" CC=$XCC QEMU_SYS=$XEMU make -j check
-        CFLAGS="-march=armv8.2-a+sve2" LDFLAGS="-static" CC=$XCC QEMU_SYS=$XEMU make -j -C tests test-cli-tests
+        CFLAGS="-O3 -march=armv8.2-a+sve2" LDFLAGS="-static" CC=$XCC QEMU_SYS=$XEMU make -j check
+        CFLAGS="-O3 -march=armv8.2-a+sve2" LDFLAGS="-static" CC=$XCC QEMU_SYS=$XEMU make -j -C tests test-cli-tests
 # This test is only compatible with standard libraries that support BTI (Branch Target Identification).
 # Unfortunately, the standard library provided on Ubuntu 24.04 does not have this feature enabled.
 #        make clean
@@ -461,6 +460,9 @@
       if: ${{ matrix.name == 'RISC-V' }}
       run: |
         LDFLAGS="-static" CC=$XCC QEMU_SYS=$XEMU make clean check
+        CFLAGS="-march=rv64gcv -O3" LDFLAGS="-static -DMEM_FORCE_MEMORY_ACCESS=0" CC=$XCC QEMU_SYS="$XEMU -cpu rv64,v=true,vlen=128" make clean check
+        CFLAGS="-march=rv64gcv -O3" LDFLAGS="-static -DMEM_FORCE_MEMORY_ACCESS=0" CC=$XCC QEMU_SYS="$XEMU -cpu rv64,v=true,vlen=256" make clean check
+        CFLAGS="-march=rv64gcv -O3" LDFLAGS="-static -DMEM_FORCE_MEMORY_ACCESS=0" CC=$XCC QEMU_SYS="$XEMU -cpu rv64,v=true,vlen=512" make clean check
     - name: M68K
       if: ${{ matrix.name == 'M68K' }}
       run: |
@@ -542,7 +544,7 @@
     steps:
     - run: git config --global core.autocrlf input
     - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # tag=v4.2.2
-    - uses: cygwin/cygwin-install-action@f61179d72284ceddc397ed07ddb444d82bf9e559 # tag=v5
+    - uses: cygwin/cygwin-install-action@f2009323764960f80959895c7bc3bb30210afe4d # tag=v6
       with:
         platform: x86_64
         packages: >-
diff --git a/lib/common/compiler.h b/lib/common/compiler.h
index 6131ad0..410068d 100644
--- a/lib/common/compiler.h
+++ b/lib/common/compiler.h
@@ -224,9 +224,17 @@
 #  if defined(__ARM_FEATURE_SVE2)
 #    define ZSTD_ARCH_ARM_SVE2
 #  endif
-# if defined(__riscv) && defined(__riscv_vector)
-#   define ZSTD_ARCH_RISCV_RVV
-# endif
+#if defined(__riscv) && defined(__riscv_vector)
+    #if defined(__GNUC__)
+        #if (__GNUC__ > 14 || (__GNUC__ == 14 && __GNUC_MINOR__ >= 1))
+            #define ZSTD_ARCH_RISCV_RVV
+        #endif
+    #elif defined(__clang__)
+        #if __clang_major__ > 18 || (__clang_major__ == 18 && __clang_minor__ >= 1)
+            #define ZSTD_ARCH_RISCV_RVV
+        #endif
+    #endif
+#endif
 #
 #  if defined(ZSTD_ARCH_X86_AVX2)
 #    include <immintrin.h>
diff --git a/lib/common/zstd_internal.h b/lib/common/zstd_internal.h
index c164768..791b648 100644
--- a/lib/common/zstd_internal.h
+++ b/lib/common/zstd_internal.h
@@ -168,7 +168,7 @@
 *  Shared functions to include for inlining
 *********************************************/
 static void ZSTD_copy8(void* dst, const void* src) {
-#if defined(ZSTD_ARCH_ARM_NEON)
+#if defined(ZSTD_ARCH_ARM_NEON) && !defined(__aarch64__)
     vst1_u8((uint8_t*)dst, vld1_u8((const uint8_t*)src));
 #else
     ZSTD_memcpy(dst, src, 8);
diff --git a/lib/compress/zstd_compress.c b/lib/compress/zstd_compress.c
index 9b7aaf9..aea7b98 100644
--- a/lib/compress/zstd_compress.c
+++ b/lib/compress/zstd_compress.c
@@ -56,6 +56,14 @@
 #  define ZSTD_HASHLOG3_MAX 17
 #endif
 
+
+/*-*************************************
+*  Forward declarations
+***************************************/
+size_t convertSequences_noRepcodes(SeqDef* dstSeqs, const ZSTD_Sequence* inSeqs,
+    size_t nbSequences);
+
+
 /*-*************************************
 *  Helper functions
 ***************************************/
@@ -7118,7 +7126,7 @@
 }
 
 
-#if defined(__AVX2__)
+#if defined(ZSTD_ARCH_X86_AVX2)
 
 #include <immintrin.h>  /* AVX2 intrinsics */
 
@@ -7138,7 +7146,7 @@
  * @returns > 0 if there is one long length (> 65535),
  * indicating the position, and type.
  */
-static size_t convertSequences_noRepcodes(
+size_t convertSequences_noRepcodes(
     SeqDef* dstSeqs,
     const ZSTD_Sequence* inSeqs,
     size_t nbSequences)
@@ -7287,7 +7295,7 @@
 #elif defined ZSTD_ARCH_RISCV_RVV
 #include <riscv_vector.h>
 /*
- * Convert `vl` sequences per iteration, using AVX2 intrinsics:
+ * Convert `vl` sequences per iteration, using RVV intrinsics:
  *   - offset -> offBase = offset + 2
  *   - litLength -> (U16) litLength
  *   - matchLength -> (U16)(matchLength - 3)
@@ -7298,9 +7306,10 @@
  * @returns > 0 if there is one long length (> 65535),
  * indicating the position, and type.
  */
-static size_t convertSequences_noRepcodes(SeqDef* dstSeqs, const ZSTD_Sequence* inSeqs, size_t nbSequences) {
+size_t convertSequences_noRepcodes(SeqDef* dstSeqs, const ZSTD_Sequence* inSeqs, size_t nbSequences) {
     size_t longLen = 0;
-
+    size_t vl = 0;
+    typedef uint32_t __attribute__((may_alias)) aliased_u32;
     /* RVV depends on the specific definition of target structures */
     ZSTD_STATIC_ASSERT(sizeof(ZSTD_Sequence) == 16);
     ZSTD_STATIC_ASSERT(offsetof(ZSTD_Sequence, offset) == 0);
@@ -7310,62 +7319,68 @@
     ZSTD_STATIC_ASSERT(offsetof(SeqDef, offBase) == 0);
     ZSTD_STATIC_ASSERT(offsetof(SeqDef, litLength) == 4);
     ZSTD_STATIC_ASSERT(offsetof(SeqDef, mlBase) == 6);
-    size_t vl = 0;
+    
     for (size_t i = 0; i < nbSequences; i += vl) {
 
-        vl = __riscv_vsetvl_e32m2(nbSequences-i);
-        // Loading structure member variables
-        vuint32m2x4_t v_tuple = __riscv_vlseg4e32_v_u32m2x4(
-            (const int32_t*)&inSeqs[i], 
-            vl
-        );
-        vuint32m2_t v_offset = __riscv_vget_v_u32m2x4_u32m2(v_tuple, 0);
-        vuint32m2_t v_lit = __riscv_vget_v_u32m2x4_u32m2(v_tuple, 1);
-        vuint32m2_t v_match = __riscv_vget_v_u32m2x4_u32m2(v_tuple, 2);
-        // offset + ZSTD_REP_NUM
-        vuint32m2_t v_offBase = __riscv_vadd_vx_u32m2(v_offset, ZSTD_REP_NUM, vl); 
-        // Check for integer overflow
-        // Cast to a 16-bit variable
-        vbool16_t lit_overflow = __riscv_vmsgtu_vx_u32m2_b16(v_lit, 65535, vl);
-        vuint16m1_t v_lit_clamped = __riscv_vncvt_x_x_w_u16m1(v_lit, vl);
+        vl = __riscv_vsetvl_e32m2(nbSequences-i);       
+        {
+            // Loading structure member variables
+            vuint32m2x4_t v_tuple = __riscv_vlseg4e32_v_u32m2x4(
+                (const aliased_u32*)((const void*)&inSeqs[i]), 
+                vl
+            );
+            vuint32m2_t v_offset = __riscv_vget_v_u32m2x4_u32m2(v_tuple, 0);
+            vuint32m2_t v_lit = __riscv_vget_v_u32m2x4_u32m2(v_tuple, 1);
+            vuint32m2_t v_match = __riscv_vget_v_u32m2x4_u32m2(v_tuple, 2);
+            // offset + ZSTD_REP_NUM
+            vuint32m2_t v_offBase = __riscv_vadd_vx_u32m2(v_offset, ZSTD_REP_NUM, vl); 
+            // Check for integer overflow
+            // Cast to a 16-bit variable
+            vbool16_t lit_overflow = __riscv_vmsgtu_vx_u32m2_b16(v_lit, 65535, vl);
+            vuint16m1_t v_lit_clamped = __riscv_vncvt_x_x_w_u16m1(v_lit, vl);
 
-        vbool16_t ml_overflow = __riscv_vmsgtu_vx_u32m2_b16(v_match, 65535+MINMATCH, vl);
-        vuint16m1_t v_ml_clamped = __riscv_vncvt_x_x_w_u16m1(__riscv_vsub_vx_u32m2(v_match, MINMATCH, vl), vl);
+            vbool16_t ml_overflow = __riscv_vmsgtu_vx_u32m2_b16(v_match, 65535+MINMATCH, vl);
+            vuint16m1_t v_ml_clamped = __riscv_vncvt_x_x_w_u16m1(__riscv_vsub_vx_u32m2(v_match, MINMATCH, vl), vl);
 
-        // Pack two 16-bit fields into a 32-bit value (little-endian)
-        // The lower 16 bits contain litLength, and the upper 16 bits contain mlBase
-        vuint32m2_t v_lit_ml_combined = __riscv_vsll_vx_u32m2(
-            __riscv_vwcvtu_x_x_v_u32m2(v_ml_clamped, vl), // Convert matchLength to 32-bit
-            16, 
-            vl
-        );
-        v_lit_ml_combined = __riscv_vor_vv_u32m2(
-            v_lit_ml_combined,
-            __riscv_vwcvtu_x_x_v_u32m2(v_lit_clamped, vl),
-            vl
-        );
-        // Create a vector of SeqDef structures
-        // Store the offBase, litLength, and mlBase in a vector of SeqDef
-        vuint32m2x2_t store_data = __riscv_vcreate_v_u32m2x2(
-            v_offBase,          
-            v_lit_ml_combined   
-        );
-        __riscv_vsseg2e32_v_u32m2x2(
-            (uint32_t*)&dstSeqs[i], 
-            store_data,             
-            vl                      
-        );
-        // Find the first index where an overflow occurs
-        int first_ml = __riscv_vfirst_m_b16(ml_overflow, vl);
-        int first_lit = __riscv_vfirst_m_b16(lit_overflow, vl);
+            // Pack two 16-bit fields into a 32-bit value (little-endian)
+            // The lower 16 bits contain litLength, and the upper 16 bits contain mlBase
+            vuint32m2_t v_lit_ml_combined = __riscv_vsll_vx_u32m2(
+                __riscv_vwcvtu_x_x_v_u32m2(v_ml_clamped, vl), // Convert matchLength to 32-bit
+                16, 
+                vl
+            );
+            v_lit_ml_combined = __riscv_vor_vv_u32m2(
+                v_lit_ml_combined,
+                __riscv_vwcvtu_x_x_v_u32m2(v_lit_clamped, vl),
+                vl
+            );
+            {
+                // Create a vector of SeqDef structures
+                // Store the offBase, litLength, and mlBase in a vector of SeqDef
+                vuint32m2x2_t store_data = __riscv_vcreate_v_u32m2x2(
+                    v_offBase,          
+                    v_lit_ml_combined   
+                );
+                __riscv_vsseg2e32_v_u32m2x2(
+                    (aliased_u32*)((void*)&dstSeqs[i]), 
+                    store_data,             
+                    vl                      
+                );
+            }
+            {
+                // Find the first index where an overflow occurs
+                int first_ml = __riscv_vfirst_m_b16(ml_overflow, vl);
+                int first_lit = __riscv_vfirst_m_b16(lit_overflow, vl);
 
-        if (UNLIKELY(first_ml != -1)) {
-            assert(longLen == 0);
-            longLen = i + first_ml + 1;
-        }
-        if (UNLIKELY(first_lit != -1)) {
-            assert(longLen == 0);
-            longLen = i + first_lit + 1 + nbSequences;
+                if (UNLIKELY(first_ml != -1)) {
+                    assert(longLen == 0);
+                    longLen = i + first_ml + 1;
+                }
+                if (UNLIKELY(first_lit != -1)) {
+                    assert(longLen == 0);
+                    longLen = i + first_lit + 1 + nbSequences;
+                }
+            }
         }
     }
     return longLen;
@@ -7375,9 +7390,131 @@
  * but since this implementation is targeting modern systems (>= Sapphire Rapid),
  * it's not useful to develop and maintain code for older pre-AVX2 platforms */
 
-#else /* no AVX2 */
+#elif defined(ZSTD_ARCH_ARM_NEON) && (defined(__aarch64__) || defined(_M_ARM64))
 
-static size_t convertSequences_noRepcodes(
+size_t convertSequences_noRepcodes(
+    SeqDef* dstSeqs,
+    const ZSTD_Sequence* inSeqs,
+    size_t nbSequences)
+{
+    size_t longLen = 0;
+    size_t n = 0;
+
+    /* Neon permutation depends on the specific definition of target structures. */
+    ZSTD_STATIC_ASSERT(sizeof(ZSTD_Sequence) == 16);
+    ZSTD_STATIC_ASSERT(offsetof(ZSTD_Sequence, offset) == 0);
+    ZSTD_STATIC_ASSERT(offsetof(ZSTD_Sequence, litLength) == 4);
+    ZSTD_STATIC_ASSERT(offsetof(ZSTD_Sequence, matchLength) == 8);
+    ZSTD_STATIC_ASSERT(sizeof(SeqDef) == 8);
+    ZSTD_STATIC_ASSERT(offsetof(SeqDef, offBase) == 0);
+    ZSTD_STATIC_ASSERT(offsetof(SeqDef, litLength) == 4);
+    ZSTD_STATIC_ASSERT(offsetof(SeqDef, mlBase) == 6);
+
+    if (nbSequences > 3) {
+        static const ZSTD_ALIGNED(16) U32 constAddition[4] = {
+            ZSTD_REP_NUM, 0, -MINMATCH, 0
+        };
+        static const ZSTD_ALIGNED(16) U8 constMask[16] = {
+            0, 1, 2, 3, 4, 5, 8, 9, 16, 17, 18, 19, 20, 21, 24, 25
+        };
+        static const ZSTD_ALIGNED(16) U16 constCounter[8] = {
+            1, 1, 1, 1, 2, 2, 2, 2
+        };
+
+        const uint32x4_t vaddition = vld1q_u32(constAddition);
+        const uint8x16_t vmask = vld1q_u8(constMask);
+        uint16x8_t vcounter = vld1q_u16(constCounter);
+        uint16x8_t vindex01 = vdupq_n_u16(0);
+        uint16x8_t vindex23 = vdupq_n_u16(0);
+
+        do {
+            /* Load 4 ZSTD_Sequence (64 bytes). */
+            const uint32x4_t vin0 = vld1q_u32(&inSeqs[n + 0].offset);
+            const uint32x4_t vin1 = vld1q_u32(&inSeqs[n + 1].offset);
+            const uint32x4_t vin2 = vld1q_u32(&inSeqs[n + 2].offset);
+            const uint32x4_t vin3 = vld1q_u32(&inSeqs[n + 3].offset);
+
+            /* Add {ZSTD_REP_NUM, 0, -MINMATCH, 0} to each vector. */
+            const uint8x16x2_t vadd01 = { {
+                vreinterpretq_u8_u32(vaddq_u32(vin0, vaddition)),
+                vreinterpretq_u8_u32(vaddq_u32(vin1, vaddition)),
+            } };
+            const uint8x16x2_t vadd23 = { {
+                vreinterpretq_u8_u32(vaddq_u32(vin2, vaddition)),
+                vreinterpretq_u8_u32(vaddq_u32(vin3, vaddition)),
+            } };
+
+            /* Shuffle and pack bytes so each vector contains 2 SeqDef structures. */
+            const uint8x16_t vout01 = vqtbl2q_u8(vadd01, vmask);
+            const uint8x16_t vout23 = vqtbl2q_u8(vadd23, vmask);
+
+            /* Pack the upper 16-bits of 32-bit lanes for overflow check. */
+            uint16x8_t voverflow01 = vuzp2q_u16(vreinterpretq_u16_u8(vadd01.val[0]),
+                                                vreinterpretq_u16_u8(vadd01.val[1]));
+            uint16x8_t voverflow23 = vuzp2q_u16(vreinterpretq_u16_u8(vadd23.val[0]),
+                                                vreinterpretq_u16_u8(vadd23.val[1]));
+
+            /* Store 4 SeqDef structures. */
+            vst1q_u32(&dstSeqs[n + 0].offBase, vreinterpretq_u32_u8(vout01));
+            vst1q_u32(&dstSeqs[n + 2].offBase, vreinterpretq_u32_u8(vout23));
+
+            /* Create masks in case of overflow. */
+            voverflow01 = vcgtzq_s16(vreinterpretq_s16_u16(voverflow01));
+            voverflow23 = vcgtzq_s16(vreinterpretq_s16_u16(voverflow23));
+
+            /* Update overflow indices. */
+            vindex01 = vbslq_u16(voverflow01, vcounter, vindex01);
+            vindex23 = vbslq_u16(voverflow23, vcounter, vindex23);
+
+            /* Update counter for overflow check. */
+            vcounter = vaddq_u16(vcounter, vdupq_n_u16(4));
+
+            n += 4;
+        } while(n < nbSequences - 3);
+
+        /* Fixup indices in the second vector, we saved an additional counter
+           in the loop to update the second overflow index, we need to add 2
+           here when the indices are not 0. */
+        {   uint16x8_t nonzero = vtstq_u16(vindex23, vindex23);
+            vindex23 = vsubq_u16(vindex23, nonzero);
+            vindex23 = vsubq_u16(vindex23, nonzero);
+        }
+
+        /* Merge indices in the vectors, maximums are needed. */
+        vindex01 = vmaxq_u16(vindex01, vindex23);
+        vindex01 = vmaxq_u16(vindex01, vextq_u16(vindex01, vindex01, 4));
+
+        /* Compute `longLen`, maximums of matchLength and litLength
+           with a preference on litLength. */
+        {   U64 maxLitMatchIndices = vgetq_lane_u64(vreinterpretq_u64_u16(vindex01), 0);
+            size_t maxLitIndex = (maxLitMatchIndices >> 16) & 0xFFFF;
+            size_t maxMatchIndex = (maxLitMatchIndices >> 32) & 0xFFFF;
+            longLen = maxLitIndex > maxMatchIndex ? maxLitIndex + nbSequences
+                                                  : maxMatchIndex;
+        }
+    }
+
+    /* Handle remaining elements. */
+    for (; n < nbSequences; n++) {
+        dstSeqs[n].offBase = OFFSET_TO_OFFBASE(inSeqs[n].offset);
+        dstSeqs[n].litLength = (U16)inSeqs[n].litLength;
+        dstSeqs[n].mlBase = (U16)(inSeqs[n].matchLength - MINMATCH);
+        /* Check for long length > 65535. */
+        if (UNLIKELY(inSeqs[n].matchLength > 65535 + MINMATCH)) {
+            assert(longLen == 0);
+            longLen = n + 1;
+        }
+        if (UNLIKELY(inSeqs[n].litLength > 65535)) {
+            assert(longLen == 0);
+            longLen = n + nbSequences + 1;
+        }
+    }
+    return longLen;
+}
+
+#else /* No vectorization. */
+
+size_t convertSequences_noRepcodes(
     SeqDef* dstSeqs,
     const ZSTD_Sequence* inSeqs,
     size_t nbSequences)
@@ -7388,7 +7525,7 @@
         dstSeqs[n].offBase = OFFSET_TO_OFFBASE(inSeqs[n].offset);
         dstSeqs[n].litLength = (U16)inSeqs[n].litLength;
         dstSeqs[n].mlBase = (U16)(inSeqs[n].matchLength - MINMATCH);
-        /* check for long length > 65535 */
+        /* Check for long length > 65535. */
         if (UNLIKELY(inSeqs[n].matchLength > 65535+MINMATCH)) {
             assert(longLen == 0);
             longLen = n + 1;
@@ -7547,18 +7684,17 @@
     size_t i = 0;
     int found_terminator = 0; 
     size_t vl_max = __riscv_vsetvlmax_e32m1();
+    typedef uint32_t __attribute__((may_alias)) aliased_u32;
     vuint32m1_t v_lit_sum = __riscv_vmv_v_x_u32m1(0, vl_max);
     vuint32m1_t v_match_sum = __riscv_vmv_v_x_u32m1(0, vl_max);
 
     for (; i  < nbSeqs; ) {
         size_t vl = __riscv_vsetvl_e32m2(nbSeqs - i); 
 
-        ptrdiff_t stride = sizeof(ZSTD_Sequence); // 16
         vuint32m2x4_t v_tuple = __riscv_vlseg4e32_v_u32m2x4(
-            (const int32_t*)&seqs[i], 
+            (const aliased_u32*)((const void*)&seqs[i]), 
             vl
         );
-        vuint32m2_t v_offset = __riscv_vget_v_u32m2x4_u32m2(v_tuple, 0);
         vuint32m2_t v_lit = __riscv_vget_v_u32m2x4_u32m2(v_tuple, 1);
         vuint32m2_t v_match = __riscv_vget_v_u32m2x4_u32m2(v_tuple, 2);
 
@@ -7604,29 +7740,104 @@
 
 #else
 
+/*
+ * The function assumes `litMatchLength` is a packed 64-bit value where the
+ * lower 32 bits represent the match length. The check varies based on the
+ * system's endianness:
+ * - On little-endian systems, it verifies if the entire 64-bit value is at most
+ * 0xFFFFFFFF, indicating the match length (lower 32 bits) is zero.
+ * - On big-endian systems, it directly checks if the lower 32 bits are zero.
+ *
+ * @returns 1 if the match length is zero, 0 otherwise.
+ */
+FORCE_INLINE_TEMPLATE int matchLengthHalfIsZero(U64 litMatchLength)
+{
+    if (MEM_isLittleEndian()) {
+        return litMatchLength <= 0xFFFFFFFFULL;
+    } else {
+        return (U32)litMatchLength == 0;
+    }
+}
+
 BlockSummary ZSTD_get1BlockSummary(const ZSTD_Sequence* seqs, size_t nbSeqs)
 {
-    size_t totalMatchSize = 0;
-    size_t litSize = 0;
-    size_t n;
+    /* Use multiple accumulators for efficient use of wide out-of-order machines. */
+    U64 litMatchSize0 = 0;
+    U64 litMatchSize1 = 0;
+    U64 litMatchSize2 = 0;
+    U64 litMatchSize3 = 0;
+    size_t n = 0;
+
+    ZSTD_STATIC_ASSERT(offsetof(ZSTD_Sequence, litLength) + 4 == offsetof(ZSTD_Sequence, matchLength));
+    ZSTD_STATIC_ASSERT(offsetof(ZSTD_Sequence, matchLength) + 4 == offsetof(ZSTD_Sequence, rep));
     assert(seqs);
-    for (n=0; n<nbSeqs; n++) {
-        totalMatchSize += seqs[n].matchLength;
-        litSize += seqs[n].litLength;
-        if (seqs[n].matchLength == 0) {
+
+    if (nbSeqs > 3) {
+        /* Process the input in 4 independent streams to reach high throughput. */
+        do {
+            /* Load `litLength` and `matchLength` as a packed `U64`. It is safe
+             * to use 64-bit unsigned arithmetic here because the sum of `litLength`
+             * and `matchLength` cannot exceed the block size, so the 32-bit
+             * subparts will never overflow. */
+            U64 litMatchLength = MEM_read64(&seqs[n].litLength);
+            litMatchSize0 += litMatchLength;
+            if (matchLengthHalfIsZero(litMatchLength)) {
+                assert(seqs[n].offset == 0);
+                goto _out;
+            }
+
+            litMatchLength = MEM_read64(&seqs[n + 1].litLength);
+            litMatchSize1 += litMatchLength;
+            if (matchLengthHalfIsZero(litMatchLength)) {
+                n += 1;
+                assert(seqs[n].offset == 0);
+                goto _out;
+            }
+
+            litMatchLength = MEM_read64(&seqs[n + 2].litLength);
+            litMatchSize2 += litMatchLength;
+            if (matchLengthHalfIsZero(litMatchLength)) {
+                n += 2;
+                assert(seqs[n].offset == 0);
+                goto _out;
+            }
+
+            litMatchLength = MEM_read64(&seqs[n + 3].litLength);
+            litMatchSize3 += litMatchLength;
+            if (matchLengthHalfIsZero(litMatchLength)) {
+                n += 3;
+                assert(seqs[n].offset == 0);
+                goto _out;
+            }
+
+            n += 4;
+        } while(n < nbSeqs - 3);
+    }
+
+    for (; n < nbSeqs; n++) {
+        U64 litMatchLength = MEM_read64(&seqs[n].litLength);
+        litMatchSize0 += litMatchLength;
+        if (matchLengthHalfIsZero(litMatchLength)) {
             assert(seqs[n].offset == 0);
-            break;
+            goto _out;
         }
     }
-    if (n==nbSeqs) {
-        BlockSummary bs;
+    /* At this point n == nbSeqs, so no end terminator. */
+    {   BlockSummary bs;
         bs.nbSequences = ERROR(externalSequences_invalid);
         return bs;
     }
+_out:
+    litMatchSize0 += litMatchSize1 + litMatchSize2 + litMatchSize3;
     {   BlockSummary bs;
-        bs.nbSequences = n+1;
-        bs.blockSize = litSize + totalMatchSize;
-        bs.litSize = litSize;
+        bs.nbSequences = n + 1;
+        if (MEM_isLittleEndian()) {
+            bs.litSize = (U32)litMatchSize0;
+            bs.blockSize = bs.litSize + (litMatchSize0 >> 32);
+        } else {
+            bs.litSize = litMatchSize0 >> 32;
+            bs.blockSize = bs.litSize + (U32)litMatchSize0;
+        }
         return bs;
     }
 }
diff --git a/lib/decompress/zstd_decompress_block.c b/lib/decompress/zstd_decompress_block.c
index 6174a25..b2ccd92 100644
--- a/lib/decompress/zstd_decompress_block.c
+++ b/lib/decompress/zstd_decompress_block.c
@@ -1236,6 +1236,10 @@
 ZSTD_decodeSequence(seqState_t* seqState, const ZSTD_longOffset_e longOffsets, const int isLastSeq)
 {
     seq_t seq;
+#if defined(__aarch64__)
+    size_t prevOffset0 = seqState->prevOffset[0];
+    size_t prevOffset1 = seqState->prevOffset[1];
+    size_t prevOffset2 = seqState->prevOffset[2];
     /*
      * ZSTD_seqSymbol is a 64 bits wide structure.
      * It can be loaded in one operation
@@ -1244,7 +1248,7 @@
      * operations that cause performance drop. This can be avoided by using this
      * ZSTD_memcpy hack.
      */
-#if defined(__aarch64__) && (defined(__GNUC__) && !defined(__clang__))
+#  if defined(__GNUC__) && !defined(__clang__)
     ZSTD_seqSymbol llDInfoS, mlDInfoS, ofDInfoS;
     ZSTD_seqSymbol* const llDInfo = &llDInfoS;
     ZSTD_seqSymbol* const mlDInfo = &mlDInfoS;
@@ -1252,11 +1256,11 @@
     ZSTD_memcpy(llDInfo, seqState->stateLL.table + seqState->stateLL.state, sizeof(ZSTD_seqSymbol));
     ZSTD_memcpy(mlDInfo, seqState->stateML.table + seqState->stateML.state, sizeof(ZSTD_seqSymbol));
     ZSTD_memcpy(ofDInfo, seqState->stateOffb.table + seqState->stateOffb.state, sizeof(ZSTD_seqSymbol));
-#else
+#  else
     const ZSTD_seqSymbol* const llDInfo = seqState->stateLL.table + seqState->stateLL.state;
     const ZSTD_seqSymbol* const mlDInfo = seqState->stateML.table + seqState->stateML.state;
     const ZSTD_seqSymbol* const ofDInfo = seqState->stateOffb.table + seqState->stateOffb.state;
-#endif
+#  endif
     seq.matchLength = mlDInfo->baseValue;
     seq.litLength = llDInfo->baseValue;
     {   U32 const ofBase = ofDInfo->baseValue;
@@ -1275,10 +1279,116 @@
         assert(llBits <= MaxLLBits);
         assert(mlBits <= MaxMLBits);
         assert(ofBits <= MaxOff);
-        /*
-         * As gcc has better branch and block analyzers, sometimes it is only
-         * valuable to mark likeliness for clang, it gives around 3-4% of
-         * performance.
+        /* As GCC has better branch and block analyzers, sometimes it is only
+         * valuable to mark likeliness for Clang.
+         */
+
+        /* sequence */
+        {   size_t offset;
+            if (ofBits > 1) {
+                ZSTD_STATIC_ASSERT(ZSTD_lo_isLongOffset == 1);
+                ZSTD_STATIC_ASSERT(LONG_OFFSETS_MAX_EXTRA_BITS_32 == 5);
+                ZSTD_STATIC_ASSERT(STREAM_ACCUMULATOR_MIN_32 > LONG_OFFSETS_MAX_EXTRA_BITS_32);
+                ZSTD_STATIC_ASSERT(STREAM_ACCUMULATOR_MIN_32 - LONG_OFFSETS_MAX_EXTRA_BITS_32 >= MaxMLBits);
+                if (MEM_32bits() && longOffsets && (ofBits >= STREAM_ACCUMULATOR_MIN_32)) {
+                    /* Always read extra bits, this keeps the logic simple,
+                     * avoids branches, and avoids accidentally reading 0 bits.
+                     */
+                    U32 const extraBits = LONG_OFFSETS_MAX_EXTRA_BITS_32;
+                    offset = ofBase + (BIT_readBitsFast(&seqState->DStream, ofBits - extraBits) << extraBits);
+                    BIT_reloadDStream(&seqState->DStream);
+                    offset += BIT_readBitsFast(&seqState->DStream, extraBits);
+                } else {
+                    offset = ofBase + BIT_readBitsFast(&seqState->DStream, ofBits/*>0*/);   /* <=  (ZSTD_WINDOWLOG_MAX-1) bits */
+                    if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream);
+                }
+                prevOffset2 = prevOffset1;
+                prevOffset1 = prevOffset0;
+                prevOffset0 = offset;
+            } else {
+                U32 const ll0 = (llDInfo->baseValue == 0);
+                if (LIKELY((ofBits == 0))) {
+                    if (ll0) {
+                        offset = prevOffset1;
+                        prevOffset1 = prevOffset0;
+                        prevOffset0 = offset;
+                    } else {
+                        offset = prevOffset0;
+                    }
+                } else {
+                    offset = ofBase + ll0 + BIT_readBitsFast(&seqState->DStream, 1);
+                    {   size_t temp = (offset == 1)   ? prevOffset1
+                                      : (offset == 3) ? prevOffset0 - 1
+                                      : (offset >= 2) ? prevOffset2
+                                      : prevOffset0;
+                        /* 0 is not valid: input corrupted => force offset to -1 =>
+                         * corruption detected at execSequence.
+                         */
+                        temp -= !temp;
+                        prevOffset2 = (offset == 1) ? prevOffset2 : prevOffset1;
+                        prevOffset1 = prevOffset0;
+                        prevOffset0 = offset = temp;
+            }   }   }
+            seq.offset = offset;
+        }
+
+        if (mlBits > 0) {
+            seq.matchLength += BIT_readBitsFast(&seqState->DStream, mlBits/*>0*/);
+
+            if (MEM_32bits() && (mlBits+llBits >= STREAM_ACCUMULATOR_MIN_32-LONG_OFFSETS_MAX_EXTRA_BITS_32))
+                BIT_reloadDStream(&seqState->DStream);
+            if (MEM_64bits() && (totalBits >= STREAM_ACCUMULATOR_MIN_64-(LLFSELog+MLFSELog+OffFSELog)))
+                BIT_reloadDStream(&seqState->DStream);
+        }
+
+        /* Ensure there are enough bits to read the rest of data in 64-bit mode. */
+        ZSTD_STATIC_ASSERT(16+LLFSELog+MLFSELog+OffFSELog < STREAM_ACCUMULATOR_MIN_64);
+
+        if (llBits > 0)
+            seq.litLength += BIT_readBitsFast(&seqState->DStream, llBits/*>0*/);
+
+        if (MEM_32bits())
+            BIT_reloadDStream(&seqState->DStream);
+
+        DEBUGLOG(6, "seq: litL=%u, matchL=%u, offset=%u",
+                    (U32)seq.litLength, (U32)seq.matchLength, (U32)seq.offset);
+
+        if (!isLastSeq) {
+            /* Don't update FSE state for last sequence. */
+            ZSTD_updateFseStateWithDInfo(&seqState->stateLL, &seqState->DStream, llNext, llnbBits);    /* <=  9 bits */
+            ZSTD_updateFseStateWithDInfo(&seqState->stateML, &seqState->DStream, mlNext, mlnbBits);    /* <=  9 bits */
+            if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream);    /* <= 18 bits */
+            ZSTD_updateFseStateWithDInfo(&seqState->stateOffb, &seqState->DStream, ofNext, ofnbBits);  /* <=  8 bits */
+            BIT_reloadDStream(&seqState->DStream);
+        }
+    }
+    seqState->prevOffset[0] = prevOffset0;
+    seqState->prevOffset[1] = prevOffset1;
+    seqState->prevOffset[2] = prevOffset2;
+#else   /* !defined(__aarch64__) */
+    const ZSTD_seqSymbol* const llDInfo = seqState->stateLL.table + seqState->stateLL.state;
+    const ZSTD_seqSymbol* const mlDInfo = seqState->stateML.table + seqState->stateML.state;
+    const ZSTD_seqSymbol* const ofDInfo = seqState->stateOffb.table + seqState->stateOffb.state;
+    seq.matchLength = mlDInfo->baseValue;
+    seq.litLength = llDInfo->baseValue;
+    {   U32 const ofBase = ofDInfo->baseValue;
+        BYTE const llBits = llDInfo->nbAdditionalBits;
+        BYTE const mlBits = mlDInfo->nbAdditionalBits;
+        BYTE const ofBits = ofDInfo->nbAdditionalBits;
+        BYTE const totalBits = llBits+mlBits+ofBits;
+
+        U16 const llNext = llDInfo->nextState;
+        U16 const mlNext = mlDInfo->nextState;
+        U16 const ofNext = ofDInfo->nextState;
+        U32 const llnbBits = llDInfo->nbBits;
+        U32 const mlnbBits = mlDInfo->nbBits;
+        U32 const ofnbBits = ofDInfo->nbBits;
+
+        assert(llBits <= MaxLLBits);
+        assert(mlBits <= MaxMLBits);
+        assert(ofBits <= MaxOff);
+        /* As GCC has better branch and block analyzers, sometimes it is only
+         * valuable to mark likeliness for Clang.
          */
 
         /* sequence */
@@ -1340,7 +1450,7 @@
                     (U32)seq.litLength, (U32)seq.matchLength, (U32)seq.offset);
 
         if (!isLastSeq) {
-            /* don't update FSE state for last Sequence */
+            /* Don't update FSE state for last sequence. */
             ZSTD_updateFseStateWithDInfo(&seqState->stateLL, &seqState->DStream, llNext, llnbBits);    /* <=  9 bits */
             ZSTD_updateFseStateWithDInfo(&seqState->stateML, &seqState->DStream, mlNext, mlnbBits);    /* <=  9 bits */
             if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream);    /* <= 18 bits */
@@ -1348,6 +1458,7 @@
             BIT_reloadDStream(&seqState->DStream);
         }
     }
+#endif  /* defined(__aarch64__) */
 
     return seq;
 }
diff --git a/tests/fuzzer.c b/tests/fuzzer.c
index da380ac..0bc160e 100644
--- a/tests/fuzzer.c
+++ b/tests/fuzzer.c
@@ -45,6 +45,7 @@
 #include "zstd_internal.h" /* ZSTD_WORKSPACETOOLARGE_MAXDURATION, ZSTD_WORKSPACETOOLARGE_FACTOR, KB, MB */
 #include "threading.h"    /* ZSTD_pthread_create, ZSTD_pthread_join */
 #include "compress/hist.h" /* HIST_count_wksp */
+#include "compress/zstd_compress_internal.h" /* ZSTD_get1BlockSummary */
 
 
 /*-************************************
@@ -769,6 +770,210 @@
     DISPLAYLEVEL(3, "OK \n");
 }
 
+size_t convertSequences_noRepcodes(SeqDef* dstSeqs, const ZSTD_Sequence* inSeqs,
+    size_t nbSequences);
+
+static size_t convertSequences_noRepcodes_ref(
+    SeqDef* dstSeqs,
+    const ZSTD_Sequence* inSeqs,
+    size_t nbSequences)
+{
+    size_t longLen = 0;
+    size_t n;
+    for (n=0; n<nbSequences; n++) {
+        dstSeqs[n].offBase = OFFSET_TO_OFFBASE(inSeqs[n].offset);
+        dstSeqs[n].litLength = (U16)inSeqs[n].litLength;
+        dstSeqs[n].mlBase = (U16)(inSeqs[n].matchLength - MINMATCH);
+        /* Check for long length > 65535. */
+        if (UNLIKELY(inSeqs[n].matchLength > 65535+MINMATCH)) {
+            assert(longLen == 0);
+            longLen = n + 1;
+        }
+        if (UNLIKELY(inSeqs[n].litLength > 65535)) {
+            assert(longLen == 0);
+            longLen = n + nbSequences + 1;
+        }
+    }
+    return longLen;
+}
+
+static unsigned test_convertSequences_noRepcodes(unsigned seed, unsigned testNb)
+{
+    ZSTD_Sequence nsrc[12];
+    SeqDef ndst[12], rdst[12];
+    size_t ref, ret, i, j;
+
+    seed += 0xDEADBEEF;
+    for (i = 0; i < COUNTOF(nsrc); ++i) {
+        seed = 48271 * ((unsigned)i + seed);
+        nsrc[i].offset = (seed & 0xFFFF) | 1;   /* Offset shall not be zero. */
+        seed = 48271 * ((unsigned)i + seed);
+        nsrc[i].litLength = seed & 0xFFFF;
+        seed = 48271 * ((unsigned)i + seed);
+        nsrc[i].matchLength = (seed & 0xFFFFFF) % (65536 + MINMATCH);
+        seed = 48271 * ((unsigned)i + seed);
+        nsrc[i].rep = seed & 0xFF;
+    }
+
+    /* For near overflow and proper negative value handling. */
+    nsrc[5].matchLength = 65535 + MINMATCH;
+    nsrc[6].litLength = 65535;
+    nsrc[6].matchLength = 0;
+    nsrc[7].litLength = 0;
+    nsrc[7].matchLength = MINMATCH;
+
+    for (i = 0; i <= COUNTOF(nsrc); ++i) {
+        DISPLAYLEVEL(3, "test%3u : convertSequences_noRepcodes with %u inputs : ",
+                     testNb++, (unsigned)i);
+        memset(ndst, 0, sizeof(ndst));
+        memset(rdst, 0, sizeof(rdst));
+        ref = convertSequences_noRepcodes_ref(rdst, nsrc, i);
+        ret = convertSequences_noRepcodes(ndst, nsrc, i);
+        CHECK_EQ(ret, ref);
+        CHECK_EQ(memcmp(rdst, ndst, sizeof(ndst)), 0);
+        DISPLAYLEVEL(3, "OK \n");
+    }
+
+    nsrc[7].matchLength = 65536 + MINMATCH;
+    for (i = 8; i <= COUNTOF(nsrc); ++i) {
+        DISPLAYLEVEL(3, "test%3u : convertSequences_noRepcodes with %u inputs and "
+                     "matchLength overflow : ",
+                     testNb++, (unsigned)i);
+        memset(ndst, 0, sizeof(ndst));
+        memset(rdst, 0, sizeof(rdst));
+        ref = convertSequences_noRepcodes_ref(rdst, nsrc, i);
+        ret = convertSequences_noRepcodes(ndst, nsrc, i);
+        CHECK_EQ(ret, ref);
+        CHECK_EQ(memcmp(rdst, ndst, sizeof(ndst)), 0);
+        DISPLAYLEVEL(3, "OK \n");
+
+        assert(COUNTOF(nsrc) > 8);
+        for (j = 4; j < 8; ++j) {
+            DISPLAYLEVEL(3, "test%3u : convertSequences_noRepcodes with %u inputs and "
+                         "matchLength overflow #%u : ",
+                         testNb++, (unsigned)i, (unsigned)(i - j));
+            memset(ndst, 0, sizeof(ndst));
+            memset(rdst, 0, sizeof(rdst));
+            ref = convertSequences_noRepcodes_ref(rdst, nsrc + j, i - j);
+            ret = convertSequences_noRepcodes(ndst, nsrc + j, i - j);
+            CHECK_EQ(ret, ref);
+            CHECK_EQ(memcmp(rdst, ndst, sizeof(ndst)), 0);
+            DISPLAYLEVEL(3, "OK \n");
+        }
+    }
+    nsrc[7].matchLength = 1;
+
+    nsrc[7].litLength = 65536;
+    for (i = 8; i <= COUNTOF(nsrc); ++i) {
+        DISPLAYLEVEL(3, "test%3u : convertSequences_noRepcodes with %u inputs and "
+                     "litLength overflow: ",
+                     testNb++, (unsigned)i);
+        memset(ndst, 0, sizeof(ndst));
+        memset(rdst, 0, sizeof(rdst));
+        ref = convertSequences_noRepcodes_ref(rdst, nsrc, i);
+        ret = convertSequences_noRepcodes(ndst, nsrc, i);
+        CHECK_EQ(ret, ref);
+        CHECK_EQ(memcmp(rdst, ndst, sizeof(ndst)), 0);
+        DISPLAYLEVEL(3, "OK \n");
+
+        assert(COUNTOF(nsrc) > 8);
+        for (j = 4; j < 8; ++j) {
+            DISPLAYLEVEL(3, "test%3u : convertSequences_noRepcodes with %u inputs and "
+                         "litLength overflow #%u: ",
+                         testNb++, (unsigned)i, (unsigned)(i - j));
+            memset(ndst, 0, sizeof(ndst));
+            memset(rdst, 0, sizeof(rdst));
+            ref = convertSequences_noRepcodes_ref(rdst, nsrc + j, i - j);
+            ret = convertSequences_noRepcodes(ndst, nsrc + j, i - j);
+            CHECK_EQ(ret, ref);
+            CHECK_EQ(memcmp(rdst, ndst, sizeof(ndst)), 0);
+            DISPLAYLEVEL(3, "OK \n");
+        }
+    }
+
+    return testNb;
+}
+
+static unsigned test_get1BlockSummary(unsigned testNb)
+{
+    static const ZSTD_Sequence nseqs[] = {
+        { 10, 2, 4, 1 },
+        { 20, 3, 5, 2 },
+        { 30, 6, 8, 3 },
+        { 40, 7, 9, 4 },
+        { 50, 10, 12, 5 },
+        { 60, 11, 13, 6 },
+        { 0,  14, 0, 7 },
+        { 70, 15, 17, 8 },
+        { 80, 16, 18, 9 },
+        { 90, 19, 21, 1 },
+        { 99, 20, 22, 2 },
+    };
+    static const BlockSummary blocks[] = {
+        { 7, 104, 53 },
+        { 6, 98, 51 },
+        { 5, 90, 48 },
+        { 4, 76, 42 },
+        { 3, 60, 35 },
+        { 2, 38, 25 },
+        { 1, 14, 14 },
+    };
+    size_t i;
+
+    DISPLAYLEVEL(3, "test%3u : ZSTD_get1BlockSummary with empty array : ", testNb++);
+    {
+        BlockSummary bs = ZSTD_get1BlockSummary(nseqs, 0);
+        CHECK_EQ(bs.nbSequences, ERROR(externalSequences_invalid));
+    }
+    DISPLAYLEVEL(3, "OK \n");
+
+    DISPLAYLEVEL(3, "test%3u : ZSTD_get1BlockSummary with 1 literal only : ", testNb++);
+    {
+        static const ZSTD_Sequence seqs[] = { { 0, 5, 0, 0 } };
+        BlockSummary bs = ZSTD_get1BlockSummary(seqs, 1);
+        CHECK_EQ(bs.nbSequences, 1);
+        CHECK_EQ(bs.litSize, 5);
+        CHECK_EQ(bs.blockSize, 5);
+    }
+    DISPLAYLEVEL(3, "OK \n");
+
+    DISPLAYLEVEL(3, "test%3u : ZSTD_get1BlockSummary with no terminator : ", testNb++);
+    {
+        static const ZSTD_Sequence seqs[] = { { 10, 2, 4, 0 }, { 20, 3, 5, 0 } };
+        BlockSummary bs = ZSTD_get1BlockSummary(seqs, 2);
+        CHECK_EQ(bs.nbSequences, ERROR(externalSequences_invalid));
+    }
+    DISPLAYLEVEL(3, "OK \n");
+
+    DISPLAYLEVEL(3, "test%3u : ZSTD_get1BlockSummary with rep ignored : ", testNb++);
+    {
+        static const ZSTD_Sequence seqs[] = {
+            { 10, 2, 4, 2 },
+            { 10, 3, 5, 2 },
+            { 0, 7, 0, 3 },
+        };
+        BlockSummary bs = ZSTD_get1BlockSummary(seqs, 3);
+        CHECK_EQ(bs.nbSequences, 3);
+        CHECK_EQ(bs.litSize, 2 + 3 + 7);
+        CHECK_EQ(bs.blockSize, (4 + 5) + (2 + 3 + 7));
+    }
+    DISPLAYLEVEL(3, "OK \n");
+
+    assert(COUNTOF(nseqs) > COUNTOF(blocks));
+    for (i = 0; i < COUNTOF(blocks); ++i) {
+        BlockSummary bs;
+        DISPLAYLEVEL(3, "test%3u : ZSTD_get1BlockSummary with %u inputs : ",
+                     testNb++, (unsigned)(COUNTOF(nseqs) - i));
+        bs = ZSTD_get1BlockSummary(nseqs + i, COUNTOF(nseqs) - i);
+        CHECK_EQ(bs.nbSequences, blocks[i].nbSequences);
+        CHECK_EQ(bs.litSize, blocks[i].litSize);
+        CHECK_EQ(bs.blockSize, blocks[i].blockSize);
+        DISPLAYLEVEL(3, "OK \n");
+    }
+
+    return testNb;
+}
+
 /* ============================================================= */
 
 static int basicUnitTests(U32 const seed, double compressibility)
@@ -4004,6 +4209,10 @@
     }
     DISPLAYLEVEL(3, "OK \n");
 
+    testNb = test_convertSequences_noRepcodes(seed, testNb);
+
+    testNb = test_get1BlockSummary(testNb);
+
     DISPLAYLEVEL(3, "test%3i : ZSTD_compressSequencesAndLiterals : ", testNb++);
     {
         const size_t srcSize = 497000;