Merge pull request #4394 from AZero13/zstd
Remove redundant setting of allJobsCompleted to 1
diff --git a/.github/workflows/dev-short-tests.yml b/.github/workflows/dev-short-tests.yml
index 53f640d..62667f3 100644
--- a/.github/workflows/dev-short-tests.yml
+++ b/.github/workflows/dev-short-tests.yml
@@ -314,8 +314,7 @@
{ name: "VS 2022 Win32 Debug", platform: Win32, configuration: Debug, toolset: v143, runner: "windows-2022", arch: "" },
{ name: "VS 2022 x64 Release", platform: x64, configuration: Release, toolset: v143, runner: "windows-2022", arch: ""},
{ name: "VS 2022 Win32 Release", platform: Win32, configuration: Release, toolset: v143, runner: "windows-2022", arch: ""},
- { name: "VS 2019 x64 Release", platform: Win32, configuration: Release, toolset: v142, runner: "windows-2019", arch: ""},
- { name: "VS 2019 Win32 Release", platform: x64, configuration: Release, toolset: v142, runner: "windows-2019", arch: ""},
+ { name: "VS 2025 x64 Debug", platform: x64, configuration: Debug, toolset: v143, runner: "windows-2025", arch: ""},
{ name: "VS 2022 x64 Release AVX2", platform: x64, configuration: Release, toolset: v143, runner: "windows-2022", arch: "AdvancedVectorExtensions2" },
]
runs-on: ${{matrix.runner}}
@@ -435,8 +434,8 @@
make clean
LDFLAGS="-static" CC=$XCC QEMU_SYS=$XEMU make -j check
LDFLAGS="-static" CC=$XCC QEMU_SYS=$XEMU make -j -C tests test-cli-tests
- CFLAGS="-march=armv8.2-a+sve2" LDFLAGS="-static" CC=$XCC QEMU_SYS=$XEMU make -j check
- CFLAGS="-march=armv8.2-a+sve2" LDFLAGS="-static" CC=$XCC QEMU_SYS=$XEMU make -j -C tests test-cli-tests
+ CFLAGS="-O3 -march=armv8.2-a+sve2" LDFLAGS="-static" CC=$XCC QEMU_SYS=$XEMU make -j check
+ CFLAGS="-O3 -march=armv8.2-a+sve2" LDFLAGS="-static" CC=$XCC QEMU_SYS=$XEMU make -j -C tests test-cli-tests
# This test is only compatible with standard libraries that support BTI (Branch Target Identification).
# Unfortunately, the standard library provided on Ubuntu 24.04 does not have this feature enabled.
# make clean
@@ -461,6 +460,9 @@
if: ${{ matrix.name == 'RISC-V' }}
run: |
LDFLAGS="-static" CC=$XCC QEMU_SYS=$XEMU make clean check
+ CFLAGS="-march=rv64gcv -O3" LDFLAGS="-static -DMEM_FORCE_MEMORY_ACCESS=0" CC=$XCC QEMU_SYS="$XEMU -cpu rv64,v=true,vlen=128" make clean check
+ CFLAGS="-march=rv64gcv -O3" LDFLAGS="-static -DMEM_FORCE_MEMORY_ACCESS=0" CC=$XCC QEMU_SYS="$XEMU -cpu rv64,v=true,vlen=256" make clean check
+ CFLAGS="-march=rv64gcv -O3" LDFLAGS="-static -DMEM_FORCE_MEMORY_ACCESS=0" CC=$XCC QEMU_SYS="$XEMU -cpu rv64,v=true,vlen=512" make clean check
- name: M68K
if: ${{ matrix.name == 'M68K' }}
run: |
@@ -542,7 +544,7 @@
steps:
- run: git config --global core.autocrlf input
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # tag=v4.2.2
- - uses: cygwin/cygwin-install-action@f61179d72284ceddc397ed07ddb444d82bf9e559 # tag=v5
+ - uses: cygwin/cygwin-install-action@f2009323764960f80959895c7bc3bb30210afe4d # tag=v6
with:
platform: x86_64
packages: >-
diff --git a/lib/common/compiler.h b/lib/common/compiler.h
index 6131ad0..410068d 100644
--- a/lib/common/compiler.h
+++ b/lib/common/compiler.h
@@ -224,9 +224,17 @@
# if defined(__ARM_FEATURE_SVE2)
# define ZSTD_ARCH_ARM_SVE2
# endif
-# if defined(__riscv) && defined(__riscv_vector)
-# define ZSTD_ARCH_RISCV_RVV
-# endif
+#if defined(__riscv) && defined(__riscv_vector)
+ #if defined(__GNUC__)
+ #if (__GNUC__ > 14 || (__GNUC__ == 14 && __GNUC_MINOR__ >= 1))
+ #define ZSTD_ARCH_RISCV_RVV
+ #endif
+ #elif defined(__clang__)
+ #if __clang_major__ > 18 || (__clang_major__ == 18 && __clang_minor__ >= 1)
+ #define ZSTD_ARCH_RISCV_RVV
+ #endif
+ #endif
+#endif
#
# if defined(ZSTD_ARCH_X86_AVX2)
# include <immintrin.h>
diff --git a/lib/common/zstd_internal.h b/lib/common/zstd_internal.h
index c164768..791b648 100644
--- a/lib/common/zstd_internal.h
+++ b/lib/common/zstd_internal.h
@@ -168,7 +168,7 @@
* Shared functions to include for inlining
*********************************************/
static void ZSTD_copy8(void* dst, const void* src) {
-#if defined(ZSTD_ARCH_ARM_NEON)
+#if defined(ZSTD_ARCH_ARM_NEON) && !defined(__aarch64__)
vst1_u8((uint8_t*)dst, vld1_u8((const uint8_t*)src));
#else
ZSTD_memcpy(dst, src, 8);
diff --git a/lib/compress/zstd_compress.c b/lib/compress/zstd_compress.c
index 9b7aaf9..aea7b98 100644
--- a/lib/compress/zstd_compress.c
+++ b/lib/compress/zstd_compress.c
@@ -56,6 +56,14 @@
# define ZSTD_HASHLOG3_MAX 17
#endif
+
+/*-*************************************
+* Forward declarations
+***************************************/
+size_t convertSequences_noRepcodes(SeqDef* dstSeqs, const ZSTD_Sequence* inSeqs,
+ size_t nbSequences);
+
+
/*-*************************************
* Helper functions
***************************************/
@@ -7118,7 +7126,7 @@
}
-#if defined(__AVX2__)
+#if defined(ZSTD_ARCH_X86_AVX2)
#include <immintrin.h> /* AVX2 intrinsics */
@@ -7138,7 +7146,7 @@
* @returns > 0 if there is one long length (> 65535),
* indicating the position, and type.
*/
-static size_t convertSequences_noRepcodes(
+size_t convertSequences_noRepcodes(
SeqDef* dstSeqs,
const ZSTD_Sequence* inSeqs,
size_t nbSequences)
@@ -7287,7 +7295,7 @@
#elif defined ZSTD_ARCH_RISCV_RVV
#include <riscv_vector.h>
/*
- * Convert `vl` sequences per iteration, using AVX2 intrinsics:
+ * Convert `vl` sequences per iteration, using RVV intrinsics:
* - offset -> offBase = offset + 2
* - litLength -> (U16) litLength
* - matchLength -> (U16)(matchLength - 3)
@@ -7298,9 +7306,10 @@
* @returns > 0 if there is one long length (> 65535),
* indicating the position, and type.
*/
-static size_t convertSequences_noRepcodes(SeqDef* dstSeqs, const ZSTD_Sequence* inSeqs, size_t nbSequences) {
+size_t convertSequences_noRepcodes(SeqDef* dstSeqs, const ZSTD_Sequence* inSeqs, size_t nbSequences) {
size_t longLen = 0;
-
+ size_t vl = 0;
+ typedef uint32_t __attribute__((may_alias)) aliased_u32;
/* RVV depends on the specific definition of target structures */
ZSTD_STATIC_ASSERT(sizeof(ZSTD_Sequence) == 16);
ZSTD_STATIC_ASSERT(offsetof(ZSTD_Sequence, offset) == 0);
@@ -7310,62 +7319,68 @@
ZSTD_STATIC_ASSERT(offsetof(SeqDef, offBase) == 0);
ZSTD_STATIC_ASSERT(offsetof(SeqDef, litLength) == 4);
ZSTD_STATIC_ASSERT(offsetof(SeqDef, mlBase) == 6);
- size_t vl = 0;
+
for (size_t i = 0; i < nbSequences; i += vl) {
- vl = __riscv_vsetvl_e32m2(nbSequences-i);
- // Loading structure member variables
- vuint32m2x4_t v_tuple = __riscv_vlseg4e32_v_u32m2x4(
- (const int32_t*)&inSeqs[i],
- vl
- );
- vuint32m2_t v_offset = __riscv_vget_v_u32m2x4_u32m2(v_tuple, 0);
- vuint32m2_t v_lit = __riscv_vget_v_u32m2x4_u32m2(v_tuple, 1);
- vuint32m2_t v_match = __riscv_vget_v_u32m2x4_u32m2(v_tuple, 2);
- // offset + ZSTD_REP_NUM
- vuint32m2_t v_offBase = __riscv_vadd_vx_u32m2(v_offset, ZSTD_REP_NUM, vl);
- // Check for integer overflow
- // Cast to a 16-bit variable
- vbool16_t lit_overflow = __riscv_vmsgtu_vx_u32m2_b16(v_lit, 65535, vl);
- vuint16m1_t v_lit_clamped = __riscv_vncvt_x_x_w_u16m1(v_lit, vl);
+ vl = __riscv_vsetvl_e32m2(nbSequences-i);
+ {
+ // Loading structure member variables
+ vuint32m2x4_t v_tuple = __riscv_vlseg4e32_v_u32m2x4(
+ (const aliased_u32*)((const void*)&inSeqs[i]),
+ vl
+ );
+ vuint32m2_t v_offset = __riscv_vget_v_u32m2x4_u32m2(v_tuple, 0);
+ vuint32m2_t v_lit = __riscv_vget_v_u32m2x4_u32m2(v_tuple, 1);
+ vuint32m2_t v_match = __riscv_vget_v_u32m2x4_u32m2(v_tuple, 2);
+ // offset + ZSTD_REP_NUM
+ vuint32m2_t v_offBase = __riscv_vadd_vx_u32m2(v_offset, ZSTD_REP_NUM, vl);
+ // Check for integer overflow
+ // Cast to a 16-bit variable
+ vbool16_t lit_overflow = __riscv_vmsgtu_vx_u32m2_b16(v_lit, 65535, vl);
+ vuint16m1_t v_lit_clamped = __riscv_vncvt_x_x_w_u16m1(v_lit, vl);
- vbool16_t ml_overflow = __riscv_vmsgtu_vx_u32m2_b16(v_match, 65535+MINMATCH, vl);
- vuint16m1_t v_ml_clamped = __riscv_vncvt_x_x_w_u16m1(__riscv_vsub_vx_u32m2(v_match, MINMATCH, vl), vl);
+ vbool16_t ml_overflow = __riscv_vmsgtu_vx_u32m2_b16(v_match, 65535+MINMATCH, vl);
+ vuint16m1_t v_ml_clamped = __riscv_vncvt_x_x_w_u16m1(__riscv_vsub_vx_u32m2(v_match, MINMATCH, vl), vl);
- // Pack two 16-bit fields into a 32-bit value (little-endian)
- // The lower 16 bits contain litLength, and the upper 16 bits contain mlBase
- vuint32m2_t v_lit_ml_combined = __riscv_vsll_vx_u32m2(
- __riscv_vwcvtu_x_x_v_u32m2(v_ml_clamped, vl), // Convert matchLength to 32-bit
- 16,
- vl
- );
- v_lit_ml_combined = __riscv_vor_vv_u32m2(
- v_lit_ml_combined,
- __riscv_vwcvtu_x_x_v_u32m2(v_lit_clamped, vl),
- vl
- );
- // Create a vector of SeqDef structures
- // Store the offBase, litLength, and mlBase in a vector of SeqDef
- vuint32m2x2_t store_data = __riscv_vcreate_v_u32m2x2(
- v_offBase,
- v_lit_ml_combined
- );
- __riscv_vsseg2e32_v_u32m2x2(
- (uint32_t*)&dstSeqs[i],
- store_data,
- vl
- );
- // Find the first index where an overflow occurs
- int first_ml = __riscv_vfirst_m_b16(ml_overflow, vl);
- int first_lit = __riscv_vfirst_m_b16(lit_overflow, vl);
+ // Pack two 16-bit fields into a 32-bit value (little-endian)
+ // The lower 16 bits contain litLength, and the upper 16 bits contain mlBase
+ vuint32m2_t v_lit_ml_combined = __riscv_vsll_vx_u32m2(
+ __riscv_vwcvtu_x_x_v_u32m2(v_ml_clamped, vl), // Convert matchLength to 32-bit
+ 16,
+ vl
+ );
+ v_lit_ml_combined = __riscv_vor_vv_u32m2(
+ v_lit_ml_combined,
+ __riscv_vwcvtu_x_x_v_u32m2(v_lit_clamped, vl),
+ vl
+ );
+ {
+ // Create a vector of SeqDef structures
+ // Store the offBase, litLength, and mlBase in a vector of SeqDef
+ vuint32m2x2_t store_data = __riscv_vcreate_v_u32m2x2(
+ v_offBase,
+ v_lit_ml_combined
+ );
+ __riscv_vsseg2e32_v_u32m2x2(
+ (aliased_u32*)((void*)&dstSeqs[i]),
+ store_data,
+ vl
+ );
+ }
+ {
+ // Find the first index where an overflow occurs
+ int first_ml = __riscv_vfirst_m_b16(ml_overflow, vl);
+ int first_lit = __riscv_vfirst_m_b16(lit_overflow, vl);
- if (UNLIKELY(first_ml != -1)) {
- assert(longLen == 0);
- longLen = i + first_ml + 1;
- }
- if (UNLIKELY(first_lit != -1)) {
- assert(longLen == 0);
- longLen = i + first_lit + 1 + nbSequences;
+ if (UNLIKELY(first_ml != -1)) {
+ assert(longLen == 0);
+ longLen = i + first_ml + 1;
+ }
+ if (UNLIKELY(first_lit != -1)) {
+ assert(longLen == 0);
+ longLen = i + first_lit + 1 + nbSequences;
+ }
+ }
}
}
return longLen;
@@ -7375,9 +7390,131 @@
* but since this implementation is targeting modern systems (>= Sapphire Rapid),
* it's not useful to develop and maintain code for older pre-AVX2 platforms */
-#else /* no AVX2 */
+#elif defined(ZSTD_ARCH_ARM_NEON) && (defined(__aarch64__) || defined(_M_ARM64))
-static size_t convertSequences_noRepcodes(
+size_t convertSequences_noRepcodes(
+ SeqDef* dstSeqs,
+ const ZSTD_Sequence* inSeqs,
+ size_t nbSequences)
+{
+ size_t longLen = 0;
+ size_t n = 0;
+
+ /* Neon permutation depends on the specific definition of target structures. */
+ ZSTD_STATIC_ASSERT(sizeof(ZSTD_Sequence) == 16);
+ ZSTD_STATIC_ASSERT(offsetof(ZSTD_Sequence, offset) == 0);
+ ZSTD_STATIC_ASSERT(offsetof(ZSTD_Sequence, litLength) == 4);
+ ZSTD_STATIC_ASSERT(offsetof(ZSTD_Sequence, matchLength) == 8);
+ ZSTD_STATIC_ASSERT(sizeof(SeqDef) == 8);
+ ZSTD_STATIC_ASSERT(offsetof(SeqDef, offBase) == 0);
+ ZSTD_STATIC_ASSERT(offsetof(SeqDef, litLength) == 4);
+ ZSTD_STATIC_ASSERT(offsetof(SeqDef, mlBase) == 6);
+
+ if (nbSequences > 3) {
+ static const ZSTD_ALIGNED(16) U32 constAddition[4] = {
+ ZSTD_REP_NUM, 0, -MINMATCH, 0
+ };
+ static const ZSTD_ALIGNED(16) U8 constMask[16] = {
+ 0, 1, 2, 3, 4, 5, 8, 9, 16, 17, 18, 19, 20, 21, 24, 25
+ };
+ static const ZSTD_ALIGNED(16) U16 constCounter[8] = {
+ 1, 1, 1, 1, 2, 2, 2, 2
+ };
+
+ const uint32x4_t vaddition = vld1q_u32(constAddition);
+ const uint8x16_t vmask = vld1q_u8(constMask);
+ uint16x8_t vcounter = vld1q_u16(constCounter);
+ uint16x8_t vindex01 = vdupq_n_u16(0);
+ uint16x8_t vindex23 = vdupq_n_u16(0);
+
+ do {
+ /* Load 4 ZSTD_Sequence (64 bytes). */
+ const uint32x4_t vin0 = vld1q_u32(&inSeqs[n + 0].offset);
+ const uint32x4_t vin1 = vld1q_u32(&inSeqs[n + 1].offset);
+ const uint32x4_t vin2 = vld1q_u32(&inSeqs[n + 2].offset);
+ const uint32x4_t vin3 = vld1q_u32(&inSeqs[n + 3].offset);
+
+ /* Add {ZSTD_REP_NUM, 0, -MINMATCH, 0} to each vector. */
+ const uint8x16x2_t vadd01 = { {
+ vreinterpretq_u8_u32(vaddq_u32(vin0, vaddition)),
+ vreinterpretq_u8_u32(vaddq_u32(vin1, vaddition)),
+ } };
+ const uint8x16x2_t vadd23 = { {
+ vreinterpretq_u8_u32(vaddq_u32(vin2, vaddition)),
+ vreinterpretq_u8_u32(vaddq_u32(vin3, vaddition)),
+ } };
+
+ /* Shuffle and pack bytes so each vector contains 2 SeqDef structures. */
+ const uint8x16_t vout01 = vqtbl2q_u8(vadd01, vmask);
+ const uint8x16_t vout23 = vqtbl2q_u8(vadd23, vmask);
+
+ /* Pack the upper 16-bits of 32-bit lanes for overflow check. */
+ uint16x8_t voverflow01 = vuzp2q_u16(vreinterpretq_u16_u8(vadd01.val[0]),
+ vreinterpretq_u16_u8(vadd01.val[1]));
+ uint16x8_t voverflow23 = vuzp2q_u16(vreinterpretq_u16_u8(vadd23.val[0]),
+ vreinterpretq_u16_u8(vadd23.val[1]));
+
+ /* Store 4 SeqDef structures. */
+ vst1q_u32(&dstSeqs[n + 0].offBase, vreinterpretq_u32_u8(vout01));
+ vst1q_u32(&dstSeqs[n + 2].offBase, vreinterpretq_u32_u8(vout23));
+
+ /* Create masks in case of overflow. */
+ voverflow01 = vcgtzq_s16(vreinterpretq_s16_u16(voverflow01));
+ voverflow23 = vcgtzq_s16(vreinterpretq_s16_u16(voverflow23));
+
+ /* Update overflow indices. */
+ vindex01 = vbslq_u16(voverflow01, vcounter, vindex01);
+ vindex23 = vbslq_u16(voverflow23, vcounter, vindex23);
+
+ /* Update counter for overflow check. */
+ vcounter = vaddq_u16(vcounter, vdupq_n_u16(4));
+
+ n += 4;
+ } while(n < nbSequences - 3);
+
+ /* Fixup indices in the second vector, we saved an additional counter
+ in the loop to update the second overflow index, we need to add 2
+ here when the indices are not 0. */
+ { uint16x8_t nonzero = vtstq_u16(vindex23, vindex23);
+ vindex23 = vsubq_u16(vindex23, nonzero);
+ vindex23 = vsubq_u16(vindex23, nonzero);
+ }
+
+ /* Merge indices in the vectors, maximums are needed. */
+ vindex01 = vmaxq_u16(vindex01, vindex23);
+ vindex01 = vmaxq_u16(vindex01, vextq_u16(vindex01, vindex01, 4));
+
+ /* Compute `longLen`, maximums of matchLength and litLength
+ with a preference on litLength. */
+ { U64 maxLitMatchIndices = vgetq_lane_u64(vreinterpretq_u64_u16(vindex01), 0);
+ size_t maxLitIndex = (maxLitMatchIndices >> 16) & 0xFFFF;
+ size_t maxMatchIndex = (maxLitMatchIndices >> 32) & 0xFFFF;
+ longLen = maxLitIndex > maxMatchIndex ? maxLitIndex + nbSequences
+ : maxMatchIndex;
+ }
+ }
+
+ /* Handle remaining elements. */
+ for (; n < nbSequences; n++) {
+ dstSeqs[n].offBase = OFFSET_TO_OFFBASE(inSeqs[n].offset);
+ dstSeqs[n].litLength = (U16)inSeqs[n].litLength;
+ dstSeqs[n].mlBase = (U16)(inSeqs[n].matchLength - MINMATCH);
+ /* Check for long length > 65535. */
+ if (UNLIKELY(inSeqs[n].matchLength > 65535 + MINMATCH)) {
+ assert(longLen == 0);
+ longLen = n + 1;
+ }
+ if (UNLIKELY(inSeqs[n].litLength > 65535)) {
+ assert(longLen == 0);
+ longLen = n + nbSequences + 1;
+ }
+ }
+ return longLen;
+}
+
+#else /* No vectorization. */
+
+size_t convertSequences_noRepcodes(
SeqDef* dstSeqs,
const ZSTD_Sequence* inSeqs,
size_t nbSequences)
@@ -7388,7 +7525,7 @@
dstSeqs[n].offBase = OFFSET_TO_OFFBASE(inSeqs[n].offset);
dstSeqs[n].litLength = (U16)inSeqs[n].litLength;
dstSeqs[n].mlBase = (U16)(inSeqs[n].matchLength - MINMATCH);
- /* check for long length > 65535 */
+ /* Check for long length > 65535. */
if (UNLIKELY(inSeqs[n].matchLength > 65535+MINMATCH)) {
assert(longLen == 0);
longLen = n + 1;
@@ -7547,18 +7684,17 @@
size_t i = 0;
int found_terminator = 0;
size_t vl_max = __riscv_vsetvlmax_e32m1();
+ typedef uint32_t __attribute__((may_alias)) aliased_u32;
vuint32m1_t v_lit_sum = __riscv_vmv_v_x_u32m1(0, vl_max);
vuint32m1_t v_match_sum = __riscv_vmv_v_x_u32m1(0, vl_max);
for (; i < nbSeqs; ) {
size_t vl = __riscv_vsetvl_e32m2(nbSeqs - i);
- ptrdiff_t stride = sizeof(ZSTD_Sequence); // 16
vuint32m2x4_t v_tuple = __riscv_vlseg4e32_v_u32m2x4(
- (const int32_t*)&seqs[i],
+ (const aliased_u32*)((const void*)&seqs[i]),
vl
);
- vuint32m2_t v_offset = __riscv_vget_v_u32m2x4_u32m2(v_tuple, 0);
vuint32m2_t v_lit = __riscv_vget_v_u32m2x4_u32m2(v_tuple, 1);
vuint32m2_t v_match = __riscv_vget_v_u32m2x4_u32m2(v_tuple, 2);
@@ -7604,29 +7740,104 @@
#else
+/*
+ * The function assumes `litMatchLength` is a packed 64-bit value where the
+ * lower 32 bits represent the match length. The check varies based on the
+ * system's endianness:
+ * - On little-endian systems, it verifies if the entire 64-bit value is at most
+ * 0xFFFFFFFF, indicating the match length (lower 32 bits) is zero.
+ * - On big-endian systems, it directly checks if the lower 32 bits are zero.
+ *
+ * @returns 1 if the match length is zero, 0 otherwise.
+ */
+FORCE_INLINE_TEMPLATE int matchLengthHalfIsZero(U64 litMatchLength)
+{
+ if (MEM_isLittleEndian()) {
+ return litMatchLength <= 0xFFFFFFFFULL;
+ } else {
+ return (U32)litMatchLength == 0;
+ }
+}
+
BlockSummary ZSTD_get1BlockSummary(const ZSTD_Sequence* seqs, size_t nbSeqs)
{
- size_t totalMatchSize = 0;
- size_t litSize = 0;
- size_t n;
+ /* Use multiple accumulators for efficient use of wide out-of-order machines. */
+ U64 litMatchSize0 = 0;
+ U64 litMatchSize1 = 0;
+ U64 litMatchSize2 = 0;
+ U64 litMatchSize3 = 0;
+ size_t n = 0;
+
+ ZSTD_STATIC_ASSERT(offsetof(ZSTD_Sequence, litLength) + 4 == offsetof(ZSTD_Sequence, matchLength));
+ ZSTD_STATIC_ASSERT(offsetof(ZSTD_Sequence, matchLength) + 4 == offsetof(ZSTD_Sequence, rep));
assert(seqs);
- for (n=0; n<nbSeqs; n++) {
- totalMatchSize += seqs[n].matchLength;
- litSize += seqs[n].litLength;
- if (seqs[n].matchLength == 0) {
+
+ if (nbSeqs > 3) {
+ /* Process the input in 4 independent streams to reach high throughput. */
+ do {
+ /* Load `litLength` and `matchLength` as a packed `U64`. It is safe
+ * to use 64-bit unsigned arithmetic here because the sum of `litLength`
+ * and `matchLength` cannot exceed the block size, so the 32-bit
+ * subparts will never overflow. */
+ U64 litMatchLength = MEM_read64(&seqs[n].litLength);
+ litMatchSize0 += litMatchLength;
+ if (matchLengthHalfIsZero(litMatchLength)) {
+ assert(seqs[n].offset == 0);
+ goto _out;
+ }
+
+ litMatchLength = MEM_read64(&seqs[n + 1].litLength);
+ litMatchSize1 += litMatchLength;
+ if (matchLengthHalfIsZero(litMatchLength)) {
+ n += 1;
+ assert(seqs[n].offset == 0);
+ goto _out;
+ }
+
+ litMatchLength = MEM_read64(&seqs[n + 2].litLength);
+ litMatchSize2 += litMatchLength;
+ if (matchLengthHalfIsZero(litMatchLength)) {
+ n += 2;
+ assert(seqs[n].offset == 0);
+ goto _out;
+ }
+
+ litMatchLength = MEM_read64(&seqs[n + 3].litLength);
+ litMatchSize3 += litMatchLength;
+ if (matchLengthHalfIsZero(litMatchLength)) {
+ n += 3;
+ assert(seqs[n].offset == 0);
+ goto _out;
+ }
+
+ n += 4;
+ } while(n < nbSeqs - 3);
+ }
+
+ for (; n < nbSeqs; n++) {
+ U64 litMatchLength = MEM_read64(&seqs[n].litLength);
+ litMatchSize0 += litMatchLength;
+ if (matchLengthHalfIsZero(litMatchLength)) {
assert(seqs[n].offset == 0);
- break;
+ goto _out;
}
}
- if (n==nbSeqs) {
- BlockSummary bs;
+ /* At this point n == nbSeqs, so no end terminator. */
+ { BlockSummary bs;
bs.nbSequences = ERROR(externalSequences_invalid);
return bs;
}
+_out:
+ litMatchSize0 += litMatchSize1 + litMatchSize2 + litMatchSize3;
{ BlockSummary bs;
- bs.nbSequences = n+1;
- bs.blockSize = litSize + totalMatchSize;
- bs.litSize = litSize;
+ bs.nbSequences = n + 1;
+ if (MEM_isLittleEndian()) {
+ bs.litSize = (U32)litMatchSize0;
+ bs.blockSize = bs.litSize + (litMatchSize0 >> 32);
+ } else {
+ bs.litSize = litMatchSize0 >> 32;
+ bs.blockSize = bs.litSize + (U32)litMatchSize0;
+ }
return bs;
}
}
diff --git a/lib/decompress/zstd_decompress_block.c b/lib/decompress/zstd_decompress_block.c
index 6174a25..b2ccd92 100644
--- a/lib/decompress/zstd_decompress_block.c
+++ b/lib/decompress/zstd_decompress_block.c
@@ -1236,6 +1236,10 @@
ZSTD_decodeSequence(seqState_t* seqState, const ZSTD_longOffset_e longOffsets, const int isLastSeq)
{
seq_t seq;
+#if defined(__aarch64__)
+ size_t prevOffset0 = seqState->prevOffset[0];
+ size_t prevOffset1 = seqState->prevOffset[1];
+ size_t prevOffset2 = seqState->prevOffset[2];
/*
* ZSTD_seqSymbol is a 64 bits wide structure.
* It can be loaded in one operation
@@ -1244,7 +1248,7 @@
* operations that cause performance drop. This can be avoided by using this
* ZSTD_memcpy hack.
*/
-#if defined(__aarch64__) && (defined(__GNUC__) && !defined(__clang__))
+# if defined(__GNUC__) && !defined(__clang__)
ZSTD_seqSymbol llDInfoS, mlDInfoS, ofDInfoS;
ZSTD_seqSymbol* const llDInfo = &llDInfoS;
ZSTD_seqSymbol* const mlDInfo = &mlDInfoS;
@@ -1252,11 +1256,11 @@
ZSTD_memcpy(llDInfo, seqState->stateLL.table + seqState->stateLL.state, sizeof(ZSTD_seqSymbol));
ZSTD_memcpy(mlDInfo, seqState->stateML.table + seqState->stateML.state, sizeof(ZSTD_seqSymbol));
ZSTD_memcpy(ofDInfo, seqState->stateOffb.table + seqState->stateOffb.state, sizeof(ZSTD_seqSymbol));
-#else
+# else
const ZSTD_seqSymbol* const llDInfo = seqState->stateLL.table + seqState->stateLL.state;
const ZSTD_seqSymbol* const mlDInfo = seqState->stateML.table + seqState->stateML.state;
const ZSTD_seqSymbol* const ofDInfo = seqState->stateOffb.table + seqState->stateOffb.state;
-#endif
+# endif
seq.matchLength = mlDInfo->baseValue;
seq.litLength = llDInfo->baseValue;
{ U32 const ofBase = ofDInfo->baseValue;
@@ -1275,10 +1279,116 @@
assert(llBits <= MaxLLBits);
assert(mlBits <= MaxMLBits);
assert(ofBits <= MaxOff);
- /*
- * As gcc has better branch and block analyzers, sometimes it is only
- * valuable to mark likeliness for clang, it gives around 3-4% of
- * performance.
+ /* As GCC has better branch and block analyzers, sometimes it is only
+ * valuable to mark likeliness for Clang.
+ */
+
+ /* sequence */
+ { size_t offset;
+ if (ofBits > 1) {
+ ZSTD_STATIC_ASSERT(ZSTD_lo_isLongOffset == 1);
+ ZSTD_STATIC_ASSERT(LONG_OFFSETS_MAX_EXTRA_BITS_32 == 5);
+ ZSTD_STATIC_ASSERT(STREAM_ACCUMULATOR_MIN_32 > LONG_OFFSETS_MAX_EXTRA_BITS_32);
+ ZSTD_STATIC_ASSERT(STREAM_ACCUMULATOR_MIN_32 - LONG_OFFSETS_MAX_EXTRA_BITS_32 >= MaxMLBits);
+ if (MEM_32bits() && longOffsets && (ofBits >= STREAM_ACCUMULATOR_MIN_32)) {
+ /* Always read extra bits, this keeps the logic simple,
+ * avoids branches, and avoids accidentally reading 0 bits.
+ */
+ U32 const extraBits = LONG_OFFSETS_MAX_EXTRA_BITS_32;
+ offset = ofBase + (BIT_readBitsFast(&seqState->DStream, ofBits - extraBits) << extraBits);
+ BIT_reloadDStream(&seqState->DStream);
+ offset += BIT_readBitsFast(&seqState->DStream, extraBits);
+ } else {
+ offset = ofBase + BIT_readBitsFast(&seqState->DStream, ofBits/*>0*/); /* <= (ZSTD_WINDOWLOG_MAX-1) bits */
+ if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream);
+ }
+ prevOffset2 = prevOffset1;
+ prevOffset1 = prevOffset0;
+ prevOffset0 = offset;
+ } else {
+ U32 const ll0 = (llDInfo->baseValue == 0);
+ if (LIKELY((ofBits == 0))) {
+ if (ll0) {
+ offset = prevOffset1;
+ prevOffset1 = prevOffset0;
+ prevOffset0 = offset;
+ } else {
+ offset = prevOffset0;
+ }
+ } else {
+ offset = ofBase + ll0 + BIT_readBitsFast(&seqState->DStream, 1);
+ { size_t temp = (offset == 1) ? prevOffset1
+ : (offset == 3) ? prevOffset0 - 1
+ : (offset >= 2) ? prevOffset2
+ : prevOffset0;
+ /* 0 is not valid: input corrupted => force offset to -1 =>
+ * corruption detected at execSequence.
+ */
+ temp -= !temp;
+ prevOffset2 = (offset == 1) ? prevOffset2 : prevOffset1;
+ prevOffset1 = prevOffset0;
+ prevOffset0 = offset = temp;
+ } } }
+ seq.offset = offset;
+ }
+
+ if (mlBits > 0) {
+ seq.matchLength += BIT_readBitsFast(&seqState->DStream, mlBits/*>0*/);
+
+ if (MEM_32bits() && (mlBits+llBits >= STREAM_ACCUMULATOR_MIN_32-LONG_OFFSETS_MAX_EXTRA_BITS_32))
+ BIT_reloadDStream(&seqState->DStream);
+ if (MEM_64bits() && (totalBits >= STREAM_ACCUMULATOR_MIN_64-(LLFSELog+MLFSELog+OffFSELog)))
+ BIT_reloadDStream(&seqState->DStream);
+ }
+
+ /* Ensure there are enough bits to read the rest of data in 64-bit mode. */
+ ZSTD_STATIC_ASSERT(16+LLFSELog+MLFSELog+OffFSELog < STREAM_ACCUMULATOR_MIN_64);
+
+ if (llBits > 0)
+ seq.litLength += BIT_readBitsFast(&seqState->DStream, llBits/*>0*/);
+
+ if (MEM_32bits())
+ BIT_reloadDStream(&seqState->DStream);
+
+ DEBUGLOG(6, "seq: litL=%u, matchL=%u, offset=%u",
+ (U32)seq.litLength, (U32)seq.matchLength, (U32)seq.offset);
+
+ if (!isLastSeq) {
+ /* Don't update FSE state for last sequence. */
+ ZSTD_updateFseStateWithDInfo(&seqState->stateLL, &seqState->DStream, llNext, llnbBits); /* <= 9 bits */
+ ZSTD_updateFseStateWithDInfo(&seqState->stateML, &seqState->DStream, mlNext, mlnbBits); /* <= 9 bits */
+ if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream); /* <= 18 bits */
+ ZSTD_updateFseStateWithDInfo(&seqState->stateOffb, &seqState->DStream, ofNext, ofnbBits); /* <= 8 bits */
+ BIT_reloadDStream(&seqState->DStream);
+ }
+ }
+ seqState->prevOffset[0] = prevOffset0;
+ seqState->prevOffset[1] = prevOffset1;
+ seqState->prevOffset[2] = prevOffset2;
+#else /* !defined(__aarch64__) */
+ const ZSTD_seqSymbol* const llDInfo = seqState->stateLL.table + seqState->stateLL.state;
+ const ZSTD_seqSymbol* const mlDInfo = seqState->stateML.table + seqState->stateML.state;
+ const ZSTD_seqSymbol* const ofDInfo = seqState->stateOffb.table + seqState->stateOffb.state;
+ seq.matchLength = mlDInfo->baseValue;
+ seq.litLength = llDInfo->baseValue;
+ { U32 const ofBase = ofDInfo->baseValue;
+ BYTE const llBits = llDInfo->nbAdditionalBits;
+ BYTE const mlBits = mlDInfo->nbAdditionalBits;
+ BYTE const ofBits = ofDInfo->nbAdditionalBits;
+ BYTE const totalBits = llBits+mlBits+ofBits;
+
+ U16 const llNext = llDInfo->nextState;
+ U16 const mlNext = mlDInfo->nextState;
+ U16 const ofNext = ofDInfo->nextState;
+ U32 const llnbBits = llDInfo->nbBits;
+ U32 const mlnbBits = mlDInfo->nbBits;
+ U32 const ofnbBits = ofDInfo->nbBits;
+
+ assert(llBits <= MaxLLBits);
+ assert(mlBits <= MaxMLBits);
+ assert(ofBits <= MaxOff);
+ /* As GCC has better branch and block analyzers, sometimes it is only
+ * valuable to mark likeliness for Clang.
*/
/* sequence */
@@ -1340,7 +1450,7 @@
(U32)seq.litLength, (U32)seq.matchLength, (U32)seq.offset);
if (!isLastSeq) {
- /* don't update FSE state for last Sequence */
+ /* Don't update FSE state for last sequence. */
ZSTD_updateFseStateWithDInfo(&seqState->stateLL, &seqState->DStream, llNext, llnbBits); /* <= 9 bits */
ZSTD_updateFseStateWithDInfo(&seqState->stateML, &seqState->DStream, mlNext, mlnbBits); /* <= 9 bits */
if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream); /* <= 18 bits */
@@ -1348,6 +1458,7 @@
BIT_reloadDStream(&seqState->DStream);
}
}
+#endif /* defined(__aarch64__) */
return seq;
}
diff --git a/tests/fuzzer.c b/tests/fuzzer.c
index da380ac..0bc160e 100644
--- a/tests/fuzzer.c
+++ b/tests/fuzzer.c
@@ -45,6 +45,7 @@
#include "zstd_internal.h" /* ZSTD_WORKSPACETOOLARGE_MAXDURATION, ZSTD_WORKSPACETOOLARGE_FACTOR, KB, MB */
#include "threading.h" /* ZSTD_pthread_create, ZSTD_pthread_join */
#include "compress/hist.h" /* HIST_count_wksp */
+#include "compress/zstd_compress_internal.h" /* ZSTD_get1BlockSummary */
/*-************************************
@@ -769,6 +770,210 @@
DISPLAYLEVEL(3, "OK \n");
}
+size_t convertSequences_noRepcodes(SeqDef* dstSeqs, const ZSTD_Sequence* inSeqs,
+ size_t nbSequences);
+
+static size_t convertSequences_noRepcodes_ref(
+ SeqDef* dstSeqs,
+ const ZSTD_Sequence* inSeqs,
+ size_t nbSequences)
+{
+ size_t longLen = 0;
+ size_t n;
+ for (n=0; n<nbSequences; n++) {
+ dstSeqs[n].offBase = OFFSET_TO_OFFBASE(inSeqs[n].offset);
+ dstSeqs[n].litLength = (U16)inSeqs[n].litLength;
+ dstSeqs[n].mlBase = (U16)(inSeqs[n].matchLength - MINMATCH);
+ /* Check for long length > 65535. */
+ if (UNLIKELY(inSeqs[n].matchLength > 65535+MINMATCH)) {
+ assert(longLen == 0);
+ longLen = n + 1;
+ }
+ if (UNLIKELY(inSeqs[n].litLength > 65535)) {
+ assert(longLen == 0);
+ longLen = n + nbSequences + 1;
+ }
+ }
+ return longLen;
+}
+
+static unsigned test_convertSequences_noRepcodes(unsigned seed, unsigned testNb)
+{
+ ZSTD_Sequence nsrc[12];
+ SeqDef ndst[12], rdst[12];
+ size_t ref, ret, i, j;
+
+ seed += 0xDEADBEEF;
+ for (i = 0; i < COUNTOF(nsrc); ++i) {
+ seed = 48271 * ((unsigned)i + seed);
+ nsrc[i].offset = (seed & 0xFFFF) | 1; /* Offset shall not be zero. */
+ seed = 48271 * ((unsigned)i + seed);
+ nsrc[i].litLength = seed & 0xFFFF;
+ seed = 48271 * ((unsigned)i + seed);
+ nsrc[i].matchLength = (seed & 0xFFFFFF) % (65536 + MINMATCH);
+ seed = 48271 * ((unsigned)i + seed);
+ nsrc[i].rep = seed & 0xFF;
+ }
+
+ /* For near overflow and proper negative value handling. */
+ nsrc[5].matchLength = 65535 + MINMATCH;
+ nsrc[6].litLength = 65535;
+ nsrc[6].matchLength = 0;
+ nsrc[7].litLength = 0;
+ nsrc[7].matchLength = MINMATCH;
+
+ for (i = 0; i <= COUNTOF(nsrc); ++i) {
+ DISPLAYLEVEL(3, "test%3u : convertSequences_noRepcodes with %u inputs : ",
+ testNb++, (unsigned)i);
+ memset(ndst, 0, sizeof(ndst));
+ memset(rdst, 0, sizeof(rdst));
+ ref = convertSequences_noRepcodes_ref(rdst, nsrc, i);
+ ret = convertSequences_noRepcodes(ndst, nsrc, i);
+ CHECK_EQ(ret, ref);
+ CHECK_EQ(memcmp(rdst, ndst, sizeof(ndst)), 0);
+ DISPLAYLEVEL(3, "OK \n");
+ }
+
+ nsrc[7].matchLength = 65536 + MINMATCH;
+ for (i = 8; i <= COUNTOF(nsrc); ++i) {
+ DISPLAYLEVEL(3, "test%3u : convertSequences_noRepcodes with %u inputs and "
+ "matchLength overflow : ",
+ testNb++, (unsigned)i);
+ memset(ndst, 0, sizeof(ndst));
+ memset(rdst, 0, sizeof(rdst));
+ ref = convertSequences_noRepcodes_ref(rdst, nsrc, i);
+ ret = convertSequences_noRepcodes(ndst, nsrc, i);
+ CHECK_EQ(ret, ref);
+ CHECK_EQ(memcmp(rdst, ndst, sizeof(ndst)), 0);
+ DISPLAYLEVEL(3, "OK \n");
+
+ assert(COUNTOF(nsrc) > 8);
+ for (j = 4; j < 8; ++j) {
+ DISPLAYLEVEL(3, "test%3u : convertSequences_noRepcodes with %u inputs and "
+ "matchLength overflow #%u : ",
+ testNb++, (unsigned)i, (unsigned)(i - j));
+ memset(ndst, 0, sizeof(ndst));
+ memset(rdst, 0, sizeof(rdst));
+ ref = convertSequences_noRepcodes_ref(rdst, nsrc + j, i - j);
+ ret = convertSequences_noRepcodes(ndst, nsrc + j, i - j);
+ CHECK_EQ(ret, ref);
+ CHECK_EQ(memcmp(rdst, ndst, sizeof(ndst)), 0);
+ DISPLAYLEVEL(3, "OK \n");
+ }
+ }
+ nsrc[7].matchLength = 1;
+
+ nsrc[7].litLength = 65536;
+ for (i = 8; i <= COUNTOF(nsrc); ++i) {
+ DISPLAYLEVEL(3, "test%3u : convertSequences_noRepcodes with %u inputs and "
+ "litLength overflow: ",
+ testNb++, (unsigned)i);
+ memset(ndst, 0, sizeof(ndst));
+ memset(rdst, 0, sizeof(rdst));
+ ref = convertSequences_noRepcodes_ref(rdst, nsrc, i);
+ ret = convertSequences_noRepcodes(ndst, nsrc, i);
+ CHECK_EQ(ret, ref);
+ CHECK_EQ(memcmp(rdst, ndst, sizeof(ndst)), 0);
+ DISPLAYLEVEL(3, "OK \n");
+
+ assert(COUNTOF(nsrc) > 8);
+ for (j = 4; j < 8; ++j) {
+ DISPLAYLEVEL(3, "test%3u : convertSequences_noRepcodes with %u inputs and "
+ "litLength overflow #%u: ",
+ testNb++, (unsigned)i, (unsigned)(i - j));
+ memset(ndst, 0, sizeof(ndst));
+ memset(rdst, 0, sizeof(rdst));
+ ref = convertSequences_noRepcodes_ref(rdst, nsrc + j, i - j);
+ ret = convertSequences_noRepcodes(ndst, nsrc + j, i - j);
+ CHECK_EQ(ret, ref);
+ CHECK_EQ(memcmp(rdst, ndst, sizeof(ndst)), 0);
+ DISPLAYLEVEL(3, "OK \n");
+ }
+ }
+
+ return testNb;
+}
+
+static unsigned test_get1BlockSummary(unsigned testNb)
+{
+ static const ZSTD_Sequence nseqs[] = {
+ { 10, 2, 4, 1 },
+ { 20, 3, 5, 2 },
+ { 30, 6, 8, 3 },
+ { 40, 7, 9, 4 },
+ { 50, 10, 12, 5 },
+ { 60, 11, 13, 6 },
+ { 0, 14, 0, 7 },
+ { 70, 15, 17, 8 },
+ { 80, 16, 18, 9 },
+ { 90, 19, 21, 1 },
+ { 99, 20, 22, 2 },
+ };
+ static const BlockSummary blocks[] = {
+ { 7, 104, 53 },
+ { 6, 98, 51 },
+ { 5, 90, 48 },
+ { 4, 76, 42 },
+ { 3, 60, 35 },
+ { 2, 38, 25 },
+ { 1, 14, 14 },
+ };
+ size_t i;
+
+ DISPLAYLEVEL(3, "test%3u : ZSTD_get1BlockSummary with empty array : ", testNb++);
+ {
+ BlockSummary bs = ZSTD_get1BlockSummary(nseqs, 0);
+ CHECK_EQ(bs.nbSequences, ERROR(externalSequences_invalid));
+ }
+ DISPLAYLEVEL(3, "OK \n");
+
+ DISPLAYLEVEL(3, "test%3u : ZSTD_get1BlockSummary with 1 literal only : ", testNb++);
+ {
+ static const ZSTD_Sequence seqs[] = { { 0, 5, 0, 0 } };
+ BlockSummary bs = ZSTD_get1BlockSummary(seqs, 1);
+ CHECK_EQ(bs.nbSequences, 1);
+ CHECK_EQ(bs.litSize, 5);
+ CHECK_EQ(bs.blockSize, 5);
+ }
+ DISPLAYLEVEL(3, "OK \n");
+
+ DISPLAYLEVEL(3, "test%3u : ZSTD_get1BlockSummary with no terminator : ", testNb++);
+ {
+ static const ZSTD_Sequence seqs[] = { { 10, 2, 4, 0 }, { 20, 3, 5, 0 } };
+ BlockSummary bs = ZSTD_get1BlockSummary(seqs, 2);
+ CHECK_EQ(bs.nbSequences, ERROR(externalSequences_invalid));
+ }
+ DISPLAYLEVEL(3, "OK \n");
+
+ DISPLAYLEVEL(3, "test%3u : ZSTD_get1BlockSummary with rep ignored : ", testNb++);
+ {
+ static const ZSTD_Sequence seqs[] = {
+ { 10, 2, 4, 2 },
+ { 10, 3, 5, 2 },
+ { 0, 7, 0, 3 },
+ };
+ BlockSummary bs = ZSTD_get1BlockSummary(seqs, 3);
+ CHECK_EQ(bs.nbSequences, 3);
+ CHECK_EQ(bs.litSize, 2 + 3 + 7);
+ CHECK_EQ(bs.blockSize, (4 + 5) + (2 + 3 + 7));
+ }
+ DISPLAYLEVEL(3, "OK \n");
+
+ assert(COUNTOF(nseqs) > COUNTOF(blocks));
+ for (i = 0; i < COUNTOF(blocks); ++i) {
+ BlockSummary bs;
+ DISPLAYLEVEL(3, "test%3u : ZSTD_get1BlockSummary with %u inputs : ",
+ testNb++, (unsigned)(COUNTOF(nseqs) - i));
+ bs = ZSTD_get1BlockSummary(nseqs + i, COUNTOF(nseqs) - i);
+ CHECK_EQ(bs.nbSequences, blocks[i].nbSequences);
+ CHECK_EQ(bs.litSize, blocks[i].litSize);
+ CHECK_EQ(bs.blockSize, blocks[i].blockSize);
+ DISPLAYLEVEL(3, "OK \n");
+ }
+
+ return testNb;
+}
+
/* ============================================================= */
static int basicUnitTests(U32 const seed, double compressibility)
@@ -4004,6 +4209,10 @@
}
DISPLAYLEVEL(3, "OK \n");
+ testNb = test_convertSequences_noRepcodes(seed, testNb);
+
+ testNb = test_get1BlockSummary(testNb);
+
DISPLAYLEVEL(3, "test%3i : ZSTD_compressSequencesAndLiterals : ", testNb++);
{
const size_t srcSize = 497000;