[zlib][x86] Allow build & execution of both optimized CRC-32 functions
In Chromium zlib we have quite a few implementations of CRC-32, as follows:
- x86: vectorized SSE4.2 and AVX-512 functions.
- Arm: scalar crc32 using the crypto extensions (32bit & 64bit) and PMULL
based (aarch64 only)
- Portable: using the Kadatch-Jenkins algorithm, implemented by Mark Adler
in the canonical zlib.
The current behavior for x86-64 is that it was exclusive: either use AVX-512
or the SSE4.2 function, decided at compile time.
Instead the best approach is to have both built if AVX-512 optimizations are
enabled at compile time and leverage the best performant version depending
on the *length* of the data inputs.
Initial data points to an improvement of near +2% faster data decompression by
leveraging this strategy, tested on Xeon 4th gen (SPR).
Bug: 340921315
Change-Id: I6bb0bea763be1bb26b63d4f966b767b00310bd6c
Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/5549255
Commit-Queue: Adenilson Cavalcanti <cavalcantii@chromium.org>
Reviewed-by: Hans Wennborg <hans@chromium.org>
Cr-Commit-Position: refs/heads/main@{#1304191}
NOKEYCHECK=True
GitOrigin-RevId: 82602b4e2b1b2a476e465f1531545d3efa61fbf3
diff --git a/crc32.c b/crc32.c
index 32686f9..4177e92 100644
--- a/crc32.c
+++ b/crc32.c
@@ -700,24 +700,29 @@
/* ========================================================================= */
unsigned long ZEXPORT crc32_z(unsigned long crc, const unsigned char FAR *buf,
z_size_t len) {
+
+ /* If no optimizations are enabled, do it as canonical zlib. */
+#if !defined(CRC32_SIMD_SSE42_PCLMUL) && !defined(CRC32_ARMV8_CRC32) && \
+ !defined(RISCV_RVV) && !defined(CRC32_SIMD_AVX512_PCLMUL)
+ if (buf == Z_NULL) {
+ return 0UL;
+ }
+#else
/*
* zlib convention is to call crc32(0, NULL, 0); before making
* calls to crc32(). So this is a good, early (and infrequent)
* place to cache CPU features if needed for those later, more
* interesting crc32() calls.
*/
-#if defined(CRC32_SIMD_SSE42_PCLMUL) || defined(CRC32_ARMV8_CRC32) \
- || defined(RISCV_RVV)
- /*
- * Since this routine can be freely used, check CPU features here.
- */
if (buf == Z_NULL) {
- if (!len) /* Assume user is calling crc32(0, NULL, 0); */
+ if (!len)
cpu_check_features();
return 0UL;
}
-
#endif
+ /* If AVX-512 is enabled, we will use it for longer inputs and fallback
+ * to SSE4.2 and eventually the portable implementation to handle the tail.
+ */
#if defined(CRC32_SIMD_AVX512_PCLMUL)
if (x86_cpu_enable_avx512 && len >= Z_CRC32_AVX512_MINIMUM_LENGTH) {
/* crc32 64-byte chunks */
@@ -730,7 +735,8 @@
/* Fall into the default crc32 for the remaining data. */
buf += chunk_size;
}
-#elif defined(CRC32_SIMD_SSE42_PCLMUL)
+#endif
+#if defined(CRC32_SIMD_SSE42_PCLMUL)
if (x86_cpu_enable_simd && len >= Z_CRC32_SSE42_MINIMUM_LENGTH) {
/* crc32 16-byte chunks */
z_size_t chunk_size = len & ~Z_CRC32_SSE42_CHUNKSIZE_MASK;
@@ -758,11 +764,8 @@
buf += chunk_size;
}
#endif
- return armv8_crc32_little(buf, len, crc); /* Armv8@32bit or tail. */
- }
-#else
- if (buf == Z_NULL) {
- return 0UL;
+ /* This is scalar and self contained, used on Armv8@32bit or tail. */
+ return armv8_crc32_little(buf, len, crc);
}
#endif /* CRC32_SIMD */
diff --git a/crc32_simd.c b/crc32_simd.c
index 7428270..1ee7742 100644
--- a/crc32_simd.c
+++ b/crc32_simd.c
@@ -200,7 +200,8 @@
return _mm_extract_epi32(a1, 1);
}
-#elif defined(CRC32_SIMD_SSE42_PCLMUL)
+#endif
+#if defined(CRC32_SIMD_SSE42_PCLMUL)
/*
* crc32_sse42_simd_(): compute the crc32 of the buffer, where the buffer