[zlib][x86] Allow build & execution of both optimized CRC-32 functions

In Chromium zlib we have quite a few implementations of CRC-32, as follows:
- x86: vectorized SSE4.2 and AVX-512 functions.
- Arm: scalar crc32 using the crypto extensions (32bit & 64bit) and PMULL
       based (aarch64 only)
- Portable: using the Kadatch-Jenkins algorithm, implemented by Mark Adler
       in the canonical zlib.

The current behavior for x86-64 is that it was exclusive: either use AVX-512
or the SSE4.2 function, decided at compile time.

Instead the best approach is to have both built if AVX-512 optimizations are
enabled at compile time and leverage the best performant version depending
on the *length* of the data inputs.

Initial data points to an improvement of near +2% faster data decompression by
leveraging this strategy, tested on Xeon 4th gen (SPR).

Bug: 340921315
Change-Id: I6bb0bea763be1bb26b63d4f966b767b00310bd6c
Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/5549255
Commit-Queue: Adenilson Cavalcanti <cavalcantii@chromium.org>
Reviewed-by: Hans Wennborg <hans@chromium.org>
Cr-Commit-Position: refs/heads/main@{#1304191}
NOKEYCHECK=True
GitOrigin-RevId: 82602b4e2b1b2a476e465f1531545d3efa61fbf3
diff --git a/crc32.c b/crc32.c
index 32686f9..4177e92 100644
--- a/crc32.c
+++ b/crc32.c
@@ -700,24 +700,29 @@
 /* ========================================================================= */
 unsigned long ZEXPORT crc32_z(unsigned long crc, const unsigned char FAR *buf,
                               z_size_t len) {
+
+    /* If no optimizations are enabled, do it as canonical zlib. */
+#if !defined(CRC32_SIMD_SSE42_PCLMUL) && !defined(CRC32_ARMV8_CRC32) && \
+    !defined(RISCV_RVV) && !defined(CRC32_SIMD_AVX512_PCLMUL)
+    if (buf == Z_NULL) {
+        return 0UL;
+    }
+#else
     /*
      * zlib convention is to call crc32(0, NULL, 0); before making
      * calls to crc32(). So this is a good, early (and infrequent)
      * place to cache CPU features if needed for those later, more
      * interesting crc32() calls.
      */
-#if defined(CRC32_SIMD_SSE42_PCLMUL) || defined(CRC32_ARMV8_CRC32) \
-    || defined(RISCV_RVV)
-    /*
-     * Since this routine can be freely used, check CPU features here.
-     */
     if (buf == Z_NULL) {
-        if (!len) /* Assume user is calling crc32(0, NULL, 0); */
+        if (!len)
             cpu_check_features();
         return 0UL;
     }
-
 #endif
+    /* If AVX-512 is enabled, we will use it for longer inputs and fallback
+     * to SSE4.2 and eventually the portable implementation to handle the tail.
+     */
 #if defined(CRC32_SIMD_AVX512_PCLMUL)
     if (x86_cpu_enable_avx512 && len >= Z_CRC32_AVX512_MINIMUM_LENGTH) {
         /* crc32 64-byte chunks */
@@ -730,7 +735,8 @@
         /* Fall into the default crc32 for the remaining data. */
         buf += chunk_size;
     }
-#elif defined(CRC32_SIMD_SSE42_PCLMUL)
+#endif
+#if defined(CRC32_SIMD_SSE42_PCLMUL)
     if (x86_cpu_enable_simd && len >= Z_CRC32_SSE42_MINIMUM_LENGTH) {
         /* crc32 16-byte chunks */
         z_size_t chunk_size = len & ~Z_CRC32_SSE42_CHUNKSIZE_MASK;
@@ -758,11 +764,8 @@
             buf += chunk_size;
         }
 #endif
-        return armv8_crc32_little(buf, len, crc); /* Armv8@32bit or tail. */
-    }
-#else
-    if (buf == Z_NULL) {
-        return 0UL;
+        /* This is scalar and self contained, used on Armv8@32bit or tail. */
+        return armv8_crc32_little(buf, len, crc);
     }
 #endif /* CRC32_SIMD */
 
diff --git a/crc32_simd.c b/crc32_simd.c
index 7428270..1ee7742 100644
--- a/crc32_simd.c
+++ b/crc32_simd.c
@@ -200,7 +200,8 @@
     return _mm_extract_epi32(a1, 1);
 }
 
-#elif defined(CRC32_SIMD_SSE42_PCLMUL)
+#endif
+#if defined(CRC32_SIMD_SSE42_PCLMUL)
 
 /*
  * crc32_sse42_simd_(): compute the crc32 of the buffer, where the buffer