[zlib] Re-sync with zlib 1.2.12, patch 13 of N (Reland)

This time we add an extra guard where it is required to explicitly
activate the new Armv8.1 CRC-32 provided by zlib 1.2.12
(i.e. USE_CANONICAL_ARMV8_CRC32) since we measured and the serial
implementation that we ship since early 2018 is slightly faster.

Ported:
- Add use of the ARMv8 crc32 instructions when requested.
- Use ARM crc32 instructions if the ARM architecture has them.
- Explicitly note that the 32-bit check values are 32 bits.

Bug: 1032721
Change-Id: I0ca87270ef470de243e62e9056ed0cd195da21ec
Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/3607702
Reviewed-by: Noel Gordon <noel@chromium.org>
Commit-Queue: Adenilson Cavalcanti <cavalcantii@chromium.org>
Cr-Commit-Position: refs/heads/main@{#996421}
NOKEYCHECK=True
GitOrigin-RevId: 43824b58db1f48f93a50194c0488ec4495f4c492
diff --git a/crc32.c b/crc32.c
index ee8d472..41fe891 100644
--- a/crc32.c
+++ b/crc32.c
@@ -622,6 +622,123 @@
     return (const z_crc_t FAR *)crc_table;
 }
 
+/* =========================================================================
+ * Use ARM machine instructions if available. This will compute the CRC about
+ * ten times faster than the braided calculation. This code does not check for
+ * the presence of the CRC instruction at run time. __ARM_FEATURE_CRC32 will
+ * only be defined if the compilation specifies an ARM processor architecture
+ * that has the instructions. For example, compiling with -march=armv8.1-a or
+ * -march=armv8-a+crc, or -march=native if the compile machine has the crc32
+ * instructions.
+ */
+#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32) && W == 8 \
+    && defined(USE_CANONICAL_ARMV8_CRC32)
+
+/*
+   Constants empirically determined to maximize speed. These values are from
+   measurements on a Cortex-A57. Your mileage may vary.
+ */
+#define Z_BATCH 3990                /* number of words in a batch */
+#define Z_BATCH_ZEROS 0xa10d3d0c    /* computed from Z_BATCH = 3990 */
+#define Z_BATCH_MIN 800             /* fewest words in a final batch */
+
+unsigned long ZEXPORT crc32_z(crc, buf, len)
+    unsigned long crc;
+    const unsigned char FAR *buf;
+    z_size_t len;
+{
+    z_crc_t val;
+    z_word_t crc1, crc2;
+    const z_word_t *word;
+    z_word_t val0, val1, val2;
+    z_size_t last, last2, i;
+    z_size_t num;
+
+    /* Return initial CRC, if requested. */
+    if (buf == Z_NULL) return 0;
+
+#ifdef DYNAMIC_CRC_TABLE
+    once(&made, make_crc_table);
+#endif /* DYNAMIC_CRC_TABLE */
+
+    /* Pre-condition the CRC */
+    crc ^= 0xffffffff;
+
+    /* Compute the CRC up to a word boundary. */
+    while (len && ((z_size_t)buf & 7) != 0) {
+        len--;
+        val = *buf++;
+        __asm__ volatile("crc32b %w0, %w0, %w1" : "+r"(crc) : "r"(val));
+    }
+
+    /* Prepare to compute the CRC on full 64-bit words word[0..num-1]. */
+    word = (z_word_t const *)buf;
+    num = len >> 3;
+    len &= 7;
+
+    /* Do three interleaved CRCs to realize the throughput of one crc32x
+       instruction per cycle. Each CRC is calcuated on Z_BATCH words. The three
+       CRCs are combined into a single CRC after each set of batches. */
+    while (num >= 3 * Z_BATCH) {
+        crc1 = 0;
+        crc2 = 0;
+        for (i = 0; i < Z_BATCH; i++) {
+            val0 = word[i];
+            val1 = word[i + Z_BATCH];
+            val2 = word[i + 2 * Z_BATCH];
+            __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc) : "r"(val0));
+            __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc1) : "r"(val1));
+            __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc2) : "r"(val2));
+        }
+        word += 3 * Z_BATCH;
+        num -= 3 * Z_BATCH;
+        crc = multmodp(Z_BATCH_ZEROS, crc) ^ crc1;
+        crc = multmodp(Z_BATCH_ZEROS, crc) ^ crc2;
+    }
+
+    /* Do one last smaller batch with the remaining words, if there are enough
+       to pay for the combination of CRCs. */
+    last = num / 3;
+    if (last >= Z_BATCH_MIN) {
+        last2 = last << 1;
+        crc1 = 0;
+        crc2 = 0;
+        for (i = 0; i < last; i++) {
+            val0 = word[i];
+            val1 = word[i + last];
+            val2 = word[i + last2];
+            __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc) : "r"(val0));
+            __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc1) : "r"(val1));
+            __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc2) : "r"(val2));
+        }
+        word += 3 * last;
+        num -= 3 * last;
+        val = x2nmodp(last, 6);
+        crc = multmodp(val, crc) ^ crc1;
+        crc = multmodp(val, crc) ^ crc2;
+    }
+
+    /* Compute the CRC on any remaining words. */
+    for (i = 0; i < num; i++) {
+        val0 = word[i];
+        __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc) : "r"(val0));
+    }
+    word += num;
+
+    /* Complete the CRC on any remaining bytes. */
+    buf = (const unsigned char FAR *)word;
+    while (len) {
+        len--;
+        val = *buf++;
+        __asm__ volatile("crc32b %w0, %w0, %w1" : "+r"(crc) : "r"(val));
+    }
+
+    /* Return the CRC, post-conditioned. */
+    return crc ^ 0xffffffff;
+}
+
+#else
+
 /* ========================================================================= */
 unsigned long ZEXPORT crc32_z(crc, buf, len)
     unsigned long crc;
@@ -974,6 +1091,8 @@
     return crc ^ 0xffffffff;
 }
 
+#endif
+
 /* ========================================================================= */
 unsigned long ZEXPORT crc32(crc, buf, len)
     unsigned long crc;
diff --git a/zlib.h b/zlib.h
index a176f93..88961b9 100644
--- a/zlib.h
+++ b/zlib.h
@@ -1690,8 +1690,9 @@
 ZEXTERN uLong ZEXPORT adler32 OF((uLong adler, const Bytef *buf, uInt len));
 /*
      Update a running Adler-32 checksum with the bytes buf[0..len-1] and
-   return the updated checksum.  If buf is Z_NULL, this function returns the
-   required initial value for the checksum.
+   return the updated checksum. An Adler-32 value is in the range of a 32-bit
+   unsigned integer. If buf is Z_NULL, this function returns the required
+   initial value for the checksum.
 
      An Adler-32 checksum is almost as reliable as a CRC-32 but can be computed
    much faster.
@@ -1727,9 +1728,10 @@
 ZEXTERN uLong ZEXPORT crc32 OF((uLong crc, const Bytef *buf, uInt len));
 /*
      Update a running CRC-32 with the bytes buf[0..len-1] and return the
-   updated CRC-32.  If buf is Z_NULL, this function returns the required
-   initial value for the crc.  Pre- and post-conditioning (one's complement) is
-   performed within this function so it shouldn't be done by the application.
+   updated CRC-32. A CRC-32 value is in the range of a 32-bit unsigned integer.
+   If buf is Z_NULL, this function returns the required initial value for the
+   crc. Pre- and post-conditioning (one's complement) is performed within this
+   function so it shouldn't be done by the application.
 
    Usage example: