[zlib] Re-sync with zlib 1.2.12, patch 13 of N (Reland)
This time we add an extra guard where it is required to explicitly
activate the new Armv8.1 CRC-32 provided by zlib 1.2.12
(i.e. USE_CANONICAL_ARMV8_CRC32) since we measured and the serial
implementation that we ship since early 2018 is slightly faster.
Ported:
- Add use of the ARMv8 crc32 instructions when requested.
- Use ARM crc32 instructions if the ARM architecture has them.
- Explicitly note that the 32-bit check values are 32 bits.
Bug: 1032721
Change-Id: I0ca87270ef470de243e62e9056ed0cd195da21ec
Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/3607702
Reviewed-by: Noel Gordon <noel@chromium.org>
Commit-Queue: Adenilson Cavalcanti <cavalcantii@chromium.org>
Cr-Commit-Position: refs/heads/main@{#996421}
NOKEYCHECK=True
GitOrigin-RevId: 43824b58db1f48f93a50194c0488ec4495f4c492
diff --git a/crc32.c b/crc32.c
index ee8d472..41fe891 100644
--- a/crc32.c
+++ b/crc32.c
@@ -622,6 +622,123 @@
return (const z_crc_t FAR *)crc_table;
}
+/* =========================================================================
+ * Use ARM machine instructions if available. This will compute the CRC about
+ * ten times faster than the braided calculation. This code does not check for
+ * the presence of the CRC instruction at run time. __ARM_FEATURE_CRC32 will
+ * only be defined if the compilation specifies an ARM processor architecture
+ * that has the instructions. For example, compiling with -march=armv8.1-a or
+ * -march=armv8-a+crc, or -march=native if the compile machine has the crc32
+ * instructions.
+ */
+#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32) && W == 8 \
+ && defined(USE_CANONICAL_ARMV8_CRC32)
+
+/*
+ Constants empirically determined to maximize speed. These values are from
+ measurements on a Cortex-A57. Your mileage may vary.
+ */
+#define Z_BATCH 3990 /* number of words in a batch */
+#define Z_BATCH_ZEROS 0xa10d3d0c /* computed from Z_BATCH = 3990 */
+#define Z_BATCH_MIN 800 /* fewest words in a final batch */
+
+unsigned long ZEXPORT crc32_z(crc, buf, len)
+ unsigned long crc;
+ const unsigned char FAR *buf;
+ z_size_t len;
+{
+ z_crc_t val;
+ z_word_t crc1, crc2;
+ const z_word_t *word;
+ z_word_t val0, val1, val2;
+ z_size_t last, last2, i;
+ z_size_t num;
+
+ /* Return initial CRC, if requested. */
+ if (buf == Z_NULL) return 0;
+
+#ifdef DYNAMIC_CRC_TABLE
+ once(&made, make_crc_table);
+#endif /* DYNAMIC_CRC_TABLE */
+
+ /* Pre-condition the CRC */
+ crc ^= 0xffffffff;
+
+ /* Compute the CRC up to a word boundary. */
+ while (len && ((z_size_t)buf & 7) != 0) {
+ len--;
+ val = *buf++;
+ __asm__ volatile("crc32b %w0, %w0, %w1" : "+r"(crc) : "r"(val));
+ }
+
+ /* Prepare to compute the CRC on full 64-bit words word[0..num-1]. */
+ word = (z_word_t const *)buf;
+ num = len >> 3;
+ len &= 7;
+
+ /* Do three interleaved CRCs to realize the throughput of one crc32x
+ instruction per cycle. Each CRC is calcuated on Z_BATCH words. The three
+ CRCs are combined into a single CRC after each set of batches. */
+ while (num >= 3 * Z_BATCH) {
+ crc1 = 0;
+ crc2 = 0;
+ for (i = 0; i < Z_BATCH; i++) {
+ val0 = word[i];
+ val1 = word[i + Z_BATCH];
+ val2 = word[i + 2 * Z_BATCH];
+ __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc) : "r"(val0));
+ __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc1) : "r"(val1));
+ __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc2) : "r"(val2));
+ }
+ word += 3 * Z_BATCH;
+ num -= 3 * Z_BATCH;
+ crc = multmodp(Z_BATCH_ZEROS, crc) ^ crc1;
+ crc = multmodp(Z_BATCH_ZEROS, crc) ^ crc2;
+ }
+
+ /* Do one last smaller batch with the remaining words, if there are enough
+ to pay for the combination of CRCs. */
+ last = num / 3;
+ if (last >= Z_BATCH_MIN) {
+ last2 = last << 1;
+ crc1 = 0;
+ crc2 = 0;
+ for (i = 0; i < last; i++) {
+ val0 = word[i];
+ val1 = word[i + last];
+ val2 = word[i + last2];
+ __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc) : "r"(val0));
+ __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc1) : "r"(val1));
+ __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc2) : "r"(val2));
+ }
+ word += 3 * last;
+ num -= 3 * last;
+ val = x2nmodp(last, 6);
+ crc = multmodp(val, crc) ^ crc1;
+ crc = multmodp(val, crc) ^ crc2;
+ }
+
+ /* Compute the CRC on any remaining words. */
+ for (i = 0; i < num; i++) {
+ val0 = word[i];
+ __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc) : "r"(val0));
+ }
+ word += num;
+
+ /* Complete the CRC on any remaining bytes. */
+ buf = (const unsigned char FAR *)word;
+ while (len) {
+ len--;
+ val = *buf++;
+ __asm__ volatile("crc32b %w0, %w0, %w1" : "+r"(crc) : "r"(val));
+ }
+
+ /* Return the CRC, post-conditioned. */
+ return crc ^ 0xffffffff;
+}
+
+#else
+
/* ========================================================================= */
unsigned long ZEXPORT crc32_z(crc, buf, len)
unsigned long crc;
@@ -974,6 +1091,8 @@
return crc ^ 0xffffffff;
}
+#endif
+
/* ========================================================================= */
unsigned long ZEXPORT crc32(crc, buf, len)
unsigned long crc;
diff --git a/zlib.h b/zlib.h
index a176f93..88961b9 100644
--- a/zlib.h
+++ b/zlib.h
@@ -1690,8 +1690,9 @@
ZEXTERN uLong ZEXPORT adler32 OF((uLong adler, const Bytef *buf, uInt len));
/*
Update a running Adler-32 checksum with the bytes buf[0..len-1] and
- return the updated checksum. If buf is Z_NULL, this function returns the
- required initial value for the checksum.
+ return the updated checksum. An Adler-32 value is in the range of a 32-bit
+ unsigned integer. If buf is Z_NULL, this function returns the required
+ initial value for the checksum.
An Adler-32 checksum is almost as reliable as a CRC-32 but can be computed
much faster.
@@ -1727,9 +1728,10 @@
ZEXTERN uLong ZEXPORT crc32 OF((uLong crc, const Bytef *buf, uInt len));
/*
Update a running CRC-32 with the bytes buf[0..len-1] and return the
- updated CRC-32. If buf is Z_NULL, this function returns the required
- initial value for the crc. Pre- and post-conditioning (one's complement) is
- performed within this function so it shouldn't be done by the application.
+ updated CRC-32. A CRC-32 value is in the range of a 32-bit unsigned integer.
+ If buf is Z_NULL, this function returns the required initial value for the
+ crc. Pre- and post-conditioning (one's complement) is performed within this
+ function so it shouldn't be done by the application.
Usage example: