third_party/zlib/patches/0011-avx512.patch - chromium/src - Git at Google

 From 87fc8e3e38323cfdabf8da3927488e3e57073b02 Mon Sep 17 00:00:00 2001
 From: Jia Liu <jia3.liu@intel.com>
 Date: Thu, 30 Mar 2023 11:13:16 +0800
 Subject: [PATCH] Enabled AVX512 for CRC32

 Enabled AVX512 for CRC32 that provide best of known performance
 beyond current SSE SIMD optimization. It enables multiple folding
 operations and AVX512 new instructions, providing ~3.5X CRC32
 performance and ~3.7% gain on Zlib_bench gzip performance.
 ---
  CMakeLists.txt |   8 +-
  cpu_features.c |   9 +++
  cpu_features.h |   1 +
  crc32.c        |  14 +++-
  crc32_simd.c   | 198 ++++++++++++++++++++++++++++++++++++++++++++++++-
  crc32_simd.h   |   6 ++
  6 files changed, 230 insertions(+), 6 deletions(-)

 diff --git a/CMakeLists.txt b/CMakeLists.txt
 index f06e193..d45b902 100644
 --- a/CMakeLists.txt
 +++ b/CMakeLists.txt
 @@ -22,6 +22,7 @@ check_include_file(stdint.h    HAVE_STDINT_H)
  check_include_file(stddef.h    HAVE_STDDEF_H)

  option(ENABLE_SIMD_OPTIMIZATIONS "Enable all SIMD optimizations" OFF)
 +option(ENABLE_SIMD_AVX512 "Enable SIMD AXV512 optimizations" OFF)

  # TODO(cavalcantii): add support for other OSes (e.g. Android, fuchsia, osx)
  # and architectures (e.g. Arm).
 @@ -30,8 +31,13 @@ if (ENABLE_SIMD_OPTIMIZATIONS)
     add_definitions(-DADLER32_SIMD_SSSE3)
     add_definitions(-DINFLATE_CHUNK_READ_64LE)
     add_definitions(-DCRC32_SIMD_SSE42_PCLMUL)
 +   if (ENABLE_SIMD_AVX512)
 +    add_definitions(-DCRC32_SIMD_AVX512_PCLMUL)
 +    add_compile_options(-mvpclmulqdq -msse2 -mavx512f -mpclmul)
 +   else()
 +    add_compile_options(-msse4.2 -mpclmul)
 +   endif()
     add_definitions(-DDEFLATE_SLIDE_HASH_SSE2)
 -   add_compile_options(-msse4.2 -mpclmul)
     # Required by CPU features detection code.
     add_definitions(-DX86_NOT_WINDOWS)
     # Apparently some environments (e.g. CentOS) require to explicitly link
 diff --git a/cpu_features.c b/cpu_features.c
 index 877d5f2..ac6ee88 100644
 --- a/cpu_features.c
 +++ b/cpu_features.c
 @@ -31,6 +31,7 @@ int ZLIB_INTERNAL arm_cpu_enable_pmull = 0;
  int ZLIB_INTERNAL x86_cpu_enable_sse2 = 0;
  int ZLIB_INTERNAL x86_cpu_enable_ssse3 = 0;
  int ZLIB_INTERNAL x86_cpu_enable_simd = 0;
 +int ZLIB_INTERNAL x86_cpu_enable_avx512 = 0;

  #ifndef CPU_NO_SIMD

 @@ -138,6 +139,10 @@ static void _cpu_check_features(void)
  /* On x86 we simply use a instruction to check the CPU features.
   * (i.e. CPUID).
   */
 +#ifdef CRC32_SIMD_AVX512_PCLMUL
 +#include <immintrin.h>
 +#include <xsaveintrin.h>
 +#endif
  static void _cpu_check_features(void)
  {
      int x86_cpu_has_sse2;
 @@ -164,6 +169,10 @@ static void _cpu_check_features(void)
      x86_cpu_enable_simd = x86_cpu_has_sse2 &&
                            x86_cpu_has_sse42 &&
                            x86_cpu_has_pclmulqdq;
 +
 +#ifdef CRC32_SIMD_AVX512_PCLMUL
 +    x86_cpu_enable_avx512 = _xgetbv(0) & 0x00000040;
 +#endif
  }
  #endif
  #endif
 diff --git a/cpu_features.h b/cpu_features.h
 index 279246c..aed3e83 100644
 --- a/cpu_features.h
 +++ b/cpu_features.h
 @@ -14,5 +14,6 @@ extern int arm_cpu_enable_pmull;
  extern int x86_cpu_enable_sse2;
  extern int x86_cpu_enable_ssse3;
  extern int x86_cpu_enable_simd;
 +extern int x86_cpu_enable_avx512;

  void cpu_check_features(void);
 diff --git a/crc32.c b/crc32.c
 index 4486098..acb6972 100644
 --- a/crc32.c
 +++ b/crc32.c
 @@ -773,7 +773,19 @@ unsigned long ZEXPORT crc32_z(crc, buf, len)
      }

  #endif
 -#if defined(CRC32_SIMD_SSE42_PCLMUL)
 +#if defined(CRC32_SIMD_AVX512_PCLMUL)
 +    if (x86_cpu_enable_avx512 && len >= Z_CRC32_AVX512_MINIMUM_LENGTH) {
 +        /* crc32 64-byte chunks */
 +        z_size_t chunk_size = len & ~Z_CRC32_AVX512_CHUNKSIZE_MASK;
 +        crc = ~crc32_avx512_simd_(buf, chunk_size, ~(uint32_t)crc);
 +        /* check remaining data */
 +        len -= chunk_size;
 +        if (!len)
 +            return crc;
 +        /* Fall into the default crc32 for the remaining data. */
 +        buf += chunk_size;
 +    }
 +#elif defined(CRC32_SIMD_SSE42_PCLMUL)
      if (x86_cpu_enable_simd && len >= Z_CRC32_SSE42_MINIMUM_LENGTH) {
          /* crc32 16-byte chunks */
          z_size_t chunk_size = len & ~Z_CRC32_SSE42_CHUNKSIZE_MASK;
 diff --git a/crc32_simd.c b/crc32_simd.c
 index d80beba..7428270 100644
 --- a/crc32_simd.c
 +++ b/crc32_simd.c
 @@ -6,17 +6,207 @@
   */

  #include "crc32_simd.h"
 -
 -#if defined(CRC32_SIMD_SSE42_PCLMUL)
 +#if defined(CRC32_SIMD_AVX512_PCLMUL)

  /*
 - * crc32_sse42_simd_(): compute the crc32 of the buffer, where the buffer
 - * length must be at least 64, and a multiple of 16. Based on:
 + * crc32_avx512_simd_(): compute the crc32 of the buffer, where the buffer
 + * length must be at least 256, and a multiple of 64. Based on:
   *
   * "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction"
   *  V. Gopal, E. Ozturk, et al., 2009, http://intel.ly/2ySEwL0
   */

 +#include <emmintrin.h>
 +#include <smmintrin.h>
 +#include <wmmintrin.h>
 +#include <immintrin.h>
 +
 +uint32_t ZLIB_INTERNAL crc32_avx512_simd_(  /* AVX512+PCLMUL */
 +    const unsigned char *buf,
 +    z_size_t len,
 +    uint32_t crc)
 +{
 +    /*
 +     * Definitions of the bit-reflected domain constants k1,k2,k3,k4
 +     * are similar to those given at the end of the paper, and remaining
 +     * constants and CRC32+Barrett polynomials remain unchanged.
 +     *
 +     * Replace the index of x from 128 to 512. As follows:
 +     * k1 = ( x ^ ( 512 * 4 + 32 ) mod P(x) << 32 )' << 1 = 0x011542778a
 +     * k2 = ( x ^ ( 512 * 4 - 32 ) mod P(x) << 32 )' << 1 = 0x01322d1430
 +     * k3 = ( x ^ ( 512 + 32 ) mod P(x) << 32 )' << 1 = 0x0154442bd4
 +     * k4 = ( x ^ ( 512 - 32 ) mod P(x) << 32 )' << 1 = 0x01c6e41596
 +     */
 +    static const uint64_t zalign(64) k1k2[] = { 0x011542778a, 0x01322d1430,
 +                                                0x011542778a, 0x01322d1430,
 +                                                0x011542778a, 0x01322d1430,
 +                                                0x011542778a, 0x01322d1430 };
 +    static const uint64_t zalign(64) k3k4[] = { 0x0154442bd4, 0x01c6e41596,
 +                                                0x0154442bd4, 0x01c6e41596,
 +                                                0x0154442bd4, 0x01c6e41596,
 +                                                0x0154442bd4, 0x01c6e41596 };
 +    static const uint64_t zalign(16) k5k6[] = { 0x01751997d0, 0x00ccaa009e };
 +    static const uint64_t zalign(16) k7k8[] = { 0x0163cd6124, 0x0000000000 };
 +    static const uint64_t zalign(16) poly[] = { 0x01db710641, 0x01f7011641 };
 +    __m512i x0, x1, x2, x3, x4, x5, x6, x7, x8, y5, y6, y7, y8;
 +    __m128i a0, a1, a2, a3;
 +
 +    /*
 +     * There's at least one block of 256.
 +     */
 +    x1 = _mm512_loadu_si512((__m512i *)(buf + 0x00));
 +    x2 = _mm512_loadu_si512((__m512i *)(buf + 0x40));
 +    x3 = _mm512_loadu_si512((__m512i *)(buf + 0x80));
 +    x4 = _mm512_loadu_si512((__m512i *)(buf + 0xC0));
 +
 +    x1 = _mm512_xor_si512(x1, _mm512_castsi128_si512(_mm_cvtsi32_si128(crc)));
 +
 +    x0 = _mm512_load_si512((__m512i *)k1k2);
 +
 +    buf += 256;
 +    len -= 256;
 +
 +    /*
 +     * Parallel fold blocks of 256, if any.
 +     */
 +    while (len >= 256)
 +    {
 +        x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00);
 +        x6 = _mm512_clmulepi64_epi128(x2, x0, 0x00);
 +        x7 = _mm512_clmulepi64_epi128(x3, x0, 0x00);
 +        x8 = _mm512_clmulepi64_epi128(x4, x0, 0x00);
 +
 +
 +        x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11);
 +        x2 = _mm512_clmulepi64_epi128(x2, x0, 0x11);
 +        x3 = _mm512_clmulepi64_epi128(x3, x0, 0x11);
 +        x4 = _mm512_clmulepi64_epi128(x4, x0, 0x11);
 +
 +        y5 = _mm512_loadu_si512((__m512i *)(buf + 0x00));
 +        y6 = _mm512_loadu_si512((__m512i *)(buf + 0x40));
 +        y7 = _mm512_loadu_si512((__m512i *)(buf + 0x80));
 +        y8 = _mm512_loadu_si512((__m512i *)(buf + 0xC0));
 +
 +        x1 = _mm512_xor_si512(x1, x5);
 +        x2 = _mm512_xor_si512(x2, x6);
 +        x3 = _mm512_xor_si512(x3, x7);
 +        x4 = _mm512_xor_si512(x4, x8);
 +
 +        x1 = _mm512_xor_si512(x1, y5);
 +        x2 = _mm512_xor_si512(x2, y6);
 +        x3 = _mm512_xor_si512(x3, y7);
 +        x4 = _mm512_xor_si512(x4, y8);
 +
 +        buf += 256;
 +        len -= 256;
 +    }
 +
 +    /*
 +     * Fold into 512-bits.
 +     */
 +    x0 = _mm512_load_si512((__m512i *)k3k4);
 +
 +    x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00);
 +    x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11);
 +    x1 = _mm512_xor_si512(x1, x2);
 +    x1 = _mm512_xor_si512(x1, x5);
 +
 +    x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00);
 +    x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11);
 +    x1 = _mm512_xor_si512(x1, x3);
 +    x1 = _mm512_xor_si512(x1, x5);
 +
 +    x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00);
 +    x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11);
 +    x1 = _mm512_xor_si512(x1, x4);
 +    x1 = _mm512_xor_si512(x1, x5);
 +
 +    /*
 +     * Single fold blocks of 64, if any.
 +     */
 +    while (len >= 64)
 +    {
 +        x2 = _mm512_loadu_si512((__m512i *)buf);
 +
 +        x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00);
 +        x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11);
 +        x1 = _mm512_xor_si512(x1, x2);
 +        x1 = _mm512_xor_si512(x1, x5);
 +
 +        buf += 64;
 +        len -= 64;
 +    }
 +
 +    /*
 +     * Fold 512-bits to 384-bits.
 +     */
 +    a0 = _mm_load_si128((__m128i *)k5k6);
 +
 +    a1 = _mm512_extracti32x4_epi32(x1, 0);
 +    a2 = _mm512_extracti32x4_epi32(x1, 1);
 +
 +    a3 = _mm_clmulepi64_si128(a1, a0, 0x00);
 +    a1 = _mm_clmulepi64_si128(a1, a0, 0x11);
 +
 +    a1 = _mm_xor_si128(a1, a3);
 +    a1 = _mm_xor_si128(a1, a2);
 +
 +    /*
 +     * Fold 384-bits to 256-bits.
 +     */
 +    a2 = _mm512_extracti32x4_epi32(x1, 2);
 +    a3 = _mm_clmulepi64_si128(a1, a0, 0x00);
 +    a1 = _mm_clmulepi64_si128(a1, a0, 0x11);
 +    a1 = _mm_xor_si128(a1, a3);
 +    a1 = _mm_xor_si128(a1, a2);
 +
 +    /*
 +     * Fold 256-bits to 128-bits.
 +     */
 +    a2 = _mm512_extracti32x4_epi32(x1, 3);
 +    a3 = _mm_clmulepi64_si128(a1, a0, 0x00);
 +    a1 = _mm_clmulepi64_si128(a1, a0, 0x11);
 +    a1 = _mm_xor_si128(a1, a3);
 +    a1 = _mm_xor_si128(a1, a2);
 +
 +    /*
 +     * Fold 128-bits to 64-bits.
 +     */
 +    a2 = _mm_clmulepi64_si128(a1, a0, 0x10);
 +    a3 = _mm_setr_epi32(~0, 0, ~0, 0);
 +    a1 = _mm_srli_si128(a1, 8);
 +    a1 = _mm_xor_si128(a1, a2);
 +
 +    a0 = _mm_loadl_epi64((__m128i*)k7k8);
 +    a2 = _mm_srli_si128(a1, 4);
 +    a1 = _mm_and_si128(a1, a3);
 +    a1 = _mm_clmulepi64_si128(a1, a0, 0x00);
 +    a1 = _mm_xor_si128(a1, a2);
 +
 +    /*
 +     * Barret reduce to 32-bits.
 +     */
 +    a0 = _mm_load_si128((__m128i*)poly);
 +
 +    a2 = _mm_and_si128(a1, a3);
 +    a2 = _mm_clmulepi64_si128(a2, a0, 0x10);
 +    a2 = _mm_and_si128(a2, a3);
 +    a2 = _mm_clmulepi64_si128(a2, a0, 0x00);
 +    a1 = _mm_xor_si128(a1, a2);
 +
 +    /*
 +     * Return the crc32.
 +     */
 +    return _mm_extract_epi32(a1, 1);
 +}
 +
 +#elif defined(CRC32_SIMD_SSE42_PCLMUL)
 +
 +/*
 + * crc32_sse42_simd_(): compute the crc32 of the buffer, where the buffer
 + * length must be at least 64, and a multiple of 16.
 + */
 +
  #include <emmintrin.h>
  #include <smmintrin.h>
  #include <wmmintrin.h>
 diff --git a/crc32_simd.h b/crc32_simd.h
 index c0346dc..8462464 100644
 --- a/crc32_simd.h
 +++ b/crc32_simd.h
 @@ -19,12 +19,18 @@ uint32_t ZLIB_INTERNAL crc32_sse42_simd_(const unsigned char* buf,
                                           z_size_t len,
                                           uint32_t crc);

 +uint32_t ZLIB_INTERNAL crc32_avx512_simd_(const unsigned char* buf,
 +                                          z_size_t len,
 +                                          uint32_t crc);
 +
  /*
   * crc32_sse42_simd_ buffer size constraints: see the use in zlib/crc32.c
   * for computing the crc32 of an arbitrary length buffer.
   */
  #define Z_CRC32_SSE42_MINIMUM_LENGTH 64
  #define Z_CRC32_SSE42_CHUNKSIZE_MASK 15
 +#define Z_CRC32_AVX512_MINIMUM_LENGTH 256
 +#define Z_CRC32_AVX512_CHUNKSIZE_MASK 63

  /*
   * CRC32 checksums using ARMv8-a crypto instructions.
 --
 2.34.1
	From 87fc8e3e38323cfdabf8da3927488e3e57073b02 Mon Sep 17 00:00:00 2001
	From: Jia Liu <jia3.liu@intel.com>
	Date: Thu, 30 Mar 2023 11:13:16 +0800
	Subject: [PATCH] Enabled AVX512 for CRC32

	Enabled AVX512 for CRC32 that provide best of known performance
	beyond current SSE SIMD optimization. It enables multiple folding
	operations and AVX512 new instructions, providing ~3.5X CRC32
	performance and ~3.7% gain on Zlib_bench gzip performance.
	---
	CMakeLists.txt \| 8 +-
	cpu_features.c \| 9 +++
	cpu_features.h \| 1 +
	crc32.c \| 14 +++-
	crc32_simd.c \| 198 ++++++++++++++++++++++++++++++++++++++++++++++++-
	crc32_simd.h \| 6 ++
	6 files changed, 230 insertions(+), 6 deletions(-)

	diff --git a/CMakeLists.txt b/CMakeLists.txt
	index f06e193..d45b902 100644
	--- a/CMakeLists.txt
	+++ b/CMakeLists.txt
	@@ -22,6 +22,7 @@ check_include_file(stdint.h HAVE_STDINT_H)
	check_include_file(stddef.h HAVE_STDDEF_H)

	option(ENABLE_SIMD_OPTIMIZATIONS "Enable all SIMD optimizations" OFF)
	+option(ENABLE_SIMD_AVX512 "Enable SIMD AXV512 optimizations" OFF)

	# TODO(cavalcantii): add support for other OSes (e.g. Android, fuchsia, osx)
	# and architectures (e.g. Arm).
	@@ -30,8 +31,13 @@ if (ENABLE_SIMD_OPTIMIZATIONS)
	add_definitions(-DADLER32_SIMD_SSSE3)
	add_definitions(-DINFLATE_CHUNK_READ_64LE)
	add_definitions(-DCRC32_SIMD_SSE42_PCLMUL)
	+ if (ENABLE_SIMD_AVX512)
	+ add_definitions(-DCRC32_SIMD_AVX512_PCLMUL)
	+ add_compile_options(-mvpclmulqdq -msse2 -mavx512f -mpclmul)
	+ else()
	+ add_compile_options(-msse4.2 -mpclmul)
	+ endif()
	add_definitions(-DDEFLATE_SLIDE_HASH_SSE2)
	- add_compile_options(-msse4.2 -mpclmul)
	# Required by CPU features detection code.
	add_definitions(-DX86_NOT_WINDOWS)
	# Apparently some environments (e.g. CentOS) require to explicitly link
	diff --git a/cpu_features.c b/cpu_features.c
	index 877d5f2..ac6ee88 100644
	--- a/cpu_features.c
	+++ b/cpu_features.c
	@@ -31,6 +31,7 @@ int ZLIB_INTERNAL arm_cpu_enable_pmull = 0;
	int ZLIB_INTERNAL x86_cpu_enable_sse2 = 0;
	int ZLIB_INTERNAL x86_cpu_enable_ssse3 = 0;
	int ZLIB_INTERNAL x86_cpu_enable_simd = 0;
	+int ZLIB_INTERNAL x86_cpu_enable_avx512 = 0;

	#ifndef CPU_NO_SIMD

	@@ -138,6 +139,10 @@ static void _cpu_check_features(void)
	/* On x86 we simply use a instruction to check the CPU features.
	* (i.e. CPUID).
	*/
	+#ifdef CRC32_SIMD_AVX512_PCLMUL
	+#include <immintrin.h>
	+#include <xsaveintrin.h>
	+#endif
	static void _cpu_check_features(void)
	{
	int x86_cpu_has_sse2;
	@@ -164,6 +169,10 @@ static void _cpu_check_features(void)
	x86_cpu_enable_simd = x86_cpu_has_sse2 &&
	x86_cpu_has_sse42 &&
	x86_cpu_has_pclmulqdq;
	+
	+#ifdef CRC32_SIMD_AVX512_PCLMUL
	+ x86_cpu_enable_avx512 = _xgetbv(0) & 0x00000040;
	+#endif
	}
	#endif
	#endif
	diff --git a/cpu_features.h b/cpu_features.h
	index 279246c..aed3e83 100644
	--- a/cpu_features.h
	+++ b/cpu_features.h
	@@ -14,5 +14,6 @@ extern int arm_cpu_enable_pmull;
	extern int x86_cpu_enable_sse2;
	extern int x86_cpu_enable_ssse3;
	extern int x86_cpu_enable_simd;
	+extern int x86_cpu_enable_avx512;

	void cpu_check_features(void);
	diff --git a/crc32.c b/crc32.c
	index 4486098..acb6972 100644
	--- a/crc32.c
	+++ b/crc32.c
	@@ -773,7 +773,19 @@ unsigned long ZEXPORT crc32_z(crc, buf, len)
	}

	#endif
	-#if defined(CRC32_SIMD_SSE42_PCLMUL)
	+#if defined(CRC32_SIMD_AVX512_PCLMUL)
	+ if (x86_cpu_enable_avx512 && len >= Z_CRC32_AVX512_MINIMUM_LENGTH) {
	+ /* crc32 64-byte chunks */
	+ z_size_t chunk_size = len & ~Z_CRC32_AVX512_CHUNKSIZE_MASK;
	+ crc = ~crc32_avx512_simd_(buf, chunk_size, ~(uint32_t)crc);
	+ /* check remaining data */
	+ len -= chunk_size;
	+ if (!len)
	+ return crc;
	+ /* Fall into the default crc32 for the remaining data. */
	+ buf += chunk_size;
	+ }
	+#elif defined(CRC32_SIMD_SSE42_PCLMUL)
	if (x86_cpu_enable_simd && len >= Z_CRC32_SSE42_MINIMUM_LENGTH) {
	/* crc32 16-byte chunks */
	z_size_t chunk_size = len & ~Z_CRC32_SSE42_CHUNKSIZE_MASK;
	diff --git a/crc32_simd.c b/crc32_simd.c
	index d80beba..7428270 100644
	--- a/crc32_simd.c
	+++ b/crc32_simd.c
	@@ -6,17 +6,207 @@
	*/

	#include "crc32_simd.h"
	-
	-#if defined(CRC32_SIMD_SSE42_PCLMUL)
	+#if defined(CRC32_SIMD_AVX512_PCLMUL)

	/*
	- * crc32_sse42_simd_(): compute the crc32 of the buffer, where the buffer
	- * length must be at least 64, and a multiple of 16. Based on:
	+ * crc32_avx512_simd_(): compute the crc32 of the buffer, where the buffer
	+ * length must be at least 256, and a multiple of 64. Based on:
	*
	* "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction"
	* V. Gopal, E. Ozturk, et al., 2009, http://intel.ly/2ySEwL0
	*/

	+#include <emmintrin.h>
	+#include <smmintrin.h>
	+#include <wmmintrin.h>
	+#include <immintrin.h>
	+
	+uint32_t ZLIB_INTERNAL crc32_avx512_simd_( /* AVX512+PCLMUL */
	+ const unsigned char *buf,
	+ z_size_t len,
	+ uint32_t crc)
	+{
	+ /*
	+ * Definitions of the bit-reflected domain constants k1,k2,k3,k4
	+ * are similar to those given at the end of the paper, and remaining
	+ * constants and CRC32+Barrett polynomials remain unchanged.
	+ *
	+ * Replace the index of x from 128 to 512. As follows:
	+ * k1 = ( x ^ ( 512 * 4 + 32 ) mod P(x) << 32 )' << 1 = 0x011542778a
	+ * k2 = ( x ^ ( 512 * 4 - 32 ) mod P(x) << 32 )' << 1 = 0x01322d1430
	+ * k3 = ( x ^ ( 512 + 32 ) mod P(x) << 32 )' << 1 = 0x0154442bd4
	+ * k4 = ( x ^ ( 512 - 32 ) mod P(x) << 32 )' << 1 = 0x01c6e41596
	+ */
	+ static const uint64_t zalign(64) k1k2[] = { 0x011542778a, 0x01322d1430,
	+ 0x011542778a, 0x01322d1430,
	+ 0x011542778a, 0x01322d1430,
	+ 0x011542778a, 0x01322d1430 };
	+ static const uint64_t zalign(64) k3k4[] = { 0x0154442bd4, 0x01c6e41596,
	+ 0x0154442bd4, 0x01c6e41596,
	+ 0x0154442bd4, 0x01c6e41596,
	+ 0x0154442bd4, 0x01c6e41596 };
	+ static const uint64_t zalign(16) k5k6[] = { 0x01751997d0, 0x00ccaa009e };
	+ static const uint64_t zalign(16) k7k8[] = { 0x0163cd6124, 0x0000000000 };
	+ static const uint64_t zalign(16) poly[] = { 0x01db710641, 0x01f7011641 };
	+ __m512i x0, x1, x2, x3, x4, x5, x6, x7, x8, y5, y6, y7, y8;
	+ __m128i a0, a1, a2, a3;
	+
	+ /*
	+ * There's at least one block of 256.
	+ */
	+ x1 = _mm512_loadu_si512((__m512i *)(buf + 0x00));
	+ x2 = _mm512_loadu_si512((__m512i *)(buf + 0x40));
	+ x3 = _mm512_loadu_si512((__m512i *)(buf + 0x80));
	+ x4 = _mm512_loadu_si512((__m512i *)(buf + 0xC0));
	+
	+ x1 = _mm512_xor_si512(x1, _mm512_castsi128_si512(_mm_cvtsi32_si128(crc)));
	+
	+ x0 = _mm512_load_si512((__m512i *)k1k2);
	+
	+ buf += 256;
	+ len -= 256;
	+
	+ /*
	+ * Parallel fold blocks of 256, if any.
	+ */
	+ while (len >= 256)
	+ {
	+ x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00);
	+ x6 = _mm512_clmulepi64_epi128(x2, x0, 0x00);
	+ x7 = _mm512_clmulepi64_epi128(x3, x0, 0x00);
	+ x8 = _mm512_clmulepi64_epi128(x4, x0, 0x00);
	+
	+
	+ x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11);
	+ x2 = _mm512_clmulepi64_epi128(x2, x0, 0x11);
	+ x3 = _mm512_clmulepi64_epi128(x3, x0, 0x11);
	+ x4 = _mm512_clmulepi64_epi128(x4, x0, 0x11);
	+
	+ y5 = _mm512_loadu_si512((__m512i *)(buf + 0x00));
	+ y6 = _mm512_loadu_si512((__m512i *)(buf + 0x40));
	+ y7 = _mm512_loadu_si512((__m512i *)(buf + 0x80));
	+ y8 = _mm512_loadu_si512((__m512i *)(buf + 0xC0));
	+
	+ x1 = _mm512_xor_si512(x1, x5);
	+ x2 = _mm512_xor_si512(x2, x6);
	+ x3 = _mm512_xor_si512(x3, x7);
	+ x4 = _mm512_xor_si512(x4, x8);
	+
	+ x1 = _mm512_xor_si512(x1, y5);
	+ x2 = _mm512_xor_si512(x2, y6);
	+ x3 = _mm512_xor_si512(x3, y7);
	+ x4 = _mm512_xor_si512(x4, y8);
	+
	+ buf += 256;
	+ len -= 256;
	+ }
	+
	+ /*
	+ * Fold into 512-bits.
	+ */
	+ x0 = _mm512_load_si512((__m512i *)k3k4);
	+
	+ x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00);
	+ x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11);
	+ x1 = _mm512_xor_si512(x1, x2);
	+ x1 = _mm512_xor_si512(x1, x5);
	+
	+ x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00);
	+ x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11);
	+ x1 = _mm512_xor_si512(x1, x3);
	+ x1 = _mm512_xor_si512(x1, x5);
	+
	+ x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00);
	+ x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11);
	+ x1 = _mm512_xor_si512(x1, x4);
	+ x1 = _mm512_xor_si512(x1, x5);
	+
	+ /*
	+ * Single fold blocks of 64, if any.
	+ */
	+ while (len >= 64)
	+ {
	+ x2 = _mm512_loadu_si512((__m512i *)buf);
	+
	+ x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00);
	+ x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11);
	+ x1 = _mm512_xor_si512(x1, x2);
	+ x1 = _mm512_xor_si512(x1, x5);
	+
	+ buf += 64;
	+ len -= 64;
	+ }
	+
	+ /*
	+ * Fold 512-bits to 384-bits.
	+ */
	+ a0 = _mm_load_si128((__m128i *)k5k6);
	+
	+ a1 = _mm512_extracti32x4_epi32(x1, 0);
	+ a2 = _mm512_extracti32x4_epi32(x1, 1);
	+
	+ a3 = _mm_clmulepi64_si128(a1, a0, 0x00);
	+ a1 = _mm_clmulepi64_si128(a1, a0, 0x11);
	+
	+ a1 = _mm_xor_si128(a1, a3);
	+ a1 = _mm_xor_si128(a1, a2);
	+
	+ /*
	+ * Fold 384-bits to 256-bits.
	+ */
	+ a2 = _mm512_extracti32x4_epi32(x1, 2);
	+ a3 = _mm_clmulepi64_si128(a1, a0, 0x00);
	+ a1 = _mm_clmulepi64_si128(a1, a0, 0x11);
	+ a1 = _mm_xor_si128(a1, a3);
	+ a1 = _mm_xor_si128(a1, a2);
	+
	+ /*
	+ * Fold 256-bits to 128-bits.
	+ */
	+ a2 = _mm512_extracti32x4_epi32(x1, 3);
	+ a3 = _mm_clmulepi64_si128(a1, a0, 0x00);
	+ a1 = _mm_clmulepi64_si128(a1, a0, 0x11);
	+ a1 = _mm_xor_si128(a1, a3);
	+ a1 = _mm_xor_si128(a1, a2);
	+
	+ /*
	+ * Fold 128-bits to 64-bits.
	+ */
	+ a2 = _mm_clmulepi64_si128(a1, a0, 0x10);
	+ a3 = _mm_setr_epi32(~0, 0, ~0, 0);
	+ a1 = _mm_srli_si128(a1, 8);
	+ a1 = _mm_xor_si128(a1, a2);
	+
	+ a0 = _mm_loadl_epi64((__m128i*)k7k8);
	+ a2 = _mm_srli_si128(a1, 4);
	+ a1 = _mm_and_si128(a1, a3);
	+ a1 = _mm_clmulepi64_si128(a1, a0, 0x00);
	+ a1 = _mm_xor_si128(a1, a2);
	+
	+ /*
	+ * Barret reduce to 32-bits.
	+ */
	+ a0 = _mm_load_si128((__m128i*)poly);
	+
	+ a2 = _mm_and_si128(a1, a3);
	+ a2 = _mm_clmulepi64_si128(a2, a0, 0x10);
	+ a2 = _mm_and_si128(a2, a3);
	+ a2 = _mm_clmulepi64_si128(a2, a0, 0x00);
	+ a1 = _mm_xor_si128(a1, a2);
	+
	+ /*
	+ * Return the crc32.
	+ */
	+ return _mm_extract_epi32(a1, 1);
	+}
	+
	+#elif defined(CRC32_SIMD_SSE42_PCLMUL)
	+
	+/*
	+ * crc32_sse42_simd_(): compute the crc32 of the buffer, where the buffer
	+ * length must be at least 64, and a multiple of 16.
	+ */
	+
	#include <emmintrin.h>
	#include <smmintrin.h>
	#include <wmmintrin.h>
	diff --git a/crc32_simd.h b/crc32_simd.h
	index c0346dc..8462464 100644
	--- a/crc32_simd.h
	+++ b/crc32_simd.h
	@@ -19,12 +19,18 @@ uint32_t ZLIB_INTERNAL crc32_sse42_simd_(const unsigned char* buf,
	z_size_t len,
	uint32_t crc);

	+uint32_t ZLIB_INTERNAL crc32_avx512_simd_(const unsigned char* buf,
	+ z_size_t len,
	+ uint32_t crc);
	+
	/*
	* crc32_sse42_simd_ buffer size constraints: see the use in zlib/crc32.c
	* for computing the crc32 of an arbitrary length buffer.
	*/
	#define Z_CRC32_SSE42_MINIMUM_LENGTH 64
	#define Z_CRC32_SSE42_CHUNKSIZE_MASK 15
	+#define Z_CRC32_AVX512_MINIMUM_LENGTH 256
	+#define Z_CRC32_AVX512_CHUNKSIZE_MASK 63

	/*
	* CRC32 checksums using ARMv8-a crypto instructions.
	--
	2.34.1