[zlib][asop] Complete USE_ZLIB_RABIN_KARP_ROLLING_HASH feature

If USE_ZLIB_RABIN_KARP_ROLLING_HASH build-time option is defined, use
the Rabin-Karp hash. This disables CRC SIMD hashing on ARM and Intel,
which will degrade compression speed.

However, the compressed output matches canonical zlib output, for the
same input, and that should resolve ASOP OTA issue 1316541.

To ensure the Rabin-Karp hash is used correctly in chromium zlib, the
first step is to go back to using canonical fill_window(). To do this
combine the ARM NEON and Intel SSE2 slide_hash() routines in a common
framework called slide_hash_simd(). Remove fill_window_sse.c and undo
deflate_read_buf() rename: name it back to canonical read_buf().

Change insert_string(): by default it uses CRC32C hashes on all ports
(ARM, Intel) so add code comments to state that. If Rabin-Karp hashes
are enabled, disable CRC32C hashes.

Add a new deflate internal state variable chromium_zlib_hash, used to
detect which type of hashing is enabled (Rabin-Karp, CRC32C). Set the
state variable in deflateInit2_ after cpu_check_features() detection,
with #ifdef guards matching the #ifdef logic of insert_string().

Change canonical fill_window() to insert hashes into {hash,prev} hash
chains based on hash type (Rabin-Karp, CRC32C). Prior to this change,
the ARM port was inserting Rabin-Karp hashes into hash chains even if
CRC32 hashing was active when s->insert was > 0.

Change longest_match() and deflate_fast(): update them to use the new
deflate state variable chromium_zlib_hash.

Compression performance degrades when Rabin-Karp hashing is used, but
is unchanged when CRC32C hashing is enabled (chromium zlib default).

Compat: if Rabin-Karp hashing is enabled, zlib-bench --check built as normal and against canonical zlib 1.2.11, produce the same compressed output for the snappy corpora for gzip,zlib,raw types and compression levels 1..9.

Bug: 1316541
Change-Id: I0d5ee6240f0b7eac4653d60a29d459d994c3871f
Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/3596671
Reviewed-by: Chris Blume <cblume@chromium.org>
Commit-Queue: Noel Gordon <noel@chromium.org>
Reviewed-by: Adenilson Cavalcanti <cavalcantii@chromium.org>
Cr-Commit-Position: refs/heads/main@{#998062}
NOKEYCHECK=True
GitOrigin-RevId: 2bd100e46361a459b41a29212ea8f97a4837a06c
diff --git a/BUILD.gn b/BUILD.gn
index 49f52e1..3a71693 100644
--- a/BUILD.gn
+++ b/BUILD.gn
@@ -216,6 +216,7 @@
     sources = [
       "crc32_simd.c",
       "crc32_simd.h",
+      "crc_folding.c",
     ]
 
     if (!is_win || is_clang) {
@@ -229,38 +230,34 @@
   configs += [ ":zlib_internal_config" ]
 
   public_configs = [ ":zlib_crc32_simd_config" ]
+
   public_deps = [ ":zlib_common_headers" ]
 }
 
-config("zlib_x86_simd_config") {
+config("zlib_slide_hash_simd_config") {
   if (use_x86_x64_optimizations) {
-    defines = [
-      "CRC32_SIMD_SSE42_PCLMUL",
-      "DEFLATE_FILL_WINDOW_SSE2",
-    ]
+    defines = [ "DEFLATE_SLIDE_HASH_SSE2" ]
+  }
+
+  if (use_arm_neon_optimizations) {
+    defines = [ "DEFLATE_SLIDE_HASH_NEON" ]
   }
 }
 
-source_set("zlib_x86_simd") {
+source_set("zlib_slide_hash_simd") {
   visibility = [ ":*" ]
 
   if (use_x86_x64_optimizations) {
-    sources = [
-      "crc_folding.c",
-      "fill_window_sse.c",
-    ]
+    sources = [ "slide_hash_simd.h" ]
+  }
 
-    if (!is_win || is_clang) {
-      cflags = [
-        "-msse4.2",
-        "-mpclmul",
-      ]
-    }
+  if (use_arm_neon_optimizations) {
+    sources = [ "slide_hash_simd.h" ]
   }
 
   configs += [ ":zlib_internal_config" ]
 
-  public_configs = [ ":zlib_x86_simd_config" ]
+  public_configs = [ ":zlib_slide_hash_simd_config" ]
 
   public_deps = [ ":zlib_common_headers" ]
 }
@@ -330,20 +327,18 @@
     deps += [
       ":zlib_adler32_simd",
       ":zlib_inflate_chunk_simd",
+      ":zlib_slide_hash_simd",
     ]
 
     if (use_x86_x64_optimizations) {
       deps += [ ":zlib_crc32_simd" ]
     } else if (use_arm_neon_optimizations) {
-      sources += [ "contrib/optimizations/slide_hash_neon.h" ]
       deps += [ ":zlib_arm_crc32" ]
     }
   } else {
     sources += [ "inflate.c" ]
   }
 
-  deps += [ ":zlib_x86_simd" ]
-
   if (is_android) {
     import("//build/config/android/config.gni")
     if (defined(android_ndk_root) && android_ndk_root != "") {
diff --git a/contrib/optimizations/insert_string.h b/contrib/optimizations/insert_string.h
index 577d225..a7f24a3 100644
--- a/contrib/optimizations/insert_string.h
+++ b/contrib/optimizations/insert_string.h
@@ -5,11 +5,16 @@
  * found in the Chromium source repository LICENSE file.
  */
 
-#if defined(_MSC_VER)
+#ifndef INSERT_STRING_H
+#define INSERT_STRING_H
+
+#ifndef INLINE
+#if defined(_MSC_VER) && !defined(__clang__)
 #define INLINE __inline
 #else
 #define INLINE inline
 #endif
+#endif
 
 #include "cpu_features.h"
 
@@ -23,7 +28,8 @@
     #define TARGET_CPU_WITH_CRC
   #endif
 
-  #define _cpu_crc32_u32 _mm_crc32_u32
+  /* CRC32C uint32_t */
+  #define _cpu_crc32c_hash_u32 _mm_crc32_u32
 
 #elif defined(CRC32_ARMV8_CRC32)
   #if defined(__clang__)
@@ -40,7 +46,8 @@
     #define TARGET_CPU_WITH_CRC __attribute__((target("armv8-a,crc")))
   #endif  // defined(__aarch64__)
 
-  #define _cpu_crc32_u32 __crc32cw
+  /* CRC32C uint32_t */
+  #define _cpu_crc32c_hash_u32 __crc32cw
 
 #endif
 // clang-format on
@@ -58,12 +65,8 @@
   if (s->level >= 6)
     val &= 0xFFFFFF;
 
-  /* Unlike the case of data integrity checks for GZIP format where the
-   * polynomial used is defined (https://tools.ietf.org/html/rfc1952#page-11),
-   * here it is just a hash function for the hash table used while
-   * performing compression.
-   */
-  h = _cpu_crc32_u32(h, val);
+  /* Compute hash from the CRC32C of |val|. */
+  h = _cpu_crc32c_hash_u32(h, val);
 
   ret = s->head[h & s->hash_mask];
   s->head[h & s->hash_mask] = str;
@@ -88,7 +91,7 @@
 #endif
 
 /* ===========================================================================
- * Update a hash value with the given input byte (Rabin-Karp rolling hash)
+ * Update a hash value with the given input byte (Rabin-Karp rolling hash).
  * IN  assertion: all calls to UPDATE_HASH are made with consecutive input
  *    characters, so that a running hash key can be computed from the previous
  *    key instead of complete recalculation each time.
@@ -120,15 +123,16 @@
 }
 
 local INLINE Pos insert_string(deflate_state* const s, const Pos str) {
-/* insert_string_simd string dictionary insertion: this SIMD symbol hashing
+/* insert_string_simd string dictionary insertion: SIMD crc32c symbol hasher
  * significantly improves data compression speed.
  *
- * Note: the generated compressed output is a valid DEFLATE stream but will
- * differ from canonical zlib output ...
+ * Note: the generated compressed output is a valid DEFLATE stream, but will
+ * differ from canonical zlib output.
  */
-#if defined(CHROMIUM_ZLIB_NO_CASTAGNOLI)
-/* ... so this build-time option can be used to disable the SIMD symbol hasher.
- */ /* FALLTHOUGH */
+#if defined(USE_ZLIB_RABIN_KARP_ROLLING_HASH)
+/* So this build-time option can be used to disable the crc32c hash, and use
+ * the Rabin-Karp hash instead.
+ */ /* FALLTHROUGH Rabin-Karp */
 #elif defined(TARGET_CPU_WITH_CRC) && defined(CRC32_SIMD_SSE42_PCLMUL)
   if (x86_cpu_enable_simd)
     return insert_string_simd(s, str);
@@ -136,5 +140,7 @@
   if (arm_cpu_enable_crc32)
     return insert_string_simd(s, str);
 #endif
-  return insert_string_c(s, str);
+  return insert_string_c(s, str); /* Rabin-Karp */
 }
+
+#endif /* INSERT_STRING_H */
diff --git a/contrib/optimizations/slide_hash_neon.h b/contrib/optimizations/slide_hash_neon.h
deleted file mode 100644
index 26995d7..0000000
--- a/contrib/optimizations/slide_hash_neon.h
+++ /dev/null
@@ -1,65 +0,0 @@
-/* Copyright 2018 The Chromium Authors. All rights reserved.
- * Use of this source code is governed by a BSD-style license that can be
- * found in the Chromium source repository LICENSE file.
- */
-#ifndef __SLIDE_HASH__NEON__
-#define __SLIDE_HASH__NEON__
-
-#include "deflate.h"
-#include <arm_neon.h>
-
-inline static void ZLIB_INTERNAL neon_slide_hash_update(Posf *hash,
-                                                        const uInt hash_size,
-                                                        const ush w_size)
-{
-   /* NEON 'Q' registers allow to store 128 bits, so we can load 8x16-bits
-     * values. For further details, check:
-     * ARM DHT 0002A, section 1.3.2 NEON Registers.
-     */
-    const size_t chunk = sizeof(uint16x8_t) / sizeof(uint16_t);
-    /* Unrolling the operation yielded a compression performance boost in both
-     * ARMv7 (from 11.7% to 13.4%) and ARMv8 (from 3.7% to 7.5%) for HTML4
-     * content. For full benchmarking data, check: http://crbug.com/863257.
-     */
-    const size_t stride = 2*chunk;
-    const uint16x8_t v = vdupq_n_u16(w_size);
-
-    for (Posf *end = hash + hash_size; hash != end; hash += stride) {
-        uint16x8_t m_low = vld1q_u16(hash);
-        uint16x8_t m_high = vld1q_u16(hash + chunk);
-
-        /* The first 'q' in vqsubq_u16 makes these subtracts saturate to zero,
-         * replacing the ternary operator expression in the original code:
-         * (m >= wsize ? m - wsize : NIL).
-         */
-        m_low = vqsubq_u16(m_low, v);
-        m_high = vqsubq_u16(m_high, v);
-
-        vst1q_u16(hash, m_low);
-        vst1q_u16(hash + chunk, m_high);
-    }
-}
-
-
-inline static void ZLIB_INTERNAL neon_slide_hash(Posf *head, Posf *prev,
-                                                 const unsigned short w_size,
-                                                 const uInt hash_size)
-{
-    /*
-     * SIMD implementation for hash table rebase assumes:
-     * 1. hash chain offset (Pos) is 2 bytes.
-     * 2. hash table size is multiple of 32 bytes.
-     * #1 should be true as Pos is defined as "ush"
-     * #2 should be true as hash_bits are greater than 7
-     */
-    const size_t size = hash_size * sizeof(head[0]);
-    Assert(sizeof(Pos) == 2, "Wrong Pos size.");
-    Assert((size % sizeof(uint16x8_t) * 2) == 0, "Hash table size error.");
-
-    neon_slide_hash_update(head, hash_size, w_size);
-#ifndef FASTEST
-    neon_slide_hash_update(prev, w_size, w_size);
-#endif
-}
-
-#endif
diff --git a/deflate.c b/deflate.c
index 1b3f631..09d1655 100644
--- a/deflate.c
+++ b/deflate.c
@@ -50,13 +50,15 @@
 /* @(#) $Id$ */
 #include <assert.h>
 #include "deflate.h"
-#include "cpu_features.h"
-#include "contrib/optimizations/insert_string.h"
 
-#if (defined(__ARM_NEON__) || defined(__ARM_NEON))
-#include "contrib/optimizations/slide_hash_neon.h"
+#include "cpu_features.h"
+
+#if defined(DEFLATE_SLIDE_HASH_SSE2) || defined(DEFLATE_SLIDE_HASH_NEON)
+#include "slide_hash_simd.h"
 #endif
 
+#include "contrib/optimizations/insert_string.h"
+
 #ifdef FASTEST
 /* See http://crbug.com/1113596 */
 #error "FASTEST is not supported in Chromium's zlib."
@@ -97,7 +99,7 @@
 local void lm_init        OF((deflate_state *s));
 local void putShortMSB    OF((deflate_state *s, uInt b));
 local void flush_pending  OF((z_streamp strm));
-unsigned ZLIB_INTERNAL deflate_read_buf OF((z_streamp strm, Bytef *buf, unsigned size));
+local unsigned read_buf   OF((z_streamp strm, Bytef *buf, unsigned size));
 #ifdef ASMV
 #  pragma message("Assembler code may have bugs -- use at your own risk")
       void match_init OF((void)); /* asm code initialization */
@@ -191,10 +193,11 @@
 local void slide_hash(s)
     deflate_state *s;
 {
-#if (defined(__ARM_NEON__) || defined(__ARM_NEON))
-    /* NEON based hash table rebase. */
-    return neon_slide_hash(s->head, s->prev, s->w_size, s->hash_size);
+#if defined(DEFLATE_SLIDE_HASH_SSE2) || defined(DEFLATE_SLIDE_HASH_NEON)
+    slide_hash_simd(s->head, s->prev, s->w_size, s->hash_size);
+    return;
 #endif
+
     unsigned n, m;
     Posf *p;
     uInt wsize = s->w_size;
@@ -311,8 +314,19 @@
     s->w_size = 1 << s->w_bits;
     s->w_mask = s->w_size - 1;
 
+    s->chromium_zlib_hash = 0;
+#if !defined(USE_ZLIB_RABIN_KARP_ROLLING_HASH)
+  #if defined(TARGET_CPU_WITH_CRC) && defined(CRC32_SIMD_SSE42_PCLMUL)
+    if (x86_cpu_enable_simd)
+      s->chromium_zlib_hash = 1;
+  #elif defined(TARGET_CPU_WITH_CRC) && defined(CRC32_ARMV8_CRC32)
+    if (arm_cpu_enable_crc32)
+      s->chromium_zlib_hash = 1;
+  #endif
+#endif
+
     s->hash_bits = memLevel + 7;
-    if ((x86_cpu_enable_simd || arm_cpu_enable_crc32) && s->hash_bits < 15) {
+    if (s->chromium_zlib_hash && s->hash_bits < 15) {
         s->hash_bits = 15;
     }
 
@@ -446,7 +460,7 @@
     /* when using zlib wrappers, compute Adler-32 for provided dictionary */
     if (wrap == 1)
         strm->adler = adler32(strm->adler, dictionary, dictLength);
-    s->wrap = 0;                    /* avoid computing Adler-32 in deflate_read_buf */
+    s->wrap = 0;                    /* avoid computing Adler-32 in read_buf */
 
     /* if dictionary would fill window, just replace the history */
     if (dictLength >= s->w_size) {
@@ -774,7 +788,7 @@
  * Flush as much pending output as possible. All deflate() output, except for
  * some deflate_stored() output, goes through this function so some
  * applications may wish to modify it to avoid allocating a large
- * strm->next_out buffer and copying into it. (See also deflate_read_buf()).
+ * strm->next_out buffer and copying into it. (See also read_buf()).
  */
 local void flush_pending(strm)
     z_streamp strm;
@@ -1210,7 +1224,7 @@
  * allocating a large strm->next_in buffer and copying from it.
  * (See also flush_pending()).
  */
-ZLIB_INTERNAL unsigned deflate_read_buf(strm, buf, size)
+local unsigned read_buf(strm, buf, size)
     z_streamp strm;
     Bytef *buf;
     unsigned size;
@@ -1358,7 +1372,7 @@
          * necessary to put more guard bytes at the end of the window, or
          * to check more often for insufficient lookahead.
          */
-        if (!x86_cpu_enable_simd && !arm_cpu_enable_crc32) {
+        if (!s->chromium_zlib_hash) {
           Assert(scan[2] == match[2], "scan[2]?");
         } else {
           /* When using CRC hashing, scan[2] and match[2] may mismatch, but in
@@ -1398,7 +1412,7 @@
          * the hash keys are equal and that HASH_BITS >= 8.
          */
         scan += 2, match++;
-        if (!x86_cpu_enable_simd && !arm_cpu_enable_crc32) {
+        if (!s->chromium_zlib_hash) {
           Assert(*scan == *match, "match[2]?");
         } else {
           /* When using CRC hashing, scan[2] and match[2] may mismatch, but in
@@ -1547,20 +1561,7 @@
  *    performed for at least two bytes (required for the zip translate_eol
  *    option -- not supported here).
  */
-local void fill_window_c(deflate_state *s);
-
-local void fill_window(deflate_state *s)
-{
-#ifdef DEFLATE_FILL_WINDOW_SSE2
-    if (x86_cpu_enable_simd) {
-        fill_window_sse(s);
-        return;
-    }
-#endif
-    fill_window_c(s);
-}
-
-local void fill_window_c(s)
+local void fill_window(s)
     deflate_state *s;
 {
     unsigned n;
@@ -1614,10 +1615,24 @@
          */
         Assert(more >= 2, "more < 2");
 
-        n = deflate_read_buf(s->strm, s->window + s->strstart + s->lookahead, more);
+        n = read_buf(s->strm, s->window + s->strstart + s->lookahead, more);
         s->lookahead += n;
 
         /* Initialize the hash value now that we have some input: */
+        if (s->chromium_zlib_hash) {
+            /* chromium hash reads 4 bytes */
+            if (s->lookahead + s->insert > MIN_MATCH) {
+                uInt str = s->strstart - s->insert;
+                while (s->insert) {
+                    insert_string(s, str);
+                    str++;
+                    s->insert--;
+                    if (s->lookahead + s->insert <= MIN_MATCH)
+                        break;
+                }
+            }
+        } else
+        /* Initialize the hash value now that we have some input: */
         if (s->lookahead + s->insert >= MIN_MATCH) {
             uInt str = s->strstart - s->insert;
             s->ins_h = s->window[str];
@@ -1803,7 +1818,7 @@
          * the check value.
          */
         if (len) {
-            deflate_read_buf(s->strm, s->strm->next_out, len);
+            read_buf(s->strm, s->strm->next_out, len);
             s->strm->next_out += len;
             s->strm->avail_out -= len;
             s->strm->total_out += len;
@@ -1871,7 +1886,7 @@
     if (have > s->strm->avail_in)
         have = s->strm->avail_in;
     if (have) {
-        deflate_read_buf(s->strm, s->window + s->strstart, have);
+        read_buf(s->strm, s->window + s->strstart, have);
         s->strstart += have;
         s->insert += MIN(have, s->w_size - s->insert);
     }
@@ -1978,14 +1993,17 @@
             {
                 s->strstart += s->match_length;
                 s->match_length = 0;
-                s->ins_h = s->window[s->strstart];
-                UPDATE_HASH(s, s->ins_h, s->window[s->strstart+1]);
+
+                if (!s->chromium_zlib_hash) {
+                  s->ins_h = s->window[s->strstart];
+                  UPDATE_HASH(s, s->ins_h, s->window[s->strstart+1]);
 #if MIN_MATCH != 3
-                Call UPDATE_HASH() MIN_MATCH-3 more times
+                  Call UPDATE_HASH() MIN_MATCH-3 more times
 #endif
-                /* If lookahead < MIN_MATCH, ins_h is garbage, but it does not
-                 * matter since it will be recomputed at next deflate call.
-                 */
+                  /* If lookahead < MIN_MATCH, ins_h is garbage, but it does not
+                   * matter since it will be recomputed at next deflate call.
+                   */
+                }
             }
         } else {
             /* No match, output a literal byte */
diff --git a/deflate.h b/deflate.h
index 5510d74..1407931 100644
--- a/deflate.h
+++ b/deflate.h
@@ -151,6 +151,11 @@
      *   hash_shift * MIN_MATCH >= hash_bits
      */
 
+    uInt chromium_zlib_hash;
+    /* 0 if Rabin-Karp rolling hash is enabled, non-zero if chromium zlib
+     * hash is enabled.
+     */
+
     long block_start;
     /* Window position at the beginning of the current output block. Gets
      * negative when the window is moved backwards.
@@ -351,6 +356,4 @@
                                  long len);
 unsigned ZLIB_INTERNAL crc_fold_512to32(deflate_state* const s);
 
-void ZLIB_INTERNAL fill_window_sse(deflate_state* s);
-
 #endif /* DEFLATE_H */
diff --git a/fill_window_sse.c b/fill_window_sse.c
deleted file mode 100644
index a841c99..0000000
--- a/fill_window_sse.c
+++ /dev/null
@@ -1,182 +0,0 @@
-/*
- * Fill Window with SSE2-optimized hash shifting
- *
- * Copyright (C) 2013 Intel Corporation
- * Authors:
- *  Arjan van de Ven    <arjan@linux.intel.com>
- *  Jim Kukunas         <james.t.kukunas@linux.intel.com>
- *
- * For conditions of distribution and use, see copyright notice in zlib.h
- */
-
-#include "deflate.h"
-
-#ifdef DEFLATE_FILL_WINDOW_SSE2
-
-#define UPDATE_HASH(s,h,i) \
-    {\
-        if (s->level < 6) { \
-            h = (3483 * (s->window[i]) +\
-                 23081* (s->window[i+1]) +\
-                 6954 * (s->window[i+2]) +\
-                 20947* (s->window[i+3])) & s->hash_mask;\
-        } else {\
-            h = (25881* (s->window[i]) +\
-                 24674* (s->window[i+1]) +\
-                 25811* (s->window[i+2])) & s->hash_mask;\
-        }\
-    }\
-
-extern int deflate_read_buf OF((z_streamp strm, Bytef *buf, unsigned size));
-
-#include <immintrin.h>
-
-void fill_window_sse(deflate_state *s)
-{
-    const __m128i xmm_wsize = _mm_set1_epi16(s->w_size);
-
-    register unsigned n;
-    register Posf *p;
-    unsigned more;    /* Amount of free space at the end of the window. */
-    uInt wsize = s->w_size;
-
-    Assert(s->lookahead < MIN_LOOKAHEAD, "already enough lookahead");
-
-    do {
-        more = (unsigned)(s->window_size -(ulg)s->lookahead -(ulg)s->strstart);
-
-        /* Deal with !@#$% 64K limit: */
-        if (sizeof(int) <= 2) {
-            if (more == 0 && s->strstart == 0 && s->lookahead == 0) {
-                more = wsize;
-
-            } else if (more == (unsigned)(-1)) {
-                /* Very unlikely, but possible on 16 bit machine if
-                 * strstart == 0 && lookahead == 1 (input done a byte at time)
-                 */
-                more--;
-            }
-        }
-
-        /* If the window is almost full and there is insufficient lookahead,
-         * move the upper half to the lower one to make room in the upper half.
-         */
-        if (s->strstart >= wsize+MAX_DIST(s)) {
-
-            zmemcpy(s->window, s->window+wsize, (unsigned)wsize);
-            s->match_start -= wsize;
-            s->strstart    -= wsize; /* we now have strstart >= MAX_DIST */
-            s->block_start -= (long) wsize;
-
-            /* Slide the hash table (could be avoided with 32 bit values
-               at the expense of memory usage). We slide even when level == 0
-               to keep the hash table consistent if we switch back to level > 0
-               later. (Using level 0 permanently is not an optimal usage of
-               zlib, so we don't care about this pathological case.)
-             */
-            n = s->hash_size;
-            p = &s->head[n];
-            p -= 8;
-            do {
-                __m128i value, result;
-
-                value = _mm_loadu_si128((__m128i *)p);
-                result = _mm_subs_epu16(value, xmm_wsize);
-                _mm_storeu_si128((__m128i *)p, result);
-
-                p -= 8;
-                n -= 8;
-            } while (n > 0);
-
-            n = wsize;
-#ifndef FASTEST
-            p = &s->prev[n];
-            p -= 8;
-            do {
-                __m128i value, result;
-
-                value = _mm_loadu_si128((__m128i *)p);
-                result = _mm_subs_epu16(value, xmm_wsize);
-                _mm_storeu_si128((__m128i *)p, result);
-
-                p -= 8;
-                n -= 8;
-            } while (n > 0);
-#endif
-            more += wsize;
-        }
-        if (s->strm->avail_in == 0) break;
-
-        /* If there was no sliding:
-         *    strstart <= WSIZE+MAX_DIST-1 && lookahead <= MIN_LOOKAHEAD - 1 &&
-         *    more == window_size - lookahead - strstart
-         * => more >= window_size - (MIN_LOOKAHEAD-1 + WSIZE + MAX_DIST-1)
-         * => more >= window_size - 2*WSIZE + 2
-         * In the BIG_MEM or MMAP case (not yet supported),
-         *   window_size == input_size + MIN_LOOKAHEAD  &&
-         *   strstart + s->lookahead <= input_size => more >= MIN_LOOKAHEAD.
-         * Otherwise, window_size == 2*WSIZE so more >= 2.
-         * If there was sliding, more >= WSIZE. So in all cases, more >= 2.
-         */
-        Assert(more >= 2, "more < 2");
-
-        n = deflate_read_buf(s->strm,
-                             s->window + s->strstart + s->lookahead,
-                             more);
-        s->lookahead += n;
-
-        /* Initialize the hash value now that we have some input: */
-        if (s->lookahead >= MIN_MATCH) {
-            uInt str = s->strstart;
-            s->ins_h = s->window[str];
-            if (str >= 1)
-                UPDATE_HASH(s, s->ins_h, str + 1 - (MIN_MATCH-1));
-#if MIN_MATCH != 3
-            Call UPDATE_HASH() MIN_MATCH-3 more times
-#endif
-        }
-        /* If the whole input has less than MIN_MATCH bytes, ins_h is garbage,
-         * but this is not important since only literal bytes will be emitted.
-         */
-
-    } while (s->lookahead < MIN_LOOKAHEAD && s->strm->avail_in != 0);
-
-    /* If the WIN_INIT bytes after the end of the current data have never been
-     * written, then zero those bytes in order to avoid memory check reports of
-     * the use of uninitialized (or uninitialised as Julian writes) bytes by
-     * the longest match routines.  Update the high water mark for the next
-     * time through here.  WIN_INIT is set to MAX_MATCH since the longest match
-     * routines allow scanning to strstart + MAX_MATCH, ignoring lookahead.
-     */
-    if (s->high_water < s->window_size) {
-        ulg curr = s->strstart + (ulg)(s->lookahead);
-        ulg init;
-
-        if (s->high_water < curr) {
-            /* Previous high water mark below current data -- zero WIN_INIT
-             * bytes or up to end of window, whichever is less.
-             */
-            init = s->window_size - curr;
-            if (init > WIN_INIT)
-                init = WIN_INIT;
-            zmemzero(s->window + curr, (unsigned)init);
-            s->high_water = curr + init;
-        }
-        else if (s->high_water < (ulg)curr + WIN_INIT) {
-            /* High water mark at or above current data, but below current data
-             * plus WIN_INIT -- zero out to current data plus WIN_INIT, or up
-             * to end of window, whichever is less.
-             */
-            init = (ulg)curr + WIN_INIT - s->high_water;
-            if (init > s->window_size - s->high_water)
-                init = s->window_size - s->high_water;
-            zmemzero(s->window + s->high_water, (unsigned)init);
-            s->high_water += init;
-        }
-    }
-
-    Assert((ulg)s->strstart <= s->window_size - MIN_LOOKAHEAD,
-           "not enough room for search");
-}
-
-#endif  /* DEFLATE_FILL_WINDOW_SSE2 */
diff --git a/slide_hash_simd.h b/slide_hash_simd.h
new file mode 100644
index 0000000..3b2e463
--- /dev/null
+++ b/slide_hash_simd.h
@@ -0,0 +1,115 @@
+/* slide_hash_simd.h
+ *
+ * Copyright 2022 The Chromium Authors. All rights reserved.
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the Chromium source repository LICENSE file.
+ */
+
+#ifndef SLIDE_HASH_SIMD_H
+#define SLIDE_HASH_SIMD_H
+
+#include "deflate.h"
+
+#ifndef INLINE
+#if defined(_MSC_VER) && !defined(__clang__)
+#define INLINE __inline
+#else
+#define INLINE inline
+#endif
+#endif
+
+#if defined(CPU_NO_SIMD)
+
+#error SIMD has been disabled for your build target
+
+#elif defined(DEFLATE_SLIDE_HASH_SSE2)
+
+#include <emmintrin.h>  /* SSE2 */
+
+#define Z_SLIDE_INIT_SIMD(wsize) _mm_set1_epi16((ush)(wsize))
+
+#define Z_SLIDE_HASH_SIMD(table, size, vector_wsize) \
+    for (const Posf* const end = table + size; table != end;) { \
+        __m128i vO = _mm_loadu_si128((__m128i *)(table + 0)); \
+        vO = _mm_subs_epu16(vO, vector_wsize); \
+        _mm_storeu_si128((__m128i *)(table + 0), vO); \
+        table += 8; \
+    }
+
+typedef __m128i z_vec128i_u16x8_t;
+
+#elif defined(DEFLATE_SLIDE_HASH_NEON)
+
+#include <arm_neon.h>  /* NEON */
+
+#define Z_SLIDE_INIT_SIMD(wsize) vdupq_n_u16((ush)(wsize))
+
+#define Z_SLIDE_HASH_SIMD(table, size, vector_wsize) \
+    for (const Posf* const end = table + size; table != end;) { \
+        uint16x8_t vO = vld1q_u16(table + 0); \
+        uint16x8_t v8 = vld1q_u16(table + 8); \
+        vO = vqsubq_u16(vO, vector_wsize); \
+        v8 = vqsubq_u16(v8, vector_wsize); \
+        vst1q_u16(table + 0, vO); \
+        vst1q_u16(table + 8, v8); \
+        table += 8 + 8; \
+    }
+
+typedef uint16x8_t z_vec128i_u16x8_t;
+
+#else
+
+#error slide_hash_simd is not defined for your build target
+
+#endif
+
+/* ===========================================================================
+ * Slide the hash table when sliding the window down (could be avoided with 32
+ * bit values at the expense of memory usage). We slide even when level == 0 to
+ * keep the hash table consistent if we switch back to level > 0 later.
+ */
+local INLINE void slide_hash_simd(
+    Posf *head, Posf *prev, const uInt w_size, const uInt hash_size) {
+    /*
+     * The SIMD implementation of the hash table slider assumes:
+     *
+     * 1. hash chain offset is 2 bytes. Should be true as Pos is "ush" type.
+     */
+    Assert(sizeof(Pos) == 2, "Pos type size error: should be 2 bytes");
+    Assert(sizeof(ush) == 2, "ush type size error: should be 2 bytes");
+
+    Assert(hash_size == (ush)hash_size, "Hash table size error");
+    Assert(w_size == (ush)w_size, "Prev table size error");
+
+    /*
+     * 2. The hash & prev table sizes are a multiple of 32 bytes (256 bits),
+     * since the NEON table slider moves two 128-bit items per loop (loop is
+     * unrolled on NEON for performance, see http://crbug.com/863257).
+     */
+    Assert(!((hash_size * sizeof(head[0])) & (32 - 1)),
+        "Hash table size error: should be a multiple of 32 bytes");
+    Assert(!((w_size * sizeof(prev[0])) & (32 - 1)),
+        "Prev table size error: should be a multiple of 32 bytes");
+
+    /*
+     * Duplicate (ush)w_size in each uint16_t component of a 128-bit vector.
+     */
+    const z_vec128i_u16x8_t vec_wsize = Z_SLIDE_INIT_SIMD(w_size);
+
+    /*
+     * Slide {head,prev} hash chain values: subtracts (ush)w_size from every
+     * value with a saturating SIMD subtract, to clamp the result to 0(NIL),
+     * to implement slide_hash() `(m >= wsize ? m - wsize : NIL);` code.
+     */
+    Z_SLIDE_HASH_SIMD(head, hash_size, vec_wsize);
+#ifndef FASTEST
+    Z_SLIDE_HASH_SIMD(prev, w_size, vec_wsize);
+#endif
+
+}
+
+#undef z_vec128i_u16x8_t
+#undef Z_SLIDE_HASH_SIMD
+#undef Z_SLIDE_INIT_SIMD
+
+#endif  /* SLIDE_HASH_SIMD_H */