src/dsp/dsp.h - codecs/libwebp2 - Git at Google

 // Copyright 2019 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     https://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 // -----------------------------------------------------------------------------
 //
 //   Speed-critical functions.
 //
 // Author: Skal (pascal.massimino@gmail.com)

 #ifndef WP2_DSP_DSP_H_
 #define WP2_DSP_DSP_H_

 #ifdef HAVE_CONFIG_H
 #include "src/wp2/config.h"
 #endif

 #include "src/wp2/base.h"
 #include "src/wp2/format_constants.h"

 //------------------------------------------------------------------------------
 // CPU detection

 // We are inheriting constructors, so we need the following
 // minimal versions of the compilers.
 #if defined(__clang__)
 // clang 3.3, cf https://clang.llvm.org/cxx_status.html
 static_assert(((__clang_major__ << 8) | __clang_minor__) >= ((3 << 8) | 3),
               "clang needs to be at version >= 3.3");
 #elif defined(__GNUC__)
 // gcc 4.8, cf https://gcc.gnu.org/projects/cxx-status.html
 static_assert(((__GNUC__ << 8) | __GNUC_MINOR__) >= ((4 << 8) | 8),
               "gcc needs to be at version >= 4.8");
 #elif defined(_MSC_VER)
 // MSVC 2015, cf https://msdn.microsoft.com/en-us/library/hh567368.aspx
 static_assert(_MSC_VER >= 1900, "Visual Studio needs to be at version >= 2015");
 #if defined(_M_X64) || defined(_M_IX86)
 #define WP2_MSC_SSE   // Visual C++ SSE4.2 targets
 #endif
 #endif

 #ifndef __has_builtin
 # define __has_builtin(x) 0
 #endif

 // WP2_HAVE_* are used to indicate the presence of the instruction set in dsp
 // files without intrinsics, allowing the corresponding Init() to be called.
 // Files containing intrinsics will need to be built targeting the instruction
 // set so should succeed on one of the earlier tests.
 #if defined(__SSE4_2__) || defined(WP2_MSC_SSE) || defined(WP2_HAVE_SSE)
 #define WP2_USE_SSE    // incorporate all instructions up to SSE4.2 (inclusive)
 #include <smmintrin.h>

 // HAVE_AVX2 is dependent on HAVE_SSE
 #if defined(__AVX2__) || defined(WP2_HAVE_AVX2)
 #define WP2_USE_AVX2
 #include <immintrin.h>
 #endif  // AVX2

 #endif  // SSE

 #if defined(__ANDROID__) && defined(__ARM_ARCH_7A__)
 #define WP2_ANDROID_NEON  // Android targets that might support NEON
 #endif

 // The intrinsics currently cause compiler errors with arm-nacl-gcc and the
 // inline assembly would need to be modified for use with Native Client.
 #if (defined(__ARM_NEON__) || defined(WP2_ANDROID_NEON) || \
      defined(__aarch64__) || defined(WP2_HAVE_NEON)) && \
     !defined(__native_client__)
 #define WP2_USE_NEON
 #endif

 #if defined(_MSC_VER) && defined(_M_ARM)
 #define WP2_USE_NEON
 #define WP2_USE_INTRINSICS
 #endif

 #if defined(WP2_USE_NEON)
 #include <arm_neon.h>
 #endif

 #if defined(__mips__) && !defined(__mips64) && \
     defined(__mips_isa_rev) && (__mips_isa_rev >= 1) && (__mips_isa_rev < 6)
 #define WP2_USE_MIPS32
 #if (__mips_isa_rev >= 2)
 #define WP2_USE_MIPS32_R2
 #if defined(__mips_dspr2) || (__mips_dsp_rev >= 2)
 #define WP2_USE_MIPS_DSP_R2
 #endif
 #endif
 #endif

 #if defined(__mips_msa) && defined(__mips_isa_rev) && (__mips_isa_rev >= 5)
 #define WP2_USE_MSA
 #endif

 // This macro prevents thread_sanitizer from reporting known concurrent writes.
 #define WP2_TSAN_IGNORE_FUNCTION
 #if defined(__has_feature)
 #if __has_feature(thread_sanitizer)
 #undef WP2_TSAN_IGNORE_FUNCTION
 #define WP2_TSAN_IGNORE_FUNCTION __attribute__((no_sanitize_thread))
 #endif
 #endif

 #define WP2_UBSAN_IGNORE_UNDEF
 #define WP2_UBSAN_IGNORE_UNSIGNED_OVERFLOW
 #if defined(__clang__) && defined(__has_attribute)
 #if __has_attribute(no_sanitize)
 // This macro prevents the undefined behavior sanitizer from reporting
 // failures. This is only meant to silence unaligned loads on platforms that
 // are known to support them.
 #undef WP2_UBSAN_IGNORE_UNDEF
 #define WP2_UBSAN_IGNORE_UNDEF \
   __attribute__((no_sanitize("undefined")))

 // This macro prevents the undefined behavior sanitizer from reporting
 // failures related to unsigned integer overflows. This is only meant to
 // silence cases where this well defined behavior is expected.
 #undef WP2_UBSAN_IGNORE_UNSIGNED_OVERFLOW
 #define WP2_UBSAN_IGNORE_UNSIGNED_OVERFLOW \
   __attribute__((no_sanitize("unsigned-integer-overflow")))
 #endif
 #endif

 // endianness code from WebP.
 // some endian fix (e.g.: mips-gcc doesn't define __BIG_ENDIAN__)
 #if !defined(WP2_WORDS_BIGENDIAN) &&               \
     (defined(__BIG_ENDIAN__) || defined(_M_PPC) || \
      (defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)))
 #define WP2_WORDS_BIGENDIAN
 #endif

 typedef enum {
   kSSE,        // everything up to 4.2
   kAVX,
   kAVX2,
   kNEON,
   kMIPS32,
   kMIPSdspR2,
   kMSA,
   // some particular fine-grained CPU types
   kSSE2, kSSE3, kSSE4_1, kSSE4_2,
   kSlowSSSE3,  // special feature for slow SSSE3 architectures
 } WP2CPUFeature;

 // returns true if the CPU supports the feature.
 typedef bool (*WP2CPUInfo)(WP2CPUFeature feature);
 WP2_EXTERN WP2CPUInfo WP2GetCPUInfo;
 #define WP2_CPUINFO_IS_DEFINED

 //------------------------------------------------------------------------------
 // Init stub generator

 // Defines an init function stub to ensure each module exposes a symbol,
 // avoiding a compiler warning.
 #define WP2_DSP_INIT_STUB(func) \
   extern void func(); \
   WP2_TSAN_IGNORE_FUNCTION void func() {}

 //------------------------------------------------------------------------------
 // Encoding

 static constexpr uint32_t WP2QBits = 16;  // fixed-point precision for quant
 static inline int16_t WP2Quantize16b(uint32_t v, uint32_t iq, uint32_t bias) {
   return (int16_t)(((uint64_t)v * iq + bias) >> WP2QBits);
 }
 // all quant/iquat/bias matrix have the same stride:
 static constexpr uint32_t WP2QStride = 32u;
 static_assert(WP2QStride >= WP2::kMaxBlockSizePix, "WP2QMaxW stride too short");

 // Quantize: coeffs[] = (res[] * iq[] + bias) >> WP2Bits (with sign handling).
 // iq[], res[] and coeffs[] are assumed to be pointing to at
 // least 8 elements (uint32_t or int16_t) of memory underneath.
 extern void (*WP2Quantize)(const uint32_t iq[], uint32_t bias,
                            const int32_t res[], int16_t coeffs[], uint32_t W,
                            uint32_t H);
 // Performs unclamped out[] = in[] * dequants[] for i in [0..len), and pad the
 // rest of out[] with 0s. It's assumed that in[] is padded with 0s for i >= len.
 // It is the caller's responsability to ensure the in[] * dequants[] will not
 // overflow 16b storage. No clamping is to be expected.
 extern void (*WP2Dequantize)(const int16_t in[], const int16_t dequants[],
                              int16_t out[],
                              uint32_t W, uint32_t H, uint32_t len);

 void WP2QuantizeInit();

 // must be called before using any of the above
 void WP2EncDspInit();

 //------------------------------------------------------------------------------
 // Decoding

 // must be called before anything using the above
 void WP2DecDspInit();

 //------------------------------------------------------------------------------
 // Common utils

 typedef void (*WP2ArgbConverterF)(const void* src, uint32_t width, void* dst);
 // functions to convert from...
 // any format -> Argb
 extern WP2ArgbConverterF WP2ArgbConvertFrom[WP2_FORMAT_NUM];
 // any format -> ARGB
 extern WP2ArgbConverterF WP2ARGBConvertFrom[WP2_FORMAT_NUM];
 // Argb -> any format
 extern WP2ArgbConverterF WP2ArgbConvertTo[WP2_FORMAT_NUM];
 // ARGB -> any format
 extern WP2ArgbConverterF WP2ARGBConvertTo[WP2_FORMAT_NUM];
 // Returns the function for converting between the two formats. At least one of
 // the two formats must be Argb or ARGB, returns null otherwise.
 WP2ArgbConverterF WP2ConversionFunction(WP2SampleFormat from,
                                         WP2SampleFormat to);

 void WP2ArgbConverterInit();

 // Applies the matrix mtx[] to (y,u,v)[] + offset[].
 // mtx[] is assumed to be 12-bit fixed-point precision matrix.
 extern void (*WP2YuvToCustom)(
   const int16_t* y, const int16_t* u, const int16_t* v,
   const int16_t offset[3], const int16_t mtx[9],
   int16_t* dst0, int16_t* dst1, int16_t* dst2, uint32_t width);

 // Same but outputs to ARGB (alternated channels).
 typedef void (*WP2AyuvToArgbFunc)(
   const int16_t* y, const int16_t* u, const int16_t* v, const int16_t* a,
   const int16_t avg[3], const int16_t mtx[9], void* argb, uint32_t width);
 typedef void (*WP2YuvToArgbFunc)(
   const int16_t* y, const int16_t* u, const int16_t* v,
   const int16_t avg[3], const int16_t mtx[9], void* argb, uint32_t width);

 // 8-bit output
 extern WP2AyuvToArgbFunc WP2AyuvToArgb32;  // Convert to pre-multiplied Argb
 extern WP2AyuvToArgbFunc WP2AyuvToARGB32;  // Convert to  un-multiplied ARGB
 extern WP2AyuvToArgbFunc WP2AyuvToXRGB32;  // Convert to  un-multiplied XRGB
 extern WP2YuvToArgbFunc WP2YuvToArgb32;    // Convert to Argb (opaque input)
 // 10-bit output
 extern WP2AyuvToArgbFunc WP2AyuvToArgb38;  // Convert to pre-multiplied Argb
 extern WP2YuvToArgbFunc WP2YuvToArgb38;    // Convert to Argb (opaque input)

 void WP2CSPConverterInit();

 //------------------------------------------------------------------------------
 // ANS

 namespace WP2 {

 #define APROBA_MAX_SYMBOL 32u  // for now, hardcoded for 4, 8, 16 or 32.

 // Updates the discrete cumulative distribution function.
 // The number of elements in 'cdf_base', 'cdf_var' and 'cumul' must be a
 // multiple of 8 at least as big as 'n'.
 typedef void (*ANSUpdateCDFFunc)(uint32_t n, const uint16_t cdf_base[],
                                  const uint16_t cdf_var[], uint32_t mult,
                                  uint16_t cumul[]);
 extern ANSUpdateCDFFunc ANSUpdateCDF;

 // Returns the symbol in 'cumul' at 'proba' which is in [0, APROBA_MAX[.
 typedef uint32_t (*ANSGetSymbolFunc)(uint32_t max_symbol,
                                      const uint16_t cumul[], uint32_t proba);
 extern ANSGetSymbolFunc ANSGetSymbol;

 void ANSInit();

 }  // namespace WP2

 //------------------------------------------------------------------------------
 // Transforms

 enum WP2TransformType { kDct = 0, kAdst, kHadamard, kIdentity, kNumTransforms };
 const char* const WP2TransformNames[] = {"DCT", "ADST", "Hadamard", "Identity"};

 typedef void (*WP2FwdTransposeF)(const int32_t* in, uint32_t w, uint32_t h,
                                  int32_t* out);
 typedef void (*WP2InvTransposeF)(const int16_t* in, uint32_t w, uint32_t h,
                                  int16_t* out);

 typedef void (*WP2InvTransformF)(int16_t* coeffs);
 typedef void (*WP2FwdTransformF)(int32_t* coeffs);

 typedef void (*WP2InvTransformColF)(int16_t* coeffs, uint32_t w);
 typedef void (*WP2FwdTransformColF)(int32_t* coeffs, uint32_t w);

 // indexed on size=2,4,8,16,32
 extern WP2FwdTransformF WP2FwdDct[5], WP2FwdAdst[5], WP2FwdHadamard[5];
 extern WP2InvTransformF WP2InvDct[5], WP2InvAdst[5], WP2InvHadamard[5];
 // Column-wise 2D transforms, indexed on log2(size) vertically / horizontally
 extern WP2InvTransformColF WP2InvDctCol[5][5];
 extern WP2FwdTransformColF WP2FwdDctCol[5][5];

 extern WP2InvTransposeF WP2Transpose16b;
 extern WP2FwdTransposeF WP2Transpose32b;

 void WP2TransformInit();

 // Entry point for 1D transforms. Repeats for 'num_rows' consecutive rows.
 void WP2InvTransform(int16_t coeffs[], WP2TransformType type, uint32_t size,
                      uint32_t num_rows = 1);
 void WP2Transform(int32_t coeffs[], WP2TransformType type, uint32_t size,
                   uint32_t num_rows = 1);

 // If 'reduced' is true, transforms at half the resolution.
 void WP2InvTransform2D(int16_t coeffs[], WP2TransformType tf_x,
                        WP2TransformType tf_y, uint32_t size_x, uint32_t size_y,
                        bool reduced = false);
 void WP2Transform2D(const int16_t src[], WP2TransformType tf_x,
                     WP2TransformType tf_y, uint32_t size_x, uint32_t size_y,
                     int32_t dst[], bool reduced = false);

 // Converts full transform coefficients array to half-transform ones.
 // coeffs[] is replaced by its top-left quadrant values divided by 4.
 void WP2ReduceCoeffs(const int32_t src[], uint32_t size_x, uint32_t size_y,
                      int32_t dst[]);

 // computes (int32)(256.f * sum_i_j{ in[i + 8 * j] * cos_x[i] * cos_y[j] })
 // Warning! Do not expect bit-exact same result amongst several implementation,
 // because of rounding order of floating-point computations. The SSE version
 // is different (off-by-one) from the C-version in ~1% of cases, e.g.
 extern int32_t (*WP2SlowDct8x8)(const int32_t in[64],
                                 const float cos_x[8], const float cos_y[8]);

 // Convenience function for 1D in-place fwd-transform. Only for testing!
 void WP2Transform(int16_t coeffs[], WP2TransformType type, uint32_t size);

 //------------------------------------------------------------------------------
 // Deblocking filter

 namespace WP2 {

 static constexpr uint32_t kDblkMaxStrength = 63;
 static constexpr uint32_t kDblkMaxSharpness = 7;
 // max number of filtered pixels on each side of the edge
 static constexpr uint32_t kDblkMaxHalf = 8;
 static constexpr int kDblCacheStep = 2 * kDblkMaxHalf;   // ('step' of cache)

 // Copy a width x '2*height' area to and from cache, transposed.
 extern void (*FilterCopyIn)(const int16_t* src, uint32_t src_step,
                             int16_t* dst, int32_t width, int32_t height);
 extern void (*FilterCopyOut)(const int16_t* src, int16_t* dst,
                              uint32_t dst_step, int32_t width, int32_t height);

 // Applies the deblocking filter on a line of 2 * 'half' samples across
 // the edge.
 // The higher 'filter_strength' and the lower 'filter_sharpness' are, the
 // higher the variance threshold (under which the filter is applied) is.
 // The 'filter_strength' also influences the number of actually filtered
 // pixels and how much they are modified (for narrow filter only).
 // 'min' and 'max' are color boundaries.
 extern void (*DeblockLine)(uint32_t filter_strength, int32_t threshold,
                            uint8_t half, int32_t min, int32_t max, int16_t* q0);

 // Returns 1 if the line with half-size 'half' would be deblocked.
 extern uint8_t (*WouldDeblockLine)(int32_t threshold, uint32_t half,
                                    const int16_t q0[]);

 // Returns the number of pixels in [1:half] that are flat on each side of rows.
 // For half = 3,4, the samples q0[-4..3] must all be available for read.
 // For 4 < half <= 8, the samples q0[-8..7] must be readable.
 // The function processes 'size' lines.
 extern void (*MeasureFlatLengths)(uint32_t filter_sharpness, uint32_t half,
                                   const int16_t* q0, uint32_t step,
                                   uint8_t out[], uint32_t size);

 // Returns the sample threshold deduced from sharpness and precision bits (sign
 // included), to be passed to (Would)DeblockLine functions.
 int32_t DeblockThresholdFromSharpness(uint32_t filter_sharpness,
                                       uint32_t num_precision_bits);

 // To initialize the above.
 void DblkFilterInit();

 }  // namespace WP2

 //------------------------------------------------------------------------------
 // Directional filter

 namespace WP2 {

 // There are 8 directions, 0 being lower-left/upper-right and 2 being
 // horizontal (with top-left origin).
 static constexpr uint32_t kDrctFltNumDirs = 8;

 // Filtering is applied on blocks of 8x8 pixels.
 static constexpr uint32_t kDrctFltSize = 8;

 // Kernel size in pixels (around the filtered pixel so 2+1+2 in total).
 static constexpr uint32_t kDrctFltTapDist = 2;

 // Tap position (x, y) for a given direction [0:7] and a given distance [0:1].
 static constexpr int8_t kDrctFltTapPos[kDrctFltNumDirs][kDrctFltTapDist][2] = {
     {{1, -1}, {2, -2}}, {{1, 0}, {2, -1}}, {{1, 0}, {2, 0}}, {{1, 0}, {2, 1}},
     {{1, 1}, {2, 2}},   {{0, 1}, {1, 2}},  {{0, 1}, {0, 2}}, {{0, 1}, {-1, 2}}};

 void DrctFilterInit();

 // CDEF direction function signature. Section 7.15.2.
 // |src| is a pointer to the source block. Pixel size is set by |bitdepth|
 // with |step| given in int16_t units. |direction| and |variance| are output
 // parameters and must not be nullptr.
 typedef void (*CdefDirectionFunc)(const int16_t* src, int32_t step,
                                   uint32_t bitdepth, uint32_t* const direction,
                                   uint32_t* const variance);
 extern CdefDirectionFunc CdefDirection4x4;
 extern CdefDirectionFunc CdefDirection8x8;

 // Fills the extended input buffer with unfiltered pixels or unknown values.
 typedef void (*CdefPadFunc)(const int16_t* const src, int32_t src_step,
                             const int16_t* const left, int32_t left_step,
                             const int16_t* const top, int32_t top_step,
                             int32_t width, int32_t height, int32_t n_left,
                             int32_t n_right, int32_t n_top, int32_t n_bottom,
                             int16_t* tmp, int32_t tmp_step);
 extern CdefPadFunc CdefPad;

 // CDEF filtering function signature. Section 7.15.3.
 // |src| is a pointer to the padded input block.
 // |block_width|, |block_height| are the width/height in px of the input block.
 // |primary_strength|, |secondary_strength|, and |damping| are parameters.
 // |direction| is the filtering direction. |dst| is the output buffer.
 typedef void (*CdefFilteringFunc)(const int16_t* src, int32_t src_step,
                                   uint32_t bitdepth, int block_width,
                                   int block_height, int primary_strength,
                                   int secondary_strength, int damping,
                                   int direction, int16_t* const dst,
                                   int32_t dst_step);
 extern CdefFilteringFunc CdefFiltering;

 }  // namespace WP2

 //------------------------------------------------------------------------------
 // Restoration filter

 namespace WP2 {

 // Wiener kernel size in pixels (around the filtered pixel so 3+1+3 in total).
 // TODO(yguyon): Use a dynamic tap_dist (2 for chroma etc.)
 static constexpr uint32_t kWieFltTapDist = 3;
 static constexpr uint32_t kWieFltNumTaps = 2 * kWieFltTapDist + 1;
 static constexpr uint32_t kWieFltNumBitsTapWgts = 8;  // Including the sign.
 static constexpr uint32_t kWieFltNumBitsOverflow = 2;

 // Including the sign.
 static constexpr uint32_t kWieNumBitsPerTapWgt[kWieFltTapDist]{4, 5, 6};

 // Dimensions of the internal buffer for the Wiener filter.
 static constexpr uint32_t kWieFltWidth = kMaxTileSize;  // AV1 is 256*1.5 (why?)
 static constexpr uint32_t kWieFltHeight = 64;           // Like AV1.

 static constexpr uint32_t kWieFltBufWidth = kWieFltWidth + 2 * kWieFltTapDist;
 static constexpr uint32_t kWieFltBufHeight = kWieFltHeight + 2 * kWieFltTapDist;
 static constexpr uint32_t kWieFltBufSize = kWieFltBufHeight * kWieFltBufWidth;

 void WienerFilterInit();

 // Fills 'full_tap_weights' from 'half_tap_weights' by symmetry and knowing it
 // must give a unit vector.
 void WienerHalfToFullWgts(const int32_t half_tap_weights[kWieFltTapDist],
                           int32_t full_tap_weights[kWieFltNumTaps]);

 // Applies the Wiener filter on  'dst' of dimensions 'width x height', which
 // must fit within the internal buffer (along with tap margins). Available
 // surrounding margins of size 'n_*' pixels should be at most 'kWieFltTapDist'.
 // 'left' can be nullptr or contain 'n_left x height' pixels. Same for 'right'.
 // 'top' can be nullptr or contain 'n_top x (n_left + height + n_right)' pixels.
 // Same for 'bottom'. The 'strength_map' must have the same dimension as 'dst'.
 // 'num_precision_bits' includes the sign.
 typedef WP2Status (*WienerFilterF)(
     uint32_t width, uint32_t height, const int16_t* const left,
     size_t left_step, const int16_t* const right, size_t right_step,
     const int16_t* const top, size_t top_step, const int16_t* const bottom,
     size_t bottom_step, uint32_t n_left, uint32_t n_right, uint32_t n_top,
     uint32_t n_bottom, const int32_t tap_weights_h[kWieFltNumTaps],
     const int32_t tap_weights_v[kWieFltNumTaps],
     const uint8_t* const strength_map, size_t strength_step,
     uint32_t num_precision_bits, size_t dst_step, int16_t* const dst);

 extern WienerFilterF WienerFilter;

 }  // namespace WP2

 //------------------------------------------------------------------------------
 // Grain

 namespace WP2 {

 void GrainFilterInit();

 class PseudoRNG;
 typedef void (*AddGrainF)(int16_t* samples, size_t step, PseudoRNG* const rng,
                           uint32_t amp, uint32_t freq);
 typedef void (*GenerateGrainF)(PseudoRNG* const rng, uint32_t amp,
                                uint32_t freq, int16_t dst[]);

 extern AddGrainF AddGrain4x4;
 extern GenerateGrainF GenerateGrain4x4;

 }  // namespace WP2

 //------------------------------------------------------------------------------
 // Predictions

 namespace WP2L {
 // Predicts one pixel using the 4 pixel context and a sub-angle.
 // This is used in lossless but re-uses a lot of private lossy code.
 extern void SubAnglePredictor(uint8_t angle_idx, const int16_t* const left,
                               const int16_t* const top, int16_t* dst);
 }  // namespace WP2L

 namespace WP2 {

 // 'step' is in *dst units, not bytes.
 typedef void (*LPredictF)(const int16_t* ctx, uint32_t bw, uint32_t bh,
                           int16_t min_value, int16_t max_value, int16_t* dst,
                           size_t step);
 typedef void (*AnglePredictorF)(uint8_t angle, const int16_t* ctx, uint32_t bw,
                                 uint32_t bh, int16_t* dst, size_t step);

 // generic predictors (dc, smooth, ...)
 extern LPredictF BasePredictors[BPRED_LAST];

 // Simple version of the generic directional predictors: interpolations are
 // closer to how AV1 does it.
 extern void SimpleAnglePredictor(uint8_t angle_idx, const int16_t ctx[],
                                  uint32_t log2_bw, uint32_t log2_bh,
                                  int16_t* dst, size_t step);

 extern void (*AnglePredInterpolate)(const int16_t* src, int32_t frac,
                                     int16_t* dst, uint32_t len);

 // Number of angle subdivision within a 22.5 degrees span.
 constexpr uint32_t kDirectionalMaxDelta = (2 * kDirectionalMaxAngleDeltaYA + 1);
 // number of indexed angled: 10 based angles: {23.0, 45.0, ... 225} + 7 deltas
 constexpr uint32_t kNumDirectionalAngles = kAnglePredNum * kDirectionalMaxDelta;
 // angle_idx goes for 0 (12.86 degrees) to 69 (234.64 degrees).
 // Some notable values:
 constexpr uint32_t kAngle_45 = 10;
 constexpr uint32_t kAngle_67 = 17;
 constexpr uint32_t kAngle_90 = 24;
 constexpr uint32_t kAngle_113 = 31;
 constexpr uint32_t kAngle_135 = 38;
 constexpr uint32_t kAngle_157 = 45;
 constexpr uint32_t kAngle_180 = 52;
 constexpr uint32_t kAngle_225 = 66;
 static_assert(kAngle_90 - kAngle_45 == 2 * kDirectionalMaxDelta &&
                   kAngle_135 - kAngle_90 == 2 * kDirectionalMaxDelta &&
                   kAngle_180 - kAngle_135 == 2 * kDirectionalMaxDelta &&
                   kAngle_225 - kAngle_180 == 2 * kDirectionalMaxDelta,
               "bad kAngle");
 static constexpr ContextType GetContextType(uint32_t angle_idx) {
   return (angle_idx < kAngle_90)    ? kContextExtendRight
          : (angle_idx > kAngle_180) ? kContextExtendLeft
                                     : kContextSmall;
 }

 // Precalculation of 'fuse' weight. 'strength' in [0..1].
 static constexpr uint32_t LargeWeightTableDim = 16u;
 typedef float LargeWeightTable[LargeWeightTableDim *  LargeWeightTableDim];
 extern void PrecomputeLargeWeightTable(float strength, LargeWeightTable table);

 // "fuse" weights computation. 'w,h,x,y' in pixels.
 extern uint32_t ComputeFuseWeights(uint32_t w, uint32_t h, int32_t x, int32_t y,
                                    const LargeWeightTable table,
                                    float weights[kMaxContextSize]);

 // Generic "fuse" prediction. 'bw,bh' in pixels.
 extern void BaseFusePredictor(const LargeWeightTable table, const int16_t ctx[],
                               uint32_t bw, uint32_t bh, int16_t* dst,
                               size_t step, int16_t min_value,
                               int16_t max_value);

 // Generic "Paeth" prediction.
 extern void BasePaethPredictor(const int16_t* ctx, uint32_t bw, uint32_t bh,
                                int16_t min_value, int16_t max_value,
                                int16_t* dst, size_t step);

 // Subtract prediction 'pred' from 'src'. 'len' is assumed to be multiple of 4.
 // The 'src - pred' difference value is assumed to fit in int16_t.
 extern void (*SubtractRow)(const int16_t src[], const int16_t pred[],
                            int16_t dst[], uint32_t len);

 // Add prediction 'res' to 'src', clamp and store in 'dst'.
 // 'len' is assumed to be multiple of 4.
 // 'res[]' and 'res[] + src[]' is assumed to fit in int16_t.
 extern void (*AddRow)(const int16_t src[], const int16_t res[],
                       int32_t min, int32_t max, int16_t dst[], uint32_t len);

 // Equivalent version for blocks instead of row (same restrictions)
 typedef void (*SubtractBlockFunc)(const int16_t src[], uint32_t src_step,
                                   const int16_t pred[], uint32_t pred_step,
                                   int16_t dst[], uint32_t dst_step,
                                   uint32_t height);
 typedef void (*AddBlockFunc)(const int16_t src[], uint32_t src_step,
                              const int16_t res[], uint32_t res_step,
                              int32_t min, int32_t max,
                              int16_t dst[], uint32_t dst_step,
                              uint32_t height);
 // in-place 16b version that performs dst[...] += res[...]
 typedef void (*AddBlockEqFunc)(const int16_t res[], uint32_t res_step,
                                int16_t dst[], uint32_t dst_step,
                                int32_t min, int32_t max, uint32_t height);

 // the array is indexed by log2(width / 4): 0=4, 1=8, 2=16, 3=32
 extern SubtractBlockFunc SubtractBlock[4];
 extern AddBlockFunc AddBlock[4];
 extern AddBlockEqFunc AddBlockEq[4];

 // Chroma-From-Luma and Alpha-From-Luma
 // precision of the linear a/b constants.
 // 'a' is in 1:2:12 format (hence, int16). The 'a' range is [-4.f, 4.f)
 // 'b' is in 1:11:12 format (int32). The 'b' range is [-1024.f, 1024.f]
 constexpr uint32_t kCflFracBits = 12;

 // Linear prediction: dst[] = clamp((a * src[] + b) >> kCflFracBits, min, max)
 // 'len' is assumed to be a multiple of 4.
 extern void (*CflPredict)(const int16_t* src, int16_t a, int32_t b,
                           int16_t* dst, int16_t min, int16_t max, uint32_t len);
 static inline int16_t CflScale(int16_t v, int16_t a, int32_t b,
                                int16_t min, int16_t max) {
   const int32_t V = ((int32_t)a * v + b) >> kCflFracBits;
   return (int16_t)(V < min ? min : V > max ? max : V);
 }

 // Will initialize the above functions.
 void PredictionInit();

 }  // namespace WP2

 //------------------------------------------------------------------------------
 // Scoring

 namespace WP2 {

 // returns the min/max value of a 4x4 blocks located at 'src'
 extern void (*GetBlockMinMax)(const int16_t* src, uint32_t step,
                               uint32_t num_blocks,
                               int16_t min[], int16_t max[]);

 // returns the Min/Max values over a 5x5 block. It is assumed that the whole
 // 8x5 block memory is readable.
 extern void (*GetBlockMinMax_5x5)(const int16_t* src, uint32_t step,
                                   int16_t* const min, int16_t* const max);

 extern void GetBlockMinMaxGeneric(const int16_t* src, uint32_t step,
                                   uint32_t w, uint32_t h,
                                   int16_t* const min, int16_t* const max);

 extern void ScoreDspInit();

 }  // namespace WP2

 //------------------------------------------------------------------------------
 // SSIM / PSNR

 // total size of the kernel: 2 * kWP2SSIMKernel + 1
 static constexpr uint32_t kWP2SSIMKernel = 3;
 static constexpr uint32_t kWP2SSIMWeightSum = 16 * 16;   // sum{kWeight}^2

 // some implementation (like SSE2) will process more than the kernel's size of
 // data in the non-clipped fast version. This value specifies how many
 // RGBA samples must be readable when calling WP2SSIMGetFunc(), all
 // implementations considered.
 static constexpr uint32_t kWP2SSIMMargin = 8;
 static_assert(kWP2SSIMMargin >= 2 * kWP2SSIMKernel + 1,
               "invalid kWP2SSIMMargin value");

 // struct for accumulating statistical moments
 struct WP2DistoStats {
   uint32_t w = 0;             // sum(w_i) : sum of weights
   int32_t xm = 0, ym = 0;     // sum(w_i * x_i), sum(w_i * y_i)
   uint64_t xxm = 0, yym = 0;  // sum(w_i * x_i * x_i), sum(w_i * y_i * y_i)
   int64_t xym = 0;            // sum(w_i * x_i * y_i)
 };

 // Compute the final SSIM value
 extern double WP2SSIMCalculation(uint32_t bit_depth,
                                  const WP2DistoStats& stats);
 // This version only computes the (contrast * saturation) part of the SSIM
 extern double WP2CsSSIMCalculation(uint32_t bit_depth,
                                    const WP2DistoStats& stats);

 // Returns the accumulated 'stats' of a (kWP2SSIMKernel*2+1)-sided pixel square.
 // 'src1' and 'src2' are absolute pointers to the top-left corner.
 typedef void (*WP2SSIMGetFunc)(const void* src1, size_t step1,
                                const void* src2, size_t step2,
                                WP2DistoStats* const stats);
 // Same as above but handles the canvas edges, where the square does not fit.
 // 'center_x,y' are the coordinates of the center of the square and
 // 'width,height' are the dimensions of the whole canvas.
 typedef WP2DistoStats (*WP2SSIMGetClippedFunc)(const void* src1, size_t step1,
                                                const void* src2, size_t step2,
                                                uint32_t center_x,
                                                uint32_t center_y,
                                                uint32_t width, uint32_t height);

 extern WP2SSIMGetFunc WP2SSIMGet4x8u;  // 4 channels, unsigned 8-bit depth.
 extern WP2SSIMGetFunc WP2SSIMGet8u;    // Single channel, unsigned 8-bit depth.
 extern WP2SSIMGetFunc WP2SSIMGet10s;   // Single channel, signed 11b or fewer.
 extern WP2SSIMGetFunc WP2SSIMGet12s;   // Single channel, signed 12b or more.
 extern WP2SSIMGetClippedFunc WP2SSIMGetClipped4x8u;  // Same but clipped.
 extern WP2SSIMGetClippedFunc WP2SSIMGetClipped8u;
 extern WP2SSIMGetClippedFunc WP2SSIMGetClipped10s;
 extern WP2SSIMGetClippedFunc WP2SSIMGetClipped12s;

 void WP2SSIMInit();

 //------------------------------------------------------------------------------
 // alpha-related functions

 // returns true if src[0..len) contains 'value'
 extern bool (*WP2HasValue8b)(const uint8_t* src, size_t len, uint8_t value);
 extern bool (*WP2HasValue16b)(const int16_t* src, size_t len, int16_t value);

 // returns true if src[0..len) contains other values than 'value'
 extern bool (*WP2HasOtherValue8b)(
     const uint8_t* src, size_t len, uint8_t value);
 extern bool (*WP2HasOtherValue16b)(
     const int16_t* src, size_t len, int16_t value);

 // return true if src[4 * i + 0] is different from 'value'
 extern bool (*WP2HasOtherValue8b32b)(
     const uint8_t* src, size_t len, uint8_t value);

 void WP2AlphaInit();

 //------------------------------------------------------------------------------

 // Return the sum of squared error for a continuous row of samples.
 extern uint64_t (*WP2SumSquaredError8u)(const uint8_t* src1,
                                         const uint8_t* src2,
                                         uint32_t len);
 // the int16_t in src1[] and src2[] are assumed in range [-1024, 1023]
 extern uint64_t (*WP2SumSquaredError16s)(const int16_t* src1,
                                          const int16_t* src2,
                                          uint32_t len);
 // Accumulate 32b samples channels by channels. result[] is not reset to 0.
 extern void (*WP2SumSquaredError4x8u)(const uint8_t* src1,
                                       const uint8_t* src2,
                                       uint32_t len, uint64_t result[4]);
 // Accumulate 24b samples channels by channels. result[] is not reset to 0.
 extern void (*WP2SumSquaredError3x8u)(const uint8_t* src1, const uint8_t* src2,
                                       uint32_t len, uint64_t result[3]);
 // Accumulate 64b samples channels by channels. result[] is not reset to 0.
 extern void (*WP2SumSquaredError4x16u)(const uint16_t* src1,
                                        const uint16_t* src2, uint32_t len,
                                        uint64_t result[4]);
 // returns the SSE for a w x h block.
 // 16b input is assumed to be in range [-1024,1023].
 extern uint64_t (*WP2SumSquaredErrorBlock)(const int16_t* src1, uint32_t step1,
                                            const int16_t* src2, uint32_t step2,
                                            uint32_t w, uint32_t h);
 // Same, but after first downsampling the sources by half (2x2 average filter).
 extern uint64_t (*WP2SumSquaredErrorHalfBlock)(
     const int16_t* src1, uint32_t step1, const int16_t* src2, uint32_t step2,
     uint32_t w, uint32_t h);

 void WP2PSNRInit();

 //------------------------------------------------------------------------------
 // Rasterization / Preview

 namespace WP2 {

 constexpr uint32_t kRasterPrec = 16;   // gradient's fixed-point precision
 typedef int32_t grad_t;    // type for holding values in fixed-precision

 // compute cur[] at position 'dX' and update grads[]
 extern void (*RasterAdvance)(grad_t grads[3 * 4], grad_t dX, grad_t cur[4]);

 // compute squared error on span
 extern void (*RasterLoss)(const uint8_t* src, uint32_t size,
                           const grad_t value[4], const grad_t gradient[4],
                           uint32_t loss[4]);
 // draw gradient on span
 extern void (*RasterDraw)(const grad_t value[4], const grad_t gradient[4],
                           uint8_t* dst, uint32_t size);

 void RasterInit();

 }  // namespace WP2

 #endif  /* WP2_DSP_DSP_H_ */