src/dsp/arm/obmc_neon.cc - codecs/libgav1 - Git at Google

 // Copyright 2019 The libgav1 Authors
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //      http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.

 #include "src/dsp/dsp.h"
 #include "src/dsp/obmc.h"

 #if LIBGAV1_ENABLE_NEON

 #include <arm_neon.h>

 #include <algorithm>
 #include <cassert>
 #include <cstddef>
 #include <cstdint>
 #include <cstring>

 #include "src/dsp/arm/common_neon.h"
 #include "src/utils/common.h"

 namespace libgav1 {
 namespace dsp {
 namespace {

 #include "src/dsp/obmc.inc"

 inline uint8x8_t Load2(const uint8_t* src) {
   uint16_t tmp;
   memcpy(&tmp, src, 2);
   uint16x4_t result = vcreate_u16(tmp);
   return vreinterpret_u8_u16(result);
 }

 template <int lane>
 inline void StoreLane2(uint8_t* dst, uint8x8_t src) {
   const uint16_t out_val = vget_lane_u16(vreinterpret_u16_u8(src), lane);
   memcpy(dst, &out_val, 2);
 }

 inline void WriteObmcLine4(uint8_t* const pred, const uint8_t* const obmc_pred,
                            const uint8x8_t pred_mask,
                            const uint8x8_t obmc_pred_mask) {
   const uint8x8_t pred_val = LoadLo4(pred, vdup_n_u8(0));
   const uint8x8_t obmc_pred_val = LoadLo4(obmc_pred, vdup_n_u8(0));
   const uint16x8_t weighted_pred = vmull_u8(pred_mask, pred_val);
   const uint8x8_t result =
       vrshrn_n_u16(vmlal_u8(weighted_pred, obmc_pred_mask, obmc_pred_val), 6);
   StoreLo4(pred, result);
 }

 template <bool from_left>
 inline void OverlapBlend2xH_NEON(uint8_t* const prediction,
                                  const ptrdiff_t prediction_stride,
                                  const int height,
                                  const uint8_t* const obmc_prediction,
                                  const ptrdiff_t obmc_prediction_stride) {
   uint8_t* pred = prediction;
   const uint8x8_t mask_inverter = vdup_n_u8(64);
   const uint8_t* obmc_pred = obmc_prediction;
   uint8x8_t pred_mask;
   uint8x8_t obmc_pred_mask;
   int compute_height;
   const int mask_offset = height - 2;
   if (from_left) {
     pred_mask = Load2(kObmcMask);
     obmc_pred_mask = vsub_u8(mask_inverter, pred_mask);
     compute_height = height;
   } else {
     // Weights for the last line are all 64, which is a no-op.
     compute_height = height - 1;
   }
   int y = 0;
   do {
     if (!from_left) {
       pred_mask = vdup_n_u8(kObmcMask[mask_offset + y]);
       obmc_pred_mask = vsub_u8(mask_inverter, pred_mask);
     }
     const uint8x8_t pred_val = Load2(pred);
     const uint16x8_t weighted_pred = vmull_u8(pred_mask, pred_val);
     const uint8x8_t obmc_pred_val = Load2(obmc_pred);
     const uint8x8_t result =
         vrshrn_n_u16(vmlal_u8(weighted_pred, obmc_pred_mask, obmc_pred_val), 6);
     StoreLane2<0>(pred, result);

     pred += prediction_stride;
     obmc_pred += obmc_prediction_stride;
   } while (++y != compute_height);
 }

 inline void OverlapBlendFromLeft4xH_NEON(
     uint8_t* const prediction, const ptrdiff_t prediction_stride,
     const int height, const uint8_t* const obmc_prediction,
     const ptrdiff_t obmc_prediction_stride) {
   uint8_t* pred = prediction;
   const uint8_t* obmc_pred = obmc_prediction;

   const uint8x8_t mask_inverter = vdup_n_u8(64);
   const uint8x8_t pred_mask = LoadLo4(kObmcMask + 2, vdup_n_u8(0));
   // 64 - mask
   const uint8x8_t obmc_pred_mask = vsub_u8(mask_inverter, pred_mask);
   int y = 0;
   do {
     WriteObmcLine4(pred, obmc_pred, pred_mask, obmc_pred_mask);
     pred += prediction_stride;
     obmc_pred += obmc_prediction_stride;

     WriteObmcLine4(pred, obmc_pred, pred_mask, obmc_pred_mask);
     pred += prediction_stride;
     obmc_pred += obmc_prediction_stride;

     y += 2;
   } while (y != height);
 }

 inline void OverlapBlendFromLeft8xH_NEON(
     uint8_t* const prediction, const ptrdiff_t prediction_stride,
     const int height, const uint8_t* const obmc_prediction,
     const ptrdiff_t obmc_prediction_stride) {
   uint8_t* pred = prediction;
   const uint8_t* obmc_pred = obmc_prediction;
   const uint8x8_t mask_inverter = vdup_n_u8(64);
   const uint8x8_t pred_mask = vld1_u8(kObmcMask + 6);
   // 64 - mask
   const uint8x8_t obmc_pred_mask = vsub_u8(mask_inverter, pred_mask);
   int y = 0;
   do {
     const uint8x8_t pred_val = vld1_u8(pred);
     const uint16x8_t weighted_pred = vmull_u8(pred_mask, pred_val);
     const uint8x8_t obmc_pred_val = vld1_u8(obmc_pred);
     const uint8x8_t result =
         vrshrn_n_u16(vmlal_u8(weighted_pred, obmc_pred_mask, obmc_pred_val), 6);

     vst1_u8(pred, result);
     pred += prediction_stride;
     obmc_pred += obmc_prediction_stride;
   } while (++y != height);
 }

 void OverlapBlendFromLeft_NEON(void* const prediction,
                                const ptrdiff_t prediction_stride,
                                const int width, const int height,
                                const void* const obmc_prediction,
                                const ptrdiff_t obmc_prediction_stride) {
   auto* pred = static_cast<uint8_t*>(prediction);
   const auto* obmc_pred = static_cast<const uint8_t*>(obmc_prediction);

   if (width == 2) {
     OverlapBlend2xH_NEON<true>(pred, prediction_stride, height, obmc_pred,
                                obmc_prediction_stride);
     return;
   }
   if (width == 4) {
     OverlapBlendFromLeft4xH_NEON(pred, prediction_stride, height, obmc_pred,
                                  obmc_prediction_stride);
     return;
   }
   if (width == 8) {
     OverlapBlendFromLeft8xH_NEON(pred, prediction_stride, height, obmc_pred,
                                  obmc_prediction_stride);
     return;
   }
   const uint8x16_t mask_inverter = vdupq_n_u8(64);
   const uint8_t* mask = kObmcMask + width - 2;
   int x = 0;
   do {
     pred = static_cast<uint8_t*>(prediction) + x;
     obmc_pred = static_cast<const uint8_t*>(obmc_prediction) + x;
     const uint8x16_t pred_mask = vld1q_u8(mask + x);
     // 64 - mask
     const uint8x16_t obmc_pred_mask = vsubq_u8(mask_inverter, pred_mask);
     int y = 0;
     do {
       const uint8x16_t pred_val = vld1q_u8(pred);
       const uint8x16_t obmc_pred_val = vld1q_u8(obmc_pred);
       const uint16x8_t weighted_pred_lo =
           vmull_u8(vget_low_u8(pred_mask), vget_low_u8(pred_val));
       const uint8x8_t result_lo =
           vrshrn_n_u16(vmlal_u8(weighted_pred_lo, vget_low_u8(obmc_pred_mask),
                                 vget_low_u8(obmc_pred_val)),
                        6);
       const uint16x8_t weighted_pred_hi =
           vmull_u8(vget_high_u8(pred_mask), vget_high_u8(pred_val));
       const uint8x8_t result_hi =
           vrshrn_n_u16(vmlal_u8(weighted_pred_hi, vget_high_u8(obmc_pred_mask),
                                 vget_high_u8(obmc_pred_val)),
                        6);
       vst1q_u8(pred, vcombine_u8(result_lo, result_hi));

       pred += prediction_stride;
       obmc_pred += obmc_prediction_stride;
     } while (++y < height);
     x += 16;
   } while (x < width);
 }

 inline void OverlapBlendFromTop4x4_NEON(uint8_t* const prediction,
                                         const ptrdiff_t prediction_stride,
                                         const uint8_t* const obmc_prediction,
                                         const ptrdiff_t obmc_prediction_stride,
                                         const int height) {
   uint8_t* pred = prediction;
   const uint8_t* obmc_pred = obmc_prediction;
   uint8x8_t pred_mask = vdup_n_u8(kObmcMask[height - 2]);
   const uint8x8_t mask_inverter = vdup_n_u8(64);
   uint8x8_t obmc_pred_mask = vsub_u8(mask_inverter, pred_mask);
   WriteObmcLine4(pred, obmc_pred, pred_mask, obmc_pred_mask);
   pred += prediction_stride;
   obmc_pred += obmc_prediction_stride;

   if (height == 2) {
     return;
   }

   pred_mask = vdup_n_u8(kObmcMask[3]);
   obmc_pred_mask = vsub_u8(mask_inverter, pred_mask);
   WriteObmcLine4(pred, obmc_pred, pred_mask, obmc_pred_mask);
   pred += prediction_stride;
   obmc_pred += obmc_prediction_stride;

   pred_mask = vdup_n_u8(kObmcMask[4]);
   obmc_pred_mask = vsub_u8(mask_inverter, pred_mask);
   WriteObmcLine4(pred, obmc_pred, pred_mask, obmc_pred_mask);
 }

 inline void OverlapBlendFromTop4xH_NEON(
     uint8_t* const prediction, const ptrdiff_t prediction_stride,
     const int height, const uint8_t* const obmc_prediction,
     const ptrdiff_t obmc_prediction_stride) {
   if (height < 8) {
     OverlapBlendFromTop4x4_NEON(prediction, prediction_stride, obmc_prediction,
                                 obmc_prediction_stride, height);
     return;
   }
   uint8_t* pred = prediction;
   const uint8_t* obmc_pred = obmc_prediction;
   const uint8_t* mask = kObmcMask + height - 2;
   const uint8x8_t mask_inverter = vdup_n_u8(64);
   int y = 0;
   // Compute 6 lines for height 8, or 12 lines for height 16. The remaining
   // lines are unchanged as the corresponding mask value is 64.
   do {
     uint8x8_t pred_mask = vdup_n_u8(mask[y]);
     uint8x8_t obmc_pred_mask = vsub_u8(mask_inverter, pred_mask);
     WriteObmcLine4(pred, obmc_pred, pred_mask, obmc_pred_mask);
     pred += prediction_stride;
     obmc_pred += obmc_prediction_stride;

     pred_mask = vdup_n_u8(mask[y + 1]);
     obmc_pred_mask = vsub_u8(mask_inverter, pred_mask);
     WriteObmcLine4(pred, obmc_pred, pred_mask, obmc_pred_mask);
     pred += prediction_stride;
     obmc_pred += obmc_prediction_stride;

     pred_mask = vdup_n_u8(mask[y + 2]);
     obmc_pred_mask = vsub_u8(mask_inverter, pred_mask);
     WriteObmcLine4(pred, obmc_pred, pred_mask, obmc_pred_mask);
     pred += prediction_stride;
     obmc_pred += obmc_prediction_stride;

     pred_mask = vdup_n_u8(mask[y + 3]);
     obmc_pred_mask = vsub_u8(mask_inverter, pred_mask);
     WriteObmcLine4(pred, obmc_pred, pred_mask, obmc_pred_mask);
     pred += prediction_stride;
     obmc_pred += obmc_prediction_stride;

     pred_mask = vdup_n_u8(mask[y + 4]);
     obmc_pred_mask = vsub_u8(mask_inverter, pred_mask);
     WriteObmcLine4(pred, obmc_pred, pred_mask, obmc_pred_mask);
     pred += prediction_stride;
     obmc_pred += obmc_prediction_stride;

     pred_mask = vdup_n_u8(mask[y + 5]);
     obmc_pred_mask = vsub_u8(mask_inverter, pred_mask);
     WriteObmcLine4(pred, obmc_pred, pred_mask, obmc_pred_mask);
     pred += prediction_stride;
     obmc_pred += obmc_prediction_stride;

     // Increment for the right mask index.
     y += 6;
   } while (y < height - 4);
 }

 inline void OverlapBlendFromTop8xH_NEON(
     uint8_t* const prediction, const ptrdiff_t prediction_stride,
     const int height, const uint8_t* const obmc_prediction,
     const ptrdiff_t obmc_prediction_stride) {
   uint8_t* pred = prediction;
   const uint8_t* obmc_pred = obmc_prediction;
   const uint8x8_t mask_inverter = vdup_n_u8(64);
   const uint8_t* mask = kObmcMask + height - 2;
   const int compute_height = height - (height >> 2);
   int y = 0;
   do {
     const uint8x8_t pred_mask = vdup_n_u8(mask[y]);
     // 64 - mask
     const uint8x8_t obmc_pred_mask = vsub_u8(mask_inverter, pred_mask);
     const uint8x8_t pred_val = vld1_u8(pred);
     const uint16x8_t weighted_pred = vmull_u8(pred_mask, pred_val);
     const uint8x8_t obmc_pred_val = vld1_u8(obmc_pred);
     const uint8x8_t result =
         vrshrn_n_u16(vmlal_u8(weighted_pred, obmc_pred_mask, obmc_pred_val), 6);

     vst1_u8(pred, result);
     pred += prediction_stride;
     obmc_pred += obmc_prediction_stride;
   } while (++y != compute_height);
 }

 void OverlapBlendFromTop_NEON(void* const prediction,
                               const ptrdiff_t prediction_stride,
                               const int width, const int height,
                               const void* const obmc_prediction,
                               const ptrdiff_t obmc_prediction_stride) {
   auto* pred = static_cast<uint8_t*>(prediction);
   const auto* obmc_pred = static_cast<const uint8_t*>(obmc_prediction);

   if (width == 2) {
     OverlapBlend2xH_NEON<false>(pred, prediction_stride, height, obmc_pred,
                                 obmc_prediction_stride);
     return;
   }
   if (width == 4) {
     OverlapBlendFromTop4xH_NEON(pred, prediction_stride, height, obmc_pred,
                                 obmc_prediction_stride);
     return;
   }

   if (width == 8) {
     OverlapBlendFromTop8xH_NEON(pred, prediction_stride, height, obmc_pred,
                                 obmc_prediction_stride);
     return;
   }

   const uint8_t* mask = kObmcMask + height - 2;
   const uint8x8_t mask_inverter = vdup_n_u8(64);
   // Stop when mask value becomes 64. This is inferred for 4xH.
   const int compute_height = height - (height >> 2);
   int y = 0;
   do {
     const uint8x8_t pred_mask = vdup_n_u8(mask[y]);
     // 64 - mask
     const uint8x8_t obmc_pred_mask = vsub_u8(mask_inverter, pred_mask);
     int x = 0;
     do {
       const uint8x16_t pred_val = vld1q_u8(pred + x);
       const uint8x16_t obmc_pred_val = vld1q_u8(obmc_pred + x);
       const uint16x8_t weighted_pred_lo =
           vmull_u8(pred_mask, vget_low_u8(pred_val));
       const uint8x8_t result_lo =
           vrshrn_n_u16(vmlal_u8(weighted_pred_lo, obmc_pred_mask,
                                 vget_low_u8(obmc_pred_val)),
                        6);
       const uint16x8_t weighted_pred_hi =
           vmull_u8(pred_mask, vget_high_u8(pred_val));
       const uint8x8_t result_hi =
           vrshrn_n_u16(vmlal_u8(weighted_pred_hi, obmc_pred_mask,
                                 vget_high_u8(obmc_pred_val)),
                        6);
       vst1q_u8(pred + x, vcombine_u8(result_lo, result_hi));

       x += 16;
     } while (x < width);
     pred += prediction_stride;
     obmc_pred += obmc_prediction_stride;
   } while (++y < compute_height);
 }

 void Init8bpp() {
   Dsp* const dsp = dsp_internal::GetWritableDspTable(8);
   assert(dsp != nullptr);
   dsp->obmc_blend[kObmcDirectionVertical] = OverlapBlendFromTop_NEON;
   dsp->obmc_blend[kObmcDirectionHorizontal] = OverlapBlendFromLeft_NEON;
 }

 }  // namespace

 void ObmcInit_NEON() { Init8bpp(); }

 }  // namespace dsp
 }  // namespace libgav1

 #else   // !LIBGAV1_ENABLE_NEON

 namespace libgav1 {
 namespace dsp {

 void ObmcInit_NEON() {}

 }  // namespace dsp
 }  // namespace libgav1
 #endif  // LIBGAV1_ENABLE_NEON
	// Copyright 2019 The libgav1 Authors
	//
	// Licensed under the Apache License, Version 2.0 (the "License");
	// you may not use this file except in compliance with the License.
	// You may obtain a copy of the License at
	//
	// http://www.apache.org/licenses/LICENSE-2.0
	//
	// Unless required by applicable law or agreed to in writing, software
	// distributed under the License is distributed on an "AS IS" BASIS,
	// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	// See the License for the specific language governing permissions and
	// limitations under the License.

	#include "src/dsp/dsp.h"
	#include "src/dsp/obmc.h"

	#if LIBGAV1_ENABLE_NEON

	#include <arm_neon.h>

	#include <algorithm>
	#include <cassert>
	#include <cstddef>
	#include <cstdint>
	#include <cstring>

	#include "src/dsp/arm/common_neon.h"
	#include "src/utils/common.h"

	namespace libgav1 {
	namespace dsp {
	namespace {

	#include "src/dsp/obmc.inc"

	inline uint8x8_t Load2(const uint8_t* src) {
	uint16_t tmp;
	memcpy(&tmp, src, 2);
	uint16x4_t result = vcreate_u16(tmp);
	return vreinterpret_u8_u16(result);
	}

	template <int lane>
	inline void StoreLane2(uint8_t* dst, uint8x8_t src) {
	const uint16_t out_val = vget_lane_u16(vreinterpret_u16_u8(src), lane);
	memcpy(dst, &out_val, 2);
	}

	inline void WriteObmcLine4(uint8_t* const pred, const uint8_t* const obmc_pred,
	const uint8x8_t pred_mask,
	const uint8x8_t obmc_pred_mask) {
	const uint8x8_t pred_val = LoadLo4(pred, vdup_n_u8(0));
	const uint8x8_t obmc_pred_val = LoadLo4(obmc_pred, vdup_n_u8(0));
	const uint16x8_t weighted_pred = vmull_u8(pred_mask, pred_val);
	const uint8x8_t result =
	vrshrn_n_u16(vmlal_u8(weighted_pred, obmc_pred_mask, obmc_pred_val), 6);
	StoreLo4(pred, result);
	}

	template <bool from_left>
	inline void OverlapBlend2xH_NEON(uint8_t* const prediction,
	const ptrdiff_t prediction_stride,
	const int height,
	const uint8_t* const obmc_prediction,
	const ptrdiff_t obmc_prediction_stride) {
	uint8_t* pred = prediction;
	const uint8x8_t mask_inverter = vdup_n_u8(64);
	const uint8_t* obmc_pred = obmc_prediction;
	uint8x8_t pred_mask;
	uint8x8_t obmc_pred_mask;
	int compute_height;
	const int mask_offset = height - 2;
	if (from_left) {
	pred_mask = Load2(kObmcMask);
	obmc_pred_mask = vsub_u8(mask_inverter, pred_mask);
	compute_height = height;
	} else {
	// Weights for the last line are all 64, which is a no-op.
	compute_height = height - 1;
	}
	int y = 0;
	do {
	if (!from_left) {
	pred_mask = vdup_n_u8(kObmcMask[mask_offset + y]);
	obmc_pred_mask = vsub_u8(mask_inverter, pred_mask);
	}
	const uint8x8_t pred_val = Load2(pred);
	const uint16x8_t weighted_pred = vmull_u8(pred_mask, pred_val);
	const uint8x8_t obmc_pred_val = Load2(obmc_pred);
	const uint8x8_t result =
	vrshrn_n_u16(vmlal_u8(weighted_pred, obmc_pred_mask, obmc_pred_val), 6);
	StoreLane2<0>(pred, result);

	pred += prediction_stride;
	obmc_pred += obmc_prediction_stride;
	} while (++y != compute_height);
	}

	inline void OverlapBlendFromLeft4xH_NEON(
	uint8_t* const prediction, const ptrdiff_t prediction_stride,
	const int height, const uint8_t* const obmc_prediction,
	const ptrdiff_t obmc_prediction_stride) {
	uint8_t* pred = prediction;
	const uint8_t* obmc_pred = obmc_prediction;

	const uint8x8_t mask_inverter = vdup_n_u8(64);
	const uint8x8_t pred_mask = LoadLo4(kObmcMask + 2, vdup_n_u8(0));
	// 64 - mask
	const uint8x8_t obmc_pred_mask = vsub_u8(mask_inverter, pred_mask);
	int y = 0;
	do {
	WriteObmcLine4(pred, obmc_pred, pred_mask, obmc_pred_mask);
	pred += prediction_stride;
	obmc_pred += obmc_prediction_stride;

	WriteObmcLine4(pred, obmc_pred, pred_mask, obmc_pred_mask);
	pred += prediction_stride;
	obmc_pred += obmc_prediction_stride;

	y += 2;
	} while (y != height);
	}

	inline void OverlapBlendFromLeft8xH_NEON(
	uint8_t* const prediction, const ptrdiff_t prediction_stride,
	const int height, const uint8_t* const obmc_prediction,
	const ptrdiff_t obmc_prediction_stride) {
	uint8_t* pred = prediction;
	const uint8_t* obmc_pred = obmc_prediction;
	const uint8x8_t mask_inverter = vdup_n_u8(64);
	const uint8x8_t pred_mask = vld1_u8(kObmcMask + 6);
	// 64 - mask
	const uint8x8_t obmc_pred_mask = vsub_u8(mask_inverter, pred_mask);
	int y = 0;
	do {
	const uint8x8_t pred_val = vld1_u8(pred);
	const uint16x8_t weighted_pred = vmull_u8(pred_mask, pred_val);
	const uint8x8_t obmc_pred_val = vld1_u8(obmc_pred);
	const uint8x8_t result =
	vrshrn_n_u16(vmlal_u8(weighted_pred, obmc_pred_mask, obmc_pred_val), 6);

	vst1_u8(pred, result);
	pred += prediction_stride;
	obmc_pred += obmc_prediction_stride;
	} while (++y != height);
	}

	void OverlapBlendFromLeft_NEON(void* const prediction,
	const ptrdiff_t prediction_stride,
	const int width, const int height,
	const void* const obmc_prediction,
	const ptrdiff_t obmc_prediction_stride) {
	auto* pred = static_cast<uint8_t*>(prediction);
	const auto* obmc_pred = static_cast<const uint8_t*>(obmc_prediction);

	if (width == 2) {
	OverlapBlend2xH_NEON<true>(pred, prediction_stride, height, obmc_pred,
	obmc_prediction_stride);
	return;
	}
	if (width == 4) {
	OverlapBlendFromLeft4xH_NEON(pred, prediction_stride, height, obmc_pred,
	obmc_prediction_stride);
	return;
	}
	if (width == 8) {
	OverlapBlendFromLeft8xH_NEON(pred, prediction_stride, height, obmc_pred,
	obmc_prediction_stride);
	return;
	}
	const uint8x16_t mask_inverter = vdupq_n_u8(64);
	const uint8_t* mask = kObmcMask + width - 2;
	int x = 0;
	do {
	pred = static_cast<uint8_t*>(prediction) + x;
	obmc_pred = static_cast<const uint8_t*>(obmc_prediction) + x;
	const uint8x16_t pred_mask = vld1q_u8(mask + x);
	// 64 - mask
	const uint8x16_t obmc_pred_mask = vsubq_u8(mask_inverter, pred_mask);
	int y = 0;
	do {
	const uint8x16_t pred_val = vld1q_u8(pred);
	const uint8x16_t obmc_pred_val = vld1q_u8(obmc_pred);
	const uint16x8_t weighted_pred_lo =
	vmull_u8(vget_low_u8(pred_mask), vget_low_u8(pred_val));
	const uint8x8_t result_lo =
	vrshrn_n_u16(vmlal_u8(weighted_pred_lo, vget_low_u8(obmc_pred_mask),
	vget_low_u8(obmc_pred_val)),
	6);
	const uint16x8_t weighted_pred_hi =
	vmull_u8(vget_high_u8(pred_mask), vget_high_u8(pred_val));
	const uint8x8_t result_hi =
	vrshrn_n_u16(vmlal_u8(weighted_pred_hi, vget_high_u8(obmc_pred_mask),
	vget_high_u8(obmc_pred_val)),
	6);
	vst1q_u8(pred, vcombine_u8(result_lo, result_hi));

	pred += prediction_stride;
	obmc_pred += obmc_prediction_stride;
	} while (++y < height);
	x += 16;
	} while (x < width);
	}

	inline void OverlapBlendFromTop4x4_NEON(uint8_t* const prediction,
	const ptrdiff_t prediction_stride,
	const uint8_t* const obmc_prediction,
	const ptrdiff_t obmc_prediction_stride,
	const int height) {
	uint8_t* pred = prediction;
	const uint8_t* obmc_pred = obmc_prediction;
	uint8x8_t pred_mask = vdup_n_u8(kObmcMask[height - 2]);
	const uint8x8_t mask_inverter = vdup_n_u8(64);
	uint8x8_t obmc_pred_mask = vsub_u8(mask_inverter, pred_mask);
	WriteObmcLine4(pred, obmc_pred, pred_mask, obmc_pred_mask);
	pred += prediction_stride;
	obmc_pred += obmc_prediction_stride;

	if (height == 2) {
	return;
	}

	pred_mask = vdup_n_u8(kObmcMask[3]);
	obmc_pred_mask = vsub_u8(mask_inverter, pred_mask);
	WriteObmcLine4(pred, obmc_pred, pred_mask, obmc_pred_mask);
	pred += prediction_stride;
	obmc_pred += obmc_prediction_stride;

	pred_mask = vdup_n_u8(kObmcMask[4]);
	obmc_pred_mask = vsub_u8(mask_inverter, pred_mask);
	WriteObmcLine4(pred, obmc_pred, pred_mask, obmc_pred_mask);
	}

	inline void OverlapBlendFromTop4xH_NEON(
	uint8_t* const prediction, const ptrdiff_t prediction_stride,
	const int height, const uint8_t* const obmc_prediction,
	const ptrdiff_t obmc_prediction_stride) {
	if (height < 8) {
	OverlapBlendFromTop4x4_NEON(prediction, prediction_stride, obmc_prediction,
	obmc_prediction_stride, height);
	return;
	}
	uint8_t* pred = prediction;
	const uint8_t* obmc_pred = obmc_prediction;
	const uint8_t* mask = kObmcMask + height - 2;
	const uint8x8_t mask_inverter = vdup_n_u8(64);
	int y = 0;
	// Compute 6 lines for height 8, or 12 lines for height 16. The remaining
	// lines are unchanged as the corresponding mask value is 64.
	do {
	uint8x8_t pred_mask = vdup_n_u8(mask[y]);
	uint8x8_t obmc_pred_mask = vsub_u8(mask_inverter, pred_mask);
	WriteObmcLine4(pred, obmc_pred, pred_mask, obmc_pred_mask);
	pred += prediction_stride;
	obmc_pred += obmc_prediction_stride;

	pred_mask = vdup_n_u8(mask[y + 1]);
	obmc_pred_mask = vsub_u8(mask_inverter, pred_mask);
	WriteObmcLine4(pred, obmc_pred, pred_mask, obmc_pred_mask);
	pred += prediction_stride;
	obmc_pred += obmc_prediction_stride;

	pred_mask = vdup_n_u8(mask[y + 2]);
	obmc_pred_mask = vsub_u8(mask_inverter, pred_mask);
	WriteObmcLine4(pred, obmc_pred, pred_mask, obmc_pred_mask);
	pred += prediction_stride;
	obmc_pred += obmc_prediction_stride;

	pred_mask = vdup_n_u8(mask[y + 3]);
	obmc_pred_mask = vsub_u8(mask_inverter, pred_mask);
	WriteObmcLine4(pred, obmc_pred, pred_mask, obmc_pred_mask);
	pred += prediction_stride;
	obmc_pred += obmc_prediction_stride;

	pred_mask = vdup_n_u8(mask[y + 4]);
	obmc_pred_mask = vsub_u8(mask_inverter, pred_mask);
	WriteObmcLine4(pred, obmc_pred, pred_mask, obmc_pred_mask);
	pred += prediction_stride;
	obmc_pred += obmc_prediction_stride;

	pred_mask = vdup_n_u8(mask[y + 5]);
	obmc_pred_mask = vsub_u8(mask_inverter, pred_mask);
	WriteObmcLine4(pred, obmc_pred, pred_mask, obmc_pred_mask);
	pred += prediction_stride;
	obmc_pred += obmc_prediction_stride;

	// Increment for the right mask index.
	y += 6;
	} while (y < height - 4);
	}

	inline void OverlapBlendFromTop8xH_NEON(
	uint8_t* const prediction, const ptrdiff_t prediction_stride,
	const int height, const uint8_t* const obmc_prediction,
	const ptrdiff_t obmc_prediction_stride) {
	uint8_t* pred = prediction;
	const uint8_t* obmc_pred = obmc_prediction;
	const uint8x8_t mask_inverter = vdup_n_u8(64);
	const uint8_t* mask = kObmcMask + height - 2;
	const int compute_height = height - (height >> 2);
	int y = 0;
	do {
	const uint8x8_t pred_mask = vdup_n_u8(mask[y]);
	// 64 - mask
	const uint8x8_t obmc_pred_mask = vsub_u8(mask_inverter, pred_mask);
	const uint8x8_t pred_val = vld1_u8(pred);
	const uint16x8_t weighted_pred = vmull_u8(pred_mask, pred_val);
	const uint8x8_t obmc_pred_val = vld1_u8(obmc_pred);
	const uint8x8_t result =
	vrshrn_n_u16(vmlal_u8(weighted_pred, obmc_pred_mask, obmc_pred_val), 6);

	vst1_u8(pred, result);
	pred += prediction_stride;
	obmc_pred += obmc_prediction_stride;
	} while (++y != compute_height);
	}

	void OverlapBlendFromTop_NEON(void* const prediction,
	const ptrdiff_t prediction_stride,
	const int width, const int height,
	const void* const obmc_prediction,
	const ptrdiff_t obmc_prediction_stride) {
	auto* pred = static_cast<uint8_t*>(prediction);
	const auto* obmc_pred = static_cast<const uint8_t*>(obmc_prediction);

	if (width == 2) {
	OverlapBlend2xH_NEON<false>(pred, prediction_stride, height, obmc_pred,
	obmc_prediction_stride);
	return;
	}
	if (width == 4) {
	OverlapBlendFromTop4xH_NEON(pred, prediction_stride, height, obmc_pred,
	obmc_prediction_stride);
	return;
	}

	if (width == 8) {
	OverlapBlendFromTop8xH_NEON(pred, prediction_stride, height, obmc_pred,
	obmc_prediction_stride);
	return;
	}

	const uint8_t* mask = kObmcMask + height - 2;
	const uint8x8_t mask_inverter = vdup_n_u8(64);
	// Stop when mask value becomes 64. This is inferred for 4xH.
	const int compute_height = height - (height >> 2);
	int y = 0;
	do {
	const uint8x8_t pred_mask = vdup_n_u8(mask[y]);
	// 64 - mask
	const uint8x8_t obmc_pred_mask = vsub_u8(mask_inverter, pred_mask);
	int x = 0;
	do {
	const uint8x16_t pred_val = vld1q_u8(pred + x);
	const uint8x16_t obmc_pred_val = vld1q_u8(obmc_pred + x);
	const uint16x8_t weighted_pred_lo =
	vmull_u8(pred_mask, vget_low_u8(pred_val));
	const uint8x8_t result_lo =
	vrshrn_n_u16(vmlal_u8(weighted_pred_lo, obmc_pred_mask,
	vget_low_u8(obmc_pred_val)),
	6);
	const uint16x8_t weighted_pred_hi =
	vmull_u8(pred_mask, vget_high_u8(pred_val));
	const uint8x8_t result_hi =
	vrshrn_n_u16(vmlal_u8(weighted_pred_hi, obmc_pred_mask,
	vget_high_u8(obmc_pred_val)),
	6);
	vst1q_u8(pred + x, vcombine_u8(result_lo, result_hi));

	x += 16;
	} while (x < width);
	pred += prediction_stride;
	obmc_pred += obmc_prediction_stride;
	} while (++y < compute_height);
	}

	void Init8bpp() {
	Dsp* const dsp = dsp_internal::GetWritableDspTable(8);
	assert(dsp != nullptr);
	dsp->obmc_blend[kObmcDirectionVertical] = OverlapBlendFromTop_NEON;
	dsp->obmc_blend[kObmcDirectionHorizontal] = OverlapBlendFromLeft_NEON;
	}

	} // namespace

	void ObmcInit_NEON() { Init8bpp(); }

	} // namespace dsp
	} // namespace libgav1

	#else // !LIBGAV1_ENABLE_NEON

	namespace libgav1 {
	namespace dsp {

	void ObmcInit_NEON() {}

	} // namespace dsp
	} // namespace libgav1
	#endif // LIBGAV1_ENABLE_NEON