blob: 24932b585d28d570ac2eed95b52f0cf33bf8c441 [file] [log] [blame]
/******************************************************************************
*
* Copyright (C) 2022 The Android Open Source Project
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at:
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
*****************************************************************************
* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
*/
/**
*******************************************************************************
* @file
* isvcd_pred_residual_recon_neonintr.c
*
* @brief
* Contains definition of functions for h264 inverse quantization inverse
* transformation and resd comp
*
* @author
* Kishore
*
* @par List of Functions:
* - isvcd_pred_residual_recon_16x16_neonintr()
* - isvcd_pred_residual_recon_8x8_neonintr()
* - isvcd_pred_residual_recon_4x4_neonintr()
* - isvcd_pred_residual_recon_chroma_4x4_neonintr()
* - isvcd_pred_residual_recon_chroma_8x8_neonintr()
* - isvcd_residual_luma_4x4_neonintr()
* - isvcd_residual_luma_8x8_neonintr()
* - isvcd_residual_luma_16x16_neonintr()
* - isvcd_residual_chroma_cb_cr_8x8_neonintr()
*
* @remarks
*
*******************************************************************************
*/
/*****************************************************************************/
/* File Includes */
/*****************************************************************************/
#include <string.h>
#include <arm_neon.h>
/* User include files */
#include "ih264_typedefs.h"
#include "ih264_defs.h"
#include "ih264_trans_macros.h"
#include "ih264_macros.h"
#include "ih264_platform_macros.h"
#include "ih264_trans_data.h"
#include "ih264_size_defs.h"
#include "ih264_structs.h"
#include "isvcd_pred_residual_recon.h"
/*****************************************************************************/
/* */
/* Function Name : isvcd_pred_residual_recon_4x4_neonintr */
/* */
/* Description : this function computes the recon data from the */
/* pred and residual buffer */
/* */
/* Inputs : */
/* Globals : none */
/* Processing : */
/* */
/* Outputs : none */
/* Returns : nnz */
/* */
/* Issues : none */
/* */
/* Revision History: */
/* */
/* DD MM YYYY Author(s) Changes (Describe the changes made) */
/* 25 11 2021 Kishore creation */
/* */
/*****************************************************************************/
WORD32 isvcd_pred_residual_recon_4x4_neonintr(UWORD8 *pu1_pred, WORD16 *pi2_rsd, UWORD8 *pu1_out,
WORD32 pred_strd, WORD32 rsd_strd, WORD32 out_strd)
{
uint8x8_t pred0_in, pred1_in, pred2_in, pred3_in;
int16x8_t pred0, pred1, pred2, pred3;
int16x8_t resd0_in, resd1_in, resd2_in, resd3_in;
int16x8_t resd01_in, resd23_in;
WORD32 i4_nnz;
int16x8_t dup_val_1, dup_val_2, dup_abs;
pred0_in = vld1_u8((uint8_t *) pu1_pred);
pred1_in = vld1_u8((uint8_t *) pu1_pred + (pred_strd * 1));
pred2_in = vld1_u8((uint8_t *) pu1_pred + (pred_strd * 2));
pred3_in = vld1_u8((uint8_t *) pu1_pred + (pred_strd * 3));
pred0 = vreinterpretq_s16_u16(vmovl_u8(pred0_in));
pred1 = vreinterpretq_s16_u16(vmovl_u8(pred1_in));
pred2 = vreinterpretq_s16_u16(vmovl_u8(pred2_in));
pred3 = vreinterpretq_s16_u16(vmovl_u8(pred3_in));
resd0_in = vld1q_s16((int16_t *) pi2_rsd);
resd1_in = vld1q_s16((int16_t *) pi2_rsd + rsd_strd);
resd2_in = vld1q_s16((int16_t *) pi2_rsd + (rsd_strd * 2));
resd3_in = vld1q_s16((int16_t *) pi2_rsd + (rsd_strd * 3));
resd01_in = vreinterpretq_s16_s64(vcombine_s64(vget_low_s64(vreinterpretq_s64_s16(resd0_in)),
vget_low_s64(vreinterpretq_s64_s16(resd1_in))));
resd23_in = vreinterpretq_s16_s64(vcombine_s64(vget_low_s64(vreinterpretq_s64_s16(resd2_in)),
vget_low_s64(vreinterpretq_s64_s16(resd3_in))));
dup_val_1 = vabsq_s16(resd01_in);
dup_val_2 = vabsq_s16(resd23_in);
dup_abs = vqaddq_s16(dup_val_1, dup_val_2);
i4_nnz = dup_abs[0] || dup_abs[1] || dup_abs[2] || dup_abs[3] || dup_abs[4] || dup_abs[5] ||
dup_abs[6] || dup_abs[7];
pred0 = vaddq_s16(pred0, resd0_in);
pred1 = vaddq_s16(pred1, resd1_in);
pred2 = vaddq_s16(pred2, resd2_in);
pred3 = vaddq_s16(pred3, resd3_in);
pred0_in = vqmovun_s16(pred0);
pred1_in = vqmovun_s16(pred1);
pred2_in = vqmovun_s16(pred2);
pred3_in = vqmovun_s16(pred3);
vst1_lane_u32((uint32_t *) (pu1_out), vreinterpret_u32_u8(pred0_in), 0);
vst1_lane_u32((uint32_t *) (pu1_out + out_strd), vreinterpret_u32_u8(pred1_in), 0);
vst1_lane_u32((uint32_t *) (pu1_out + out_strd * 2), vreinterpret_u32_u8(pred2_in), 0);
vst1_lane_u32((uint32_t *) (pu1_out + out_strd * 3), vreinterpret_u32_u8(pred3_in), 0);
return i4_nnz;
}
/*****************************************************************************/
/* */
/* Function Name : isvcd_pred_residual_recon_8x8_neonintr */
/* */
/* Description : this function computes the recon data from the */
/* pred and residual buffer */
/* */
/* Inputs : */
/* Globals : none */
/* Processing : */
/* */
/* Outputs : none */
/* Returns : nnz */
/* */
/* Issues : none */
/* */
/* Revision History: */
/* */
/* DD MM YYYY Author(s) Changes (Describe the changes made) */
/* 25 11 2021 Kishore creation */
/* */
/*****************************************************************************/
WORD32 isvcd_pred_residual_recon_8x8_neonintr(UWORD8 *pu1_pred, WORD16 *pi2_rsd, UWORD8 *pu1_out,
WORD32 pred_strd, WORD32 rsd_strd, WORD32 out_strd)
{
uint8x8_t pred0_in, pred1_in, pred2_in, pred3_in;
uint8x8_t pred4_in, pred5_in, pred6_in, pred7_in;
int16x8_t pred0, pred1, pred2, pred3;
int16x8_t resd0_in, resd1_in, resd2_in, resd3_in;
int16x8_t pred4, pred5, pred6, pred7;
int16x8_t resd4_in, resd5_in, resd6_in, resd7_in;
int16x8_t dup_val_1, dup_val_2, dup_abs;
int64x2_t resd0_in_64x2, resd1_in_64x2, resd2_in_64x2, resd3_in_64x2, resd4_in_64x2,
resd5_in_64x2, resd6_in_64x2, resd7_in_64x2;
int16x8_t resd_b0_r01_in;
int16x8_t resd_b0_r23_in;
int16x8_t resd_b1_r01_in;
int16x8_t resd_b1_r23_in;
int16x8_t resd_b2_r45_in;
int16x8_t resd_b2_r67_in;
int16x8_t resd_b3_r45_in;
int16x8_t resd_b3_r67_in;
WORD32 nnz, nnz_b0, nnz_b1, nnz_b2, nnz_b3;
pred0_in = vld1_u8((uint8_t *) pu1_pred);
pred1_in = vld1_u8((uint8_t *) pu1_pred + (pred_strd));
pred2_in = vld1_u8((uint8_t *) pu1_pred + (pred_strd * 2));
pred3_in = vld1_u8((uint8_t *) pu1_pred + (pred_strd * 3));
pred4_in = vld1_u8((uint8_t *) pu1_pred + (pred_strd * 4));
pred5_in = vld1_u8((uint8_t *) pu1_pred + (pred_strd * 5));
pred6_in = vld1_u8((uint8_t *) pu1_pred + (pred_strd * 6));
pred7_in = vld1_u8((uint8_t *) pu1_pred + (pred_strd * 7));
pred0 = vreinterpretq_s16_u16(vmovl_u8(pred0_in));
pred1 = vreinterpretq_s16_u16(vmovl_u8(pred1_in));
pred2 = vreinterpretq_s16_u16(vmovl_u8(pred2_in));
pred3 = vreinterpretq_s16_u16(vmovl_u8(pred3_in));
pred4 = vreinterpretq_s16_u16(vmovl_u8(pred4_in));
pred5 = vreinterpretq_s16_u16(vmovl_u8(pred5_in));
pred6 = vreinterpretq_s16_u16(vmovl_u8(pred6_in));
pred7 = vreinterpretq_s16_u16(vmovl_u8(pred7_in));
resd0_in = vld1q_s16((int16_t *) pi2_rsd);
resd1_in = vld1q_s16((int16_t *) pi2_rsd + rsd_strd);
resd2_in = vld1q_s16((int16_t *) pi2_rsd + (rsd_strd * 2));
resd3_in = vld1q_s16((int16_t *) pi2_rsd + (rsd_strd * 3));
resd4_in = vld1q_s16((int16_t *) pi2_rsd + (rsd_strd * 4));
resd5_in = vld1q_s16((int16_t *) pi2_rsd + (rsd_strd * 5));
resd6_in = vld1q_s16((int16_t *) pi2_rsd + (rsd_strd * 6));
resd7_in = vld1q_s16((int16_t *) pi2_rsd + (rsd_strd * 7));
resd0_in_64x2 = vreinterpretq_s64_s16(resd0_in);
resd1_in_64x2 = vreinterpretq_s64_s16(resd1_in);
resd2_in_64x2 = vreinterpretq_s64_s16(resd2_in);
resd3_in_64x2 = vreinterpretq_s64_s16(resd3_in);
resd4_in_64x2 = vreinterpretq_s64_s16(resd4_in);
resd5_in_64x2 = vreinterpretq_s64_s16(resd5_in);
resd6_in_64x2 = vreinterpretq_s64_s16(resd6_in);
resd7_in_64x2 = vreinterpretq_s64_s16(resd7_in);
resd_b0_r01_in = vreinterpretq_s16_s64(
vcombine_s64(vget_low_s64(resd0_in_64x2), vget_low_s64(resd1_in_64x2)));
resd_b0_r23_in = vreinterpretq_s16_s64(
vcombine_s64(vget_low_s64(resd2_in_64x2), vget_low_s64(resd3_in_64x2)));
resd_b1_r01_in = vreinterpretq_s16_s64(
vcombine_s64(vget_high_s64(resd0_in_64x2), vget_high_s64(resd1_in_64x2)));
resd_b1_r23_in = vreinterpretq_s16_s64(
vcombine_s64(vget_high_s64(resd2_in_64x2), vget_high_s64(resd3_in_64x2)));
resd_b2_r45_in = vreinterpretq_s16_s64(
vcombine_s64(vget_low_s64(resd4_in_64x2), vget_low_s64(resd5_in_64x2)));
resd_b2_r67_in = vreinterpretq_s16_s64(
vcombine_s64(vget_low_s64(resd6_in_64x2), vget_low_s64(resd7_in_64x2)));
resd_b3_r45_in = vreinterpretq_s16_s64(
vcombine_s64(vget_high_s64(resd4_in_64x2), vget_high_s64(resd5_in_64x2)));
resd_b3_r67_in = vreinterpretq_s16_s64(
vcombine_s64(vget_high_s64(resd6_in_64x2), vget_high_s64(resd7_in_64x2)));
dup_val_1 = vabsq_s16(resd_b0_r01_in);
dup_val_2 = vabsq_s16(resd_b0_r23_in);
dup_abs = vqaddq_s16(dup_val_1, dup_val_2);
nnz_b0 = dup_abs[0] || dup_abs[1] || dup_abs[2] || dup_abs[3] || dup_abs[4] || dup_abs[5] ||
dup_abs[6] || dup_abs[7];
dup_val_1 = vabsq_s16(resd_b1_r01_in);
dup_val_2 = vabsq_s16(resd_b1_r23_in);
dup_abs = vqaddq_s16(dup_val_1, dup_val_2);
nnz_b1 = dup_abs[0] || dup_abs[1] || dup_abs[2] || dup_abs[3] || dup_abs[4] || dup_abs[5] ||
dup_abs[6] || dup_abs[7];
dup_val_1 = vabsq_s16(resd_b2_r45_in);
dup_val_2 = vabsq_s16(resd_b2_r67_in);
dup_abs = vqaddq_s16(dup_val_1, dup_val_2);
nnz_b2 = dup_abs[0] || dup_abs[1] || dup_abs[2] || dup_abs[3] || dup_abs[4] || dup_abs[5] ||
dup_abs[6] || dup_abs[7];
dup_val_1 = vabsq_s16(resd_b3_r45_in);
dup_val_2 = vabsq_s16(resd_b3_r67_in);
dup_abs = vqaddq_s16(dup_val_1, dup_val_2);
nnz_b3 = dup_abs[0] || dup_abs[1] || dup_abs[2] || dup_abs[3] || dup_abs[4] || dup_abs[5] ||
dup_abs[6] || dup_abs[7];
nnz = (nnz_b0 | (nnz_b1 << 1) | (nnz_b2 << 4) | (nnz_b3 << 5));
pred0 = vaddq_s16(pred0, resd0_in);
pred1 = vaddq_s16(pred1, resd1_in);
pred2 = vaddq_s16(pred2, resd2_in);
pred3 = vaddq_s16(pred3, resd3_in);
pred4 = vaddq_s16(pred4, resd4_in);
pred5 = vaddq_s16(pred5, resd5_in);
pred6 = vaddq_s16(pred6, resd6_in);
pred7 = vaddq_s16(pred7, resd7_in);
pred0_in = vqmovun_s16(pred0);
pred1_in = vqmovun_s16(pred1);
pred2_in = vqmovun_s16(pred2);
pred3_in = vqmovun_s16(pred3);
pred4_in = vqmovun_s16(pred4);
pred5_in = vqmovun_s16(pred5);
pred6_in = vqmovun_s16(pred6);
pred7_in = vqmovun_s16(pred7);
vst1_u8((uint8_t *) (pu1_out), pred0_in);
vst1_u8((uint8_t *) (pu1_out + out_strd), pred1_in);
vst1_u8((uint8_t *) (pu1_out + out_strd * 2), pred2_in);
vst1_u8((uint8_t *) (pu1_out + out_strd * 3), pred3_in);
vst1_u8((uint8_t *) (pu1_out + out_strd * 4), pred4_in);
vst1_u8((uint8_t *) (pu1_out + out_strd * 5), pred5_in);
vst1_u8((uint8_t *) (pu1_out + out_strd * 6), pred6_in);
vst1_u8((uint8_t *) (pu1_out + out_strd * 7), pred7_in);
return nnz;
}
/*****************************************************************************/
/* */
/* Function Name : isvcd_pred_residual_recon_16x16_neonintr */
/* */
/* Description : this function computes the recon data from the */
/* pred and residual buffer */
/* */
/* Inputs : */
/* Globals : none */
/* Processing : */
/* */
/* Outputs : none */
/* Returns : nnz */
/* */
/* Issues : none */
/* */
/* Revision History: */
/* */
/* DD MM YYYY Author(s) Changes (Describe the changes made) */
/* 25 11 2021 Kishore creation */
/* */
/*****************************************************************************/
WORD32 isvcd_pred_residual_recon_16x16_neonintr(UWORD8 *pu1_pred, WORD16 *pi2_rsd, UWORD8 *pu1_out,
WORD32 pred_strd, WORD32 rsd_strd, WORD32 out_strd)
{
uint8x8_t pred0_in, pred1_in, pred2_in, pred3_in;
uint8x8_t pred4_in, pred5_in, pred6_in, pred7_in;
int16x8_t pred0, pred1, pred2, pred3;
int16x8_t resd0_in, resd1_in, resd2_in, resd3_in;
int16x8_t pred4, pred5, pred6, pred7;
int16x8_t resd4_in, resd5_in, resd6_in, resd7_in;
int16x8_t dup_val_1, dup_val_2, dup_abs;
UWORD8 *pu1_pred_ptr = pu1_pred;
WORD16 *pi2_rsd_ptr = pi2_rsd;
UWORD8 *pu1_out_ptr = pu1_out;
int64x2_t resd0_in_64x2, resd1_in_64x2, resd2_in_64x2, resd3_in_64x2, resd4_in_64x2,
resd5_in_64x2, resd6_in_64x2, resd7_in_64x2;
int16x8_t resd_b0_r01_in;
int16x8_t resd_b0_r23_in;
int16x8_t resd_b1_r01_in;
int16x8_t resd_b1_r23_in;
int16x8_t resd_b2_r45_in;
int16x8_t resd_b2_r67_in;
int16x8_t resd_b3_r45_in;
int16x8_t resd_b3_r67_in;
WORD32 nnz, nnz_b0, nnz_b1, nnz_b2, nnz_b3;
/* First row of 8, first 8x8 elements */
pred0_in = vld1_u8((uint8_t *) pu1_pred_ptr);
pred1_in = vld1_u8((uint8_t *) pu1_pred_ptr + (pred_strd));
pred2_in = vld1_u8((uint8_t *) pu1_pred_ptr + (pred_strd * 2));
pred3_in = vld1_u8((uint8_t *) pu1_pred_ptr + (pred_strd * 3));
pred4_in = vld1_u8((uint8_t *) pu1_pred_ptr + (pred_strd * 4));
pred5_in = vld1_u8((uint8_t *) pu1_pred_ptr + (pred_strd * 5));
pred6_in = vld1_u8((uint8_t *) pu1_pred_ptr + (pred_strd * 6));
pred7_in = vld1_u8((uint8_t *) pu1_pred_ptr + (pred_strd * 7));
pred0 = vreinterpretq_s16_u16(vmovl_u8(pred0_in));
pred1 = vreinterpretq_s16_u16(vmovl_u8(pred1_in));
pred2 = vreinterpretq_s16_u16(vmovl_u8(pred2_in));
pred3 = vreinterpretq_s16_u16(vmovl_u8(pred3_in));
pred4 = vreinterpretq_s16_u16(vmovl_u8(pred4_in));
pred5 = vreinterpretq_s16_u16(vmovl_u8(pred5_in));
pred6 = vreinterpretq_s16_u16(vmovl_u8(pred6_in));
pred7 = vreinterpretq_s16_u16(vmovl_u8(pred7_in));
resd0_in = vld1q_s16((int16_t *) pi2_rsd_ptr);
resd1_in = vld1q_s16((int16_t *) pi2_rsd_ptr + rsd_strd);
resd2_in = vld1q_s16((int16_t *) pi2_rsd_ptr + (rsd_strd * 2));
resd3_in = vld1q_s16((int16_t *) pi2_rsd_ptr + (rsd_strd * 3));
resd4_in = vld1q_s16((int16_t *) pi2_rsd_ptr + (rsd_strd * 4));
resd5_in = vld1q_s16((int16_t *) pi2_rsd_ptr + (rsd_strd * 5));
resd6_in = vld1q_s16((int16_t *) pi2_rsd_ptr + (rsd_strd * 6));
resd7_in = vld1q_s16((int16_t *) pi2_rsd_ptr + (rsd_strd * 7));
resd0_in_64x2 = vreinterpretq_s64_s16(resd0_in);
resd1_in_64x2 = vreinterpretq_s64_s16(resd1_in);
resd2_in_64x2 = vreinterpretq_s64_s16(resd2_in);
resd3_in_64x2 = vreinterpretq_s64_s16(resd3_in);
resd4_in_64x2 = vreinterpretq_s64_s16(resd4_in);
resd5_in_64x2 = vreinterpretq_s64_s16(resd5_in);
resd6_in_64x2 = vreinterpretq_s64_s16(resd6_in);
resd7_in_64x2 = vreinterpretq_s64_s16(resd7_in);
resd_b0_r01_in = vreinterpretq_s16_s64(
vcombine_s64(vget_low_s64(resd0_in_64x2), vget_low_s64(resd1_in_64x2)));
resd_b0_r23_in = vreinterpretq_s16_s64(
vcombine_s64(vget_low_s64(resd2_in_64x2), vget_low_s64(resd3_in_64x2)));
resd_b1_r01_in = vreinterpretq_s16_s64(
vcombine_s64(vget_high_s64(resd0_in_64x2), vget_high_s64(resd1_in_64x2)));
resd_b1_r23_in = vreinterpretq_s16_s64(
vcombine_s64(vget_high_s64(resd2_in_64x2), vget_high_s64(resd3_in_64x2)));
resd_b2_r45_in = vreinterpretq_s16_s64(
vcombine_s64(vget_low_s64(resd4_in_64x2), vget_low_s64(resd5_in_64x2)));
resd_b2_r67_in = vreinterpretq_s16_s64(
vcombine_s64(vget_low_s64(resd6_in_64x2), vget_low_s64(resd7_in_64x2)));
resd_b3_r45_in = vreinterpretq_s16_s64(
vcombine_s64(vget_high_s64(resd4_in_64x2), vget_high_s64(resd5_in_64x2)));
resd_b3_r67_in = vreinterpretq_s16_s64(
vcombine_s64(vget_high_s64(resd6_in_64x2), vget_high_s64(resd7_in_64x2)));
dup_val_1 = vabsq_s16(resd_b0_r01_in);
dup_val_2 = vabsq_s16(resd_b0_r23_in);
dup_abs = vqaddq_s16(dup_val_1, dup_val_2);
nnz_b0 = dup_abs[0] || dup_abs[1] || dup_abs[2] || dup_abs[3] || dup_abs[4] || dup_abs[5] ||
dup_abs[6] || dup_abs[7];
dup_val_1 = vabsq_s16(resd_b1_r01_in);
dup_val_2 = vabsq_s16(resd_b1_r23_in);
dup_abs = vqaddq_s16(dup_val_1, dup_val_2);
nnz_b1 = dup_abs[0] || dup_abs[1] || dup_abs[2] || dup_abs[3] || dup_abs[4] || dup_abs[5] ||
dup_abs[6] || dup_abs[7];
dup_val_1 = vabsq_s16(resd_b2_r45_in);
dup_val_2 = vabsq_s16(resd_b2_r67_in);
dup_abs = vqaddq_s16(dup_val_1, dup_val_2);
nnz_b2 = dup_abs[0] || dup_abs[1] || dup_abs[2] || dup_abs[3] || dup_abs[4] || dup_abs[5] ||
dup_abs[6] || dup_abs[7];
dup_val_1 = vabsq_s16(resd_b3_r45_in);
dup_val_2 = vabsq_s16(resd_b3_r67_in);
dup_abs = vqaddq_s16(dup_val_1, dup_val_2);
nnz_b3 = dup_abs[0] || dup_abs[1] || dup_abs[2] || dup_abs[3] || dup_abs[4] || dup_abs[5] ||
dup_abs[6] || dup_abs[7];
nnz = (nnz_b0 | (nnz_b1 << 1) | (nnz_b2 << 4) | (nnz_b3 << 5));
pred0 = vaddq_s16(pred0, resd0_in);
pred1 = vaddq_s16(pred1, resd1_in);
pred2 = vaddq_s16(pred2, resd2_in);
pred3 = vaddq_s16(pred3, resd3_in);
pred4 = vaddq_s16(pred4, resd4_in);
pred5 = vaddq_s16(pred5, resd5_in);
pred6 = vaddq_s16(pred6, resd6_in);
pred7 = vaddq_s16(pred7, resd7_in);
pred0_in = vqmovun_s16(pred0);
pred1_in = vqmovun_s16(pred1);
pred2_in = vqmovun_s16(pred2);
pred3_in = vqmovun_s16(pred3);
pred4_in = vqmovun_s16(pred4);
pred5_in = vqmovun_s16(pred5);
pred6_in = vqmovun_s16(pred6);
pred7_in = vqmovun_s16(pred7);
vst1_u8((uint8_t *) (pu1_out_ptr), pred0_in);
vst1_u8((uint8_t *) (pu1_out_ptr + out_strd), pred1_in);
vst1_u8((uint8_t *) (pu1_out_ptr + out_strd * 2), pred2_in);
vst1_u8((uint8_t *) (pu1_out_ptr + out_strd * 3), pred3_in);
vst1_u8((uint8_t *) (pu1_out_ptr + out_strd * 4), pred4_in);
vst1_u8((uint8_t *) (pu1_out_ptr + out_strd * 5), pred5_in);
vst1_u8((uint8_t *) (pu1_out_ptr + out_strd * 6), pred6_in);
vst1_u8((uint8_t *) (pu1_out_ptr + out_strd * 7), pred7_in);
/* first row of 8, sec 8x8 elements */
pu1_out_ptr = pu1_out_ptr + 8;
pi2_rsd_ptr = pi2_rsd_ptr + 8;
pu1_pred_ptr = pu1_pred_ptr + 8;
pred0_in = vld1_u8((uint8_t *) pu1_pred_ptr);
pred1_in = vld1_u8((uint8_t *) pu1_pred_ptr + (pred_strd));
pred2_in = vld1_u8((uint8_t *) pu1_pred_ptr + (pred_strd * 2));
pred3_in = vld1_u8((uint8_t *) pu1_pred_ptr + (pred_strd * 3));
pred4_in = vld1_u8((uint8_t *) pu1_pred_ptr + (pred_strd * 4));
pred5_in = vld1_u8((uint8_t *) pu1_pred_ptr + (pred_strd * 5));
pred6_in = vld1_u8((uint8_t *) pu1_pred_ptr + (pred_strd * 6));
pred7_in = vld1_u8((uint8_t *) pu1_pred_ptr + (pred_strd * 7));
pred0 = vreinterpretq_s16_u16(vmovl_u8(pred0_in));
pred1 = vreinterpretq_s16_u16(vmovl_u8(pred1_in));
pred2 = vreinterpretq_s16_u16(vmovl_u8(pred2_in));
pred3 = vreinterpretq_s16_u16(vmovl_u8(pred3_in));
pred4 = vreinterpretq_s16_u16(vmovl_u8(pred4_in));
pred5 = vreinterpretq_s16_u16(vmovl_u8(pred5_in));
pred6 = vreinterpretq_s16_u16(vmovl_u8(pred6_in));
pred7 = vreinterpretq_s16_u16(vmovl_u8(pred7_in));
resd0_in = vld1q_s16((int16_t *) pi2_rsd_ptr);
resd1_in = vld1q_s16((int16_t *) pi2_rsd_ptr + rsd_strd);
resd2_in = vld1q_s16((int16_t *) pi2_rsd_ptr + (rsd_strd * 2));
resd3_in = vld1q_s16((int16_t *) pi2_rsd_ptr + (rsd_strd * 3));
resd4_in = vld1q_s16((int16_t *) pi2_rsd_ptr + (rsd_strd * 4));
resd5_in = vld1q_s16((int16_t *) pi2_rsd_ptr + (rsd_strd * 5));
resd6_in = vld1q_s16((int16_t *) pi2_rsd_ptr + (rsd_strd * 6));
resd7_in = vld1q_s16((int16_t *) pi2_rsd_ptr + (rsd_strd * 7));
resd0_in_64x2 = vreinterpretq_s64_s16(resd0_in);
resd1_in_64x2 = vreinterpretq_s64_s16(resd1_in);
resd2_in_64x2 = vreinterpretq_s64_s16(resd2_in);
resd3_in_64x2 = vreinterpretq_s64_s16(resd3_in);
resd4_in_64x2 = vreinterpretq_s64_s16(resd4_in);
resd5_in_64x2 = vreinterpretq_s64_s16(resd5_in);
resd6_in_64x2 = vreinterpretq_s64_s16(resd6_in);
resd7_in_64x2 = vreinterpretq_s64_s16(resd7_in);
resd_b0_r01_in = vreinterpretq_s16_s64(
vcombine_s64(vget_low_s64(resd0_in_64x2), vget_low_s64(resd1_in_64x2)));
resd_b0_r23_in = vreinterpretq_s16_s64(
vcombine_s64(vget_low_s64(resd2_in_64x2), vget_low_s64(resd3_in_64x2)));
resd_b1_r01_in = vreinterpretq_s16_s64(
vcombine_s64(vget_high_s64(resd0_in_64x2), vget_high_s64(resd1_in_64x2)));
resd_b1_r23_in = vreinterpretq_s16_s64(
vcombine_s64(vget_high_s64(resd2_in_64x2), vget_high_s64(resd3_in_64x2)));
resd_b2_r45_in = vreinterpretq_s16_s64(
vcombine_s64(vget_low_s64(resd4_in_64x2), vget_low_s64(resd5_in_64x2)));
resd_b2_r67_in = vreinterpretq_s16_s64(
vcombine_s64(vget_low_s64(resd6_in_64x2), vget_low_s64(resd7_in_64x2)));
resd_b3_r45_in = vreinterpretq_s16_s64(
vcombine_s64(vget_high_s64(resd4_in_64x2), vget_high_s64(resd5_in_64x2)));
resd_b3_r67_in = vreinterpretq_s16_s64(
vcombine_s64(vget_high_s64(resd6_in_64x2), vget_high_s64(resd7_in_64x2)));
dup_val_1 = vabsq_s16(resd_b0_r01_in);
dup_val_2 = vabsq_s16(resd_b0_r23_in);
dup_abs = vqaddq_s16(dup_val_1, dup_val_2);
nnz_b0 = dup_abs[0] || dup_abs[1] || dup_abs[2] || dup_abs[3] || dup_abs[4] || dup_abs[5] ||
dup_abs[6] || dup_abs[7];
dup_val_1 = vabsq_s16(resd_b1_r01_in);
dup_val_2 = vabsq_s16(resd_b1_r23_in);
dup_abs = vqaddq_s16(dup_val_1, dup_val_2);
nnz_b1 = dup_abs[0] || dup_abs[1] || dup_abs[2] || dup_abs[3] || dup_abs[4] || dup_abs[5] ||
dup_abs[6] || dup_abs[7];
dup_val_1 = vabsq_s16(resd_b2_r45_in);
dup_val_2 = vabsq_s16(resd_b2_r67_in);
dup_abs = vqaddq_s16(dup_val_1, dup_val_2);
nnz_b2 = dup_abs[0] || dup_abs[1] || dup_abs[2] || dup_abs[3] || dup_abs[4] || dup_abs[5] ||
dup_abs[6] || dup_abs[7];
dup_val_1 = vabsq_s16(resd_b3_r45_in);
dup_val_2 = vabsq_s16(resd_b3_r67_in);
dup_abs = vqaddq_s16(dup_val_1, dup_val_2);
nnz_b3 = dup_abs[0] || dup_abs[1] || dup_abs[2] || dup_abs[3] || dup_abs[4] || dup_abs[5] ||
dup_abs[6] || dup_abs[7];
nnz |= (nnz_b0 << 2 | (nnz_b1 << 3) | (nnz_b2 << 6) | (nnz_b3 << 7));
pred0 = vaddq_s16(pred0, resd0_in);
pred1 = vaddq_s16(pred1, resd1_in);
pred2 = vaddq_s16(pred2, resd2_in);
pred3 = vaddq_s16(pred3, resd3_in);
pred4 = vaddq_s16(pred4, resd4_in);
pred5 = vaddq_s16(pred5, resd5_in);
pred6 = vaddq_s16(pred6, resd6_in);
pred7 = vaddq_s16(pred7, resd7_in);
pred0_in = vqmovun_s16(pred0);
pred1_in = vqmovun_s16(pred1);
pred2_in = vqmovun_s16(pred2);
pred3_in = vqmovun_s16(pred3);
pred4_in = vqmovun_s16(pred4);
pred5_in = vqmovun_s16(pred5);
pred6_in = vqmovun_s16(pred6);
pred7_in = vqmovun_s16(pred7);
vst1_u8((uint8_t *) (pu1_out_ptr), pred0_in);
vst1_u8((uint8_t *) (pu1_out_ptr + out_strd), pred1_in);
vst1_u8((uint8_t *) (pu1_out_ptr + out_strd * 2), pred2_in);
vst1_u8((uint8_t *) (pu1_out_ptr + out_strd * 3), pred3_in);
vst1_u8((uint8_t *) (pu1_out_ptr + out_strd * 4), pred4_in);
vst1_u8((uint8_t *) (pu1_out_ptr + out_strd * 5), pred5_in);
vst1_u8((uint8_t *) (pu1_out_ptr + out_strd * 6), pred6_in);
vst1_u8((uint8_t *) (pu1_out_ptr + out_strd * 7), pred7_in);
pu1_out_ptr = pu1_out + (8 * out_strd);
pi2_rsd_ptr = pi2_rsd + (8 * rsd_strd);
pu1_pred_ptr = pu1_pred + (8 * pred_strd);
/*Sec row of 8, first 8x8*/
pred0_in = vld1_u8((uint8_t *) pu1_pred_ptr);
pred1_in = vld1_u8((uint8_t *) pu1_pred_ptr + (pred_strd));
pred2_in = vld1_u8((uint8_t *) pu1_pred_ptr + (pred_strd * 2));
pred3_in = vld1_u8((uint8_t *) pu1_pred_ptr + (pred_strd * 3));
pred4_in = vld1_u8((uint8_t *) pu1_pred_ptr + (pred_strd * 4));
pred5_in = vld1_u8((uint8_t *) pu1_pred_ptr + (pred_strd * 5));
pred6_in = vld1_u8((uint8_t *) pu1_pred_ptr + (pred_strd * 6));
pred7_in = vld1_u8((uint8_t *) pu1_pred_ptr + (pred_strd * 7));
pred0 = vreinterpretq_s16_u16(vmovl_u8(pred0_in));
pred1 = vreinterpretq_s16_u16(vmovl_u8(pred1_in));
pred2 = vreinterpretq_s16_u16(vmovl_u8(pred2_in));
pred3 = vreinterpretq_s16_u16(vmovl_u8(pred3_in));
pred4 = vreinterpretq_s16_u16(vmovl_u8(pred4_in));
pred5 = vreinterpretq_s16_u16(vmovl_u8(pred5_in));
pred6 = vreinterpretq_s16_u16(vmovl_u8(pred6_in));
pred7 = vreinterpretq_s16_u16(vmovl_u8(pred7_in));
resd0_in = vld1q_s16((int16_t *) pi2_rsd_ptr);
resd1_in = vld1q_s16((int16_t *) pi2_rsd_ptr + rsd_strd);
resd2_in = vld1q_s16((int16_t *) pi2_rsd_ptr + (rsd_strd * 2));
resd3_in = vld1q_s16((int16_t *) pi2_rsd_ptr + (rsd_strd * 3));
resd4_in = vld1q_s16((int16_t *) pi2_rsd_ptr + (rsd_strd * 4));
resd5_in = vld1q_s16((int16_t *) pi2_rsd_ptr + (rsd_strd * 5));
resd6_in = vld1q_s16((int16_t *) pi2_rsd_ptr + (rsd_strd * 6));
resd7_in = vld1q_s16((int16_t *) pi2_rsd_ptr + (rsd_strd * 7));
resd0_in_64x2 = vreinterpretq_s64_s16(resd0_in);
resd1_in_64x2 = vreinterpretq_s64_s16(resd1_in);
resd2_in_64x2 = vreinterpretq_s64_s16(resd2_in);
resd3_in_64x2 = vreinterpretq_s64_s16(resd3_in);
resd4_in_64x2 = vreinterpretq_s64_s16(resd4_in);
resd5_in_64x2 = vreinterpretq_s64_s16(resd5_in);
resd6_in_64x2 = vreinterpretq_s64_s16(resd6_in);
resd7_in_64x2 = vreinterpretq_s64_s16(resd7_in);
resd_b0_r01_in = vreinterpretq_s16_s64(
vcombine_s64(vget_low_s64(resd0_in_64x2), vget_low_s64(resd1_in_64x2)));
resd_b0_r23_in = vreinterpretq_s16_s64(
vcombine_s64(vget_low_s64(resd2_in_64x2), vget_low_s64(resd3_in_64x2)));
resd_b1_r01_in = vreinterpretq_s16_s64(
vcombine_s64(vget_high_s64(resd0_in_64x2), vget_high_s64(resd1_in_64x2)));
resd_b1_r23_in = vreinterpretq_s16_s64(
vcombine_s64(vget_high_s64(resd2_in_64x2), vget_high_s64(resd3_in_64x2)));
resd_b2_r45_in = vreinterpretq_s16_s64(
vcombine_s64(vget_low_s64(resd4_in_64x2), vget_low_s64(resd5_in_64x2)));
resd_b2_r67_in = vreinterpretq_s16_s64(
vcombine_s64(vget_low_s64(resd6_in_64x2), vget_low_s64(resd7_in_64x2)));
resd_b3_r45_in = vreinterpretq_s16_s64(
vcombine_s64(vget_high_s64(resd4_in_64x2), vget_high_s64(resd5_in_64x2)));
resd_b3_r67_in = vreinterpretq_s16_s64(
vcombine_s64(vget_high_s64(resd6_in_64x2), vget_high_s64(resd7_in_64x2)));
dup_val_1 = vabsq_s16(resd_b0_r01_in);
dup_val_2 = vabsq_s16(resd_b0_r23_in);
dup_abs = vqaddq_s16(dup_val_1, dup_val_2);
nnz_b0 = dup_abs[0] || dup_abs[1] || dup_abs[2] || dup_abs[3] || dup_abs[4] || dup_abs[5] ||
dup_abs[6] || dup_abs[7];
dup_val_1 = vabsq_s16(resd_b1_r01_in);
dup_val_2 = vabsq_s16(resd_b1_r23_in);
dup_abs = vqaddq_s16(dup_val_1, dup_val_2);
nnz_b1 = dup_abs[0] || dup_abs[1] || dup_abs[2] || dup_abs[3] || dup_abs[4] || dup_abs[5] ||
dup_abs[6] || dup_abs[7];
dup_val_1 = vabsq_s16(resd_b2_r45_in);
dup_val_2 = vabsq_s16(resd_b2_r67_in);
dup_abs = vqaddq_s16(dup_val_1, dup_val_2);
nnz_b2 = dup_abs[0] || dup_abs[1] || dup_abs[2] || dup_abs[3] || dup_abs[4] || dup_abs[5] ||
dup_abs[6] || dup_abs[7];
dup_val_1 = vabsq_s16(resd_b3_r45_in);
dup_val_2 = vabsq_s16(resd_b3_r67_in);
dup_abs = vqaddq_s16(dup_val_1, dup_val_2);
nnz_b3 = dup_abs[0] || dup_abs[1] || dup_abs[2] || dup_abs[3] || dup_abs[4] || dup_abs[5] ||
dup_abs[6] || dup_abs[7];
nnz |= (nnz_b0 << 8 | (nnz_b1 << 9) | (nnz_b2 << 12) | (nnz_b3 << 13));
pred0 = vaddq_s16(pred0, resd0_in);
pred1 = vaddq_s16(pred1, resd1_in);
pred2 = vaddq_s16(pred2, resd2_in);
pred3 = vaddq_s16(pred3, resd3_in);
pred4 = vaddq_s16(pred4, resd4_in);
pred5 = vaddq_s16(pred5, resd5_in);
pred6 = vaddq_s16(pred6, resd6_in);
pred7 = vaddq_s16(pred7, resd7_in);
pred0_in = vqmovun_s16(pred0);
pred1_in = vqmovun_s16(pred1);
pred2_in = vqmovun_s16(pred2);
pred3_in = vqmovun_s16(pred3);
pred4_in = vqmovun_s16(pred4);
pred5_in = vqmovun_s16(pred5);
pred6_in = vqmovun_s16(pred6);
pred7_in = vqmovun_s16(pred7);
vst1_u8((uint8_t *) (pu1_out_ptr), pred0_in);
vst1_u8((uint8_t *) (pu1_out_ptr + out_strd), pred1_in);
vst1_u8((uint8_t *) (pu1_out_ptr + out_strd * 2), pred2_in);
vst1_u8((uint8_t *) (pu1_out_ptr + out_strd * 3), pred3_in);
vst1_u8((uint8_t *) (pu1_out_ptr + out_strd * 4), pred4_in);
vst1_u8((uint8_t *) (pu1_out_ptr + out_strd * 5), pred5_in);
vst1_u8((uint8_t *) (pu1_out_ptr + out_strd * 6), pred6_in);
vst1_u8((uint8_t *) (pu1_out_ptr + out_strd * 7), pred7_in);
/*Sec row of 8, Sec 8x8*/
pu1_out_ptr = pu1_out_ptr + 8;
pi2_rsd_ptr = pi2_rsd_ptr + 8;
pu1_pred_ptr = pu1_pred_ptr + 8;
pred0_in = vld1_u8((uint8_t *) pu1_pred_ptr);
pred1_in = vld1_u8((uint8_t *) pu1_pred_ptr + (pred_strd));
pred2_in = vld1_u8((uint8_t *) pu1_pred_ptr + (pred_strd * 2));
pred3_in = vld1_u8((uint8_t *) pu1_pred_ptr + (pred_strd * 3));
pred4_in = vld1_u8((uint8_t *) pu1_pred_ptr + (pred_strd * 4));
pred5_in = vld1_u8((uint8_t *) pu1_pred_ptr + (pred_strd * 5));
pred6_in = vld1_u8((uint8_t *) pu1_pred_ptr + (pred_strd * 6));
pred7_in = vld1_u8((uint8_t *) pu1_pred_ptr + (pred_strd * 7));
pred0 = vreinterpretq_s16_u16(vmovl_u8(pred0_in));
pred1 = vreinterpretq_s16_u16(vmovl_u8(pred1_in));
pred2 = vreinterpretq_s16_u16(vmovl_u8(pred2_in));
pred3 = vreinterpretq_s16_u16(vmovl_u8(pred3_in));
pred4 = vreinterpretq_s16_u16(vmovl_u8(pred4_in));
pred5 = vreinterpretq_s16_u16(vmovl_u8(pred5_in));
pred6 = vreinterpretq_s16_u16(vmovl_u8(pred6_in));
pred7 = vreinterpretq_s16_u16(vmovl_u8(pred7_in));
resd0_in = vld1q_s16((int16_t *) pi2_rsd_ptr);
resd1_in = vld1q_s16((int16_t *) pi2_rsd_ptr + rsd_strd);
resd2_in = vld1q_s16((int16_t *) pi2_rsd_ptr + (rsd_strd * 2));
resd3_in = vld1q_s16((int16_t *) pi2_rsd_ptr + (rsd_strd * 3));
resd4_in = vld1q_s16((int16_t *) pi2_rsd_ptr + (rsd_strd * 4));
resd5_in = vld1q_s16((int16_t *) pi2_rsd_ptr + (rsd_strd * 5));
resd6_in = vld1q_s16((int16_t *) pi2_rsd_ptr + (rsd_strd * 6));
resd7_in = vld1q_s16((int16_t *) pi2_rsd_ptr + (rsd_strd * 7));
resd0_in_64x2 = vreinterpretq_s64_s16(resd0_in);
resd1_in_64x2 = vreinterpretq_s64_s16(resd1_in);
resd2_in_64x2 = vreinterpretq_s64_s16(resd2_in);
resd3_in_64x2 = vreinterpretq_s64_s16(resd3_in);
resd4_in_64x2 = vreinterpretq_s64_s16(resd4_in);
resd5_in_64x2 = vreinterpretq_s64_s16(resd5_in);
resd6_in_64x2 = vreinterpretq_s64_s16(resd6_in);
resd7_in_64x2 = vreinterpretq_s64_s16(resd7_in);
resd_b0_r01_in = vreinterpretq_s16_s64(
vcombine_s64(vget_low_s64(resd0_in_64x2), vget_low_s64(resd1_in_64x2)));
resd_b0_r23_in = vreinterpretq_s16_s64(
vcombine_s64(vget_low_s64(resd2_in_64x2), vget_low_s64(resd3_in_64x2)));
resd_b1_r01_in = vreinterpretq_s16_s64(
vcombine_s64(vget_high_s64(resd0_in_64x2), vget_high_s64(resd1_in_64x2)));
resd_b1_r23_in = vreinterpretq_s16_s64(
vcombine_s64(vget_high_s64(resd2_in_64x2), vget_high_s64(resd3_in_64x2)));
resd_b2_r45_in = vreinterpretq_s16_s64(
vcombine_s64(vget_low_s64(resd4_in_64x2), vget_low_s64(resd5_in_64x2)));
resd_b2_r67_in = vreinterpretq_s16_s64(
vcombine_s64(vget_low_s64(resd6_in_64x2), vget_low_s64(resd7_in_64x2)));
resd_b3_r45_in = vreinterpretq_s16_s64(
vcombine_s64(vget_high_s64(resd4_in_64x2), vget_high_s64(resd5_in_64x2)));
resd_b3_r67_in = vreinterpretq_s16_s64(
vcombine_s64(vget_high_s64(resd6_in_64x2), vget_high_s64(resd7_in_64x2)));
dup_val_1 = vabsq_s16(resd_b0_r01_in);
dup_val_2 = vabsq_s16(resd_b0_r23_in);
dup_abs = vqaddq_s16(dup_val_1, dup_val_2);
nnz_b0 = dup_abs[0] || dup_abs[1] || dup_abs[2] || dup_abs[3] || dup_abs[4] || dup_abs[5] ||
dup_abs[6] || dup_abs[7];
dup_val_1 = vabsq_s16(resd_b1_r01_in);
dup_val_2 = vabsq_s16(resd_b1_r23_in);
dup_abs = vqaddq_s16(dup_val_1, dup_val_2);
nnz_b1 = dup_abs[0] || dup_abs[1] || dup_abs[2] || dup_abs[3] || dup_abs[4] || dup_abs[5] ||
dup_abs[6] || dup_abs[7];
dup_val_1 = vabsq_s16(resd_b2_r45_in);
dup_val_2 = vabsq_s16(resd_b2_r67_in);
dup_abs = vqaddq_s16(dup_val_1, dup_val_2);
nnz_b2 = dup_abs[0] || dup_abs[1] || dup_abs[2] || dup_abs[3] || dup_abs[4] || dup_abs[5] ||
dup_abs[6] || dup_abs[7];
dup_val_1 = vabsq_s16(resd_b3_r45_in);
dup_val_2 = vabsq_s16(resd_b3_r67_in);
dup_abs = vqaddq_s16(dup_val_1, dup_val_2);
nnz_b3 = dup_abs[0] || dup_abs[1] || dup_abs[2] || dup_abs[3] || dup_abs[4] || dup_abs[5] ||
dup_abs[6] || dup_abs[7];
nnz |= (nnz_b0 << 10 | (nnz_b1 << 11) | (nnz_b2 << 14) | (nnz_b3 << 15));
pred0 = vaddq_s16(pred0, resd0_in);
pred1 = vaddq_s16(pred1, resd1_in);
pred2 = vaddq_s16(pred2, resd2_in);
pred3 = vaddq_s16(pred3, resd3_in);
pred4 = vaddq_s16(pred4, resd4_in);
pred5 = vaddq_s16(pred5, resd5_in);
pred6 = vaddq_s16(pred6, resd6_in);
pred7 = vaddq_s16(pred7, resd7_in);
pred0_in = vqmovun_s16(pred0);
pred1_in = vqmovun_s16(pred1);
pred2_in = vqmovun_s16(pred2);
pred3_in = vqmovun_s16(pred3);
pred4_in = vqmovun_s16(pred4);
pred5_in = vqmovun_s16(pred5);
pred6_in = vqmovun_s16(pred6);
pred7_in = vqmovun_s16(pred7);
vst1_u8((uint8_t *) (pu1_out_ptr), pred0_in);
vst1_u8((uint8_t *) (pu1_out_ptr + out_strd), pred1_in);
vst1_u8((uint8_t *) (pu1_out_ptr + out_strd * 2), pred2_in);
vst1_u8((uint8_t *) (pu1_out_ptr + out_strd * 3), pred3_in);
vst1_u8((uint8_t *) (pu1_out_ptr + out_strd * 4), pred4_in);
vst1_u8((uint8_t *) (pu1_out_ptr + out_strd * 5), pred5_in);
vst1_u8((uint8_t *) (pu1_out_ptr + out_strd * 6), pred6_in);
vst1_u8((uint8_t *) (pu1_out_ptr + out_strd * 7), pred7_in);
return nnz;
}
/*****************************************************************************/
/* */
/* Function Name : isvcd_pred_residual_recon_chroma_8x8_neonintr */
/* */
/* Description : this function computes the recon data from the */
/* pred and residual buffer */
/* */
/* Inputs : */
/* Globals : none */
/* Processing : */
/* */
/* Outputs : none */
/* Returns : none */
/* */
/* Issues : none */
/* */
/* Revision History: */
/* */
/* DD MM YYYY Author(s) Changes (Describe the changes made) */
/* 25 11 2021 Kishore creation */
/* */
/*****************************************************************************/
void isvcd_pred_residual_recon_chroma_8x8_neonintr(UWORD8 *pu1_pred, WORD16 *pi2_rsd,
UWORD8 *pu1_out, WORD32 pred_strd,
WORD32 rsd_strd, WORD32 out_strd)
{
uint8x8_t pred0_in, pred1_in, pred2_in, pred3_in;
uint8x8_t pred4_in, pred5_in, pred6_in, pred7_in;
int16x8_t pred0, pred1, pred2, pred3;
int16x8_t resd0_in, resd1_in, resd2_in, resd3_in;
int16x8_t pred4, pred5, pred6, pred7;
int16x8_t resd4_in, resd5_in, resd6_in, resd7_in;
UWORD8 *pu1_pred_ptr = pu1_pred;
WORD16 *pi2_rsd_ptr = pi2_rsd;
UWORD8 *pu1_out_ptr = pu1_out;
uint8x16_t pred0_inp_full, pred1_inp_full, pred2_inp_full, pred3_inp_full;
uint8x16_t pred4_inp_full, pred5_inp_full, pred6_inp_full, pred7_inp_full;
uint8x8_t i4_out_horz_8x8_r0, i4_out_horz_8x8_r1, i4_out_horz_8x8_r2, i4_out_horz_8x8_r3;
uint8x8_t i4_out_horz_8x8_r4, i4_out_horz_8x8_r5, i4_out_horz_8x8_r6, i4_out_horz_8x8_r7;
uint8x8_t chroma_mask_8x8;
pred0_inp_full = vld1q_u8((uint8_t *) pu1_pred);
pred1_inp_full = vld1q_u8((uint8_t *) pu1_pred + (pred_strd));
pred2_inp_full = vld1q_u8((uint8_t *) pu1_pred + (pred_strd * 2));
pred3_inp_full = vld1q_u8((uint8_t *) pu1_pred + (pred_strd * 3));
pred4_inp_full = vld1q_u8((uint8_t *) pu1_pred + (pred_strd * 4));
pred5_inp_full = vld1q_u8((uint8_t *) pu1_pred + (pred_strd * 5));
pred6_inp_full = vld1q_u8((uint8_t *) pu1_pred + (pred_strd * 6));
pred7_inp_full = vld1q_u8((uint8_t *) pu1_pred + (pred_strd * 7));
pred0_in = vget_low_u8(pred0_inp_full);
pred1_in = vget_low_u8(pred1_inp_full);
pred2_in = vget_low_u8(pred2_inp_full);
pred3_in = vget_low_u8(pred3_inp_full);
pred4_in = vget_low_u8(pred4_inp_full);
pred5_in = vget_low_u8(pred5_inp_full);
pred6_in = vget_low_u8(pred6_inp_full);
pred7_in = vget_low_u8(pred7_inp_full);
pred0 = vreinterpretq_s16_u16(vmovl_u8(pred0_in));
pred1 = vreinterpretq_s16_u16(vmovl_u8(pred1_in));
pred2 = vreinterpretq_s16_u16(vmovl_u8(pred2_in));
pred3 = vreinterpretq_s16_u16(vmovl_u8(pred3_in));
pred4 = vreinterpretq_s16_u16(vmovl_u8(pred4_in));
pred5 = vreinterpretq_s16_u16(vmovl_u8(pred5_in));
pred6 = vreinterpretq_s16_u16(vmovl_u8(pred6_in));
pred7 = vreinterpretq_s16_u16(vmovl_u8(pred7_in));
resd0_in = vld1q_s16((int16_t *) pi2_rsd);
resd1_in = vld1q_s16((int16_t *) pi2_rsd + rsd_strd);
resd2_in = vld1q_s16((int16_t *) pi2_rsd + (rsd_strd * 2));
resd3_in = vld1q_s16((int16_t *) pi2_rsd + (rsd_strd * 3));
resd4_in = vld1q_s16((int16_t *) pi2_rsd + (rsd_strd * 4));
resd5_in = vld1q_s16((int16_t *) pi2_rsd + (rsd_strd * 5));
resd6_in = vld1q_s16((int16_t *) pi2_rsd + (rsd_strd * 6));
resd7_in = vld1q_s16((int16_t *) pi2_rsd + (rsd_strd * 7));
pred0 = vaddq_s16(pred0, resd0_in);
pred1 = vaddq_s16(pred1, resd1_in);
pred2 = vaddq_s16(pred2, resd2_in);
pred3 = vaddq_s16(pred3, resd3_in);
pred4 = vaddq_s16(pred4, resd4_in);
pred5 = vaddq_s16(pred5, resd5_in);
pred6 = vaddq_s16(pred6, resd6_in);
pred7 = vaddq_s16(pred7, resd7_in);
pred0_in = vqmovun_s16(pred0);
pred1_in = vqmovun_s16(pred1);
pred2_in = vqmovun_s16(pred2);
pred3_in = vqmovun_s16(pred3);
pred4_in = vqmovun_s16(pred4);
pred5_in = vqmovun_s16(pred5);
pred6_in = vqmovun_s16(pred6);
pred7_in = vqmovun_s16(pred7);
chroma_mask_8x8 = vreinterpret_u8_u16(vdup_n_u16(0x00ff));
i4_out_horz_8x8_r0 = vld1_u8(pu1_out);
i4_out_horz_8x8_r1 = vld1_u8(pu1_out + out_strd);
i4_out_horz_8x8_r2 = vld1_u8(pu1_out + out_strd * 2);
i4_out_horz_8x8_r3 = vld1_u8(pu1_out + out_strd * 3);
i4_out_horz_8x8_r4 = vld1_u8(pu1_out + out_strd * 4);
i4_out_horz_8x8_r5 = vld1_u8(pu1_out + out_strd * 5);
i4_out_horz_8x8_r6 = vld1_u8(pu1_out + out_strd * 6);
i4_out_horz_8x8_r7 = vld1_u8(pu1_out + out_strd * 7);
i4_out_horz_8x8_r0 = vbsl_u8(chroma_mask_8x8, pred0_in, i4_out_horz_8x8_r0);
i4_out_horz_8x8_r1 = vbsl_u8(chroma_mask_8x8, pred1_in, i4_out_horz_8x8_r1);
i4_out_horz_8x8_r2 = vbsl_u8(chroma_mask_8x8, pred2_in, i4_out_horz_8x8_r2);
i4_out_horz_8x8_r3 = vbsl_u8(chroma_mask_8x8, pred3_in, i4_out_horz_8x8_r3);
i4_out_horz_8x8_r4 = vbsl_u8(chroma_mask_8x8, pred4_in, i4_out_horz_8x8_r4);
i4_out_horz_8x8_r5 = vbsl_u8(chroma_mask_8x8, pred5_in, i4_out_horz_8x8_r5);
i4_out_horz_8x8_r6 = vbsl_u8(chroma_mask_8x8, pred6_in, i4_out_horz_8x8_r6);
i4_out_horz_8x8_r7 = vbsl_u8(chroma_mask_8x8, pred7_in, i4_out_horz_8x8_r7);
vst1_u8((uint8_t *) (pu1_out), i4_out_horz_8x8_r0);
vst1_u8((uint8_t *) (pu1_out + out_strd), i4_out_horz_8x8_r1);
vst1_u8((uint8_t *) (pu1_out + out_strd * 2), i4_out_horz_8x8_r2);
vst1_u8((uint8_t *) (pu1_out + out_strd * 3), i4_out_horz_8x8_r3);
vst1_u8((uint8_t *) (pu1_out + out_strd * 4), i4_out_horz_8x8_r4);
vst1_u8((uint8_t *) (pu1_out + out_strd * 5), i4_out_horz_8x8_r5);
vst1_u8((uint8_t *) (pu1_out + out_strd * 6), i4_out_horz_8x8_r6);
vst1_u8((uint8_t *) (pu1_out + out_strd * 7), i4_out_horz_8x8_r7);
/* for the next 4 elements interleaved format */
pred0_in = vget_high_u8(pred0_inp_full);
pred1_in = vget_high_u8(pred1_inp_full);
pred2_in = vget_high_u8(pred2_inp_full);
pred3_in = vget_high_u8(pred3_inp_full);
pred4_in = vget_high_u8(pred4_inp_full);
pred5_in = vget_high_u8(pred5_inp_full);
pred6_in = vget_high_u8(pred6_inp_full);
pred7_in = vget_high_u8(pred7_inp_full);
pred0 = vreinterpretq_s16_u16(vmovl_u8(pred0_in));
pred1 = vreinterpretq_s16_u16(vmovl_u8(pred1_in));
pred2 = vreinterpretq_s16_u16(vmovl_u8(pred2_in));
pred3 = vreinterpretq_s16_u16(vmovl_u8(pred3_in));
pred4 = vreinterpretq_s16_u16(vmovl_u8(pred4_in));
pred5 = vreinterpretq_s16_u16(vmovl_u8(pred5_in));
pred6 = vreinterpretq_s16_u16(vmovl_u8(pred6_in));
pred7 = vreinterpretq_s16_u16(vmovl_u8(pred7_in));
pi2_rsd = pi2_rsd + 8;
resd0_in = vld1q_s16((int16_t *) pi2_rsd);
resd1_in = vld1q_s16((int16_t *) pi2_rsd + rsd_strd);
resd2_in = vld1q_s16((int16_t *) pi2_rsd + (rsd_strd * 2));
resd3_in = vld1q_s16((int16_t *) pi2_rsd + (rsd_strd * 3));
resd4_in = vld1q_s16((int16_t *) pi2_rsd + (rsd_strd * 4));
resd5_in = vld1q_s16((int16_t *) pi2_rsd + (rsd_strd * 5));
resd6_in = vld1q_s16((int16_t *) pi2_rsd + (rsd_strd * 6));
resd7_in = vld1q_s16((int16_t *) pi2_rsd + (rsd_strd * 7));
pred0 = vaddq_s16(pred0, resd0_in);
pred1 = vaddq_s16(pred1, resd1_in);
pred2 = vaddq_s16(pred2, resd2_in);
pred3 = vaddq_s16(pred3, resd3_in);
pred4 = vaddq_s16(pred4, resd4_in);
pred5 = vaddq_s16(pred5, resd5_in);
pred6 = vaddq_s16(pred6, resd6_in);
pred7 = vaddq_s16(pred7, resd7_in);
pred0_in = vqmovun_s16(pred0);
pred1_in = vqmovun_s16(pred1);
pred2_in = vqmovun_s16(pred2);
pred3_in = vqmovun_s16(pred3);
pred4_in = vqmovun_s16(pred4);
pred5_in = vqmovun_s16(pred5);
pred6_in = vqmovun_s16(pred6);
pred7_in = vqmovun_s16(pred7);
pu1_out = pu1_out + 8;
i4_out_horz_8x8_r0 = vld1_u8(pu1_out);
i4_out_horz_8x8_r1 = vld1_u8(pu1_out + out_strd);
i4_out_horz_8x8_r2 = vld1_u8(pu1_out + out_strd * 2);
i4_out_horz_8x8_r3 = vld1_u8(pu1_out + out_strd * 3);
i4_out_horz_8x8_r4 = vld1_u8(pu1_out + out_strd * 4);
i4_out_horz_8x8_r5 = vld1_u8(pu1_out + out_strd * 5);
i4_out_horz_8x8_r6 = vld1_u8(pu1_out + out_strd * 6);
i4_out_horz_8x8_r7 = vld1_u8(pu1_out + out_strd * 7);
i4_out_horz_8x8_r0 = vbsl_u8(chroma_mask_8x8, pred0_in, i4_out_horz_8x8_r0);
i4_out_horz_8x8_r1 = vbsl_u8(chroma_mask_8x8, pred1_in, i4_out_horz_8x8_r1);
i4_out_horz_8x8_r2 = vbsl_u8(chroma_mask_8x8, pred2_in, i4_out_horz_8x8_r2);
i4_out_horz_8x8_r3 = vbsl_u8(chroma_mask_8x8, pred3_in, i4_out_horz_8x8_r3);
i4_out_horz_8x8_r4 = vbsl_u8(chroma_mask_8x8, pred4_in, i4_out_horz_8x8_r4);
i4_out_horz_8x8_r5 = vbsl_u8(chroma_mask_8x8, pred5_in, i4_out_horz_8x8_r5);
i4_out_horz_8x8_r6 = vbsl_u8(chroma_mask_8x8, pred6_in, i4_out_horz_8x8_r6);
i4_out_horz_8x8_r7 = vbsl_u8(chroma_mask_8x8, pred7_in, i4_out_horz_8x8_r7);
vst1_u8((uint8_t *) (pu1_out), i4_out_horz_8x8_r0);
vst1_u8((uint8_t *) (pu1_out + out_strd), i4_out_horz_8x8_r1);
vst1_u8((uint8_t *) (pu1_out + out_strd * 2), i4_out_horz_8x8_r2);
vst1_u8((uint8_t *) (pu1_out + out_strd * 3), i4_out_horz_8x8_r3);
vst1_u8((uint8_t *) (pu1_out + out_strd * 4), i4_out_horz_8x8_r4);
vst1_u8((uint8_t *) (pu1_out + out_strd * 5), i4_out_horz_8x8_r5);
vst1_u8((uint8_t *) (pu1_out + out_strd * 6), i4_out_horz_8x8_r6);
vst1_u8((uint8_t *) (pu1_out + out_strd * 7), i4_out_horz_8x8_r7);
pu1_out = pu1_out_ptr;
pu1_pred = pu1_pred_ptr;
pi2_rsd = pi2_rsd_ptr;
}
/*****************************************************************************/
/* */
/* Function Name : isvcd_pred_residual_recon_chroma_4x4_neonintr */
/* */
/* Description : this function computes the recon data from the */
/* pred and residual buffer */
/* */
/* Inputs : */
/* Globals : none */
/* Processing : */
/* */
/* Outputs : none */
/* Returns : none */
/* */
/* Issues : none */
/* */
/* Revision History: */
/* */
/* DD MM YYYY Author(s) Changes (Describe the changes made) */
/* 25 11 2021 Kishore creation */
/* */
/*****************************************************************************/
void isvcd_pred_residual_recon_chroma_4x4_neonintr(UWORD8 *pu1_pred, WORD16 *pi2_rsd,
UWORD8 *pu1_out, WORD32 pred_strd,
WORD32 rsd_strd, WORD32 out_strd)
{
uint8x8_t pred0_in, pred1_in, pred2_in, pred3_in;
int16x8_t pred0, pred1, pred2, pred3;
int16x8_t resd0_in, resd1_in, resd2_in, resd3_in;
uint8x8_t i4_out_horz_8x8_r0, i4_out_horz_8x8_r1, i4_out_horz_8x8_r2, i4_out_horz_8x8_r3;
uint8x8_t chroma_mask_8x8 = vreinterpret_u8_u16(vdup_n_u16(0x00ff));
UWORD8 *pu1_pred_ptr = pu1_pred;
WORD16 *pi2_rsd_ptr = pi2_rsd;
UWORD8 *pu1_out_ptr = pu1_out;
pred0_in = vld1_u8((uint8_t *) pu1_pred);
pred1_in = vld1_u8((uint8_t *) pu1_pred + (pred_strd));
pred2_in = vld1_u8((uint8_t *) pu1_pred + (pred_strd * 2));
pred3_in = vld1_u8((uint8_t *) pu1_pred + (pred_strd * 3));
pred0 = vreinterpretq_s16_u16(vmovl_u8(pred0_in));
pred1 = vreinterpretq_s16_u16(vmovl_u8(pred1_in));
pred2 = vreinterpretq_s16_u16(vmovl_u8(pred2_in));
pred3 = vreinterpretq_s16_u16(vmovl_u8(pred3_in));
resd0_in = vld1q_s16((int16_t *) pi2_rsd);
resd1_in = vld1q_s16((int16_t *) pi2_rsd + rsd_strd);
resd2_in = vld1q_s16((int16_t *) pi2_rsd + (rsd_strd * 2));
resd3_in = vld1q_s16((int16_t *) pi2_rsd + (rsd_strd * 3));
pred0 = vaddq_s16(pred0, resd0_in);
pred1 = vaddq_s16(pred1, resd1_in);
pred2 = vaddq_s16(pred2, resd2_in);
pred3 = vaddq_s16(pred3, resd3_in);
pred0_in = vqmovun_s16(pred0);
pred1_in = vqmovun_s16(pred1);
pred2_in = vqmovun_s16(pred2);
pred3_in = vqmovun_s16(pred3);
i4_out_horz_8x8_r0 = vld1_u8(pu1_out);
i4_out_horz_8x8_r1 = vld1_u8(pu1_out + out_strd);
i4_out_horz_8x8_r2 = vld1_u8(pu1_out + out_strd * 2);
i4_out_horz_8x8_r3 = vld1_u8(pu1_out + out_strd * 3);
i4_out_horz_8x8_r0 = vbsl_u8(chroma_mask_8x8, pred0_in, i4_out_horz_8x8_r0);
i4_out_horz_8x8_r1 = vbsl_u8(chroma_mask_8x8, pred1_in, i4_out_horz_8x8_r1);
i4_out_horz_8x8_r2 = vbsl_u8(chroma_mask_8x8, pred2_in, i4_out_horz_8x8_r2);
i4_out_horz_8x8_r3 = vbsl_u8(chroma_mask_8x8, pred3_in, i4_out_horz_8x8_r3);
vst1_u8((uint8_t *) (pu1_out), i4_out_horz_8x8_r0);
vst1_u8((uint8_t *) (pu1_out + out_strd), i4_out_horz_8x8_r1);
vst1_u8((uint8_t *) (pu1_out + out_strd * 2), i4_out_horz_8x8_r2);
vst1_u8((uint8_t *) (pu1_out + out_strd * 3), i4_out_horz_8x8_r3);
pu1_out = pu1_out_ptr;
pu1_pred = pu1_pred_ptr;
pi2_rsd = pi2_rsd_ptr;
}
/*****************************************************************************/
/* */
/* Function Name : isvcd_residual_luma_4x4_neonintr */
/* */
/* Description : this function computes the nnz from resd */
/* */
/* Inputs : */
/* Globals : none */
/* Processing : */
/* */
/* Outputs : none */
/* Returns : nnz */
/* */
/* Issues : none */
/* */
/* Revision History: */
/* */
/* DD MM YYYY Author(s) Changes (Describe the changes made) */
/* 25 11 2021 Kishore creation */
/* */
/*****************************************************************************/
WORD32 isvcd_residual_luma_4x4_neonintr(WORD16 *pi2_rsd, WORD32 rsd_strd)
{
int16x8_t resd0_in, resd1_in, resd2_in, resd3_in;
int16x8_t resd01_in, resd23_in;
WORD32 i4_nnz;
int16x8_t dup_val_1, dup_val_2, dup_abs;
resd0_in = vld1q_s16((int16_t *) pi2_rsd);
resd1_in = vld1q_s16((int16_t *) pi2_rsd + rsd_strd);
resd2_in = vld1q_s16((int16_t *) pi2_rsd + (rsd_strd * 2));
resd3_in = vld1q_s16((int16_t *) pi2_rsd + (rsd_strd * 3));
resd01_in = vreinterpretq_s16_s64(vcombine_s64(vget_low_s64(vreinterpretq_s64_s16(resd0_in)),
vget_low_s64(vreinterpretq_s64_s16(resd1_in))));
resd23_in = vreinterpretq_s16_s64(vcombine_s64(vget_low_s64(vreinterpretq_s64_s16(resd2_in)),
vget_low_s64(vreinterpretq_s64_s16(resd3_in))));
dup_val_1 = vabsq_s16(resd01_in);
dup_val_2 = vabsq_s16(resd23_in);
dup_abs = vqaddq_s16(dup_val_1, dup_val_2);
i4_nnz = dup_abs[0] || dup_abs[1] || dup_abs[2] || dup_abs[3] || dup_abs[4] || dup_abs[5] ||
dup_abs[6] || dup_abs[7];
return i4_nnz;
}
/*****************************************************************************/
/* */
/* Function Name : isvcd_residual_luma_8x8_neonintr */
/* */
/* Description : this function computes the nnz from resd */
/* */
/* Inputs : */
/* Globals : none */
/* Processing : */
/* */
/* Outputs : none */
/* Returns : nnz */
/* */
/* Issues : none */
/* */
/* Revision History: */
/* */
/* DD MM YYYY Author(s) Changes (Describe the changes made) */
/* 25 11 2021 Kishore creation */
/* */
/*****************************************************************************/
WORD32 isvcd_residual_luma_8x8_neonintr(WORD16 *pi2_rsd, WORD32 rsd_strd)
{
int16x8_t resd0_in, resd1_in, resd2_in, resd3_in;
int16x8_t resd4_in, resd5_in, resd6_in, resd7_in;
int64x2_t resd0_in_64x2, resd1_in_64x2, resd2_in_64x2, resd3_in_64x2, resd4_in_64x2,
resd5_in_64x2, resd6_in_64x2, resd7_in_64x2;
int16x8_t resd_b0_r01_in;
int16x8_t resd_b0_r23_in;
int16x8_t resd_b1_r01_in;
int16x8_t resd_b1_r23_in;
int16x8_t resd_b2_r45_in;
int16x8_t resd_b2_r67_in;
int16x8_t resd_b3_r45_in;
int16x8_t resd_b3_r67_in;
int16x8_t dup_val_1, dup_val_2, dup_abs;
WORD32 nnz, nnz_b0, nnz_b1, nnz_b2, nnz_b3;
resd0_in = vld1q_s16((int16_t *) pi2_rsd);
resd1_in = vld1q_s16((int16_t *) pi2_rsd + rsd_strd);
resd2_in = vld1q_s16((int16_t *) pi2_rsd + (rsd_strd * 2));
resd3_in = vld1q_s16((int16_t *) pi2_rsd + (rsd_strd * 3));
resd4_in = vld1q_s16((int16_t *) pi2_rsd + (rsd_strd * 4));
resd5_in = vld1q_s16((int16_t *) pi2_rsd + (rsd_strd * 5));
resd6_in = vld1q_s16((int16_t *) pi2_rsd + (rsd_strd * 6));
resd7_in = vld1q_s16((int16_t *) pi2_rsd + (rsd_strd * 7));
resd0_in_64x2 = vreinterpretq_s64_s16(resd0_in);
resd1_in_64x2 = vreinterpretq_s64_s16(resd1_in);
resd2_in_64x2 = vreinterpretq_s64_s16(resd2_in);
resd3_in_64x2 = vreinterpretq_s64_s16(resd3_in);
resd4_in_64x2 = vreinterpretq_s64_s16(resd4_in);
resd5_in_64x2 = vreinterpretq_s64_s16(resd5_in);
resd6_in_64x2 = vreinterpretq_s64_s16(resd6_in);
resd7_in_64x2 = vreinterpretq_s64_s16(resd7_in);
resd_b0_r01_in = vreinterpretq_s16_s64(
vcombine_s64(vget_low_s64(resd0_in_64x2), vget_low_s64(resd1_in_64x2)));
resd_b0_r23_in = vreinterpretq_s16_s64(
vcombine_s64(vget_low_s64(resd2_in_64x2), vget_low_s64(resd3_in_64x2)));
resd_b1_r01_in = vreinterpretq_s16_s64(
vcombine_s64(vget_high_s64(resd0_in_64x2), vget_high_s64(resd1_in_64x2)));
resd_b1_r23_in = vreinterpretq_s16_s64(
vcombine_s64(vget_high_s64(resd2_in_64x2), vget_high_s64(resd3_in_64x2)));
resd_b2_r45_in = vreinterpretq_s16_s64(
vcombine_s64(vget_low_s64(resd4_in_64x2), vget_low_s64(resd5_in_64x2)));
resd_b2_r67_in = vreinterpretq_s16_s64(
vcombine_s64(vget_low_s64(resd6_in_64x2), vget_low_s64(resd7_in_64x2)));
resd_b3_r45_in = vreinterpretq_s16_s64(
vcombine_s64(vget_high_s64(resd4_in_64x2), vget_high_s64(resd5_in_64x2)));
resd_b3_r67_in = vreinterpretq_s16_s64(
vcombine_s64(vget_high_s64(resd6_in_64x2), vget_high_s64(resd7_in_64x2)));
dup_val_1 = vabsq_s16(resd_b0_r01_in);
dup_val_2 = vabsq_s16(resd_b0_r23_in);
dup_abs = vqaddq_s16(dup_val_1, dup_val_2);
nnz_b0 = dup_abs[0] || dup_abs[1] || dup_abs[2] || dup_abs[3] || dup_abs[4] || dup_abs[5] ||
dup_abs[6] || dup_abs[7];
dup_val_1 = vabsq_s16(resd_b1_r01_in);
dup_val_2 = vabsq_s16(resd_b1_r23_in);
dup_abs = vqaddq_s16(dup_val_1, dup_val_2);
nnz_b1 = dup_abs[0] || dup_abs[1] || dup_abs[2] || dup_abs[3] || dup_abs[4] || dup_abs[5] ||
dup_abs[6] || dup_abs[7];
dup_val_1 = vabsq_s16(resd_b2_r45_in);
dup_val_2 = vabsq_s16(resd_b2_r67_in);
dup_abs = vqaddq_s16(dup_val_1, dup_val_2);
nnz_b2 = dup_abs[0] || dup_abs[1] || dup_abs[2] || dup_abs[3] || dup_abs[4] || dup_abs[5] ||
dup_abs[6] || dup_abs[7];
dup_val_1 = vabsq_s16(resd_b3_r45_in);
dup_val_2 = vabsq_s16(resd_b3_r67_in);
dup_abs = vqaddq_s16(dup_val_1, dup_val_2);
nnz_b3 = dup_abs[0] || dup_abs[1] || dup_abs[2] || dup_abs[3] || dup_abs[4] || dup_abs[5] ||
dup_abs[6] || dup_abs[7];
nnz = (nnz_b0 | (nnz_b1 << 1) | (nnz_b2 << 4) | (nnz_b3 << 5));
return nnz;
}
/*****************************************************************************/
/* */
/* Function Name : isvcd_residual_luma_16x16_neonintr */
/* */
/* Description : this function computes the nnz from resd */
/* */
/* Inputs : */
/* Globals : none */
/* Processing : */
/* */
/* Outputs : none */
/* Returns : nnz */
/* */
/* Issues : none */
/* */
/* Revision History: */
/* */
/* DD MM YYYY Author(s) Changes (Describe the changes made) */
/* 25 11 2021 Kishore creation */
/* */
/*****************************************************************************/
WORD32 isvcd_residual_luma_16x16_neonintr(WORD16 *pi2_rsd, WORD32 rsd_strd)
{
int16x8_t resd0_in, resd1_in, resd2_in, resd3_in;
int16x8_t resd4_in, resd5_in, resd6_in, resd7_in;
WORD16 *pi2_rsd_ptr = pi2_rsd;
int64x2_t resd0_in_64x2, resd1_in_64x2, resd2_in_64x2, resd3_in_64x2, resd4_in_64x2,
resd5_in_64x2, resd6_in_64x2, resd7_in_64x2;
int16x8_t resd_b0_r01_in;
int16x8_t resd_b0_r23_in;
int16x8_t resd_b1_r01_in;
int16x8_t resd_b1_r23_in;
int16x8_t resd_b2_r45_in;
int16x8_t resd_b2_r67_in;
int16x8_t resd_b3_r45_in;
int16x8_t resd_b3_r67_in;
int16x8_t dup_val_1, dup_val_2, dup_abs;
WORD32 nnz, nnz_b0, nnz_b1, nnz_b2, nnz_b3;
/* First row of 8, first 8x8 elements */
resd0_in = vld1q_s16((int16_t *) pi2_rsd_ptr);
resd1_in = vld1q_s16((int16_t *) pi2_rsd_ptr + rsd_strd);
resd2_in = vld1q_s16((int16_t *) pi2_rsd_ptr + (rsd_strd * 2));
resd3_in = vld1q_s16((int16_t *) pi2_rsd_ptr + (rsd_strd * 3));
resd4_in = vld1q_s16((int16_t *) pi2_rsd_ptr + (rsd_strd * 4));
resd5_in = vld1q_s16((int16_t *) pi2_rsd_ptr + (rsd_strd * 5));
resd6_in = vld1q_s16((int16_t *) pi2_rsd_ptr + (rsd_strd * 6));
resd7_in = vld1q_s16((int16_t *) pi2_rsd_ptr + (rsd_strd * 7));
resd0_in_64x2 = vreinterpretq_s64_s16(resd0_in);
resd1_in_64x2 = vreinterpretq_s64_s16(resd1_in);
resd2_in_64x2 = vreinterpretq_s64_s16(resd2_in);
resd3_in_64x2 = vreinterpretq_s64_s16(resd3_in);
resd4_in_64x2 = vreinterpretq_s64_s16(resd4_in);
resd5_in_64x2 = vreinterpretq_s64_s16(resd5_in);
resd6_in_64x2 = vreinterpretq_s64_s16(resd6_in);
resd7_in_64x2 = vreinterpretq_s64_s16(resd7_in);
resd_b0_r01_in = vreinterpretq_s16_s64(
vcombine_s64(vget_low_s64(resd0_in_64x2), vget_low_s64(resd1_in_64x2)));
resd_b0_r23_in = vreinterpretq_s16_s64(
vcombine_s64(vget_low_s64(resd2_in_64x2), vget_low_s64(resd3_in_64x2)));
resd_b1_r01_in = vreinterpretq_s16_s64(
vcombine_s64(vget_high_s64(resd0_in_64x2), vget_high_s64(resd1_in_64x2)));
resd_b1_r23_in = vreinterpretq_s16_s64(
vcombine_s64(vget_high_s64(resd2_in_64x2), vget_high_s64(resd3_in_64x2)));
resd_b2_r45_in = vreinterpretq_s16_s64(
vcombine_s64(vget_low_s64(resd4_in_64x2), vget_low_s64(resd5_in_64x2)));
resd_b2_r67_in = vreinterpretq_s16_s64(
vcombine_s64(vget_low_s64(resd6_in_64x2), vget_low_s64(resd7_in_64x2)));
resd_b3_r45_in = vreinterpretq_s16_s64(
vcombine_s64(vget_high_s64(resd4_in_64x2), vget_high_s64(resd5_in_64x2)));
resd_b3_r67_in = vreinterpretq_s16_s64(
vcombine_s64(vget_high_s64(resd6_in_64x2), vget_high_s64(resd7_in_64x2)));
dup_val_1 = vabsq_s16(resd_b0_r01_in);
dup_val_2 = vabsq_s16(resd_b0_r23_in);
dup_abs = vqaddq_s16(dup_val_1, dup_val_2);
nnz_b0 = dup_abs[0] || dup_abs[1] || dup_abs[2] || dup_abs[3] || dup_abs[4] || dup_abs[5] ||
dup_abs[6] || dup_abs[7];
dup_val_1 = vabsq_s16(resd_b1_r01_in);
dup_val_2 = vabsq_s16(resd_b1_r23_in);
dup_abs = vqaddq_s16(dup_val_1, dup_val_2);
nnz_b1 = dup_abs[0] || dup_abs[1] || dup_abs[2] || dup_abs[3] || dup_abs[4] || dup_abs[5] ||
dup_abs[6] || dup_abs[7];
dup_val_1 = vabsq_s16(resd_b2_r45_in);
dup_val_2 = vabsq_s16(resd_b2_r67_in);
dup_abs = vqaddq_s16(dup_val_1, dup_val_2);
nnz_b2 = dup_abs[0] || dup_abs[1] || dup_abs[2] || dup_abs[3] || dup_abs[4] || dup_abs[5] ||
dup_abs[6] || dup_abs[7];
dup_val_1 = vabsq_s16(resd_b3_r45_in);
dup_val_2 = vabsq_s16(resd_b3_r67_in);
dup_abs = vqaddq_s16(dup_val_1, dup_val_2);
nnz_b3 = dup_abs[0] || dup_abs[1] || dup_abs[2] || dup_abs[3] || dup_abs[4] || dup_abs[5] ||
dup_abs[6] || dup_abs[7];
nnz = (nnz_b0 | (nnz_b1 << 1) | (nnz_b2 << 4) | (nnz_b3 << 5));
/* first row of 8, sec 8x8 elements */
pi2_rsd_ptr = pi2_rsd_ptr + 8;
resd0_in = vld1q_s16((int16_t *) pi2_rsd_ptr);
resd1_in = vld1q_s16((int16_t *) pi2_rsd_ptr + rsd_strd);
resd2_in = vld1q_s16((int16_t *) pi2_rsd_ptr + (rsd_strd * 2));
resd3_in = vld1q_s16((int16_t *) pi2_rsd_ptr + (rsd_strd * 3));
resd4_in = vld1q_s16((int16_t *) pi2_rsd_ptr + (rsd_strd * 4));
resd5_in = vld1q_s16((int16_t *) pi2_rsd_ptr + (rsd_strd * 5));
resd6_in = vld1q_s16((int16_t *) pi2_rsd_ptr + (rsd_strd * 6));
resd7_in = vld1q_s16((int16_t *) pi2_rsd_ptr + (rsd_strd * 7));
resd0_in_64x2 = vreinterpretq_s64_s16(resd0_in);
resd1_in_64x2 = vreinterpretq_s64_s16(resd1_in);
resd2_in_64x2 = vreinterpretq_s64_s16(resd2_in);
resd3_in_64x2 = vreinterpretq_s64_s16(resd3_in);
resd4_in_64x2 = vreinterpretq_s64_s16(resd4_in);
resd5_in_64x2 = vreinterpretq_s64_s16(resd5_in);
resd6_in_64x2 = vreinterpretq_s64_s16(resd6_in);
resd7_in_64x2 = vreinterpretq_s64_s16(resd7_in);
resd_b0_r01_in = vreinterpretq_s16_s64(
vcombine_s64(vget_low_s64(resd0_in_64x2), vget_low_s64(resd1_in_64x2)));
resd_b0_r23_in = vreinterpretq_s16_s64(
vcombine_s64(vget_low_s64(resd2_in_64x2), vget_low_s64(resd3_in_64x2)));
resd_b1_r01_in = vreinterpretq_s16_s64(
vcombine_s64(vget_high_s64(resd0_in_64x2), vget_high_s64(resd1_in_64x2)));
resd_b1_r23_in = vreinterpretq_s16_s64(
vcombine_s64(vget_high_s64(resd2_in_64x2), vget_high_s64(resd3_in_64x2)));
resd_b2_r45_in = vreinterpretq_s16_s64(
vcombine_s64(vget_low_s64(resd4_in_64x2), vget_low_s64(resd5_in_64x2)));
resd_b2_r67_in = vreinterpretq_s16_s64(
vcombine_s64(vget_low_s64(resd6_in_64x2), vget_low_s64(resd7_in_64x2)));
resd_b3_r45_in = vreinterpretq_s16_s64(
vcombine_s64(vget_high_s64(resd4_in_64x2), vget_high_s64(resd5_in_64x2)));
resd_b3_r67_in = vreinterpretq_s16_s64(
vcombine_s64(vget_high_s64(resd6_in_64x2), vget_high_s64(resd7_in_64x2)));
dup_val_1 = vabsq_s16(resd_b0_r01_in);
dup_val_2 = vabsq_s16(resd_b0_r23_in);
dup_abs = vqaddq_s16(dup_val_1, dup_val_2);
nnz_b0 = dup_abs[0] || dup_abs[1] || dup_abs[2] || dup_abs[3] || dup_abs[4] || dup_abs[5] ||
dup_abs[6] || dup_abs[7];
dup_val_1 = vabsq_s16(resd_b1_r01_in);
dup_val_2 = vabsq_s16(resd_b1_r23_in);
dup_abs = vqaddq_s16(dup_val_1, dup_val_2);
nnz_b1 = dup_abs[0] || dup_abs[1] || dup_abs[2] || dup_abs[3] || dup_abs[4] || dup_abs[5] ||
dup_abs[6] || dup_abs[7];
dup_val_1 = vabsq_s16(resd_b2_r45_in);
dup_val_2 = vabsq_s16(resd_b2_r67_in);
dup_abs = vqaddq_s16(dup_val_1, dup_val_2);
nnz_b2 = dup_abs[0] || dup_abs[1] || dup_abs[2] || dup_abs[3] || dup_abs[4] || dup_abs[5] ||
dup_abs[6] || dup_abs[7];
dup_val_1 = vabsq_s16(resd_b3_r45_in);
dup_val_2 = vabsq_s16(resd_b3_r67_in);
dup_abs = vqaddq_s16(dup_val_1, dup_val_2);
nnz_b3 = dup_abs[0] || dup_abs[1] || dup_abs[2] || dup_abs[3] || dup_abs[4] || dup_abs[5] ||
dup_abs[6] || dup_abs[7];
nnz |= (nnz_b0 << 2 | (nnz_b1 << 3) | (nnz_b2 << 6) | (nnz_b3 << 7));
pi2_rsd_ptr = pi2_rsd + (8 * rsd_strd);
/*Sec row of 8, first 8x8*/
resd0_in = vld1q_s16((int16_t *) pi2_rsd_ptr);
resd1_in = vld1q_s16((int16_t *) pi2_rsd_ptr + rsd_strd);
resd2_in = vld1q_s16((int16_t *) pi2_rsd_ptr + (rsd_strd * 2));
resd3_in = vld1q_s16((int16_t *) pi2_rsd_ptr + (rsd_strd * 3));
resd4_in = vld1q_s16((int16_t *) pi2_rsd_ptr + (rsd_strd * 4));
resd5_in = vld1q_s16((int16_t *) pi2_rsd_ptr + (rsd_strd * 5));
resd6_in = vld1q_s16((int16_t *) pi2_rsd_ptr + (rsd_strd * 6));
resd7_in = vld1q_s16((int16_t *) pi2_rsd_ptr + (rsd_strd * 7));
resd0_in_64x2 = vreinterpretq_s64_s16(resd0_in);
resd1_in_64x2 = vreinterpretq_s64_s16(resd1_in);
resd2_in_64x2 = vreinterpretq_s64_s16(resd2_in);
resd3_in_64x2 = vreinterpretq_s64_s16(resd3_in);
resd4_in_64x2 = vreinterpretq_s64_s16(resd4_in);
resd5_in_64x2 = vreinterpretq_s64_s16(resd5_in);
resd6_in_64x2 = vreinterpretq_s64_s16(resd6_in);
resd7_in_64x2 = vreinterpretq_s64_s16(resd7_in);
resd_b0_r01_in = vreinterpretq_s16_s64(
vcombine_s64(vget_low_s64(resd0_in_64x2), vget_low_s64(resd1_in_64x2)));
resd_b0_r23_in = vreinterpretq_s16_s64(
vcombine_s64(vget_low_s64(resd2_in_64x2), vget_low_s64(resd3_in_64x2)));
resd_b1_r01_in = vreinterpretq_s16_s64(
vcombine_s64(vget_high_s64(resd0_in_64x2), vget_high_s64(resd1_in_64x2)));
resd_b1_r23_in = vreinterpretq_s16_s64(
vcombine_s64(vget_high_s64(resd2_in_64x2), vget_high_s64(resd3_in_64x2)));
resd_b2_r45_in = vreinterpretq_s16_s64(
vcombine_s64(vget_low_s64(resd4_in_64x2), vget_low_s64(resd5_in_64x2)));
resd_b2_r67_in = vreinterpretq_s16_s64(
vcombine_s64(vget_low_s64(resd6_in_64x2), vget_low_s64(resd7_in_64x2)));
resd_b3_r45_in = vreinterpretq_s16_s64(
vcombine_s64(vget_high_s64(resd4_in_64x2), vget_high_s64(resd5_in_64x2)));
resd_b3_r67_in = vreinterpretq_s16_s64(
vcombine_s64(vget_high_s64(resd6_in_64x2), vget_high_s64(resd7_in_64x2)));
dup_val_1 = vabsq_s16(resd_b0_r01_in);
dup_val_2 = vabsq_s16(resd_b0_r23_in);
dup_abs = vqaddq_s16(dup_val_1, dup_val_2);
nnz_b0 = dup_abs[0] || dup_abs[1] || dup_abs[2] || dup_abs[3] || dup_abs[4] || dup_abs[5] ||
dup_abs[6] || dup_abs[7];
dup_val_1 = vabsq_s16(resd_b1_r01_in);
dup_val_2 = vabsq_s16(resd_b1_r23_in);
dup_abs = vqaddq_s16(dup_val_1, dup_val_2);
nnz_b1 = dup_abs[0] || dup_abs[1] || dup_abs[2] || dup_abs[3] || dup_abs[4] || dup_abs[5] ||
dup_abs[6] || dup_abs[7];
dup_val_1 = vabsq_s16(resd_b2_r45_in);
dup_val_2 = vabsq_s16(resd_b2_r67_in);
dup_abs = vqaddq_s16(dup_val_1, dup_val_2);
nnz_b2 = dup_abs[0] || dup_abs[1] || dup_abs[2] || dup_abs[3] || dup_abs[4] || dup_abs[5] ||
dup_abs[6] || dup_abs[7];
dup_val_1 = vabsq_s16(resd_b3_r45_in);
dup_val_2 = vabsq_s16(resd_b3_r67_in);
dup_abs = vqaddq_s16(dup_val_1, dup_val_2);
nnz_b3 = dup_abs[0] || dup_abs[1] || dup_abs[2] || dup_abs[3] || dup_abs[4] || dup_abs[5] ||
dup_abs[6] || dup_abs[7];
nnz |= (nnz_b0 << 8 | (nnz_b1 << 9) | (nnz_b2 << 12) | (nnz_b3 << 13));
/*Sec row of 8, Sec 8x8*/
pi2_rsd_ptr = pi2_rsd_ptr + 8;
resd0_in = vld1q_s16((int16_t *) pi2_rsd_ptr);
resd1_in = vld1q_s16((int16_t *) pi2_rsd_ptr + rsd_strd);
resd2_in = vld1q_s16((int16_t *) pi2_rsd_ptr + (rsd_strd * 2));
resd3_in = vld1q_s16((int16_t *) pi2_rsd_ptr + (rsd_strd * 3));
resd4_in = vld1q_s16((int16_t *) pi2_rsd_ptr + (rsd_strd * 4));
resd5_in = vld1q_s16((int16_t *) pi2_rsd_ptr + (rsd_strd * 5));
resd6_in = vld1q_s16((int16_t *) pi2_rsd_ptr + (rsd_strd * 6));
resd7_in = vld1q_s16((int16_t *) pi2_rsd_ptr + (rsd_strd * 7));
resd0_in_64x2 = vreinterpretq_s64_s16(resd0_in);
resd1_in_64x2 = vreinterpretq_s64_s16(resd1_in);
resd2_in_64x2 = vreinterpretq_s64_s16(resd2_in);
resd3_in_64x2 = vreinterpretq_s64_s16(resd3_in);
resd4_in_64x2 = vreinterpretq_s64_s16(resd4_in);
resd5_in_64x2 = vreinterpretq_s64_s16(resd5_in);
resd6_in_64x2 = vreinterpretq_s64_s16(resd6_in);
resd7_in_64x2 = vreinterpretq_s64_s16(resd7_in);
resd_b0_r01_in = vreinterpretq_s16_s64(
vcombine_s64(vget_low_s64(resd0_in_64x2), vget_low_s64(resd1_in_64x2)));
resd_b0_r23_in = vreinterpretq_s16_s64(
vcombine_s64(vget_low_s64(resd2_in_64x2), vget_low_s64(resd3_in_64x2)));
resd_b1_r01_in = vreinterpretq_s16_s64(
vcombine_s64(vget_high_s64(resd0_in_64x2), vget_high_s64(resd1_in_64x2)));
resd_b1_r23_in = vreinterpretq_s16_s64(
vcombine_s64(vget_high_s64(resd2_in_64x2), vget_high_s64(resd3_in_64x2)));
resd_b2_r45_in = vreinterpretq_s16_s64(
vcombine_s64(vget_low_s64(resd4_in_64x2), vget_low_s64(resd5_in_64x2)));
resd_b2_r67_in = vreinterpretq_s16_s64(
vcombine_s64(vget_low_s64(resd6_in_64x2), vget_low_s64(resd7_in_64x2)));
resd_b3_r45_in = vreinterpretq_s16_s64(
vcombine_s64(vget_high_s64(resd4_in_64x2), vget_high_s64(resd5_in_64x2)));
resd_b3_r67_in = vreinterpretq_s16_s64(
vcombine_s64(vget_high_s64(resd6_in_64x2), vget_high_s64(resd7_in_64x2)));
dup_val_1 = vabsq_s16(resd_b0_r01_in);
dup_val_2 = vabsq_s16(resd_b0_r23_in);
dup_abs = vqaddq_s16(dup_val_1, dup_val_2);
nnz_b0 = dup_abs[0] || dup_abs[1] || dup_abs[2] || dup_abs[3] || dup_abs[4] || dup_abs[5] ||
dup_abs[6] || dup_abs[7];
dup_val_1 = vabsq_s16(resd_b1_r01_in);
dup_val_2 = vabsq_s16(resd_b1_r23_in);
dup_abs = vqaddq_s16(dup_val_1, dup_val_2);
nnz_b1 = dup_abs[0] || dup_abs[1] || dup_abs[2] || dup_abs[3] || dup_abs[4] || dup_abs[5] ||
dup_abs[6] || dup_abs[7];
dup_val_1 = vabsq_s16(resd_b2_r45_in);
dup_val_2 = vabsq_s16(resd_b2_r67_in);
dup_abs = vqaddq_s16(dup_val_1, dup_val_2);
nnz_b2 = dup_abs[0] || dup_abs[1] || dup_abs[2] || dup_abs[3] || dup_abs[4] || dup_abs[5] ||
dup_abs[6] || dup_abs[7];
dup_val_1 = vabsq_s16(resd_b3_r45_in);
dup_val_2 = vabsq_s16(resd_b3_r67_in);
dup_abs = vqaddq_s16(dup_val_1, dup_val_2);
nnz_b3 = dup_abs[0] || dup_abs[1] || dup_abs[2] || dup_abs[3] || dup_abs[4] || dup_abs[5] ||
dup_abs[6] || dup_abs[7];
nnz |= (nnz_b0 << 10 | (nnz_b1 << 11) | (nnz_b2 << 14) | (nnz_b3 << 15));
return nnz;
}
/*****************************************************************************/
/* */
/* Function Name : isvcd_residual_chroma_cb_cr_8x8_neonintr */
/* */
/* Description : this function computes the nnz from resd */
/* */
/* Inputs : */
/* Globals : none */
/* Processing : */
/* */
/* Outputs : none */
/* Returns : nnz */
/* */
/* Issues : none */
/* */
/* Revision History: */
/* */
/* DD MM YYYY Author(s) Changes (Describe the changes made) */
/* 25 11 2021 Kishore creation */
/* */
/*****************************************************************************/
WORD32 isvcd_residual_chroma_cb_cr_8x8_neonintr(WORD16 *pi2_rsd, WORD32 rsd_strd)
{
int16x8x2_t resd0_in, resd1_in, resd2_in, resd3_in;
int16x8x2_t resd4_in, resd5_in, resd6_in, resd7_in;
int64x2_t resd0_cr_64x2, resd1_cr_64x2, resd2_cr_64x2, resd3_cr_64x2, resd4_cr_64x2,
resd5_cr_64x2, resd6_cr_64x2, resd7_cr_64x2;
int16x8_t resd_b0_r01_cr;
int16x8_t resd_b0_r23_cr;
int16x8_t resd_b1_r01_cr;
int16x8_t resd_b1_r23_cr;
int16x8_t resd_b2_r45_cr;
int16x8_t resd_b2_r67_cr;
int16x8_t resd_b3_r45_cr;
int16x8_t resd_b3_r67_cr;
int64x2_t resd0_cb_64x2, resd1_cb_64x2, resd2_cb_64x2, resd3_cb_64x2, resd4_cb_64x2,
resd5_cb_64x2, resd6_cb_64x2, resd7_cb_64x2;
int16x8_t resd_b0_r01_cb;
int16x8_t resd_b0_r23_cb;
int16x8_t resd_b1_r01_cb;
int16x8_t resd_b1_r23_cb;
int16x8_t resd_b2_r45_cb;
int16x8_t resd_b2_r67_cb;
int16x8_t resd_b3_r45_cb;
int16x8_t resd_b3_r67_cb;
WORD32 nnz, nnz_b0, nnz_b1, nnz_b2, nnz_b3;
int16x8_t dup_val_1, dup_val_2, dup_abs;
resd0_in = vld2q_s16((int16_t *) pi2_rsd);
resd1_in = vld2q_s16((int16_t *) pi2_rsd + rsd_strd);
resd2_in = vld2q_s16((int16_t *) pi2_rsd + (rsd_strd * 2));
resd3_in = vld2q_s16((int16_t *) pi2_rsd + (rsd_strd * 3));
resd4_in = vld2q_s16((int16_t *) pi2_rsd + (rsd_strd * 4));
resd5_in = vld2q_s16((int16_t *) pi2_rsd + (rsd_strd * 5));
resd6_in = vld2q_s16((int16_t *) pi2_rsd + (rsd_strd * 6));
resd7_in = vld2q_s16((int16_t *) pi2_rsd + (rsd_strd * 7));
resd0_cb_64x2 = vreinterpretq_s64_s16(resd0_in.val[0]);
resd1_cb_64x2 = vreinterpretq_s64_s16(resd1_in.val[0]);
resd2_cb_64x2 = vreinterpretq_s64_s16(resd2_in.val[0]);
resd3_cb_64x2 = vreinterpretq_s64_s16(resd3_in.val[0]);
resd4_cb_64x2 = vreinterpretq_s64_s16(resd4_in.val[0]);
resd5_cb_64x2 = vreinterpretq_s64_s16(resd5_in.val[0]);
resd6_cb_64x2 = vreinterpretq_s64_s16(resd6_in.val[0]);
resd7_cb_64x2 = vreinterpretq_s64_s16(resd7_in.val[0]);
resd_b0_r01_cb = vreinterpretq_s16_s64(
vcombine_s64(vget_low_s64(resd0_cb_64x2), vget_low_s64(resd1_cb_64x2)));
resd_b0_r23_cb = vreinterpretq_s16_s64(
vcombine_s64(vget_low_s64(resd2_cb_64x2), vget_low_s64(resd3_cb_64x2)));
resd_b1_r01_cb = vreinterpretq_s16_s64(
vcombine_s64(vget_high_s64(resd0_cb_64x2), vget_high_s64(resd1_cb_64x2)));
resd_b1_r23_cb = vreinterpretq_s16_s64(
vcombine_s64(vget_high_s64(resd2_cb_64x2), vget_high_s64(resd3_cb_64x2)));
resd_b2_r45_cb = vreinterpretq_s16_s64(
vcombine_s64(vget_low_s64(resd4_cb_64x2), vget_low_s64(resd5_cb_64x2)));
resd_b2_r67_cb = vreinterpretq_s16_s64(
vcombine_s64(vget_low_s64(resd6_cb_64x2), vget_low_s64(resd7_cb_64x2)));
resd_b3_r45_cb = vreinterpretq_s16_s64(
vcombine_s64(vget_high_s64(resd4_cb_64x2), vget_high_s64(resd5_cb_64x2)));
resd_b3_r67_cb = vreinterpretq_s16_s64(
vcombine_s64(vget_high_s64(resd6_cb_64x2), vget_high_s64(resd7_cb_64x2)));
resd0_cr_64x2 = vreinterpretq_s64_s16(resd0_in.val[1]);
resd1_cr_64x2 = vreinterpretq_s64_s16(resd1_in.val[1]);
resd2_cr_64x2 = vreinterpretq_s64_s16(resd2_in.val[1]);
resd3_cr_64x2 = vreinterpretq_s64_s16(resd3_in.val[1]);
resd4_cr_64x2 = vreinterpretq_s64_s16(resd4_in.val[1]);
resd5_cr_64x2 = vreinterpretq_s64_s16(resd5_in.val[1]);
resd6_cr_64x2 = vreinterpretq_s64_s16(resd6_in.val[1]);
resd7_cr_64x2 = vreinterpretq_s64_s16(resd7_in.val[1]);
resd_b0_r01_cr = vreinterpretq_s16_s64(
vcombine_s64(vget_low_s64(resd0_cr_64x2), vget_low_s64(resd1_cr_64x2)));
resd_b0_r23_cr = vreinterpretq_s16_s64(
vcombine_s64(vget_low_s64(resd2_cr_64x2), vget_low_s64(resd3_cr_64x2)));
resd_b1_r01_cr = vreinterpretq_s16_s64(
vcombine_s64(vget_high_s64(resd0_cr_64x2), vget_high_s64(resd1_cr_64x2)));
resd_b1_r23_cr = vreinterpretq_s16_s64(
vcombine_s64(vget_high_s64(resd2_cr_64x2), vget_high_s64(resd3_cr_64x2)));
resd_b2_r45_cr = vreinterpretq_s16_s64(
vcombine_s64(vget_low_s64(resd4_cr_64x2), vget_low_s64(resd5_cr_64x2)));
resd_b2_r67_cr = vreinterpretq_s16_s64(
vcombine_s64(vget_low_s64(resd6_cr_64x2), vget_low_s64(resd7_cr_64x2)));
resd_b3_r45_cr = vreinterpretq_s16_s64(
vcombine_s64(vget_high_s64(resd4_cr_64x2), vget_high_s64(resd5_cr_64x2)));
resd_b3_r67_cr = vreinterpretq_s16_s64(
vcombine_s64(vget_high_s64(resd6_cr_64x2), vget_high_s64(resd7_cr_64x2)));
dup_val_1 = vabsq_s16(resd_b0_r01_cr);
dup_val_2 = vabsq_s16(resd_b0_r23_cr);
dup_abs = vqaddq_s16(dup_val_1, dup_val_2);
nnz_b0 = dup_abs[0] || dup_abs[1] || dup_abs[2] || dup_abs[3] || dup_abs[4] || dup_abs[5] ||
dup_abs[6] || dup_abs[7];
dup_val_1 = vabsq_s16(resd_b1_r01_cr);
dup_val_2 = vabsq_s16(resd_b1_r23_cr);
dup_abs = vqaddq_s16(dup_val_1, dup_val_2);
nnz_b1 = dup_abs[0] || dup_abs[1] || dup_abs[2] || dup_abs[3] || dup_abs[4] || dup_abs[5] ||
dup_abs[6] || dup_abs[7];
dup_val_1 = vabsq_s16(resd_b2_r45_cr);
dup_val_2 = vabsq_s16(resd_b2_r67_cr);
dup_abs = vqaddq_s16(dup_val_1, dup_val_2);
nnz_b2 = dup_abs[0] || dup_abs[1] || dup_abs[2] || dup_abs[3] || dup_abs[4] || dup_abs[5] ||
dup_abs[6] || dup_abs[7];
dup_val_1 = vabsq_s16(resd_b3_r45_cr);
dup_val_2 = vabsq_s16(resd_b3_r67_cr);
dup_abs = vqaddq_s16(dup_val_1, dup_val_2);
nnz_b3 = dup_abs[0] || dup_abs[1] || dup_abs[2] || dup_abs[3] || dup_abs[4] || dup_abs[5] ||
dup_abs[6] || dup_abs[7];
nnz = ((nnz_b0 | (nnz_b1 << 1) | (nnz_b2 << 2) | (nnz_b3 << 3)) << 4);
dup_val_1 = vabsq_s16(resd_b0_r01_cb);
dup_val_2 = vabsq_s16(resd_b0_r23_cb);
dup_abs = vqaddq_s16(dup_val_1, dup_val_2);
nnz_b0 = dup_abs[0] || dup_abs[1] || dup_abs[2] || dup_abs[3] || dup_abs[4] || dup_abs[5] ||
dup_abs[6] || dup_abs[7];
dup_val_1 = vabsq_s16(resd_b1_r01_cb);
dup_val_2 = vabsq_s16(resd_b1_r23_cb);
dup_abs = vqaddq_s16(dup_val_1, dup_val_2);
nnz_b1 = dup_abs[0] || dup_abs[1] || dup_abs[2] || dup_abs[3] || dup_abs[4] || dup_abs[5] ||
dup_abs[6] || dup_abs[7];
dup_val_1 = vabsq_s16(resd_b2_r45_cb);
dup_val_2 = vabsq_s16(resd_b2_r67_cb);
dup_abs = vqaddq_s16(dup_val_1, dup_val_2);
nnz_b2 = dup_abs[0] || dup_abs[1] || dup_abs[2] || dup_abs[3] || dup_abs[4] || dup_abs[5] ||
dup_abs[6] || dup_abs[7];
dup_val_1 = vabsq_s16(resd_b3_r45_cb);
dup_val_2 = vabsq_s16(resd_b3_r67_cb);
dup_abs = vqaddq_s16(dup_val_1, dup_val_2);
nnz_b3 = dup_abs[0] || dup_abs[1] || dup_abs[2] || dup_abs[3] || dup_abs[4] || dup_abs[5] ||
dup_abs[6] || dup_abs[7];
nnz |= ((nnz_b0 | (nnz_b1 << 1) | (nnz_b2 << 2) | (nnz_b3 << 3)));
return nnz;
}