| /****************************************************************************** |
| * |
| * Copyright (C) 2022 The Android Open Source Project |
| * |
| * Licensed under the Apache License, Version 2.0 (the "License"); |
| * you may not use this file except in compliance with the License. |
| * You may obtain a copy of the License at: |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| * |
| ***************************************************************************** |
| * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore |
| */ |
| /** |
| ******************************************************************************* |
| * @file |
| * isvcd_residual_resamp_neonintr.c |
| * |
| * @brief |
| * Contains routines that resample for SVC resampling |
| * |
| * @author |
| * Kishore |
| * |
| * @par List of Functions: |
| * - isvcd_pred_residual_recon_4x4_neonintr() |
| * - isvcd_pred_residual_recon_8x8_neonintr() |
| * - isvcd_pred_residual_recon_16x16_neonintr() |
| * - isvcd_pred_residual_recon_chroma_4x4_neonintr() |
| * - isvcd_pred_residual_recon_chroma_8x8_neonintr() |
| * - isvcd_residual_luma_4x4_neonintr() |
| * - isvcd_residual_luma_8x8_neonintr() |
| * - isvcd_residual_luma_16x16_neonintr() |
| * - isvcd_residual_chroma_cb_cr_8x8_neonintr() |
| * |
| * @remarks |
| * |
| ******************************************************************************* |
| */ |
| |
| /*! |
| ************************************************************************** |
| * \file isvcd_residual_resamp_neonintr.c |
| * |
| * \brief |
| * Contains routines that resample for SVC resampling |
| * |
| * Detailed_description |
| * |
| * \date |
| * |
| * |
| * \author : kishore |
| ************************************************************************** |
| */ |
| #include <assert.h> |
| #include <string.h> |
| #include <arm_neon.h> |
| |
| #include "ih264_typedefs.h" |
| #include "ih264_macros.h" |
| #include "ih264_platform_macros.h" |
| #include "isvcd_structs.h" |
| #include "ih264_debug.h" |
| |
| /*****************************************************************************/ |
| /* */ |
| /* Function Name : isvcd_residual_luma_dyadic_neonintr */ |
| /* */ |
| /* Description : this fucntion does the upsampling of luma residuals for */ |
| /* Dyadic cases */ |
| /* */ |
| /* Inputs : pv_residual_samp_ctxt : Residual upsampling context */ |
| /* pu1_inp_data : input 8 bit data pointer */ |
| /* i4_inp_data_stride : input buffer stride */ |
| /* pi2_out_res : output 16 bit buffer pointer */ |
| /* i4_out_res_stride : Output buffer stride */ |
| /* pu1_inp_bitmap : input packed sign bit data pointer */ |
| /* i4_inp_bitmap_stride : sign bit buffer stride */ |
| /* ps_ref_mb_mode : reference mb mode pointer of base layer */ |
| /* ps_coord : mb co-ordinate pointer */ |
| /* Globals : none */ |
| /* Processing : it does the upsampling with fixed phase values and */ |
| /* reference layer transform size */ |
| /* Outputs : Upsampled residuals for luma */ |
| /* Returns : none */ |
| /* */ |
| /* Issues : none */ |
| /* */ |
| /* Revision History: */ |
| /* */ |
| /* DD MM YYYY Author(s) Changes (Describe the changes made) */ |
| /* 26 05 2021 Dolan creation */ |
| /* */ |
| /*****************************************************************************/ |
| |
| void isvcd_residual_luma_dyadic_neonintr(void *pv_residual_samp_ctxt, WORD16 *pi2_inp_data, |
| WORD32 i4_inp_data_stride, WORD16 *pi2_out_res, |
| WORD32 i4_out_res_stride, mem_element_t *ps_ref_mb_mode, |
| UWORD16 u2_mb_x, UWORD16 u2_mb_y, WORD32 i4_ref_nnz, |
| WORD32 i4_ref_tx_size) |
| { |
| WORD16 *pi2_refarray_buffer; |
| WORD32 i4_blk_ctr; |
| residual_sampling_ctxt_t *ps_ctxt; |
| UNUSED(ps_ref_mb_mode); |
| UNUSED(u2_mb_x); |
| UNUSED(u2_mb_y); |
| |
| ps_ctxt = (residual_sampling_ctxt_t *) pv_residual_samp_ctxt; |
| pi2_refarray_buffer = ps_ctxt->pi2_refarray_buffer; |
| |
| /* based on transform size the counter and interpolation width and */ |
| /* height are intialised as follows */ |
| if((i4_ref_tx_size) && (0 != i4_ref_nnz)) |
| { |
| WORD16 *pi2_ref_data_byte; |
| WORD32 *pi4_ref_array; |
| WORD32 i4_i, i4_j; |
| /* ----------- Horizontal Interpolation ---------------- */ |
| int16x8_t i2_coeff_add_16x8_r0; |
| int16x8_t i2_coeff_16x8_r0_0, i2_coeff_16x8_r0_1; |
| int16x8_t i2_coeff_16x8_sl_r0_0, i2_coeff_16x8_sl_r0_1; |
| int16x8_t result_16x8_r0_0, result_16x8_r0_1; |
| int16x8_t final_result_16x8_r0_0, final_result_16x8_r0_1; |
| |
| int16x8_t i2_coeff_add_16x8_r1; |
| int16x8_t i2_coeff_16x8_r1_0, i2_coeff_16x8_r1_1; |
| int16x8_t i2_coeff_16x8_sl_r1_0, i2_coeff_16x8_sl_r1_1; |
| int16x8_t result_16x8_r1_0, result_16x8_r1_1; |
| int16x8_t final_result_16x8_r1_0, final_result_16x8_r1_1; |
| int16x8x2_t result_16x8x2_t_0; |
| |
| pi2_ref_data_byte = pi2_inp_data; |
| |
| /* ----------- Horizontal Interpolation ---------------- */ |
| pi4_ref_array = (WORD32 *) pi2_refarray_buffer; |
| |
| for(i4_i = 0; i4_i < BLOCK_HEIGHT; i4_i += 2) |
| { |
| i2_coeff_16x8_r0_0 = vld1q_s16(pi2_ref_data_byte); |
| i2_coeff_16x8_r0_1 = vld1q_s16((pi2_ref_data_byte + 1)); |
| |
| i2_coeff_16x8_r1_0 = vld1q_s16(pi2_ref_data_byte + i4_inp_data_stride); |
| i2_coeff_16x8_r1_1 = vld1q_s16((pi2_ref_data_byte + i4_inp_data_stride + 1)); |
| |
| i2_coeff_add_16x8_r0 = vaddq_s16(i2_coeff_16x8_r0_0, i2_coeff_16x8_r0_1); |
| i2_coeff_16x8_sl_r0_0 = vshlq_n_s16(i2_coeff_16x8_r0_0, 1); |
| i2_coeff_16x8_sl_r0_1 = vshlq_n_s16(i2_coeff_16x8_r0_1, 1); |
| |
| i2_coeff_add_16x8_r1 = vaddq_s16(i2_coeff_16x8_r1_0, i2_coeff_16x8_r1_1); |
| i2_coeff_16x8_sl_r1_0 = vshlq_n_s16(i2_coeff_16x8_r1_0, 1); |
| i2_coeff_16x8_sl_r1_1 = vshlq_n_s16(i2_coeff_16x8_r1_1, 1); |
| |
| result_16x8_r0_0 = vaddq_s16(i2_coeff_16x8_sl_r0_0, i2_coeff_add_16x8_r0); |
| result_16x8_r0_1 = vaddq_s16(i2_coeff_16x8_sl_r0_1, i2_coeff_add_16x8_r0); |
| |
| result_16x8_r1_0 = vaddq_s16(i2_coeff_16x8_sl_r1_0, i2_coeff_add_16x8_r1); |
| result_16x8_r1_1 = vaddq_s16(i2_coeff_16x8_sl_r1_1, i2_coeff_add_16x8_r1); |
| |
| result_16x8x2_t_0 = vzipq_s16(result_16x8_r0_0, result_16x8_r0_1); |
| final_result_16x8_r0_0 = result_16x8x2_t_0.val[0]; |
| final_result_16x8_r0_1 = result_16x8x2_t_0.val[1]; |
| |
| result_16x8x2_t_0 = vzipq_s16(result_16x8_r1_0, result_16x8_r1_1); |
| final_result_16x8_r1_0 = result_16x8x2_t_0.val[0]; |
| final_result_16x8_r1_1 = result_16x8x2_t_0.val[1]; |
| |
| vst1q_s32(pi4_ref_array + 1, vmovl_s16(vget_low_s16(final_result_16x8_r0_0))); |
| vst1q_s32(pi4_ref_array + 5, vmovl_s16(vget_high_s16(final_result_16x8_r0_0))); |
| vst1q_s32(pi4_ref_array + 9, vmovl_s16(vget_low_s16(final_result_16x8_r0_1))); |
| vst1q_s32(pi4_ref_array + 13, vmovl_s16(vget_high_s16(final_result_16x8_r0_1))); |
| pi4_ref_array[0] = pi2_ref_data_byte[0] << 2; |
| pi4_ref_array[15] = pi2_ref_data_byte[7] << 2; |
| pi4_ref_array += 16; |
| pi2_ref_data_byte += i4_inp_data_stride; |
| |
| vst1q_s32(pi4_ref_array + 1, vmovl_s16(vget_low_s16(final_result_16x8_r1_0))); |
| vst1q_s32(pi4_ref_array + 5, vmovl_s16(vget_high_s16(final_result_16x8_r1_0))); |
| vst1q_s32(pi4_ref_array + 9, vmovl_s16(vget_low_s16(final_result_16x8_r1_1))); |
| vst1q_s32(pi4_ref_array + 13, vmovl_s16(vget_high_s16(final_result_16x8_r1_1))); |
| |
| pi4_ref_array[0] = pi2_ref_data_byte[0] << 2; |
| pi4_ref_array[15] = pi2_ref_data_byte[7] << 2; |
| pi4_ref_array += 16; |
| /* vertical loop uopdates */ |
| pi2_ref_data_byte = pi2_inp_data + ((i4_i + 2) * i4_inp_data_stride); |
| } |
| /* ----------- Vertical Interpolation ---------------- */ |
| pi4_ref_array = (WORD32 *) pi2_refarray_buffer; |
| |
| { |
| WORD32 *pi4_ref_array_temp; |
| WORD16 *pi2_out; |
| int32x4_t i4_horz_samp_32x4_r1_1, i4_horz_samp_32x4_r1_2, i4_horz_samp_32x4_r1_3, |
| i4_horz_samp_32x4_r1_4; |
| int32x4_t i4_horz_samp_32x4_r2_1, i4_horz_samp_32x4_r2_2, i4_horz_samp_32x4_r2_3, |
| i4_horz_samp_32x4_r2_4; |
| |
| int32x4_t i4_horz_res_32x4_r1_1, i4_horz_res_32x4_r1_2, i4_horz_res_32x4_r1_3, |
| i4_horz_res_32x4_r1_4; |
| int32x4_t i4_horz_res_32x4_r2_1, i4_horz_res_32x4_r2_2, i4_horz_res_32x4_r2_3, |
| i4_horz_res_32x4_r2_4; |
| int32x4_t i4_horz_res_32x4_r3_1, i4_horz_res_32x4_r3_2, i4_horz_res_32x4_r3_3, |
| i4_horz_res_32x4_r3_4; |
| int32x4_t horz_add_32x4_r2_1, horz_add_32x4_r2_2, horz_add_32x4_r2_3, |
| horz_add_32x4_r2_4; |
| |
| int16x8_t comb_horz_16x8_1, comb_horz_16x8_2, comb_horz_16x8_3, comb_horz_16x8_4; |
| pi4_ref_array_temp = pi4_ref_array; |
| pi2_out = pi2_out_res; |
| |
| i4_horz_samp_32x4_r1_1 = vld1q_s32(pi4_ref_array_temp); |
| i4_horz_samp_32x4_r1_2 = vld1q_s32(pi4_ref_array_temp + 4); |
| i4_horz_samp_32x4_r1_3 = vld1q_s32(pi4_ref_array_temp + 8); |
| i4_horz_samp_32x4_r1_4 = vld1q_s32(pi4_ref_array_temp + 12); |
| |
| /* populate the first inter sample */ |
| i4_horz_res_32x4_r1_1 = vrshrq_n_s32(i4_horz_samp_32x4_r1_1, 2); |
| i4_horz_res_32x4_r1_2 = vrshrq_n_s32(i4_horz_samp_32x4_r1_2, 2); |
| i4_horz_res_32x4_r1_3 = vrshrq_n_s32(i4_horz_samp_32x4_r1_3, 2); |
| i4_horz_res_32x4_r1_4 = vrshrq_n_s32(i4_horz_samp_32x4_r1_4, 2); |
| |
| comb_horz_16x8_1 = |
| vcombine_s16(vmovn_s32(i4_horz_res_32x4_r1_1), vmovn_s32(i4_horz_res_32x4_r1_2)); |
| comb_horz_16x8_2 = |
| vcombine_s16(vmovn_s32(i4_horz_res_32x4_r1_3), vmovn_s32(i4_horz_res_32x4_r1_4)); |
| vst1q_s16(pi2_out, comb_horz_16x8_1); |
| vst1q_s16(pi2_out + 8, comb_horz_16x8_2); |
| |
| pi2_out += i4_out_res_stride; |
| |
| for(i4_j = 0; i4_j < 14; i4_j += 2) |
| { |
| pi4_ref_array_temp += MB_WIDTH; |
| i4_horz_samp_32x4_r2_1 = vld1q_s32(pi4_ref_array_temp); |
| i4_horz_samp_32x4_r2_2 = vld1q_s32(pi4_ref_array_temp + 4); |
| i4_horz_samp_32x4_r2_3 = vld1q_s32(pi4_ref_array_temp + 8); |
| i4_horz_samp_32x4_r2_4 = vld1q_s32(pi4_ref_array_temp + 12); |
| |
| horz_add_32x4_r2_1 = vaddq_s32(i4_horz_samp_32x4_r1_1, i4_horz_samp_32x4_r2_1); |
| horz_add_32x4_r2_2 = vaddq_s32(i4_horz_samp_32x4_r1_2, i4_horz_samp_32x4_r2_2); |
| horz_add_32x4_r2_3 = vaddq_s32(i4_horz_samp_32x4_r1_3, i4_horz_samp_32x4_r2_3); |
| horz_add_32x4_r2_4 = vaddq_s32(i4_horz_samp_32x4_r1_4, i4_horz_samp_32x4_r2_4); |
| |
| i4_horz_res_32x4_r2_1 = |
| vaddq_s32(vshlq_n_s32(i4_horz_samp_32x4_r1_1, 1), horz_add_32x4_r2_1); |
| i4_horz_res_32x4_r2_2 = |
| vaddq_s32(vshlq_n_s32(i4_horz_samp_32x4_r1_2, 1), horz_add_32x4_r2_2); |
| i4_horz_res_32x4_r2_3 = |
| vaddq_s32(vshlq_n_s32(i4_horz_samp_32x4_r1_3, 1), horz_add_32x4_r2_3); |
| i4_horz_res_32x4_r2_4 = |
| vaddq_s32(vshlq_n_s32(i4_horz_samp_32x4_r1_4, 1), horz_add_32x4_r2_4); |
| |
| i4_horz_res_32x4_r3_1 = |
| vaddq_s32(vshlq_n_s32(i4_horz_samp_32x4_r2_1, 1), horz_add_32x4_r2_1); |
| i4_horz_res_32x4_r3_2 = |
| vaddq_s32(vshlq_n_s32(i4_horz_samp_32x4_r2_2, 1), horz_add_32x4_r2_2); |
| i4_horz_res_32x4_r3_3 = |
| vaddq_s32(vshlq_n_s32(i4_horz_samp_32x4_r2_3, 1), horz_add_32x4_r2_3); |
| i4_horz_res_32x4_r3_4 = |
| vaddq_s32(vshlq_n_s32(i4_horz_samp_32x4_r2_4, 1), horz_add_32x4_r2_4); |
| |
| i4_horz_res_32x4_r2_1 = vrshrq_n_s32(i4_horz_res_32x4_r2_1, 4); |
| i4_horz_res_32x4_r2_2 = vrshrq_n_s32(i4_horz_res_32x4_r2_2, 4); |
| i4_horz_res_32x4_r2_3 = vrshrq_n_s32(i4_horz_res_32x4_r2_3, 4); |
| i4_horz_res_32x4_r2_4 = vrshrq_n_s32(i4_horz_res_32x4_r2_4, 4); |
| |
| i4_horz_res_32x4_r3_1 = vrshrq_n_s32(i4_horz_res_32x4_r3_1, 4); |
| i4_horz_res_32x4_r3_2 = vrshrq_n_s32(i4_horz_res_32x4_r3_2, 4); |
| i4_horz_res_32x4_r3_3 = vrshrq_n_s32(i4_horz_res_32x4_r3_3, 4); |
| i4_horz_res_32x4_r3_4 = vrshrq_n_s32(i4_horz_res_32x4_r3_4, 4); |
| |
| comb_horz_16x8_1 = vcombine_s16(vmovn_s32(i4_horz_res_32x4_r2_1), |
| vmovn_s32(i4_horz_res_32x4_r2_2)); |
| comb_horz_16x8_2 = vcombine_s16(vmovn_s32(i4_horz_res_32x4_r2_3), |
| vmovn_s32(i4_horz_res_32x4_r2_4)); |
| |
| comb_horz_16x8_3 = vcombine_s16(vmovn_s32(i4_horz_res_32x4_r3_1), |
| vmovn_s32(i4_horz_res_32x4_r3_2)); |
| comb_horz_16x8_4 = vcombine_s16(vmovn_s32(i4_horz_res_32x4_r3_3), |
| vmovn_s32(i4_horz_res_32x4_r3_4)); |
| |
| /* populate 2 samples based on current coeffs */ |
| vst1q_s16(pi2_out, comb_horz_16x8_1); |
| vst1q_s16(pi2_out + 8, comb_horz_16x8_2); |
| pi2_out += i4_out_res_stride; |
| |
| vst1q_s16(pi2_out, comb_horz_16x8_3); |
| vst1q_s16(pi2_out + 8, comb_horz_16x8_4); |
| pi2_out += i4_out_res_stride; |
| |
| /* store the coeff 2 to coeff 1 */ |
| /* (used in next iteration) */ |
| i4_horz_samp_32x4_r1_1 = i4_horz_samp_32x4_r2_1; |
| i4_horz_samp_32x4_r1_2 = i4_horz_samp_32x4_r2_2; |
| i4_horz_samp_32x4_r1_3 = i4_horz_samp_32x4_r2_3; |
| i4_horz_samp_32x4_r1_4 = i4_horz_samp_32x4_r2_4; |
| } |
| |
| /* populate the first inter sample */ |
| i4_horz_res_32x4_r1_1 = vrshrq_n_s32(i4_horz_samp_32x4_r1_1, 2); |
| i4_horz_res_32x4_r1_2 = vrshrq_n_s32(i4_horz_samp_32x4_r1_2, 2); |
| i4_horz_res_32x4_r1_3 = vrshrq_n_s32(i4_horz_samp_32x4_r1_3, 2); |
| i4_horz_res_32x4_r1_4 = vrshrq_n_s32(i4_horz_samp_32x4_r1_4, 2); |
| |
| comb_horz_16x8_1 = |
| vcombine_s16(vmovn_s32(i4_horz_res_32x4_r1_1), vmovn_s32(i4_horz_res_32x4_r1_2)); |
| comb_horz_16x8_2 = |
| vcombine_s16(vmovn_s32(i4_horz_res_32x4_r1_3), vmovn_s32(i4_horz_res_32x4_r1_4)); |
| vst1q_s16(pi2_out, comb_horz_16x8_1); |
| vst1q_s16(pi2_out + 8, comb_horz_16x8_2); |
| |
| /* horizontal loop updates */ |
| pi4_ref_array++; |
| pi2_out_res++; |
| } |
| } |
| else |
| { |
| /* ----------------------------------------------------------------- */ |
| /* LOOP over number of blocks */ |
| /* ----------------------------------------------------------------- */ |
| for(i4_blk_ctr = 0; i4_blk_ctr < 4; i4_blk_ctr++) |
| { |
| /* if reference layer is not coded then no processing */ |
| if(0 != (i4_ref_nnz & 0x1)) |
| { |
| int16x8_t i2_coeff1_16x8_r0_0, i2_coeff1_16x8_r0_1; |
| int16x8_t i2_coeff1_16x8_r1_0, i2_coeff1_16x8_r1_1; |
| int16x8_t i2_coeff1_16x8_r2_0, i2_coeff1_16x8_r2_1; |
| int16x8_t i2_coeff1_16x8_r3_0, i2_coeff1_16x8_r3_1; |
| int16x8_t i2_add_16x8_r0_0; |
| int16x8_t i2_add_16x8_r1_0; |
| int16x8_t i2_add_16x8_r2_0; |
| int16x8_t i2_add_16x8_r3_0; |
| int16x8_t i2_res_16x8_r0_0, i2_res_16x8_r0_1; |
| int16x8_t i2_res_16x8_r1_0, i2_res_16x8_r1_1; |
| int16x8_t i2_res_16x8_r2_0, i2_res_16x8_r2_1; |
| int16x8_t i2_res_16x8_r3_0, i2_res_16x8_r3_1; |
| |
| i2_coeff1_16x8_r0_0 = vld1q_s16(pi2_inp_data); |
| i2_coeff1_16x8_r1_0 = vld1q_s16(pi2_inp_data + i4_inp_data_stride); |
| i2_coeff1_16x8_r2_0 = vld1q_s16(pi2_inp_data + (i4_inp_data_stride << 1)); |
| i2_coeff1_16x8_r3_0 = |
| vld1q_s16(pi2_inp_data + (i4_inp_data_stride << 1) + i4_inp_data_stride); |
| |
| i2_coeff1_16x8_r0_1 = vextq_s16(i2_coeff1_16x8_r0_0, i2_coeff1_16x8_r0_0, 1); |
| i2_coeff1_16x8_r1_1 = vextq_s16(i2_coeff1_16x8_r1_0, i2_coeff1_16x8_r1_0, 1); |
| i2_coeff1_16x8_r2_1 = vextq_s16(i2_coeff1_16x8_r2_0, i2_coeff1_16x8_r2_0, 1); |
| i2_coeff1_16x8_r3_1 = vextq_s16(i2_coeff1_16x8_r3_0, i2_coeff1_16x8_r3_0, 1); |
| |
| i2_add_16x8_r0_0 = vaddq_s16(i2_coeff1_16x8_r0_1, i2_coeff1_16x8_r0_0); |
| i2_add_16x8_r1_0 = vaddq_s16(i2_coeff1_16x8_r1_1, i2_coeff1_16x8_r1_0); |
| i2_add_16x8_r2_0 = vaddq_s16(i2_coeff1_16x8_r2_1, i2_coeff1_16x8_r2_0); |
| i2_add_16x8_r3_0 = vaddq_s16(i2_coeff1_16x8_r3_1, i2_coeff1_16x8_r3_0); |
| |
| i2_coeff1_16x8_r0_0 = vshlq_n_s16(i2_coeff1_16x8_r0_0, 1); |
| i2_coeff1_16x8_r1_0 = vshlq_n_s16(i2_coeff1_16x8_r1_0, 1); |
| i2_coeff1_16x8_r2_0 = vshlq_n_s16(i2_coeff1_16x8_r2_0, 1); |
| i2_coeff1_16x8_r3_0 = vshlq_n_s16(i2_coeff1_16x8_r3_0, 1); |
| |
| i2_coeff1_16x8_r0_1 = vshlq_n_s16(i2_coeff1_16x8_r0_1, 1); |
| i2_coeff1_16x8_r1_1 = vshlq_n_s16(i2_coeff1_16x8_r1_1, 1); |
| i2_coeff1_16x8_r2_1 = vshlq_n_s16(i2_coeff1_16x8_r2_1, 1); |
| i2_coeff1_16x8_r3_1 = vshlq_n_s16(i2_coeff1_16x8_r3_1, 1); |
| |
| i2_res_16x8_r0_0 = vaddq_s16(i2_coeff1_16x8_r0_0, i2_add_16x8_r0_0); |
| i2_res_16x8_r1_0 = vaddq_s16(i2_coeff1_16x8_r1_0, i2_add_16x8_r1_0); |
| i2_res_16x8_r2_0 = vaddq_s16(i2_coeff1_16x8_r2_0, i2_add_16x8_r2_0); |
| i2_res_16x8_r3_0 = vaddq_s16(i2_coeff1_16x8_r3_0, i2_add_16x8_r3_0); |
| |
| i2_res_16x8_r0_1 = vaddq_s16(i2_coeff1_16x8_r0_1, i2_add_16x8_r0_0); |
| i2_res_16x8_r1_1 = vaddq_s16(i2_coeff1_16x8_r1_1, i2_add_16x8_r1_0); |
| i2_res_16x8_r2_1 = vaddq_s16(i2_coeff1_16x8_r2_1, i2_add_16x8_r2_0); |
| i2_res_16x8_r3_1 = vaddq_s16(i2_coeff1_16x8_r3_1, i2_add_16x8_r3_0); |
| |
| i2_res_16x8_r0_0 = vzipq_s16(i2_res_16x8_r0_0, i2_res_16x8_r0_1).val[0]; |
| i2_res_16x8_r1_0 = vzipq_s16(i2_res_16x8_r1_0, i2_res_16x8_r1_1).val[0]; |
| i2_res_16x8_r2_0 = vzipq_s16(i2_res_16x8_r2_0, i2_res_16x8_r2_1).val[0]; |
| i2_res_16x8_r3_0 = vzipq_s16(i2_res_16x8_r3_0, i2_res_16x8_r3_1).val[0]; |
| |
| i2_coeff1_16x8_r0_0 = vshlq_n_s16(i2_coeff1_16x8_r0_0, 1); |
| i2_coeff1_16x8_r1_0 = vshlq_n_s16(i2_coeff1_16x8_r1_0, 1); |
| i2_coeff1_16x8_r2_0 = vshlq_n_s16(i2_coeff1_16x8_r2_0, 1); |
| i2_coeff1_16x8_r3_0 = vshlq_n_s16(i2_coeff1_16x8_r3_0, 1); |
| |
| vst1q_s16(pi2_refarray_buffer + 1, i2_res_16x8_r0_0); |
| vst1q_lane_s16(pi2_refarray_buffer, i2_coeff1_16x8_r0_0, 0); |
| vst1q_lane_s16(pi2_refarray_buffer + 7, i2_coeff1_16x8_r0_0, 3); |
| |
| vst1q_s16(pi2_refarray_buffer + 9, i2_res_16x8_r1_0); |
| vst1q_lane_s16(pi2_refarray_buffer + 8, i2_coeff1_16x8_r1_0, 0); |
| vst1q_lane_s16(pi2_refarray_buffer + 15, i2_coeff1_16x8_r1_0, 3); |
| |
| vst1q_s16(pi2_refarray_buffer + 17, i2_res_16x8_r2_0); |
| vst1q_lane_s16(pi2_refarray_buffer + 16, i2_coeff1_16x8_r2_0, 0); |
| vst1q_lane_s16(pi2_refarray_buffer + 23, i2_coeff1_16x8_r2_0, 3); |
| |
| vst1q_s16(pi2_refarray_buffer + 25, i2_res_16x8_r3_0); |
| vst1q_lane_s16(pi2_refarray_buffer + 24, i2_coeff1_16x8_r3_0, 0); |
| vst1q_lane_s16(pi2_refarray_buffer + 31, i2_coeff1_16x8_r3_0, 3); |
| |
| { |
| int16x4_t i4_horz_samp_16x4_r0_1, i4_horz_samp_16x4_r0_2; |
| int16x4_t i4_horz_samp_16x4_r1_1, i4_horz_samp_16x4_r1_2; |
| int16x4_t i4_horz_samp_16x4_r2_1, i4_horz_samp_16x4_r2_2; |
| int16x4_t i4_horz_samp_16x4_r3_1, i4_horz_samp_16x4_r3_2; |
| |
| int32x4_t i4_horz_samp_32x4_r0_1, i4_horz_samp_32x4_r0_2; |
| int32x4_t i4_horz_samp_32x4_r1_1, i4_horz_samp_32x4_r1_2; |
| int32x4_t i4_horz_samp_32x4_r2_1, i4_horz_samp_32x4_r2_2; |
| int32x4_t i4_horz_samp_32x4_r3_1, i4_horz_samp_32x4_r3_2; |
| |
| int32x4_t i4_horz_add_32x4_r1_1, i4_horz_add_32x4_r1_2; |
| int32x4_t i4_horz_add_32x4_r2_1, i4_horz_add_32x4_r2_2; |
| int32x4_t i4_horz_add_32x4_r3_1, i4_horz_add_32x4_r3_2; |
| |
| int16x4_t i4_horz_res_16x4_r0_1, i4_horz_res_16x4_r0_2; |
| int16x4_t i4_horz_res_16x4_r1_1, i4_horz_res_16x4_r1_2; |
| int16x4_t i4_horz_res_16x4_r2_1, i4_horz_res_16x4_r2_2; |
| int16x4_t i4_horz_res_16x4_r3_1, i4_horz_res_16x4_r3_2; |
| int16x4_t i4_horz_res_16x4_r4_1, i4_horz_res_16x4_r4_2; |
| int16x4_t i4_horz_res_16x4_r5_1, i4_horz_res_16x4_r5_2; |
| int16x4_t i4_horz_res_16x4_r6_1, i4_horz_res_16x4_r6_2; |
| int16x4_t i4_horz_res_16x4_r7_1, i4_horz_res_16x4_r7_2; |
| |
| int32x4_t i4_horz_res_32x4_r1_1, i4_horz_res_32x4_r1_2; |
| int32x4_t i4_horz_res_32x4_r2_1, i4_horz_res_32x4_r2_2; |
| int32x4_t i4_horz_res_32x4_r3_1, i4_horz_res_32x4_r3_2; |
| int32x4_t i4_horz_res_32x4_r4_1, i4_horz_res_32x4_r4_2; |
| int32x4_t i4_horz_res_32x4_r5_1, i4_horz_res_32x4_r5_2; |
| int32x4_t i4_horz_res_32x4_r6_1, i4_horz_res_32x4_r6_2; |
| |
| i4_horz_samp_16x4_r0_1 = vld1_s16(pi2_refarray_buffer); |
| i4_horz_samp_16x4_r0_2 = vld1_s16(pi2_refarray_buffer + 4); |
| |
| i4_horz_samp_16x4_r1_1 = vld1_s16(pi2_refarray_buffer + 8); |
| i4_horz_samp_16x4_r1_2 = vld1_s16(pi2_refarray_buffer + 12); |
| |
| i4_horz_samp_16x4_r2_1 = vld1_s16(pi2_refarray_buffer + 16); |
| i4_horz_samp_16x4_r2_2 = vld1_s16(pi2_refarray_buffer + 20); |
| |
| i4_horz_samp_16x4_r3_1 = vld1_s16(pi2_refarray_buffer + 24); |
| i4_horz_samp_16x4_r3_2 = vld1_s16(pi2_refarray_buffer + 28); |
| |
| i4_horz_res_16x4_r0_1 = vrshr_n_s16(i4_horz_samp_16x4_r0_1, 2); |
| i4_horz_res_16x4_r0_2 = vrshr_n_s16(i4_horz_samp_16x4_r0_2, 2); |
| |
| i4_horz_add_32x4_r1_1 = |
| vaddl_s16(i4_horz_samp_16x4_r0_1, i4_horz_samp_16x4_r1_1); |
| i4_horz_add_32x4_r1_2 = |
| vaddl_s16(i4_horz_samp_16x4_r0_2, i4_horz_samp_16x4_r1_2); |
| |
| i4_horz_add_32x4_r2_1 = |
| vaddl_s16(i4_horz_samp_16x4_r1_1, i4_horz_samp_16x4_r2_1); |
| i4_horz_add_32x4_r2_2 = |
| vaddl_s16(i4_horz_samp_16x4_r1_2, i4_horz_samp_16x4_r2_2); |
| |
| i4_horz_add_32x4_r3_1 = |
| vaddl_s16(i4_horz_samp_16x4_r2_1, i4_horz_samp_16x4_r3_1); |
| i4_horz_add_32x4_r3_2 = |
| vaddl_s16(i4_horz_samp_16x4_r2_2, i4_horz_samp_16x4_r3_2); |
| |
| i4_horz_samp_32x4_r0_1 = vshll_n_s16(i4_horz_samp_16x4_r0_1, 1); |
| i4_horz_samp_32x4_r0_2 = vshll_n_s16(i4_horz_samp_16x4_r0_2, 1); |
| |
| i4_horz_samp_32x4_r1_1 = vshll_n_s16(i4_horz_samp_16x4_r1_1, 1); |
| i4_horz_samp_32x4_r1_2 = vshll_n_s16(i4_horz_samp_16x4_r1_2, 1); |
| |
| i4_horz_samp_32x4_r2_1 = vshll_n_s16(i4_horz_samp_16x4_r2_1, 1); |
| i4_horz_samp_32x4_r2_2 = vshll_n_s16(i4_horz_samp_16x4_r2_2, 1); |
| |
| i4_horz_samp_32x4_r3_1 = vshll_n_s16(i4_horz_samp_16x4_r3_1, 1); |
| i4_horz_samp_32x4_r3_2 = vshll_n_s16(i4_horz_samp_16x4_r3_2, 1); |
| |
| i4_horz_res_32x4_r1_1 = |
| vaddq_s32(i4_horz_samp_32x4_r0_1, i4_horz_add_32x4_r1_1); |
| i4_horz_res_32x4_r1_2 = |
| vaddq_s32(i4_horz_samp_32x4_r0_2, i4_horz_add_32x4_r1_2); |
| |
| i4_horz_res_32x4_r2_1 = |
| vaddq_s32(i4_horz_samp_32x4_r1_1, i4_horz_add_32x4_r1_1); |
| i4_horz_res_32x4_r2_2 = |
| vaddq_s32(i4_horz_samp_32x4_r1_2, i4_horz_add_32x4_r1_2); |
| |
| i4_horz_res_32x4_r3_1 = |
| vaddq_s32(i4_horz_samp_32x4_r1_1, i4_horz_add_32x4_r2_1); |
| i4_horz_res_32x4_r3_2 = |
| vaddq_s32(i4_horz_samp_32x4_r1_2, i4_horz_add_32x4_r2_2); |
| |
| i4_horz_res_32x4_r4_1 = |
| vaddq_s32(i4_horz_samp_32x4_r2_1, i4_horz_add_32x4_r2_1); |
| i4_horz_res_32x4_r4_2 = |
| vaddq_s32(i4_horz_samp_32x4_r2_2, i4_horz_add_32x4_r2_2); |
| |
| i4_horz_res_32x4_r5_1 = |
| vaddq_s32(i4_horz_samp_32x4_r2_1, i4_horz_add_32x4_r3_1); |
| i4_horz_res_32x4_r5_2 = |
| vaddq_s32(i4_horz_samp_32x4_r2_2, i4_horz_add_32x4_r3_2); |
| |
| i4_horz_res_32x4_r6_1 = |
| vaddq_s32(i4_horz_samp_32x4_r3_1, i4_horz_add_32x4_r3_1); |
| i4_horz_res_32x4_r6_2 = |
| vaddq_s32(i4_horz_samp_32x4_r3_2, i4_horz_add_32x4_r3_2); |
| |
| i4_horz_res_16x4_r1_1 = vqrshrn_n_s32(i4_horz_res_32x4_r1_1, 4); |
| i4_horz_res_16x4_r1_2 = vqrshrn_n_s32(i4_horz_res_32x4_r1_2, 4); |
| |
| i4_horz_res_16x4_r2_1 = vqrshrn_n_s32(i4_horz_res_32x4_r2_1, 4); |
| i4_horz_res_16x4_r2_2 = vqrshrn_n_s32(i4_horz_res_32x4_r2_2, 4); |
| |
| i4_horz_res_16x4_r3_1 = vqrshrn_n_s32(i4_horz_res_32x4_r3_1, 4); |
| i4_horz_res_16x4_r3_2 = vqrshrn_n_s32(i4_horz_res_32x4_r3_2, 4); |
| |
| i4_horz_res_16x4_r4_1 = vqrshrn_n_s32(i4_horz_res_32x4_r4_1, 4); |
| i4_horz_res_16x4_r4_2 = vqrshrn_n_s32(i4_horz_res_32x4_r4_2, 4); |
| |
| i4_horz_res_16x4_r5_1 = vqrshrn_n_s32(i4_horz_res_32x4_r5_1, 4); |
| i4_horz_res_16x4_r5_2 = vqrshrn_n_s32(i4_horz_res_32x4_r5_2, 4); |
| |
| i4_horz_res_16x4_r6_1 = vqrshrn_n_s32(i4_horz_res_32x4_r6_1, 4); |
| i4_horz_res_16x4_r6_2 = vqrshrn_n_s32(i4_horz_res_32x4_r6_2, 4); |
| |
| i4_horz_res_16x4_r7_1 = vrshr_n_s16(i4_horz_samp_16x4_r3_1, 2); |
| i4_horz_res_16x4_r7_2 = vrshr_n_s16(i4_horz_samp_16x4_r3_2, 2); |
| |
| vst1_s16(pi2_out_res, i4_horz_res_16x4_r0_1); |
| vst1_s16(pi2_out_res + 4, i4_horz_res_16x4_r0_2); |
| |
| vst1_s16(pi2_out_res + i4_out_res_stride, i4_horz_res_16x4_r1_1); |
| vst1_s16(pi2_out_res + i4_out_res_stride + 4, i4_horz_res_16x4_r1_2); |
| |
| vst1_s16(pi2_out_res + (i4_out_res_stride << 1), i4_horz_res_16x4_r2_1); |
| vst1_s16(pi2_out_res + (i4_out_res_stride << 1) + 4, i4_horz_res_16x4_r2_2); |
| |
| vst1_s16(pi2_out_res + (i4_out_res_stride * 3), i4_horz_res_16x4_r3_1); |
| vst1_s16(pi2_out_res + (i4_out_res_stride * 3) + 4, i4_horz_res_16x4_r3_2); |
| |
| vst1_s16(pi2_out_res + (i4_out_res_stride << 2), i4_horz_res_16x4_r4_1); |
| vst1_s16(pi2_out_res + (i4_out_res_stride << 2) + 4, i4_horz_res_16x4_r4_2); |
| |
| vst1_s16(pi2_out_res + (i4_out_res_stride * 5), i4_horz_res_16x4_r5_1); |
| vst1_s16(pi2_out_res + (i4_out_res_stride * 5) + 4, i4_horz_res_16x4_r5_2); |
| |
| vst1_s16(pi2_out_res + (i4_out_res_stride * 6), i4_horz_res_16x4_r6_1); |
| vst1_s16(pi2_out_res + (i4_out_res_stride * 6) + 4, i4_horz_res_16x4_r6_2); |
| |
| vst1_s16(pi2_out_res + (i4_out_res_stride * 7), i4_horz_res_16x4_r7_1); |
| vst1_s16(pi2_out_res + (i4_out_res_stride * 7) + 4, i4_horz_res_16x4_r7_2); |
| |
| pi2_out_res += BLOCK_WIDTH; |
| } |
| } |
| else |
| { |
| pi2_out_res += BLOCK_WIDTH; |
| } |
| |
| /* Block level loop updates */ |
| if(1 == i4_blk_ctr) |
| { |
| pi2_inp_data -= SUB_BLOCK_WIDTH; |
| pi2_inp_data += (i4_inp_data_stride * SUB_BLOCK_HEIGHT); |
| pi2_out_res -= MB_WIDTH; |
| pi2_out_res += (i4_out_res_stride * BLOCK_HEIGHT); |
| i4_ref_nnz >>= 2; |
| } |
| else |
| { |
| pi2_inp_data += SUB_BLOCK_WIDTH; |
| } |
| |
| i4_ref_nnz >>= 1; |
| } /* end of loop over all the blocks */ |
| } |
| |
| return; |
| } |
| |
| /*****************************************************************************/ |
| /* */ |
| /* Function Name : isvcd_interpolate_residual_neonintr */ |
| /* */ |
| /* Description : this fucntion does the upsampling of residuals. */ |
| /* */ |
| /* Inputs : pv_residual_samp_ctxt : Residual upsampling context */ |
| /* pu1_inp_data : input 8 bit data pointer */ |
| /* i4_inp_data_stride : input buffer stride */ |
| /* pi2_out_res : output 16 bit buffer pointer */ |
| /* i4_out_res_stride : Output buffer stride */ |
| /* pu1_inp_bitmap : input packed sign bit data pointer */ |
| /* i4_inp_bitmap_stride : sign bit buffer stride */ |
| /* ps_ref_mb_mode : reference mb mode pointer of base layer */ |
| /* ps_coord : mb co-ordinate pointer */ |
| /* Globals : none */ |
| /* Processing : it does the upsampling with fixed phase values and */ |
| /* reference layer transform size */ |
| /* Outputs : Upsampled residuals. */ |
| /* Returns : none */ |
| /* */ |
| /* Issues : none */ |
| /* */ |
| /* Revision History: */ |
| /* */ |
| /* DD MM YYYY Author(s) Changes (Describe the changes made) */ |
| /* 26 05 2021 Dolan creation */ |
| /* */ |
| /*****************************************************************************/ |
| |
| void isvcd_interpolate_residual_neonintr(void *pv_residual_samp_ctxt, WORD16 *pi2_out, |
| WORD32 i4_out_stride, WORD32 i4_refarray_wd, |
| UWORD16 u2_mb_x, UWORD16 u2_mb_y, WORD32 i4_chroma_flag) |
| { |
| residual_sampling_ctxt_t *ps_ctxt; |
| residual_samp_map_ctxt_t *ps_map_ctxt; |
| res_lyr_ctxt *ps_lyr_ctxt; |
| ref_pixel_map_t *ps_x_pos_phase; |
| ref_pixel_map_t *ps_y_pos_phase; |
| |
| WORD32 i4_x, i4_y; |
| WORD32 i4_frm_mb_x, i4_frm_mb_y; |
| WORD32 i4_temp_array_ht; |
| WORD32 i4_mb_wd; |
| WORD32 i4_mb_ht; |
| WORD16 *pi2_ref_array; |
| UWORD8 *pu1_ref_x_ptr_incr, *pu1_ref_y_ptr_incr; |
| |
| UWORD8 arr_y_ref_pos[16] = {0}; |
| UWORD8 arr_x_ref_pos[16] = {0}; |
| UWORD8 arr_x_ref_pos_low[16] = {0}; |
| UWORD8 arr_x_phase[16] = {0}; |
| UWORD8 arr_y_phase[16] = {0}; |
| UWORD8 *pi1_y_ref_pos; |
| UWORD8 *pi1_x_ref_pos; |
| UWORD8 *pi1_x_ref_pos_low; |
| UWORD8 *pi1_y_phase; |
| UWORD8 *pi1_x_phase; |
| |
| ps_ctxt = (residual_sampling_ctxt_t *) pv_residual_samp_ctxt; |
| ps_lyr_ctxt = &ps_ctxt->as_res_lyrs[ps_ctxt->i4_res_lyr_id]; |
| pi2_ref_array = ps_ctxt->pi2_refarray_buffer; |
| pu1_ref_x_ptr_incr = ps_ctxt->pu1_ref_x_ptr_incr; |
| pu1_ref_y_ptr_incr = ps_ctxt->pu1_ref_y_ptr_incr; |
| |
| if(1 == i4_chroma_flag) |
| ps_map_ctxt = &ps_lyr_ctxt->s_chroma_map_ctxt; |
| else |
| ps_map_ctxt = &ps_lyr_ctxt->s_luma_map_ctxt; |
| |
| i4_mb_wd = MB_WIDTH >> i4_chroma_flag; |
| i4_mb_ht = MB_HEIGHT >> i4_chroma_flag; |
| |
| ps_x_pos_phase = ps_map_ctxt->ps_x_pos_phase; |
| ps_y_pos_phase = ps_map_ctxt->ps_y_pos_phase; |
| |
| i4_temp_array_ht = i4_mb_ht; |
| i4_frm_mb_y = u2_mb_y * i4_mb_ht; |
| i4_frm_mb_x = u2_mb_x * i4_mb_wd; |
| |
| /* --------------------------------------------------------------------- */ |
| /* Loop for interpolation */ |
| /* --------------------------------------------------------------------- */ |
| if(i4_chroma_flag == 0) |
| { |
| int16x8_t ref_arr_16x8_r0_0, ref_arr_16x8_r0_1; |
| int16x8_t ref_arr_16x8_r1_0, ref_arr_16x8_r1_1; |
| uint8x16_t x_ref_pos_mask_r0_0, x_ref_rnd_mask_r0_0; |
| uint8x16_t u1_incr_8x16_r0_0, x_ref_pos_mask_temp_r0_0, u1_incr_not_8x16_r0_0, |
| u1_y_incr_8x16_r0_0, phs_mask_8x16_0; |
| uint8x16_t u1_incr_8x16_r1_0, x_ref_pos_mask_temp_r1_0, u1_incr_not_8x16_r1_0; |
| int16x8_t ref_arr_temp0_16x8_r0_0, res_16x8_r0_0, vert_res_16x8_r0_0; |
| int16x8_t ref_arr_temp0_16x8_r1_0, res_16x8_r1_0, vert_res_16x8_r1_0; |
| int16x8_t ref_arr_temp1_16x8_r0_0; |
| int16x8_t ref_arr_temp1_16x8_r1_0; |
| |
| uint8x16_t x_ref_pos_mask_temp_r0_1, u1_incr_not_8x16_r0_1; |
| uint8x16_t x_ref_pos_mask_temp_r1_1, u1_incr_not_8x16_r1_1; |
| int16x8_t ref_arr_temp0_16x8_r0_1, res_16x8_r0_1, vert_res_16x8_r0_1; |
| int16x8_t ref_arr_temp0_16x8_r1_1, res_16x8_r1_1, vert_res_16x8_r1_1; |
| int16x8_t ref_arr_temp1_16x8_r0_1; |
| int16x8_t ref_arr_temp1_16x8_r1_1; |
| |
| uint16x8_t u1_y_incr_16x8_r0_0, u1_y_incr_16x8_r0_1; |
| |
| uint8x16_t u1_incr_not_8x16_r0_0_even, u1_incr_not_8x16_r1_0_even, |
| x_ref_pos_mask_temp_r0_0_even, x_ref_pos_mask_temp_r1_0_even; |
| uint8x16_t u1_incr_not_8x16_r0_0_odd, u1_incr_not_8x16_r1_0_odd, |
| x_ref_pos_mask_temp_r0_0_odd, x_ref_pos_mask_temp_r1_0_odd; |
| uint8x16x2_t u1_incr_not_8x16_2, x_ref_pos_mask_temp; |
| int16x8_t prev_res_16x8_r0_0; |
| int16x8_t prev_res_16x8_r1_0; |
| int16x8_t prev_res_16x8_r0_1; |
| int16x8_t prev_res_16x8_r1_1; |
| uint8x8x2_t u1_incr_8x8x2_t; |
| uint8x8_t u1_incr_8x8_t0, u1_incr_8x8_t1; |
| uint16x8_t u1_prev_y_incr_16x8_r0_0; |
| uint16x8_t u1_prev_y_incr_16x8_r0_1; |
| |
| WORD32 zero_r0_r1 = 0; |
| |
| int32x4_t res_32x4_l_0, res_32x4_h_0; |
| int32x4_t res_32x4_l_1, res_32x4_h_1; |
| int16x8_t res_16x8_l, res_16x8_h; |
| uint16x8_t phs_mask_16x8_0, phs_mask_16x8_1; |
| int16x8_t const_16_16x8, phs_mask_16min_16x8_0; |
| int16x8_t dup_val_1, dup_val_2, dup_val_3, dup_val_4, dup_val_5, dup_abs; |
| uint8x16_t phs_mask_div8_8x16_0, mid_indx_8x16; |
| int16x8_t phs_mask_16min_16x8_1; |
| uint16x8_t ones = vdupq_n_u16(0xFFFF); |
| uint8x16_t const_ones = vdupq_n_u8(1); |
| uint8x8x2_t u1_temp_8x8x2_t; |
| uint8x8_t u1_temp_8x8_t0, u1_temp_8x8_t1; |
| |
| WORD16 *pi2_ref_array_temp; |
| UWORD8 *pu1_ref_x_ptr_incr_temp, *pu1_ref_y_ptr_incr_temp; |
| WORD32 i4_y_phase; |
| WORD32 out_stride_temp; |
| WORD32 strt_indx_h = 0; |
| |
| for(i4_y = 0; i4_y < (i4_temp_array_ht); i4_y++) |
| { |
| arr_y_phase[i4_y] = (UWORD8) ps_y_pos_phase[i4_y + i4_frm_mb_y].i2_phase; |
| arr_y_ref_pos[i4_y] = (UWORD8) (ps_y_pos_phase[i4_y + i4_frm_mb_y].i2_ref_pos); |
| } |
| pi1_y_ref_pos = arr_y_ref_pos; |
| pi1_y_phase = arr_y_phase; |
| |
| strt_indx_h = (ps_x_pos_phase[8 + i4_frm_mb_x].i2_ref_pos); |
| for(i4_x = 0; i4_x < i4_mb_wd; i4_x++) |
| { |
| arr_x_ref_pos[i4_x] = (WORD8) ps_x_pos_phase[i4_x + i4_frm_mb_x].i2_ref_pos; |
| arr_x_phase[i4_x] = (WORD8) ps_x_pos_phase[i4_x + i4_frm_mb_x].i2_phase; |
| } |
| |
| pi1_x_ref_pos = arr_x_ref_pos; |
| pi1_x_phase = arr_x_phase; |
| |
| phs_mask_8x16_0 = vld1q_u8((pi1_x_phase)); |
| phs_mask_16x8_0 = vmovl_u8(vget_low_u8(phs_mask_8x16_0)); |
| phs_mask_16x8_1 = vmovl_u8(vld1_u8((pi1_x_phase + 8))); |
| x_ref_pos_mask_r0_0 = vld1q_u8((pi1_x_ref_pos)); |
| const_16_16x8 = vdupq_n_s16(16); |
| phs_mask_div8_8x16_0 = vshrq_n_u8(phs_mask_8x16_0, 3); |
| |
| phs_mask_16min_16x8_0 = vsubq_s16(const_16_16x8, vreinterpretq_s16_u16(phs_mask_16x8_0)); |
| phs_mask_16min_16x8_1 = vsubq_s16(const_16_16x8, vreinterpretq_s16_u16(phs_mask_16x8_1)); |
| |
| x_ref_rnd_mask_r0_0 = vaddq_u8(x_ref_pos_mask_r0_0, phs_mask_div8_8x16_0); |
| mid_indx_8x16 = vdupq_n_u8((strt_indx_h << 1)); |
| |
| for(i4_y = 0; i4_y < (i4_temp_array_ht); i4_y++) |
| { |
| if((i4_y > 0) && (pi1_y_ref_pos[i4_y] == pi1_y_ref_pos[i4_y - 1])) |
| { |
| if(!zero_r0_r1) |
| { |
| res_16x8_l = vdupq_n_s16(0); |
| res_16x8_h = vdupq_n_s16(0); |
| |
| out_stride_temp = (i4_y * i4_out_stride); |
| vst1q_s16((pi2_out + out_stride_temp), res_16x8_l); |
| vst1q_s16((pi2_out + out_stride_temp + 8), res_16x8_h); |
| continue; |
| } |
| |
| res_16x8_r0_0 = prev_res_16x8_r0_0; |
| res_16x8_r1_0 = prev_res_16x8_r1_0; |
| res_16x8_r0_1 = prev_res_16x8_r0_1; |
| res_16x8_r1_1 = prev_res_16x8_r1_1; |
| |
| u1_y_incr_16x8_r0_0 = u1_prev_y_incr_16x8_r0_0; |
| u1_y_incr_16x8_r0_1 = u1_prev_y_incr_16x8_r0_1; |
| } |
| else |
| { |
| pi2_ref_array_temp = pi2_ref_array + ((pi1_y_ref_pos[i4_y]) * i4_refarray_wd); |
| pu1_ref_x_ptr_incr_temp = |
| pu1_ref_x_ptr_incr + ((pi1_y_ref_pos[i4_y]) * i4_refarray_wd); |
| ref_arr_16x8_r0_0 = vld1q_s16((pi2_ref_array_temp)); |
| ref_arr_16x8_r1_0 = vld1q_s16((pi2_ref_array_temp + i4_refarray_wd)); |
| ref_arr_16x8_r0_1 = vld1q_s16((pi2_ref_array_temp + strt_indx_h)); |
| ref_arr_16x8_r1_1 = vld1q_s16((pi2_ref_array_temp + i4_refarray_wd + strt_indx_h)); |
| |
| dup_val_1 = vabsq_s16(ref_arr_16x8_r0_0); |
| dup_val_2 = vabsq_s16(ref_arr_16x8_r1_0); |
| dup_val_3 = vabsq_s16(ref_arr_16x8_r0_1); |
| dup_val_4 = vabsq_s16(ref_arr_16x8_r1_1); |
| dup_val_5 = vqaddq_s16(dup_val_1, dup_val_2); |
| dup_val_1 = vqaddq_s16(dup_val_3, dup_val_4); |
| dup_abs = vqaddq_s16(dup_val_1, dup_val_5); |
| zero_r0_r1 = dup_abs[0] || dup_abs[1] || dup_abs[2] || dup_abs[3] || dup_abs[4] || |
| dup_abs[5] || dup_abs[6] || dup_abs[7]; |
| if(zero_r0_r1) |
| { |
| u1_incr_8x16_r0_0 = vld1q_u8((pu1_ref_x_ptr_incr_temp)); |
| u1_incr_8x16_r1_0 = vld1q_u8((pu1_ref_x_ptr_incr_temp + i4_refarray_wd)); |
| u1_incr_8x8x2_t.val[0] = vget_low_u8(u1_incr_8x16_r0_0); |
| u1_incr_8x8x2_t.val[1] = vget_high_u8(u1_incr_8x16_r0_0); |
| u1_incr_8x8_t0 = vtbl2_u8(u1_incr_8x8x2_t, vget_low_u8(x_ref_pos_mask_r0_0)); |
| u1_incr_8x8_t1 = vtbl2_u8(u1_incr_8x8x2_t, vget_high_u8(x_ref_pos_mask_r0_0)); |
| u1_incr_8x16_r0_0 = vcombine_u8(u1_incr_8x8_t0, u1_incr_8x8_t1); |
| |
| u1_incr_8x8x2_t.val[0] = vget_low_u8(u1_incr_8x16_r1_0); |
| u1_incr_8x8x2_t.val[1] = vget_high_u8(u1_incr_8x16_r1_0); |
| u1_incr_8x8_t0 = vtbl2_u8(u1_incr_8x8x2_t, vget_low_u8(x_ref_pos_mask_r0_0)); |
| u1_incr_8x8_t1 = vtbl2_u8(u1_incr_8x8x2_t, vget_high_u8(x_ref_pos_mask_r0_0)); |
| u1_incr_8x16_r1_0 = vcombine_u8(u1_incr_8x8_t0, u1_incr_8x8_t1); |
| u1_incr_not_8x16_r0_0 = vbicq_u8(phs_mask_div8_8x16_0, u1_incr_8x16_r0_0); |
| u1_incr_not_8x16_r1_0 = vbicq_u8(phs_mask_div8_8x16_0, u1_incr_8x16_r1_0); |
| |
| u1_incr_not_8x16_r0_0 = vaddq_u8(u1_incr_not_8x16_r0_0, x_ref_pos_mask_r0_0); |
| u1_incr_not_8x16_r1_0 = vaddq_u8(u1_incr_not_8x16_r1_0, x_ref_pos_mask_r0_0); |
| |
| x_ref_pos_mask_temp_r0_0 = vaddq_u8(u1_incr_not_8x16_r0_0, u1_incr_8x16_r0_0); |
| x_ref_pos_mask_temp_r1_0 = vaddq_u8(u1_incr_not_8x16_r1_0, u1_incr_8x16_r1_0); |
| |
| u1_incr_not_8x16_r0_0_even = vshlq_n_u8(u1_incr_not_8x16_r0_0, 1); |
| u1_incr_not_8x16_r1_0_even = vshlq_n_u8(u1_incr_not_8x16_r1_0, 1); |
| x_ref_pos_mask_temp_r0_0_even = vshlq_n_u8(x_ref_pos_mask_temp_r0_0, 1); |
| x_ref_pos_mask_temp_r1_0_even = vshlq_n_u8(x_ref_pos_mask_temp_r1_0, 1); |
| |
| u1_incr_not_8x16_r0_0_odd = vaddq_u8(u1_incr_not_8x16_r0_0_even, const_ones); |
| u1_incr_not_8x16_r1_0_odd = vaddq_u8(u1_incr_not_8x16_r1_0_even, const_ones); |
| x_ref_pos_mask_temp_r0_0_odd = |
| vaddq_u8(x_ref_pos_mask_temp_r0_0_even, const_ones); |
| x_ref_pos_mask_temp_r1_0_odd = |
| vaddq_u8(x_ref_pos_mask_temp_r1_0_even, const_ones); |
| |
| u1_incr_not_8x16_2 = |
| vzipq_u8(u1_incr_not_8x16_r0_0_even, u1_incr_not_8x16_r0_0_odd); |
| |
| u1_incr_not_8x16_r0_0 = u1_incr_not_8x16_2.val[0]; |
| u1_incr_not_8x16_r0_1 = u1_incr_not_8x16_2.val[1]; |
| |
| u1_incr_not_8x16_2 = |
| vzipq_u8(u1_incr_not_8x16_r1_0_even, u1_incr_not_8x16_r1_0_odd); |
| u1_incr_not_8x16_r1_0 = u1_incr_not_8x16_2.val[0]; |
| u1_incr_not_8x16_r1_1 = u1_incr_not_8x16_2.val[1]; |
| |
| x_ref_pos_mask_temp = |
| vzipq_u8(x_ref_pos_mask_temp_r0_0_even, x_ref_pos_mask_temp_r0_0_odd); |
| x_ref_pos_mask_temp_r0_0 = x_ref_pos_mask_temp.val[0]; |
| x_ref_pos_mask_temp_r0_1 = x_ref_pos_mask_temp.val[1]; |
| |
| x_ref_pos_mask_temp = |
| vzipq_u8(x_ref_pos_mask_temp_r1_0_even, x_ref_pos_mask_temp_r1_0_odd); |
| x_ref_pos_mask_temp_r1_0 = x_ref_pos_mask_temp.val[0]; |
| x_ref_pos_mask_temp_r1_1 = x_ref_pos_mask_temp.val[1]; |
| u1_incr_not_8x16_r0_1 = vsubq_u8(u1_incr_not_8x16_r0_1, mid_indx_8x16); |
| u1_incr_not_8x16_r1_1 = vsubq_u8(u1_incr_not_8x16_r1_1, mid_indx_8x16); |
| x_ref_pos_mask_temp_r0_1 = vsubq_u8(x_ref_pos_mask_temp_r0_1, mid_indx_8x16); |
| x_ref_pos_mask_temp_r1_1 = vsubq_u8(x_ref_pos_mask_temp_r1_1, mid_indx_8x16); |
| |
| u1_temp_8x8x2_t.val[0] = |
| vreinterpret_u8_s8(vget_low_s8(vreinterpretq_s8_s16(ref_arr_16x8_r0_0))); |
| u1_temp_8x8x2_t.val[1] = |
| vreinterpret_u8_s8(vget_high_s8(vreinterpretq_s8_s16(ref_arr_16x8_r0_0))); |
| u1_temp_8x8_t0 = vtbl2_u8(u1_temp_8x8x2_t, vget_low_u8(u1_incr_not_8x16_r0_0)); |
| u1_temp_8x8_t1 = vtbl2_u8(u1_temp_8x8x2_t, vget_high_u8(u1_incr_not_8x16_r0_0)); |
| ref_arr_temp0_16x8_r0_0 = vreinterpretq_s16_s8(vcombine_s8( |
| vreinterpret_s8_u8(u1_temp_8x8_t0), vreinterpret_s8_u8(u1_temp_8x8_t1))); |
| |
| u1_temp_8x8x2_t.val[0] = |
| vreinterpret_u8_s8(vget_low_s8(vreinterpretq_s8_s16(ref_arr_16x8_r1_0))); |
| u1_temp_8x8x2_t.val[1] = |
| vreinterpret_u8_s8(vget_high_s8(vreinterpretq_s8_s16(ref_arr_16x8_r1_0))); |
| u1_temp_8x8_t0 = vtbl2_u8(u1_temp_8x8x2_t, vget_low_u8(u1_incr_not_8x16_r1_0)); |
| u1_temp_8x8_t1 = vtbl2_u8(u1_temp_8x8x2_t, vget_high_u8(u1_incr_not_8x16_r1_0)); |
| ref_arr_temp0_16x8_r1_0 = vreinterpretq_s16_s8(vcombine_s8( |
| vreinterpret_s8_u8(u1_temp_8x8_t0), vreinterpret_s8_u8(u1_temp_8x8_t1))); |
| |
| u1_temp_8x8x2_t.val[0] = |
| vreinterpret_u8_s8(vget_low_s8(vreinterpretq_s8_s16(ref_arr_16x8_r0_0))); |
| u1_temp_8x8x2_t.val[1] = |
| vreinterpret_u8_s8(vget_high_s8(vreinterpretq_s8_s16(ref_arr_16x8_r0_0))); |
| u1_temp_8x8_t0 = |
| vtbl2_u8(u1_temp_8x8x2_t, vget_low_u8(x_ref_pos_mask_temp_r0_0)); |
| u1_temp_8x8_t1 = |
| vtbl2_u8(u1_temp_8x8x2_t, vget_high_u8(x_ref_pos_mask_temp_r0_0)); |
| ref_arr_temp1_16x8_r0_0 = vreinterpretq_s16_s8(vcombine_s8( |
| vreinterpret_s8_u8(u1_temp_8x8_t0), vreinterpret_s8_u8(u1_temp_8x8_t1))); |
| |
| u1_temp_8x8x2_t.val[0] = |
| vreinterpret_u8_s8(vget_low_s8(vreinterpretq_s8_s16(ref_arr_16x8_r1_0))); |
| u1_temp_8x8x2_t.val[1] = |
| vreinterpret_u8_s8(vget_high_s8(vreinterpretq_s8_s16(ref_arr_16x8_r1_0))); |
| u1_temp_8x8_t0 = |
| vtbl2_u8(u1_temp_8x8x2_t, vget_low_u8(x_ref_pos_mask_temp_r1_0)); |
| u1_temp_8x8_t1 = |
| vtbl2_u8(u1_temp_8x8x2_t, vget_high_u8(x_ref_pos_mask_temp_r1_0)); |
| ref_arr_temp1_16x8_r1_0 = vreinterpretq_s16_s8(vcombine_s8( |
| vreinterpret_s8_u8(u1_temp_8x8_t0), vreinterpret_s8_u8(u1_temp_8x8_t1))); |
| |
| u1_temp_8x8x2_t.val[0] = |
| vreinterpret_u8_s8(vget_low_s8(vreinterpretq_s8_s16(ref_arr_16x8_r0_1))); |
| u1_temp_8x8x2_t.val[1] = |
| vreinterpret_u8_s8(vget_high_s8(vreinterpretq_s8_s16(ref_arr_16x8_r0_1))); |
| u1_temp_8x8_t0 = vtbl2_u8(u1_temp_8x8x2_t, vget_low_u8(u1_incr_not_8x16_r0_1)); |
| u1_temp_8x8_t1 = vtbl2_u8(u1_temp_8x8x2_t, vget_high_u8(u1_incr_not_8x16_r0_1)); |
| ref_arr_temp0_16x8_r0_1 = vreinterpretq_s16_s8(vcombine_s8( |
| vreinterpret_s8_u8(u1_temp_8x8_t0), vreinterpret_s8_u8(u1_temp_8x8_t1))); |
| |
| u1_temp_8x8x2_t.val[0] = |
| vreinterpret_u8_s8(vget_low_s8(vreinterpretq_s8_s16(ref_arr_16x8_r1_1))); |
| u1_temp_8x8x2_t.val[1] = |
| vreinterpret_u8_s8(vget_high_s8(vreinterpretq_s8_s16(ref_arr_16x8_r1_1))); |
| u1_temp_8x8_t0 = vtbl2_u8(u1_temp_8x8x2_t, vget_low_u8(u1_incr_not_8x16_r1_1)); |
| u1_temp_8x8_t1 = vtbl2_u8(u1_temp_8x8x2_t, vget_high_u8(u1_incr_not_8x16_r1_1)); |
| ref_arr_temp0_16x8_r1_1 = vreinterpretq_s16_s8(vcombine_s8( |
| vreinterpret_s8_u8(u1_temp_8x8_t0), vreinterpret_s8_u8(u1_temp_8x8_t1))); |
| |
| u1_temp_8x8x2_t.val[0] = |
| vreinterpret_u8_s8(vget_low_s8(vreinterpretq_s8_s16(ref_arr_16x8_r0_1))); |
| u1_temp_8x8x2_t.val[1] = |
| vreinterpret_u8_s8(vget_high_s8(vreinterpretq_s8_s16(ref_arr_16x8_r0_1))); |
| u1_temp_8x8_t0 = |
| vtbl2_u8(u1_temp_8x8x2_t, vget_low_u8(x_ref_pos_mask_temp_r0_1)); |
| u1_temp_8x8_t1 = |
| vtbl2_u8(u1_temp_8x8x2_t, vget_high_u8(x_ref_pos_mask_temp_r0_1)); |
| ref_arr_temp1_16x8_r0_1 = vreinterpretq_s16_s8(vcombine_s8( |
| vreinterpret_s8_u8(u1_temp_8x8_t0), vreinterpret_s8_u8(u1_temp_8x8_t1))); |
| |
| u1_temp_8x8x2_t.val[0] = |
| vreinterpret_u8_s8(vget_low_s8(vreinterpretq_s8_s16(ref_arr_16x8_r1_1))); |
| u1_temp_8x8x2_t.val[1] = |
| vreinterpret_u8_s8(vget_high_s8(vreinterpretq_s8_s16(ref_arr_16x8_r1_1))); |
| u1_temp_8x8_t0 = |
| vtbl2_u8(u1_temp_8x8x2_t, vget_low_u8(x_ref_pos_mask_temp_r1_1)); |
| u1_temp_8x8_t1 = |
| vtbl2_u8(u1_temp_8x8x2_t, vget_high_u8(x_ref_pos_mask_temp_r1_1)); |
| ref_arr_temp1_16x8_r1_1 = vreinterpretq_s16_s8(vcombine_s8( |
| vreinterpret_s8_u8(u1_temp_8x8_t0), vreinterpret_s8_u8(u1_temp_8x8_t1))); |
| |
| res_16x8_r0_0 = vmulq_s16(ref_arr_temp0_16x8_r0_0, phs_mask_16min_16x8_0); |
| res_16x8_r1_0 = vmulq_s16(ref_arr_temp0_16x8_r1_0, phs_mask_16min_16x8_0); |
| res_16x8_r0_0 = vmlaq_s16(res_16x8_r0_0, ref_arr_temp1_16x8_r0_0, |
| vreinterpretq_s16_u16(phs_mask_16x8_0)); |
| res_16x8_r1_0 = vmlaq_s16(res_16x8_r1_0, ref_arr_temp1_16x8_r1_0, |
| vreinterpretq_s16_u16(phs_mask_16x8_0)); |
| res_16x8_r0_1 = vmulq_s16(ref_arr_temp0_16x8_r0_1, phs_mask_16min_16x8_1); |
| res_16x8_r1_1 = vmulq_s16(ref_arr_temp0_16x8_r1_1, phs_mask_16min_16x8_1); |
| res_16x8_r0_1 = vmlaq_s16(res_16x8_r0_1, ref_arr_temp1_16x8_r0_1, |
| vreinterpretq_s16_u16(phs_mask_16x8_1)); |
| res_16x8_r1_1 = vmlaq_s16(res_16x8_r1_1, ref_arr_temp1_16x8_r1_1, |
| vreinterpretq_s16_u16(phs_mask_16x8_1)); |
| |
| pu1_ref_y_ptr_incr_temp = |
| pu1_ref_y_ptr_incr + (pi1_y_ref_pos[i4_y] * i4_refarray_wd); |
| u1_y_incr_8x16_r0_0 = vld1q_u8((pu1_ref_y_ptr_incr_temp)); |
| |
| u1_incr_8x8x2_t.val[0] = vget_low_u8(u1_y_incr_8x16_r0_0); |
| u1_incr_8x8x2_t.val[1] = vget_high_u8(u1_y_incr_8x16_r0_0); |
| u1_incr_8x8_t0 = vtbl2_u8(u1_incr_8x8x2_t, vget_low_u8(x_ref_rnd_mask_r0_0)); |
| u1_incr_8x8_t1 = vtbl2_u8(u1_incr_8x8x2_t, vget_high_u8(x_ref_rnd_mask_r0_0)); |
| u1_y_incr_8x16_r0_0 = vcombine_u8(u1_incr_8x8_t0, u1_incr_8x8_t1); |
| u1_y_incr_16x8_r0_0 = vmovl_u8(vget_low_u8(u1_y_incr_8x16_r0_0)); |
| u1_y_incr_16x8_r0_1 = vmovl_u8(vget_high_u8(u1_y_incr_8x16_r0_0)); |
| u1_y_incr_16x8_r0_0 = vtstq_u16(u1_y_incr_16x8_r0_0, ones); |
| u1_y_incr_16x8_r0_1 = vtstq_u16(u1_y_incr_16x8_r0_1, ones); |
| |
| prev_res_16x8_r0_0 = res_16x8_r0_0; |
| prev_res_16x8_r1_0 = res_16x8_r1_0; |
| prev_res_16x8_r0_1 = res_16x8_r0_1; |
| prev_res_16x8_r1_1 = res_16x8_r1_1; |
| |
| u1_prev_y_incr_16x8_r0_0 = u1_y_incr_16x8_r0_0; |
| u1_prev_y_incr_16x8_r0_1 = u1_y_incr_16x8_r0_1; |
| } |
| } |
| |
| if(!zero_r0_r1) |
| { |
| res_16x8_l = vdupq_n_s16(0); |
| res_16x8_h = vdupq_n_s16(0); |
| } |
| else |
| { |
| i4_y_phase = pi1_y_phase[i4_y]; |
| |
| if((i4_y_phase) >> 3) |
| { |
| vert_res_16x8_r0_0 = |
| vbslq_s16(u1_y_incr_16x8_r0_0, res_16x8_r0_0, res_16x8_r1_0); |
| vert_res_16x8_r1_0 = |
| vbslq_s16(u1_y_incr_16x8_r0_0, res_16x8_r1_0, res_16x8_r1_0); |
| vert_res_16x8_r0_1 = |
| vbslq_s16(u1_y_incr_16x8_r0_1, res_16x8_r0_1, res_16x8_r1_1); |
| vert_res_16x8_r1_1 = |
| vbslq_s16(u1_y_incr_16x8_r0_1, res_16x8_r1_1, res_16x8_r1_1); |
| } |
| else |
| { |
| vert_res_16x8_r0_0 = |
| vbslq_s16(u1_y_incr_16x8_r0_0, res_16x8_r0_0, res_16x8_r0_0); |
| vert_res_16x8_r1_0 = |
| vbslq_s16(u1_y_incr_16x8_r0_0, res_16x8_r1_0, res_16x8_r0_0); |
| vert_res_16x8_r0_1 = |
| vbslq_s16(u1_y_incr_16x8_r0_1, res_16x8_r0_1, res_16x8_r0_1); |
| vert_res_16x8_r1_1 = |
| vbslq_s16(u1_y_incr_16x8_r0_1, res_16x8_r1_1, res_16x8_r0_1); |
| } |
| |
| res_32x4_l_0 = vmull_n_s16(vget_low_s16(vert_res_16x8_r0_0), 16 - i4_y_phase); |
| res_32x4_l_0 = |
| vmlal_n_s16(res_32x4_l_0, vget_low_s16(vert_res_16x8_r1_0), i4_y_phase); |
| |
| res_32x4_l_1 = vmull_n_s16(vget_high_s16(vert_res_16x8_r0_0), 16 - i4_y_phase); |
| res_32x4_l_1 = |
| vmlal_n_s16(res_32x4_l_1, vget_high_s16(vert_res_16x8_r1_0), i4_y_phase); |
| res_32x4_h_0 = vmull_n_s16(vget_low_s16(vert_res_16x8_r0_1), 16 - i4_y_phase); |
| res_32x4_h_0 = |
| vmlal_n_s16(res_32x4_h_0, vget_low_s16(vert_res_16x8_r1_1), i4_y_phase); |
| res_32x4_h_1 = vmull_n_s16(vget_high_s16(vert_res_16x8_r0_1), 16 - i4_y_phase); |
| res_32x4_h_1 = |
| vmlal_n_s16(res_32x4_h_1, vget_high_s16(vert_res_16x8_r1_1), i4_y_phase); |
| |
| res_32x4_l_0 = vrshrq_n_s32(res_32x4_l_0, 8); |
| res_32x4_l_1 = vrshrq_n_s32(res_32x4_l_1, 8); |
| res_32x4_h_0 = vrshrq_n_s32(res_32x4_h_0, 8); |
| res_32x4_h_1 = vrshrq_n_s32(res_32x4_h_1, 8); |
| |
| res_16x8_l = vcombine_s16(vmovn_s32(res_32x4_l_0), vmovn_s32(res_32x4_l_1)); |
| res_16x8_h = vcombine_s16(vmovn_s32(res_32x4_h_0), vmovn_s32(res_32x4_h_1)); |
| } |
| |
| out_stride_temp = (i4_y * i4_out_stride); |
| vst1q_s16((pi2_out + out_stride_temp), res_16x8_l); |
| vst1q_s16((pi2_out + out_stride_temp + 8), res_16x8_h); |
| } |
| } |
| else |
| { |
| int16x8_t ref_arr_16x8_r0_0; |
| int16x8_t ref_arr_16x8_r1_0; |
| uint8x16_t x_ref_pos_mask_r0_0, x_ref_rnd_mask_r0_0; |
| uint16x8_t u1_incr_16x8_r0_0, u1_incr_mask_16x8_r0_0, phs_mask_16x8_0, |
| u1_incr_not_16x8_r0_0, u1_y_incr_16x8_r0_0; |
| uint16x8_t u1_incr_16x8_r1_0, u1_incr_mask_16x8_r1_0, u1_incr_not_16x8_r1_0; |
| uint8x16_t u1_incr_8x16_r0_0, x_ref_pos_mask_temp_r0_0, u1_incr_mask_8x16_r0_0, |
| u1_incr_not_8x16_r0_0, u1_y_incr_8x16_r0_0; |
| uint8x16_t u1_incr_8x16_r1_0, x_ref_pos_mask_temp_r1_0, u1_incr_mask_8x16_r1_0, |
| u1_incr_not_8x16_r1_0; |
| int16x8_t ref_arr_temp0_16x8_r0_0, res_16x8_r0_0, vert_res_16x8_r0_0; |
| int16x8_t ref_arr_temp0_16x8_r1_0, res_16x8_r1_0, vert_res_16x8_r1_0; |
| int16x8_t ref_arr_temp1_16x8_r0_0; |
| int16x8_t ref_arr_temp1_16x8_r1_0; |
| |
| int32x4_t res_32x4_l_0; |
| int32x4_t res_32x4_l_1; |
| int16x8_t out_16x8_0, out_16x8_1; |
| uint16x8_t phs_mask_div8_16x8_0, phs_mask_div8_msb_16x8_0; |
| int16x8_t const_16_16x8, phs_mask_16min_16x8_0; |
| uint8x8x2_t u1_temp_8x8x2_t; |
| uint8x8_t u1_temp_8x8_t0, u1_temp_8x8_t1; |
| |
| uint16x8_t chroma_mask_16x8 = vreinterpretq_u16_u32(vdupq_n_u32(0x0000ffff)); |
| uint16x8_t ones = vdupq_n_u16(0xFFFF); |
| |
| WORD16 *pi2_ref_array_temp; |
| UWORD8 *pu1_ref_x_ptr_incr_temp, *pu1_ref_y_ptr_incr_temp; |
| WORD32 i4_y_phase; |
| uint8x8x2_t u1_incr_8x8x2_t; |
| uint8x8_t u1_incr_8x8_t0, u1_incr_8x8_t1; |
| int16x8_t prev_res_16x8_r0_0; |
| int16x8_t prev_res_16x8_r1_0; |
| int16x8_t dup_val_1, dup_val_2, dup_abs; |
| uint16x8_t u1_prev_y_incr_16x8_r0_0; |
| |
| WORD32 out_stride_temp; |
| WORD32 zero_r0_r1 = 0; |
| WORD32 i4_x2 = 0; |
| for(i4_y = 0; i4_y < (i4_temp_array_ht); i4_y++) |
| { |
| arr_y_phase[i4_y] = (WORD8) ps_y_pos_phase[i4_y + i4_frm_mb_y].i2_phase; |
| arr_y_ref_pos[i4_y] = (WORD8) (ps_y_pos_phase[i4_y + i4_frm_mb_y].i2_ref_pos); |
| } |
| pi1_y_ref_pos = arr_y_ref_pos; |
| pi1_y_phase = arr_y_phase; |
| |
| for(i4_x = 0; i4_x < i4_mb_wd; i4_x++) |
| { |
| arr_x_ref_pos[i4_x] = (WORD8) ps_x_pos_phase[i4_x + i4_frm_mb_x].i2_ref_pos; |
| arr_x_phase[i4_x] = (WORD8) ps_x_pos_phase[i4_x + i4_frm_mb_x].i2_phase; |
| i4_x2 = i4_x << 1; |
| arr_x_ref_pos_low[i4_x2] = (arr_x_ref_pos[i4_x]) << 1; |
| arr_x_ref_pos_low[i4_x2 + 1] = arr_x_ref_pos_low[i4_x2] + 1; |
| } |
| |
| pi1_x_ref_pos_low = arr_x_ref_pos_low; |
| pi1_x_phase = arr_x_phase; |
| |
| phs_mask_16x8_0 = vmovl_u8(vld1_u8((pi1_x_phase))); |
| x_ref_pos_mask_r0_0 = vld1q_u8((pi1_x_ref_pos_low)); |
| const_16_16x8 = vdupq_n_s16(16); |
| phs_mask_div8_16x8_0 = vshrq_n_u16(phs_mask_16x8_0, 3); |
| phs_mask_div8_msb_16x8_0 = vsliq_n_u16(phs_mask_div8_16x8_0, phs_mask_div8_16x8_0, 8); |
| |
| phs_mask_16min_16x8_0 = vsubq_s16(const_16_16x8, vreinterpretq_s16_u16(phs_mask_16x8_0)); |
| |
| x_ref_rnd_mask_r0_0 = vaddq_u8( |
| x_ref_pos_mask_r0_0, vreinterpretq_u8_u16(vshlq_n_u16(phs_mask_div8_msb_16x8_0, 1))); |
| for(i4_y = 0; i4_y < (i4_temp_array_ht); i4_y++) |
| { |
| if((i4_y > 0) && (pi1_y_ref_pos[i4_y] == pi1_y_ref_pos[i4_y - 1])) |
| { |
| if(!zero_r0_r1) |
| { |
| res_32x4_l_0 = vdupq_n_s32(0); |
| res_32x4_l_1 = vdupq_n_s32(0); |
| |
| out_stride_temp = (i4_y * i4_out_stride); |
| |
| out_16x8_0 = vld1q_s16(pi2_out + out_stride_temp); |
| out_16x8_1 = vld1q_s16(pi2_out + out_stride_temp + 8); |
| out_16x8_0 = vbslq_s16(chroma_mask_16x8, vreinterpretq_s16_s32(res_32x4_l_0), |
| out_16x8_0); |
| out_16x8_1 = vbslq_s16(chroma_mask_16x8, vreinterpretq_s16_s32(res_32x4_l_1), |
| out_16x8_1); |
| vst1q_s16((pi2_out + out_stride_temp), out_16x8_0); |
| vst1q_s16((pi2_out + out_stride_temp + 8), out_16x8_1); |
| continue; |
| } |
| |
| res_16x8_r0_0 = prev_res_16x8_r0_0; |
| res_16x8_r1_0 = prev_res_16x8_r1_0; |
| |
| u1_y_incr_16x8_r0_0 = u1_prev_y_incr_16x8_r0_0; |
| } |
| else |
| { |
| pi2_ref_array_temp = pi2_ref_array + ((pi1_y_ref_pos[i4_y]) * i4_refarray_wd); |
| pu1_ref_x_ptr_incr_temp = |
| pu1_ref_x_ptr_incr + ((pi1_y_ref_pos[i4_y]) * i4_refarray_wd); |
| ref_arr_16x8_r0_0 = vld1q_s16((pi2_ref_array_temp)); |
| ref_arr_16x8_r1_0 = vld1q_s16((pi2_ref_array_temp + i4_refarray_wd)); |
| |
| dup_val_1 = vabsq_s16(ref_arr_16x8_r0_0); |
| dup_val_2 = vabsq_s16(ref_arr_16x8_r1_0); |
| dup_abs = vqaddq_s16(dup_val_1, dup_val_2); |
| zero_r0_r1 = dup_abs[0] || dup_abs[1] || dup_abs[2] || dup_abs[3] || dup_abs[4] || |
| dup_abs[5] || dup_abs[6] || dup_abs[7]; |
| if(zero_r0_r1) |
| { |
| u1_incr_16x8_r0_0 = (vmovl_u8(vld1_u8((pu1_ref_x_ptr_incr_temp)))); |
| u1_incr_16x8_r1_0 = |
| (vmovl_u8(vld1_u8((pu1_ref_x_ptr_incr_temp + i4_refarray_wd)))); |
| u1_incr_8x8x2_t.val[0] = vget_low_u8(vreinterpretq_u8_u16(u1_incr_16x8_r0_0)); |
| u1_incr_8x8x2_t.val[1] = vget_high_u8(vreinterpretq_u8_u16(u1_incr_16x8_r0_0)); |
| u1_incr_8x8_t0 = vtbl2_u8(u1_incr_8x8x2_t, vget_low_u8(x_ref_pos_mask_r0_0)); |
| u1_incr_8x8_t1 = vtbl2_u8(u1_incr_8x8x2_t, vget_high_u8(x_ref_pos_mask_r0_0)); |
| u1_incr_8x16_r0_0 = vcombine_u8(u1_incr_8x8_t0, u1_incr_8x8_t1); |
| |
| u1_incr_8x8x2_t.val[0] = vget_low_u8(vreinterpretq_u8_u16(u1_incr_16x8_r1_0)); |
| u1_incr_8x8x2_t.val[1] = vget_high_u8(vreinterpretq_u8_u16(u1_incr_16x8_r1_0)); |
| u1_incr_8x8_t0 = vtbl2_u8(u1_incr_8x8x2_t, vget_low_u8(x_ref_pos_mask_r0_0)); |
| u1_incr_8x8_t1 = vtbl2_u8(u1_incr_8x8x2_t, vget_high_u8(x_ref_pos_mask_r0_0)); |
| u1_incr_8x16_r1_0 = vcombine_u8(u1_incr_8x8_t0, u1_incr_8x8_t1); |
| |
| u1_incr_16x8_r0_0 = vreinterpretq_u16_u8(u1_incr_8x16_r0_0); |
| u1_incr_16x8_r1_0 = vreinterpretq_u16_u8(u1_incr_8x16_r1_0); |
| u1_incr_mask_16x8_r0_0 = vsliq_n_u16(u1_incr_16x8_r0_0, u1_incr_16x8_r0_0, 8); |
| u1_incr_mask_16x8_r1_0 = vsliq_n_u16(u1_incr_16x8_r1_0, u1_incr_16x8_r1_0, 8); |
| |
| u1_incr_not_16x8_r0_0 = |
| vbicq_u16(phs_mask_div8_msb_16x8_0, u1_incr_mask_16x8_r0_0); |
| u1_incr_not_16x8_r1_0 = |
| vbicq_u16(phs_mask_div8_msb_16x8_0, u1_incr_mask_16x8_r1_0); |
| |
| u1_incr_mask_8x16_r0_0 = |
| vreinterpretq_u8_u16(vshlq_n_u16(u1_incr_mask_16x8_r0_0, 1)); |
| u1_incr_mask_8x16_r1_0 = |
| vreinterpretq_u8_u16(vshlq_n_u16(u1_incr_mask_16x8_r1_0, 1)); |
| u1_incr_not_8x16_r0_0 = |
| vreinterpretq_u8_u16(vshlq_n_u16(u1_incr_not_16x8_r0_0, 1)); |
| u1_incr_not_8x16_r1_0 = |
| vreinterpretq_u8_u16(vshlq_n_u16(u1_incr_not_16x8_r1_0, 1)); |
| |
| u1_incr_not_8x16_r0_0 = vaddq_u8(u1_incr_not_8x16_r0_0, x_ref_pos_mask_r0_0); |
| u1_incr_not_8x16_r1_0 = vaddq_u8(u1_incr_not_8x16_r1_0, x_ref_pos_mask_r0_0); |
| |
| x_ref_pos_mask_temp_r0_0 = |
| vaddq_u8(u1_incr_not_8x16_r0_0, u1_incr_mask_8x16_r0_0); |
| x_ref_pos_mask_temp_r1_0 = |
| vaddq_u8(u1_incr_not_8x16_r1_0, u1_incr_mask_8x16_r1_0); |
| |
| u1_temp_8x8x2_t.val[0] = |
| vreinterpret_u8_s8(vget_low_s8(vreinterpretq_s8_s16(ref_arr_16x8_r0_0))); |
| u1_temp_8x8x2_t.val[1] = |
| vreinterpret_u8_s8(vget_high_s8(vreinterpretq_s8_s16(ref_arr_16x8_r0_0))); |
| u1_temp_8x8_t0 = vtbl2_u8(u1_temp_8x8x2_t, vget_low_u8(u1_incr_not_8x16_r0_0)); |
| u1_temp_8x8_t1 = vtbl2_u8(u1_temp_8x8x2_t, vget_high_u8(u1_incr_not_8x16_r0_0)); |
| ref_arr_temp0_16x8_r0_0 = vreinterpretq_s16_s8(vcombine_s8( |
| vreinterpret_s8_u8(u1_temp_8x8_t0), vreinterpret_s8_u8(u1_temp_8x8_t1))); |
| |
| u1_temp_8x8x2_t.val[0] = |
| vreinterpret_u8_s8(vget_low_s8(vreinterpretq_s8_s16(ref_arr_16x8_r1_0))); |
| u1_temp_8x8x2_t.val[1] = |
| vreinterpret_u8_s8(vget_high_s8(vreinterpretq_s8_s16(ref_arr_16x8_r1_0))); |
| u1_temp_8x8_t0 = vtbl2_u8(u1_temp_8x8x2_t, vget_low_u8(u1_incr_not_8x16_r1_0)); |
| u1_temp_8x8_t1 = vtbl2_u8(u1_temp_8x8x2_t, vget_high_u8(u1_incr_not_8x16_r1_0)); |
| ref_arr_temp0_16x8_r1_0 = vreinterpretq_s16_s8(vcombine_s8( |
| vreinterpret_s8_u8(u1_temp_8x8_t0), vreinterpret_s8_u8(u1_temp_8x8_t1))); |
| |
| u1_temp_8x8x2_t.val[0] = |
| vreinterpret_u8_s8(vget_low_s8(vreinterpretq_s8_s16(ref_arr_16x8_r0_0))); |
| u1_temp_8x8x2_t.val[1] = |
| vreinterpret_u8_s8(vget_high_s8(vreinterpretq_s8_s16(ref_arr_16x8_r0_0))); |
| u1_temp_8x8_t0 = |
| vtbl2_u8(u1_temp_8x8x2_t, vget_low_u8(x_ref_pos_mask_temp_r0_0)); |
| u1_temp_8x8_t1 = |
| vtbl2_u8(u1_temp_8x8x2_t, vget_high_u8(x_ref_pos_mask_temp_r0_0)); |
| ref_arr_temp1_16x8_r0_0 = vreinterpretq_s16_s8(vcombine_s8( |
| vreinterpret_s8_u8(u1_temp_8x8_t0), vreinterpret_s8_u8(u1_temp_8x8_t1))); |
| |
| u1_temp_8x8x2_t.val[0] = |
| vreinterpret_u8_s8(vget_low_s8(vreinterpretq_s8_s16(ref_arr_16x8_r1_0))); |
| u1_temp_8x8x2_t.val[1] = |
| vreinterpret_u8_s8(vget_high_s8(vreinterpretq_s8_s16(ref_arr_16x8_r1_0))); |
| u1_temp_8x8_t0 = |
| vtbl2_u8(u1_temp_8x8x2_t, vget_low_u8(x_ref_pos_mask_temp_r1_0)); |
| u1_temp_8x8_t1 = |
| vtbl2_u8(u1_temp_8x8x2_t, vget_high_u8(x_ref_pos_mask_temp_r1_0)); |
| ref_arr_temp1_16x8_r1_0 = vreinterpretq_s16_s8(vcombine_s8( |
| vreinterpret_s8_u8(u1_temp_8x8_t0), vreinterpret_s8_u8(u1_temp_8x8_t1))); |
| res_16x8_r0_0 = vmulq_s16(ref_arr_temp0_16x8_r0_0, phs_mask_16min_16x8_0); |
| res_16x8_r1_0 = vmulq_s16(ref_arr_temp0_16x8_r1_0, phs_mask_16min_16x8_0); |
| res_16x8_r0_0 = vmlaq_s16(res_16x8_r0_0, ref_arr_temp1_16x8_r0_0, |
| vreinterpretq_s16_u16(phs_mask_16x8_0)); |
| res_16x8_r1_0 = vmlaq_s16(res_16x8_r1_0, ref_arr_temp1_16x8_r1_0, |
| vreinterpretq_s16_u16(phs_mask_16x8_0)); |
| |
| pu1_ref_y_ptr_incr_temp = |
| pu1_ref_y_ptr_incr + (pi1_y_ref_pos[i4_y] * i4_refarray_wd); |
| u1_y_incr_16x8_r0_0 = vmovl_u8(vld1_u8((pu1_ref_y_ptr_incr_temp))); |
| u1_incr_8x8x2_t.val[0] = vget_low_u8(vreinterpretq_u8_u16(u1_y_incr_16x8_r0_0)); |
| u1_incr_8x8x2_t.val[1] = |
| vget_high_u8(vreinterpretq_u8_u16(u1_y_incr_16x8_r0_0)); |
| u1_incr_8x8_t0 = vtbl2_u8(u1_incr_8x8x2_t, vget_low_u8(x_ref_rnd_mask_r0_0)); |
| u1_incr_8x8_t1 = vtbl2_u8(u1_incr_8x8x2_t, vget_high_u8(x_ref_rnd_mask_r0_0)); |
| u1_y_incr_8x16_r0_0 = vcombine_u8(u1_incr_8x8_t0, u1_incr_8x8_t1); |
| |
| u1_y_incr_16x8_r0_0 = vreinterpretq_u16_u8(u1_y_incr_8x16_r0_0); |
| |
| u1_y_incr_16x8_r0_0 = vtstq_u16(u1_y_incr_16x8_r0_0, ones); |
| |
| prev_res_16x8_r0_0 = res_16x8_r0_0; |
| prev_res_16x8_r1_0 = res_16x8_r1_0; |
| |
| u1_prev_y_incr_16x8_r0_0 = u1_y_incr_16x8_r0_0; |
| } |
| } |
| |
| if(!zero_r0_r1) |
| { |
| res_32x4_l_0 = vdupq_n_s32(0); |
| res_32x4_l_1 = vdupq_n_s32(0); |
| } |
| else |
| { |
| i4_y_phase = pi1_y_phase[i4_y]; |
| |
| if((i4_y_phase) >> 3) |
| { |
| vert_res_16x8_r0_0 = |
| vbslq_s16(u1_y_incr_16x8_r0_0, res_16x8_r0_0, res_16x8_r1_0); |
| vert_res_16x8_r1_0 = |
| vbslq_s16(u1_y_incr_16x8_r0_0, res_16x8_r1_0, res_16x8_r1_0); |
| } |
| else |
| { |
| vert_res_16x8_r0_0 = |
| vbslq_s16(u1_y_incr_16x8_r0_0, res_16x8_r0_0, res_16x8_r0_0); |
| vert_res_16x8_r1_0 = |
| vbslq_s16(u1_y_incr_16x8_r0_0, res_16x8_r1_0, res_16x8_r0_0); |
| } |
| res_32x4_l_0 = vmull_n_s16(vget_low_s16(vert_res_16x8_r0_0), 16 - i4_y_phase); |
| res_32x4_l_0 = |
| vmlal_n_s16(res_32x4_l_0, vget_low_s16(vert_res_16x8_r1_0), i4_y_phase); |
| |
| res_32x4_l_1 = vmull_n_s16(vget_high_s16(vert_res_16x8_r0_0), 16 - i4_y_phase); |
| res_32x4_l_1 = |
| vmlal_n_s16(res_32x4_l_1, vget_high_s16(vert_res_16x8_r1_0), i4_y_phase); |
| |
| res_32x4_l_0 = vrshrq_n_s32(res_32x4_l_0, 8); |
| res_32x4_l_1 = vrshrq_n_s32(res_32x4_l_1, 8); |
| } |
| out_stride_temp = (i4_y * i4_out_stride); |
| |
| out_16x8_0 = vld1q_s16(pi2_out + out_stride_temp); |
| out_16x8_1 = vld1q_s16(pi2_out + out_stride_temp + 8); |
| out_16x8_0 = |
| vbslq_s16(chroma_mask_16x8, vreinterpretq_s16_s32(res_32x4_l_0), out_16x8_0); |
| out_16x8_1 = |
| vbslq_s16(chroma_mask_16x8, vreinterpretq_s16_s32(res_32x4_l_1), out_16x8_1); |
| vst1q_s16((pi2_out + out_stride_temp), out_16x8_0); |
| vst1q_s16((pi2_out + out_stride_temp + 8), out_16x8_1); |
| } |
| } |
| return; |
| } /* End of Interpolation Function */ |
| |
| /*****************************************************************************/ |
| /* */ |
| /* Function Name : isvcd_residual_reflayer_const_non_boundary_mb_neonintr */ |
| /* */ |
| /* Description : */ |
| /* */ |
| /* Inputs : */ |
| /* Globals : none */ |
| /* Processing : */ |
| /* */ |
| /* Outputs : none */ |
| /* Returns : none */ |
| /* */ |
| /* Issues : none */ |
| /* */ |
| /* Revision History: */ |
| /* */ |
| /* DD MM YYYY Author(s) Changes (Describe the changes made) */ |
| /* 25 11 2021 Dolan creation */ |
| /* */ |
| /*****************************************************************************/ |
| |
| void isvcd_residual_reflayer_const_non_boundary_mb_neonintr( |
| WORD16 *pi2_inp_data, WORD32 i4_inp_data_stride, WORD16 *pi2_ref_array, WORD32 i4_refarray_wd, |
| WORD32 i4_refarray_ht, WORD32 i4_ref_mb_type_q0, WORD32 i4_ref_mb_type_q1, |
| WORD32 i4_ref_mb_type_q2, WORD32 i4_ref_mb_type_q3, WORD32 i4_mb_quard1_part_x, |
| WORD32 i4_mb_quard1_part_y, WORD32 i4_chroma_flag) |
| { |
| WORD32 i4_y; |
| WORD16 *pi2_ref_data_byte; |
| WORD16 *pi2_ref_array_temp; |
| |
| if(i4_chroma_flag == 0) |
| { |
| WORD16 index_0[8] = {0, 1, 2, 3, 4, 5, 6, 7}; |
| int16x8_t ref_mb_type_16x8_q0, ref_mb_type_16x8_q1, ref_mb_type_16x8_q2, |
| ref_mb_type_16x8_q3, mb_quard1_part_x_16x8; |
| int16x8_t ref_mb_type_16x8_0, ref_mb_type_16x8_1; |
| int16x8_t ref_mb_type_16x8_low_0, ref_mb_type_16x8_low_1; |
| uint16x8_t mb_type_mask_16x8_0, mb_type_mask_16x8_1; |
| uint16x8_t mb_type_mask_16x8_low_0, mb_type_mask_16x8_low_1; |
| uint16x8_t mask_16x8_0; |
| |
| int16x8_t index_arr_0; |
| int16x8_t inp_data_16x8_0, inp_data_16x8_1; |
| int16x8_t res_16x8_0, res_16x8_1; |
| int16x8_t one_16x8 = vdupq_n_s16(1); |
| int16x8_t zero_16x8 = vdupq_n_s16(0); |
| |
| index_arr_0 = vld1q_s16(&index_0[0]); |
| |
| ref_mb_type_16x8_q0 = vdupq_n_s16(i4_ref_mb_type_q0); |
| ref_mb_type_16x8_q1 = vdupq_n_s16(i4_ref_mb_type_q1); |
| ref_mb_type_16x8_q2 = vdupq_n_s16(i4_ref_mb_type_q2); |
| ref_mb_type_16x8_q3 = vdupq_n_s16(i4_ref_mb_type_q3); |
| if((i4_mb_quard1_part_x >= i4_refarray_wd) && (i4_mb_quard1_part_y >= i4_refarray_ht)) |
| { |
| // Quard 0 |
| ref_mb_type_16x8_0 = ref_mb_type_16x8_q0; |
| ref_mb_type_16x8_1 = ref_mb_type_16x8_q0; |
| mb_type_mask_16x8_0 = vceqq_s16(ref_mb_type_16x8_0, one_16x8); |
| mb_type_mask_16x8_1 = mb_type_mask_16x8_0; |
| } |
| else if((i4_mb_quard1_part_y >= (i4_refarray_ht - 1)) && |
| (i4_mb_quard1_part_x < i4_refarray_wd)) |
| { |
| // Quard 0 & 1 |
| if(i4_mb_quard1_part_x == 8) |
| { |
| ref_mb_type_16x8_0 = ref_mb_type_16x8_q0; |
| ref_mb_type_16x8_1 = ref_mb_type_16x8_q1; |
| } |
| else if(i4_mb_quard1_part_x < 8) |
| { |
| mb_quard1_part_x_16x8 = vdupq_n_s16((i4_mb_quard1_part_x)); |
| mask_16x8_0 = |
| vcltq_s16(index_arr_0, mb_quard1_part_x_16x8); // return 1 if a<b, else 0 |
| |
| ref_mb_type_16x8_0 = |
| vbslq_s16(mask_16x8_0, ref_mb_type_16x8_q0, ref_mb_type_16x8_q1); |
| ref_mb_type_16x8_1 = ref_mb_type_16x8_q1; |
| } |
| else |
| { |
| mb_quard1_part_x_16x8 = vdupq_n_s16((i4_mb_quard1_part_x - 8)); |
| mask_16x8_0 = |
| vcleq_s16(index_arr_0, mb_quard1_part_x_16x8); // return 1 if a<b, else 0 |
| |
| ref_mb_type_16x8_0 = ref_mb_type_16x8_q0; |
| ref_mb_type_16x8_1 = |
| vbslq_s16(mask_16x8_0, ref_mb_type_16x8_q0, ref_mb_type_16x8_q1); |
| } |
| |
| mb_type_mask_16x8_0 = vceqq_s16(ref_mb_type_16x8_0, one_16x8); |
| mb_type_mask_16x8_1 = vceqq_s16(ref_mb_type_16x8_1, one_16x8); |
| } |
| else |
| { |
| if(i4_mb_quard1_part_x >= i4_refarray_wd) |
| { |
| ref_mb_type_16x8_0 = ref_mb_type_16x8_q0; |
| ref_mb_type_16x8_1 = ref_mb_type_16x8_q0; |
| |
| ref_mb_type_16x8_low_0 = ref_mb_type_16x8_q2; |
| ref_mb_type_16x8_low_1 = ref_mb_type_16x8_q2; |
| } |
| else |
| { |
| // Quard 0, 1, 2, 3 |
| if(i4_mb_quard1_part_x == 8) |
| { |
| ref_mb_type_16x8_0 = ref_mb_type_16x8_q0; |
| ref_mb_type_16x8_1 = ref_mb_type_16x8_q1; |
| |
| ref_mb_type_16x8_low_0 = ref_mb_type_16x8_q2; |
| ref_mb_type_16x8_low_1 = ref_mb_type_16x8_q3; |
| } |
| else if(i4_mb_quard1_part_x < 8) |
| { |
| mb_quard1_part_x_16x8 = vdupq_n_s16((i4_mb_quard1_part_x)); |
| mask_16x8_0 = |
| vcltq_s16(index_arr_0, mb_quard1_part_x_16x8); // return 1 if a<b, else 0 |
| |
| ref_mb_type_16x8_0 = |
| vbslq_s16(mask_16x8_0, ref_mb_type_16x8_q0, ref_mb_type_16x8_q1); |
| ref_mb_type_16x8_1 = ref_mb_type_16x8_q1; |
| |
| ref_mb_type_16x8_low_0 = |
| vbslq_s16(mask_16x8_0, ref_mb_type_16x8_q2, ref_mb_type_16x8_q3); |
| ref_mb_type_16x8_low_1 = ref_mb_type_16x8_q3; |
| } |
| else |
| { |
| mb_quard1_part_x_16x8 = vdupq_n_s16((i4_mb_quard1_part_x - 8)); |
| mask_16x8_0 = |
| vcltq_s16(index_arr_0, mb_quard1_part_x_16x8); // return 1 if a<b, else 0 |
| |
| ref_mb_type_16x8_0 = ref_mb_type_16x8_q0; |
| ref_mb_type_16x8_1 = |
| vbslq_s16(mask_16x8_0, ref_mb_type_16x8_q0, ref_mb_type_16x8_q1); |
| |
| ref_mb_type_16x8_low_0 = ref_mb_type_16x8_q2; |
| ref_mb_type_16x8_low_1 = |
| vbslq_s16(mask_16x8_0, ref_mb_type_16x8_q2, ref_mb_type_16x8_q3); |
| } |
| mb_type_mask_16x8_0 = vceqq_s16(ref_mb_type_16x8_0, one_16x8); |
| mb_type_mask_16x8_1 = vceqq_s16(ref_mb_type_16x8_1, one_16x8); |
| |
| mb_type_mask_16x8_low_0 = vceqq_s16(ref_mb_type_16x8_low_0, one_16x8); |
| mb_type_mask_16x8_low_1 = vceqq_s16(ref_mb_type_16x8_low_1, one_16x8); |
| } |
| } |
| |
| if(i4_mb_quard1_part_y < i4_refarray_ht - 1) |
| { |
| for(i4_y = 0; i4_y < i4_refarray_ht; i4_y++) |
| { |
| pi2_ref_data_byte = pi2_inp_data + (i4_y * i4_inp_data_stride); |
| inp_data_16x8_0 = vld1q_s16((pi2_ref_data_byte)); |
| inp_data_16x8_1 = vld1q_s16((pi2_ref_data_byte + 8)); |
| if(i4_y < i4_mb_quard1_part_y) |
| { |
| res_16x8_0 = vbslq_s16(mb_type_mask_16x8_0, inp_data_16x8_0, zero_16x8); |
| res_16x8_1 = vbslq_s16(mb_type_mask_16x8_1, inp_data_16x8_1, zero_16x8); |
| } |
| else |
| { |
| res_16x8_0 = vbslq_s16(mb_type_mask_16x8_low_0, inp_data_16x8_0, zero_16x8); |
| res_16x8_1 = vbslq_s16(mb_type_mask_16x8_low_1, inp_data_16x8_1, zero_16x8); |
| } |
| pi2_ref_array_temp = pi2_ref_array + (i4_y * i4_refarray_wd); |
| vst1q_s16((pi2_ref_array_temp), res_16x8_0); |
| vst1q_s16((pi2_ref_array_temp + 8), res_16x8_1); |
| } |
| } |
| else |
| { |
| for(i4_y = 0; i4_y < i4_refarray_ht; i4_y++) |
| { |
| pi2_ref_data_byte = pi2_inp_data + (i4_y * i4_inp_data_stride); |
| inp_data_16x8_0 = vld1q_s16((pi2_ref_data_byte)); |
| inp_data_16x8_1 = vld1q_s16((pi2_ref_data_byte + 8)); |
| |
| res_16x8_0 = vbslq_s16(mb_type_mask_16x8_0, inp_data_16x8_0, zero_16x8); |
| res_16x8_1 = vbslq_s16(mb_type_mask_16x8_1, inp_data_16x8_1, zero_16x8); |
| |
| pi2_ref_array_temp = pi2_ref_array + (i4_y * i4_refarray_wd); |
| vst1q_s16((pi2_ref_array_temp), res_16x8_0); |
| vst1q_s16((pi2_ref_array_temp + 8), res_16x8_1); |
| } |
| } |
| } |
| else |
| { |
| WORD16 index_0[8] = {0, 1, 2, 3, 4, 5, 6, 7}; |
| int16x8_t ref_mb_type_16x8_q0, ref_mb_type_16x8_q1, ref_mb_type_16x8_q2, |
| ref_mb_type_16x8_q3, mb_quard1_part_x_16x8; |
| int16x8_t ref_mb_type_16x8_0; |
| int16x8_t ref_mb_type_16x8_low_0; |
| uint16x8_t mb_type_mask_16x8_0; |
| uint16x8_t mb_type_mask_16x8_low_0; |
| uint16x8_t mask_16x8_0; |
| |
| int16x8_t index_arr_0; |
| int16x8x2_t inp_data_16x8x2; |
| int16x8_t inp_data_16x8; |
| int16x8_t res_16x8_0; |
| int16x8_t one_16x8 = vdupq_n_s16(1); |
| int16x8_t zero_16x8 = vdupq_n_s16(0); |
| index_arr_0 = vld1q_s16(&index_0[0]); |
| |
| ref_mb_type_16x8_q0 = vdupq_n_s16(i4_ref_mb_type_q0); |
| ref_mb_type_16x8_q1 = vdupq_n_s16(i4_ref_mb_type_q1); |
| ref_mb_type_16x8_q2 = vdupq_n_s16(i4_ref_mb_type_q2); |
| ref_mb_type_16x8_q3 = vdupq_n_s16(i4_ref_mb_type_q3); |
| if((i4_mb_quard1_part_x >= i4_refarray_wd) && (i4_mb_quard1_part_y >= i4_refarray_ht)) |
| { |
| // Quard 0 |
| ref_mb_type_16x8_0 = ref_mb_type_16x8_q0; |
| mb_type_mask_16x8_0 = vceqq_s16(ref_mb_type_16x8_0, one_16x8); |
| } |
| else if((i4_mb_quard1_part_y >= (i4_refarray_ht - 1)) && |
| (i4_mb_quard1_part_x < i4_refarray_wd)) |
| { |
| // Quard 0 & 1 |
| mb_quard1_part_x_16x8 = vdupq_n_s16((i4_mb_quard1_part_x)); |
| mask_16x8_0 = vcltq_s16(index_arr_0, |
| mb_quard1_part_x_16x8); // return 1 if a<b, else 0 |
| |
| ref_mb_type_16x8_0 = vbslq_s16(mask_16x8_0, ref_mb_type_16x8_q0, ref_mb_type_16x8_q1); |
| mb_type_mask_16x8_0 = vceqq_s16(ref_mb_type_16x8_0, one_16x8); |
| } |
| else |
| { |
| if(i4_mb_quard1_part_x >= i4_refarray_wd) |
| { |
| ref_mb_type_16x8_0 = ref_mb_type_16x8_q0; |
| ref_mb_type_16x8_low_0 = ref_mb_type_16x8_q2; |
| } |
| else |
| { |
| mb_quard1_part_x_16x8 = vdupq_n_s16((i4_mb_quard1_part_x)); |
| mask_16x8_0 = |
| vcltq_s16(index_arr_0, mb_quard1_part_x_16x8); // return 1 if a<b, else 0 |
| |
| ref_mb_type_16x8_0 = |
| vbslq_s16(mask_16x8_0, ref_mb_type_16x8_q0, ref_mb_type_16x8_q1); |
| ref_mb_type_16x8_low_0 = |
| vbslq_s16(mask_16x8_0, ref_mb_type_16x8_q2, ref_mb_type_16x8_q3); |
| |
| mb_type_mask_16x8_0 = vceqq_s16(ref_mb_type_16x8_0, one_16x8); |
| mb_type_mask_16x8_low_0 = vceqq_s16(ref_mb_type_16x8_low_0, one_16x8); |
| } |
| } |
| |
| if(i4_mb_quard1_part_y < i4_refarray_ht - 1) |
| { |
| for(i4_y = 0; i4_y < i4_refarray_ht; i4_y++) |
| { |
| pi2_ref_data_byte = pi2_inp_data + (i4_y * i4_inp_data_stride); |
| inp_data_16x8x2 = vld2q_s16((pi2_ref_data_byte)); |
| inp_data_16x8 = inp_data_16x8x2.val[0]; |
| |
| if(i4_y < i4_mb_quard1_part_y) |
| { |
| res_16x8_0 = vbslq_s16(mb_type_mask_16x8_0, inp_data_16x8, zero_16x8); |
| } |
| else |
| { |
| res_16x8_0 = vbslq_s16(mb_type_mask_16x8_low_0, inp_data_16x8, zero_16x8); |
| } |
| pi2_ref_array_temp = pi2_ref_array + (i4_y * i4_refarray_wd); |
| vst1q_s16((pi2_ref_array_temp), res_16x8_0); |
| } |
| } |
| else |
| { |
| for(i4_y = 0; i4_y < i4_refarray_ht; i4_y++) |
| { |
| pi2_ref_data_byte = pi2_inp_data + (i4_y * i4_inp_data_stride); |
| inp_data_16x8x2 = vld2q_s16((pi2_ref_data_byte)); |
| inp_data_16x8 = inp_data_16x8x2.val[0]; |
| |
| res_16x8_0 = vbslq_s16(mb_type_mask_16x8_0, inp_data_16x8, zero_16x8); |
| |
| pi2_ref_array_temp = pi2_ref_array + (i4_y * i4_refarray_wd); |
| vst1q_s16((pi2_ref_array_temp), res_16x8_0); |
| } |
| } |
| } |
| } |