| /****************************************************************************** |
| * |
| * Copyright (C) 2022 The Android Open Source Project |
| * |
| * Licensed under the Apache License, Version 2.0 (the "License"); |
| * you may not use this file except in compliance with the License. |
| * You may obtain a copy of the License at: |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| * |
| ***************************************************************************** |
| * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore |
| */ |
| /** |
| ******************************************************************************* |
| * @file |
| * isvcd_intra_resamp_neonintr.c |
| * |
| * @brief |
| * Contains routines that resample for SVC resampling |
| * |
| * @author |
| * Kishore |
| * |
| * @par List of Functions: |
| * - isvcd_interpolate_base_luma_dyadic_neonintr() |
| * - isvcd_interpolate_intra_base_neonintr() |
| * - isvcd_horz_interpol_chroma_dyadic_1_neonintr() |
| * - isvcd_horz_interpol_chroma_dyadic_2_neonintr() |
| * - isvcd_vert_interpol_chroma_dyadic_1_neonintr() |
| * - isvcd_vert_interpol_chroma_dyadic_2_neonintr() |
| * - isvcd_vert_interpol_chroma_dyadic_3_neonintr() |
| * |
| * @remarks |
| * None |
| * |
| ******************************************************************************* |
| */ |
| #include <assert.h> |
| #include <string.h> |
| #include <arm_neon.h> |
| |
| #include "ih264_typedefs.h" |
| #include "ih264_macros.h" |
| #include "ih264_platform_macros.h" |
| #include "isvcd_structs.h" |
| #include "ih264_debug.h" |
| |
| /*****************************************************************************/ |
| /* */ |
| /* Function Name : isvcd_interpolate_base_luma_dyadic_neonintr */ |
| /* */ |
| /* Description : This function takes the reference array buffer & performs*/ |
| /* intra resampling for dyadic scaling ratios */ |
| /* Inputs : pu1_inp_buf : ptr to the 12x12 reference sample buffer */ |
| /* pi2_tmp_filt_buf : ptr to the 12x16 buffer to hold the */ |
| /* vertically interpolated data */ |
| /* pu1_out_buf : output buffer pointer */ |
| /* i4_out_stride : output buffer stride */ |
| /* Globals : none */ |
| /* Processing : it does the interpolation in vertical direction followed */ |
| /* by horizontal direction */ |
| /* Outputs : resampled pixels */ |
| /* Returns : none */ |
| /* */ |
| /* Issues : none */ |
| /* */ |
| /* Revision History: */ |
| /* */ |
| /* DD MM YYYY Author(s) Changes (Describe the changes made) */ |
| /* 05 21 2021 Dolan creation */ |
| /* */ |
| /*****************************************************************************/ |
| void isvcd_interpolate_base_luma_dyadic_neonintr(UWORD8 *pu1_inp_buf, WORD16 *pi2_tmp_filt_buf, |
| UWORD8 *pu1_out_buf, WORD32 i4_out_stride) |
| { |
| WORD32 i4_y; |
| WORD16 i4_coeff_0, i4_coeff_1, i4_coeff_2, i4_coeff_3; |
| WORD32 i4_filt_stride, i4_src_stride; |
| UWORD8 *pu1_inp, *pu1_out; |
| WORD16 *pi2_tmp; |
| |
| int16x4_t i4_rslt_vert_16x4_1, i4_rslt_vert_16x4_2; |
| uint8x8_t i4_samp_vert_8x8_0, i4_samp_vert_8x8_1, i4_samp_vert_8x8_2, i4_samp_vert_8x8_3; |
| int16x8_t i4_rslt_vert_16x8_0, i4_rslt_vert_16x8_2; |
| /* Horizontal interpolation */ |
| int32x4_t const_512_32x4 = vdupq_n_s32(512); |
| int32x4_t i4_rslt_horz_r0_1, i4_rslt_horz_r1_1, i4_rslt_horz_r0_2, i4_rslt_horz_r1_2; |
| uint16x4_t i4_rslt_horz_r0_1_tmp, i4_rslt_horz_r1_1_tmp, i4_rslt_horz_r0_2_tmp, |
| i4_rslt_horz_r1_2_tmp; |
| uint16x8_t rslt_16x8_t_1, rslt_16x8_t_2; |
| int32x4x2_t i4_rslt_horz_32x4x2_t; |
| int16x4_t i4_samp_horz_16x4_0, i4_samp_horz_16x4_1, i4_samp_horz_16x4_2, i4_samp_horz_16x4_3, |
| i4_samp_horz_16x4_4; |
| int16x4_t i4_samp_horz_16x4_5, i4_samp_horz_16x4_6, i4_samp_horz_16x4_7, i4_samp_horz_16x4_8; |
| int16_t i4_coeff_c0 = -3; |
| int16_t i4_coeff_c1 = 28; |
| int16_t i4_coeff_c2 = 8; |
| int16_t i4_coeff_c3 = -1; |
| int32x4_t i4_rslt_horz_r0_1_tmp32, i4_rslt_horz_r1_1_tmp32, i4_rslt_horz_r0_2_tmp32, |
| i4_rslt_horz_r1_2_tmp32; |
| |
| /* Filter coefficient values for phase 4 */ |
| i4_coeff_0 = -3; |
| i4_coeff_1 = 28; |
| i4_coeff_2 = 8; |
| i4_coeff_3 = -1; |
| i4_filt_stride = 12; |
| i4_src_stride = DYADIC_REF_W_Y; |
| |
| pu1_inp = pu1_inp_buf; |
| pi2_tmp = pi2_tmp_filt_buf; |
| pu1_out = pu1_out_buf; |
| |
| /* Vertical interpolation */ |
| // First 64 bits |
| i4_samp_vert_8x8_0 = vld1_u8((const uint8_t *) pu1_inp); |
| pu1_inp += i4_src_stride; |
| i4_samp_vert_8x8_1 = vld1_u8((const uint8_t *) pu1_inp); |
| pu1_inp += i4_src_stride; |
| i4_samp_vert_8x8_2 = vld1_u8((const uint8_t *) pu1_inp); |
| pu1_inp += i4_src_stride; |
| i4_samp_vert_8x8_3 = vld1_u8((const uint8_t *) pu1_inp); |
| pu1_inp += i4_src_stride; |
| |
| i4_rslt_vert_16x8_0 = |
| vmulq_n_s16(vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_0)), i4_coeff_3); |
| i4_rslt_vert_16x8_0 = vmlaq_n_s16( |
| i4_rslt_vert_16x8_0, vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_1)), i4_coeff_2); |
| i4_rslt_vert_16x8_0 = vmlaq_n_s16( |
| i4_rslt_vert_16x8_0, vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_2)), i4_coeff_1); |
| i4_rslt_vert_16x8_0 = vmlaq_n_s16( |
| i4_rslt_vert_16x8_0, vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_3)), i4_coeff_0); |
| |
| vst1q_s16(pi2_tmp, i4_rslt_vert_16x8_0); |
| pi2_tmp += i4_filt_stride; |
| |
| for(i4_y = 1; i4_y < 15; i4_y += 2) |
| { |
| i4_samp_vert_8x8_0 = i4_samp_vert_8x8_1; |
| i4_samp_vert_8x8_1 = i4_samp_vert_8x8_2; |
| i4_samp_vert_8x8_2 = i4_samp_vert_8x8_3; |
| i4_samp_vert_8x8_3 = vld1_u8((const uint8_t *) pu1_inp); |
| |
| i4_rslt_vert_16x8_0 = |
| vmulq_n_s16(vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_0)), i4_coeff_0); |
| i4_rslt_vert_16x8_0 = vmlaq_n_s16( |
| i4_rslt_vert_16x8_0, vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_1)), i4_coeff_1); |
| i4_rslt_vert_16x8_0 = vmlaq_n_s16( |
| i4_rslt_vert_16x8_0, vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_2)), i4_coeff_2); |
| i4_rslt_vert_16x8_0 = vmlaq_n_s16( |
| i4_rslt_vert_16x8_0, vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_3)), i4_coeff_3); |
| |
| i4_rslt_vert_16x8_2 = |
| vmulq_n_s16(vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_0)), i4_coeff_3); |
| i4_rslt_vert_16x8_2 = vmlaq_n_s16( |
| i4_rslt_vert_16x8_2, vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_1)), i4_coeff_2); |
| i4_rslt_vert_16x8_2 = vmlaq_n_s16( |
| i4_rslt_vert_16x8_2, vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_2)), i4_coeff_1); |
| i4_rslt_vert_16x8_2 = vmlaq_n_s16( |
| i4_rslt_vert_16x8_2, vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_3)), i4_coeff_0); |
| |
| /* Storing the results */ |
| vst1q_s16(pi2_tmp, (i4_rslt_vert_16x8_0)); |
| pi2_tmp += i4_filt_stride; |
| vst1q_s16(pi2_tmp, (i4_rslt_vert_16x8_2)); |
| pi2_tmp += i4_filt_stride; |
| pu1_inp += i4_src_stride; |
| } /*End of Loop over y*/ |
| |
| /* y = 15, y_phase = 4 */ |
| i4_samp_vert_8x8_0 = i4_samp_vert_8x8_1; |
| i4_samp_vert_8x8_1 = i4_samp_vert_8x8_2; |
| i4_samp_vert_8x8_2 = i4_samp_vert_8x8_3; |
| i4_samp_vert_8x8_3 = vld1_u8((const uint8_t *) pu1_inp); |
| |
| i4_rslt_vert_16x8_0 = |
| vmulq_n_s16(vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_0)), i4_coeff_0); |
| i4_rslt_vert_16x8_0 = vmlaq_n_s16( |
| i4_rslt_vert_16x8_0, vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_1)), i4_coeff_1); |
| i4_rslt_vert_16x8_0 = vmlaq_n_s16( |
| i4_rslt_vert_16x8_0, vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_2)), i4_coeff_2); |
| i4_rslt_vert_16x8_0 = vmlaq_n_s16( |
| i4_rslt_vert_16x8_0, vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_3)), i4_coeff_3); |
| |
| vst1q_s16(pi2_tmp, (i4_rslt_vert_16x8_0)); |
| /* End of loop over x */ |
| |
| // Remaining 32 bits |
| pu1_inp = pu1_inp_buf + 8; |
| pi2_tmp = pi2_tmp_filt_buf + 8; |
| |
| i4_samp_vert_8x8_0 = vld1_u8((const uint8_t *) pu1_inp); |
| pu1_inp += i4_src_stride; |
| i4_samp_vert_8x8_1 = vld1_u8((const uint8_t *) pu1_inp); |
| pu1_inp += i4_src_stride; |
| i4_samp_vert_8x8_2 = vld1_u8((const uint8_t *) pu1_inp); |
| pu1_inp += i4_src_stride; |
| i4_samp_vert_8x8_3 = vld1_u8((const uint8_t *) pu1_inp); |
| pu1_inp += i4_src_stride; |
| |
| i4_rslt_vert_16x4_1 = |
| vmul_n_s16(vreinterpret_s16_u16(vget_low_u16(vmovl_u8(i4_samp_vert_8x8_0))), i4_coeff_3); |
| i4_rslt_vert_16x4_1 = |
| vmla_n_s16(i4_rslt_vert_16x4_1, |
| vreinterpret_s16_u16(vget_low_u16(vmovl_u8(i4_samp_vert_8x8_1))), i4_coeff_2); |
| i4_rslt_vert_16x4_1 = |
| vmla_n_s16(i4_rslt_vert_16x4_1, |
| vreinterpret_s16_u16(vget_low_u16(vmovl_u8(i4_samp_vert_8x8_2))), i4_coeff_1); |
| i4_rslt_vert_16x4_1 = |
| vmla_n_s16(i4_rslt_vert_16x4_1, |
| vreinterpret_s16_u16(vget_low_u16(vmovl_u8(i4_samp_vert_8x8_3))), i4_coeff_0); |
| |
| vst1_s16(pi2_tmp, (i4_rslt_vert_16x4_1)); |
| pi2_tmp += i4_filt_stride; |
| |
| for(i4_y = 1; i4_y < 15; i4_y += 2) |
| { |
| i4_samp_vert_8x8_0 = i4_samp_vert_8x8_1; |
| i4_samp_vert_8x8_1 = i4_samp_vert_8x8_2; |
| i4_samp_vert_8x8_2 = i4_samp_vert_8x8_3; |
| i4_samp_vert_8x8_3 = vld1_u8((const uint8_t *) pu1_inp); |
| |
| i4_rslt_vert_16x4_1 = vmul_n_s16( |
| vreinterpret_s16_u16(vget_low_u16(vmovl_u8(i4_samp_vert_8x8_0))), i4_coeff_0); |
| i4_rslt_vert_16x4_1 = vmla_n_s16( |
| i4_rslt_vert_16x4_1, vreinterpret_s16_u16(vget_low_u16(vmovl_u8(i4_samp_vert_8x8_1))), |
| i4_coeff_1); |
| i4_rslt_vert_16x4_1 = vmla_n_s16( |
| i4_rslt_vert_16x4_1, vreinterpret_s16_u16(vget_low_u16(vmovl_u8(i4_samp_vert_8x8_2))), |
| i4_coeff_2); |
| i4_rslt_vert_16x4_1 = vmla_n_s16( |
| i4_rslt_vert_16x4_1, vreinterpret_s16_u16(vget_low_u16(vmovl_u8(i4_samp_vert_8x8_3))), |
| i4_coeff_3); |
| |
| i4_rslt_vert_16x4_2 = vmul_n_s16( |
| vreinterpret_s16_u16(vget_low_u16(vmovl_u8(i4_samp_vert_8x8_0))), i4_coeff_3); |
| i4_rslt_vert_16x4_2 = vmla_n_s16( |
| i4_rslt_vert_16x4_2, vreinterpret_s16_u16(vget_low_u16(vmovl_u8(i4_samp_vert_8x8_1))), |
| i4_coeff_2); |
| i4_rslt_vert_16x4_2 = vmla_n_s16( |
| i4_rslt_vert_16x4_2, vreinterpret_s16_u16(vget_low_u16(vmovl_u8(i4_samp_vert_8x8_2))), |
| i4_coeff_1); |
| i4_rslt_vert_16x4_2 = vmla_n_s16( |
| i4_rslt_vert_16x4_2, vreinterpret_s16_u16(vget_low_u16(vmovl_u8(i4_samp_vert_8x8_3))), |
| i4_coeff_0); |
| |
| vst1_s16(pi2_tmp, (i4_rslt_vert_16x4_1)); |
| pi2_tmp += i4_filt_stride; |
| vst1_s16(pi2_tmp, (i4_rslt_vert_16x4_2)); |
| pi2_tmp += i4_filt_stride; |
| pu1_inp += i4_src_stride; |
| } |
| |
| i4_samp_vert_8x8_0 = i4_samp_vert_8x8_1; |
| i4_samp_vert_8x8_1 = i4_samp_vert_8x8_2; |
| i4_samp_vert_8x8_2 = i4_samp_vert_8x8_3; |
| i4_samp_vert_8x8_3 = vld1_u8((const uint8_t *) pu1_inp); |
| |
| i4_rslt_vert_16x4_1 = |
| vmul_n_s16(vreinterpret_s16_u16(vget_low_u16(vmovl_u8(i4_samp_vert_8x8_0))), i4_coeff_0); |
| i4_rslt_vert_16x4_1 = |
| vmla_n_s16(i4_rslt_vert_16x4_1, |
| vreinterpret_s16_u16(vget_low_u16(vmovl_u8(i4_samp_vert_8x8_1))), i4_coeff_1); |
| i4_rslt_vert_16x4_1 = |
| vmla_n_s16(i4_rslt_vert_16x4_1, |
| vreinterpret_s16_u16(vget_low_u16(vmovl_u8(i4_samp_vert_8x8_2))), i4_coeff_2); |
| i4_rslt_vert_16x4_1 = |
| vmla_n_s16(i4_rslt_vert_16x4_1, |
| vreinterpret_s16_u16(vget_low_u16(vmovl_u8(i4_samp_vert_8x8_3))), i4_coeff_3); |
| |
| vst1_s16(pi2_tmp, (i4_rslt_vert_16x4_1)); |
| /* Reinitializing the ptrs */ |
| pu1_inp = pu1_inp_buf; |
| pi2_tmp = pi2_tmp_filt_buf; |
| |
| /* Horizontal interpolation */ |
| for(i4_y = 0; i4_y < 16; i4_y++) |
| { |
| i4_samp_horz_16x4_0 = vld1_s16(pi2_tmp); |
| i4_samp_horz_16x4_1 = vld1_s16(pi2_tmp + 1); |
| i4_samp_horz_16x4_2 = vld1_s16(pi2_tmp + 2); |
| i4_samp_horz_16x4_3 = vld1_s16(pi2_tmp + 3); |
| i4_samp_horz_16x4_4 = vld1_s16(pi2_tmp + 4); |
| i4_samp_horz_16x4_5 = vld1_s16(pi2_tmp + 5); |
| i4_samp_horz_16x4_6 = vld1_s16(pi2_tmp + 6); |
| i4_samp_horz_16x4_7 = vld1_s16(pi2_tmp + 7); |
| i4_samp_horz_16x4_8 = vld1_s16(pi2_tmp + 8); |
| |
| i4_rslt_horz_r0_1 = vmull_n_s16(i4_samp_horz_16x4_0, i4_coeff_c3); // a0c3 a1c3 a2c3 a3c3 |
| i4_rslt_horz_r0_1 = vmlal_n_s16(i4_rslt_horz_r0_1, i4_samp_horz_16x4_1, |
| i4_coeff_c2); // a0c0+a1c1 a1c0+a2c1 a2c0+a3c1 a3c0+a4c1 |
| i4_rslt_horz_r0_1 = vmlal_n_s16(i4_rslt_horz_r0_1, i4_samp_horz_16x4_2, i4_coeff_c1); |
| i4_rslt_horz_r0_1 = vmlal_n_s16(i4_rslt_horz_r0_1, i4_samp_horz_16x4_3, i4_coeff_c0); |
| |
| i4_rslt_horz_r1_1 = vmull_n_s16(i4_samp_horz_16x4_1, i4_coeff_c0); // a0c0 a1c0 a2c0 a3c0 |
| i4_rslt_horz_r1_1 = vmlal_n_s16(i4_rslt_horz_r1_1, i4_samp_horz_16x4_2, |
| i4_coeff_c1); // a0c0+a1c1 a1c0+a2c1 a2c0+a3c1 a3c0+a4c1 |
| i4_rslt_horz_r1_1 = vmlal_n_s16(i4_rslt_horz_r1_1, i4_samp_horz_16x4_3, i4_coeff_c2); |
| i4_rslt_horz_r1_1 = vmlal_n_s16(i4_rslt_horz_r1_1, i4_samp_horz_16x4_4, i4_coeff_c3); |
| |
| i4_rslt_horz_r0_2 = vmull_n_s16(i4_samp_horz_16x4_4, i4_coeff_c3); // a0c3 a1c3 a2c3 a3c3 |
| i4_rslt_horz_r0_2 = vmlal_n_s16(i4_rslt_horz_r0_2, i4_samp_horz_16x4_5, |
| i4_coeff_c2); // a0c0+a1c1 a1c0+a2c1 a2c0+a3c1 a3c0+a4c1 |
| i4_rslt_horz_r0_2 = vmlal_n_s16(i4_rslt_horz_r0_2, i4_samp_horz_16x4_6, i4_coeff_c1); |
| i4_rslt_horz_r0_2 = vmlal_n_s16(i4_rslt_horz_r0_2, i4_samp_horz_16x4_7, i4_coeff_c0); |
| |
| i4_rslt_horz_r1_2 = vmull_n_s16(i4_samp_horz_16x4_5, i4_coeff_c0); // a0c0 a1c0 a2c0 a3c0 |
| i4_rslt_horz_r1_2 = vmlal_n_s16(i4_rslt_horz_r1_2, i4_samp_horz_16x4_6, |
| i4_coeff_c1); // a0c0+a1c1 a1c0+a2c1 a2c0+a3c1 a3c0+a4c1 |
| i4_rslt_horz_r1_2 = vmlal_n_s16(i4_rslt_horz_r1_2, i4_samp_horz_16x4_7, i4_coeff_c2); |
| i4_rslt_horz_r1_2 = vmlal_n_s16(i4_rslt_horz_r1_2, i4_samp_horz_16x4_8, i4_coeff_c3); |
| |
| i4_rslt_horz_32x4x2_t = vzipq_s32(i4_rslt_horz_r0_1, i4_rslt_horz_r1_1); |
| i4_rslt_horz_r0_1_tmp32 = i4_rslt_horz_32x4x2_t.val[0]; // 0 to 3 |
| i4_rslt_horz_r1_1_tmp32 = i4_rslt_horz_32x4x2_t.val[1]; // 4 to 7 |
| |
| i4_rslt_horz_32x4x2_t = vzipq_s32(i4_rslt_horz_r0_2, i4_rslt_horz_r1_2); |
| i4_rslt_horz_r0_2_tmp32 = i4_rslt_horz_32x4x2_t.val[0]; // 8 to 11 |
| i4_rslt_horz_r1_2_tmp32 = i4_rslt_horz_32x4x2_t.val[1]; // 12 to 15 |
| |
| i4_rslt_horz_r0_1 = vaddq_s32(i4_rslt_horz_r0_1_tmp32, const_512_32x4); |
| i4_rslt_horz_r1_1 = vaddq_s32(i4_rslt_horz_r1_1_tmp32, const_512_32x4); |
| i4_rslt_horz_r0_2 = vaddq_s32(i4_rslt_horz_r0_2_tmp32, const_512_32x4); |
| i4_rslt_horz_r1_2 = vaddq_s32(i4_rslt_horz_r1_2_tmp32, const_512_32x4); |
| |
| i4_rslt_horz_r0_1_tmp = vqshrun_n_s32(i4_rslt_horz_r0_1, 10); |
| i4_rslt_horz_r1_1_tmp = vqshrun_n_s32(i4_rslt_horz_r1_1, 10); |
| |
| i4_rslt_horz_r0_2_tmp = vqshrun_n_s32(i4_rslt_horz_r0_2, 10); |
| i4_rslt_horz_r1_2_tmp = vqshrun_n_s32(i4_rslt_horz_r1_2, 10); |
| |
| rslt_16x8_t_1 = vcombine_u16(i4_rslt_horz_r0_1_tmp, i4_rslt_horz_r1_1_tmp); // 0 to 7 |
| rslt_16x8_t_2 = vcombine_u16(i4_rslt_horz_r0_2_tmp, i4_rslt_horz_r1_2_tmp); // 8 to 15 |
| |
| vst1_u8(pu1_out, vqmovn_u16(rslt_16x8_t_1)); |
| vst1_u8(pu1_out + 8, vqmovn_u16(rslt_16x8_t_2)); |
| |
| pu1_out += i4_out_stride; |
| pi2_tmp += i4_filt_stride; |
| } |
| } |
| |
| /*****************************************************************************/ |
| /* */ |
| /* Function Name : isvcd_interpolate_intra_base_neonintr */ |
| /* */ |
| /* Description : This function takes the reference array buffer & performs*/ |
| /* interpolation of a component to find the intra */ |
| /* resampled value */ |
| /* Inputs : pv_intra_samp_ctxt : intra sampling context */ |
| /* pu1_out : output buffer pointer */ |
| /* i4_out_stride : output buffer stride */ |
| /* i4_refarray_wd : reference array width */ |
| /* i4_x_offset : offset in reference layer in horz direction*/ |
| /* ps_coord : current mb co-ordinate */ |
| /* i4_chroma_flag : chroma processing flag */ |
| /* Globals : none */ |
| /* Processing : it does the interpolation in vertical direction followed */ |
| /* by horizontal direction */ |
| /* Outputs : resampled pixels */ |
| /* Returns : none */ |
| /* */ |
| /* Issues : none */ |
| /* */ |
| /* Revision History: */ |
| /* */ |
| /* DD MM YYYY Author(s) Changes (Describe the changes made) */ |
| /* 26 06 2009 vijayakumar creation */ |
| /* */ |
| /*****************************************************************************/ |
| void isvcd_interpolate_intra_base_neonintr(void *pv_intra_samp_ctxt, UWORD8 *pu1_out, |
| WORD32 i4_out_stride, WORD32 i4_refarray_wd, |
| WORD32 i4_mb_x, WORD32 i4_mb_y, WORD32 i4_chroma_flag, |
| WORD32 i4_refarray_flag) |
| { |
| /* --------------------------------------------------------------------- */ |
| /* Index Parameters */ |
| /* --------------------------------------------------------------------- */ |
| intra_sampling_ctxt_t *ps_ctxt; |
| intra_samp_map_ctxt_t *ps_map_ctxt; |
| intra_samp_lyr_ctxt *ps_lyr_ctxt; |
| WORD32 i4_x, i4_y; |
| WORD32 i4_frm_mb_x, i4_frm_mb_y; |
| UWORD8 *pu1_refarray = NULL; |
| ref_pixel_map_t *ps_x_pos_phase; |
| ref_pixel_map_t *ps_y_pos_phase; |
| WORD32 i4_temp_array_ht; |
| WORD32 *pi4_interp_buff; |
| |
| UWORD8 arr_y_ref_pos_luma[16] = {0}; |
| UWORD8 arr_x_ref_pos_luma[16] = {0}; |
| UWORD8 arr_x_ref_pos_luma_low[16] = {0}; |
| UWORD8 arr_x_ref_pos_luma_high[16] = {0}; |
| UWORD8 arr_phase_luma[16] = {0}; |
| UWORD8 *pi4_y_ref_pos_luma; |
| UWORD8 *pi4_x_ref_pos_luma_low; |
| UWORD8 *pi4_x_ref_pos_luma_high; |
| UWORD8 *pi4_phase_luma; |
| WORD16 *pi2_interp_buff_temp; |
| WORD32 i4_mb_wd; |
| WORD32 i4_mb_ht; |
| WORD32 i4_x_min; |
| ref_min_max_map_t *ps_x_min_max; |
| UWORD8 *pu1_refarray_temp; |
| |
| ps_ctxt = (intra_sampling_ctxt_t *) pv_intra_samp_ctxt; |
| ps_lyr_ctxt = &ps_ctxt->as_res_lyrs[ps_ctxt->i4_res_lyr_id]; |
| |
| if(0 == i4_refarray_flag) |
| { |
| pu1_refarray = ps_ctxt->pu1_refarray_buffer; |
| } |
| else if(1 == i4_refarray_flag) |
| { |
| pu1_refarray = ps_ctxt->pu1_refarray_cb; |
| } |
| |
| /* --------------------------------------------------------------------- */ |
| /* LUMA or CHROMA */ |
| /* --------------------------------------------------------------------- */ |
| |
| if(1 == i4_chroma_flag) |
| ps_map_ctxt = &(ps_lyr_ctxt->s_chroma_map_ctxt); |
| else |
| ps_map_ctxt = &(ps_lyr_ctxt->s_luma_map_ctxt); |
| |
| i4_mb_wd = MB_WIDTH >> i4_chroma_flag; |
| i4_mb_ht = MB_HEIGHT >> i4_chroma_flag; |
| |
| ps_x_min_max = ps_map_ctxt->ps_x_min_max; |
| |
| i4_frm_mb_y = i4_mb_y * i4_mb_ht; |
| i4_frm_mb_x = i4_mb_x * i4_mb_wd; |
| /* get the min and max positions */ |
| i4_x_min = ps_x_min_max[i4_mb_x].i2_min_pos; |
| |
| /* --------------------------------------------------------------------- */ |
| /* Projected frame level pointers */ |
| /* --------------------------------------------------------------------- */ |
| ps_x_pos_phase = ps_map_ctxt->ps_x_pos_phase; |
| ps_y_pos_phase = ps_map_ctxt->ps_y_pos_phase; |
| |
| /* --------------------------------------------------------------------- */ |
| /* Pointers and Dimenstion of the temporary buffer */ |
| /* --------------------------------------------------------------------- */ |
| i4_temp_array_ht = i4_mb_ht; |
| pi4_interp_buff = ps_ctxt->pi4_temp_interpolation_buffer; |
| pi2_interp_buff_temp = (WORD16 *) pi4_interp_buff; |
| |
| /* --------------------------------------------------------------------- */ |
| /* Loop for interpolation in vertical direction */ |
| /* --------------------------------------------------------------------- */ |
| if(i4_chroma_flag == 0) |
| { |
| { |
| uint8x8_t inp_8x8_r0, inp_8x8_r0_1; |
| uint8x8_t inp_8x8_r1, inp_8x8_r1_1; |
| uint8x8_t inp_8x8_r2, inp_8x8_r2_1; |
| uint8x8_t inp_8x8_r3, inp_8x8_r3_1; |
| int16x8_t out_res_16x8_r0_0, out_res_16x8_r0_1; |
| |
| for(i4_y = 0; i4_y < (i4_temp_array_ht); i4_y++) |
| { |
| arr_phase_luma[i4_y] = (UWORD8) ps_y_pos_phase[i4_y + i4_frm_mb_y].i2_phase; |
| arr_y_ref_pos_luma[i4_y] = (UWORD8) (ps_y_pos_phase[i4_y + i4_frm_mb_y].i2_ref_pos); |
| } |
| pi4_y_ref_pos_luma = arr_y_ref_pos_luma; |
| pi4_phase_luma = arr_phase_luma; |
| |
| for(i4_y = 0; i4_y < (i4_temp_array_ht); i4_y++) |
| { |
| pu1_refarray_temp = |
| pu1_refarray + (pi4_y_ref_pos_luma[i4_y] * i4_refarray_wd) + (i4_x_min - 1); |
| inp_8x8_r0 = vld1_u8((pu1_refarray_temp - i4_refarray_wd)); |
| inp_8x8_r1 = vld1_u8((pu1_refarray_temp)); |
| inp_8x8_r2 = vld1_u8((pu1_refarray_temp + i4_refarray_wd)); |
| inp_8x8_r3 = vld1_u8((pu1_refarray_temp + 2 * i4_refarray_wd)); |
| |
| inp_8x8_r0_1 = vld1_u8((pu1_refarray_temp + 8 - i4_refarray_wd)); |
| inp_8x8_r1_1 = vld1_u8((pu1_refarray_temp + 8)); |
| inp_8x8_r2_1 = vld1_u8((pu1_refarray_temp + 8 + i4_refarray_wd)); |
| inp_8x8_r3_1 = vld1_u8((pu1_refarray_temp + 8 + 2 * i4_refarray_wd)); |
| |
| out_res_16x8_r0_0 = vmulq_n_s16(vreinterpretq_s16_u16(vmovl_u8(inp_8x8_r0)), |
| g_ai1_interp_filter_luma[pi4_phase_luma[i4_y]]); |
| out_res_16x8_r0_0 = |
| vmlaq_n_s16(out_res_16x8_r0_0, vreinterpretq_s16_u16(vmovl_u8(inp_8x8_r1)), |
| g_ai1_interp_filter_luma[pi4_phase_luma[i4_y] + 16]); |
| out_res_16x8_r0_0 = |
| vmlaq_n_s16(out_res_16x8_r0_0, vreinterpretq_s16_u16(vmovl_u8(inp_8x8_r2)), |
| g_ai1_interp_filter_luma[pi4_phase_luma[i4_y] + 32]); |
| out_res_16x8_r0_0 = |
| vmlaq_n_s16(out_res_16x8_r0_0, vreinterpretq_s16_u16(vmovl_u8(inp_8x8_r3)), |
| g_ai1_interp_filter_luma[pi4_phase_luma[i4_y] + 48]); |
| |
| out_res_16x8_r0_1 = vmulq_n_s16(vreinterpretq_s16_u16(vmovl_u8(inp_8x8_r0_1)), |
| g_ai1_interp_filter_luma[pi4_phase_luma[i4_y]]); |
| out_res_16x8_r0_1 = |
| vmlaq_n_s16(out_res_16x8_r0_1, vreinterpretq_s16_u16(vmovl_u8(inp_8x8_r1_1)), |
| g_ai1_interp_filter_luma[pi4_phase_luma[i4_y] + 16]); |
| out_res_16x8_r0_1 = |
| vmlaq_n_s16(out_res_16x8_r0_1, vreinterpretq_s16_u16(vmovl_u8(inp_8x8_r2_1)), |
| g_ai1_interp_filter_luma[pi4_phase_luma[i4_y] + 32]); |
| out_res_16x8_r0_1 = |
| vmlaq_n_s16(out_res_16x8_r0_1, vreinterpretq_s16_u16(vmovl_u8(inp_8x8_r3_1)), |
| g_ai1_interp_filter_luma[pi4_phase_luma[i4_y] + 48]); |
| |
| vst1q_s16((pi2_interp_buff_temp + (i4_y * i4_refarray_wd) + (i4_x_min - 1)), |
| out_res_16x8_r0_0); |
| vst1q_s16((pi2_interp_buff_temp + (i4_y * i4_refarray_wd) + (i4_x_min - 1) + 8), |
| out_res_16x8_r0_1); |
| } |
| } |
| /*Horizontal Interpolation*/ |
| { |
| WORD32 strt_indx = 10; |
| |
| uint8x16_t phs_mask_8x8_0; |
| uint8x16_t x_ref_pos_luma_mask_r0_0; |
| uint8x16_t x_ref_pos_luma_mask_r0_1; |
| uint8x16_t x_ref_pos_luma_mask_r1_0; |
| uint8x16_t x_ref_pos_luma_mask_r1_1; |
| uint8x16_t x_ref_pos_luma_mask_r2_0; |
| uint8x16_t x_ref_pos_luma_mask_r2_1; |
| uint8x16_t x_ref_pos_luma_mask_r3_0; |
| uint8x16_t x_ref_pos_luma_mask_r3_1; |
| |
| WORD32 strt_indx_h = 0, i4_x2 = 0; |
| WORD32 i4_mb_wd_hlf = (i4_mb_wd >> 1); |
| uint8x16_t twos = vdupq_n_u8(2); |
| strt_indx = ps_x_pos_phase[0 + i4_frm_mb_x].i2_ref_pos - 1; |
| strt_indx_h = (ps_x_pos_phase[8 + i4_frm_mb_x].i2_ref_pos - strt_indx - 1); |
| for(i4_x = 0; i4_x < i4_mb_wd; i4_x++) |
| { |
| arr_x_ref_pos_luma[i4_x] = ps_x_pos_phase[i4_x + i4_frm_mb_x].i2_ref_pos; |
| arr_phase_luma[i4_x] = ps_x_pos_phase[i4_x + i4_frm_mb_x].i2_phase; |
| arr_x_ref_pos_luma[i4_x] = arr_x_ref_pos_luma[i4_x] - strt_indx - 1; |
| } |
| |
| for(i4_x = 0; i4_x < i4_mb_wd_hlf; i4_x++) |
| { |
| i4_x2 = i4_x << 1; |
| arr_x_ref_pos_luma_low[i4_x2] = (arr_x_ref_pos_luma[i4_x]) << 1; |
| arr_x_ref_pos_luma_low[i4_x2 + 1] = arr_x_ref_pos_luma_low[i4_x2] + 1; |
| } |
| for(i4_x = i4_mb_wd_hlf; i4_x < i4_mb_wd; i4_x++) |
| { |
| i4_x2 = (i4_x - i4_mb_wd_hlf) << 1; |
| arr_x_ref_pos_luma_high[i4_x2] = ((arr_x_ref_pos_luma[i4_x] - strt_indx_h) << 1); |
| arr_x_ref_pos_luma_high[i4_x2 + 1] = arr_x_ref_pos_luma_high[i4_x2] + 1; |
| } |
| pi4_x_ref_pos_luma_low = arr_x_ref_pos_luma_low; |
| pi4_x_ref_pos_luma_high = arr_x_ref_pos_luma_high; |
| pi4_phase_luma = arr_phase_luma; |
| |
| phs_mask_8x8_0 = vld1q_u8((const uint8_t *) pi4_phase_luma); |
| |
| x_ref_pos_luma_mask_r0_0 = vld1q_u8(pi4_x_ref_pos_luma_low); |
| x_ref_pos_luma_mask_r0_1 = vld1q_u8(pi4_x_ref_pos_luma_high); |
| x_ref_pos_luma_mask_r1_0 = vaddq_u8(x_ref_pos_luma_mask_r0_0, twos); |
| x_ref_pos_luma_mask_r1_1 = vaddq_u8(x_ref_pos_luma_mask_r0_1, twos); |
| x_ref_pos_luma_mask_r2_0 = vaddq_u8(x_ref_pos_luma_mask_r1_0, twos); |
| x_ref_pos_luma_mask_r2_1 = vaddq_u8(x_ref_pos_luma_mask_r1_1, twos); |
| x_ref_pos_luma_mask_r3_0 = x_ref_pos_luma_mask_r0_0; |
| x_ref_pos_luma_mask_r3_1 = x_ref_pos_luma_mask_r0_1; |
| |
| { |
| int8x16_t ip_filt_8x16_r0; |
| int8x16_t ip_filt_8x16_r1; |
| int8x16_t ip_filt_8x16_r2; |
| int8x16_t ip_filt_8x16_r3; |
| |
| int16x8_t ip_filt_16x8_r0_0, ip_filt_16x8_r0_1; |
| int16x8_t ip_filt_16x8_r1_0, ip_filt_16x8_r1_1; |
| int16x8_t ip_filt_16x8_r2_0, ip_filt_16x8_r2_1; |
| int16x8_t ip_filt_16x8_r3_0, ip_filt_16x8_r3_1; |
| |
| int16x8_t inp_16x8_0; |
| int16x8_t inp_16x8_1; |
| int16x8_t inp_16x8_2; |
| int16x8_t inp_16x8_3; |
| |
| int16x8_t inp_16x8_r0_0, inp_16x8_r2_0; |
| int16x8_t inp_16x8_r0_1, inp_16x8_r2_1; |
| int16x8_t inp_16x8_r1_0, inp_16x8_r3_0; |
| int16x8_t inp_16x8_r1_1, inp_16x8_r3_1; |
| |
| int16x4_t inp_16x4_r0_0, inp_16x4_r2_0; |
| int16x4_t inp_16x4_r0_1, inp_16x4_r2_1; |
| int16x4_t inp_16x4_r1_0, inp_16x4_r3_0; |
| int16x4_t inp_16x4_r1_1, inp_16x4_r3_1; |
| |
| int32x4_t out_res_32x4_r0_l_0; |
| int32x4_t out_res_32x4_r0_l_1; |
| int32x4_t out_res_32x4_r0_h_0; |
| int32x4_t out_res_32x4_r0_h_1; |
| |
| uint16x4_t out_res_16x4_r0_l_0; |
| uint16x4_t out_res_16x4_r0_l_1; |
| uint16x4_t out_res_16x4_r0_h_0; |
| uint16x4_t out_res_16x4_r0_h_1; |
| |
| uint8x8_t out_res_8x8_r0_l, out_res_8x8_r0_h; |
| uint8x8x2_t u1_temp_8x8x2_t; |
| uint8x8_t u1_temp_8x8_t0, u1_temp_8x8_t1; |
| |
| ip_filt_8x16_r0 = vld1q_s8((g_ai1_interp_filter_luma)); |
| ip_filt_8x16_r1 = vld1q_s8((g_ai1_interp_filter_luma + 16)); |
| ip_filt_8x16_r2 = vld1q_s8((g_ai1_interp_filter_luma + 32)); |
| ip_filt_8x16_r3 = vld1q_s8((g_ai1_interp_filter_luma + 48)); |
| |
| u1_temp_8x8x2_t.val[0] = vreinterpret_u8_s8(vget_low_s8(ip_filt_8x16_r0)); |
| u1_temp_8x8x2_t.val[1] = vreinterpret_u8_s8(vget_high_s8(ip_filt_8x16_r0)); |
| u1_temp_8x8_t0 = vtbl2_u8(u1_temp_8x8x2_t, vget_low_u8(phs_mask_8x8_0)); |
| u1_temp_8x8_t1 = vtbl2_u8(u1_temp_8x8x2_t, vget_high_u8(phs_mask_8x8_0)); |
| ip_filt_8x16_r0 = vcombine_s8(vreinterpret_s8_u8(u1_temp_8x8_t0), |
| vreinterpret_s8_u8(u1_temp_8x8_t1)); |
| |
| u1_temp_8x8x2_t.val[0] = vreinterpret_u8_s8(vget_low_s8(ip_filt_8x16_r1)); |
| u1_temp_8x8x2_t.val[1] = vreinterpret_u8_s8(vget_high_s8(ip_filt_8x16_r1)); |
| u1_temp_8x8_t0 = vtbl2_u8(u1_temp_8x8x2_t, vget_low_u8(phs_mask_8x8_0)); |
| u1_temp_8x8_t1 = vtbl2_u8(u1_temp_8x8x2_t, vget_high_u8(phs_mask_8x8_0)); |
| ip_filt_8x16_r1 = vcombine_s8(vreinterpret_s8_u8(u1_temp_8x8_t0), |
| vreinterpret_s8_u8(u1_temp_8x8_t1)); |
| u1_temp_8x8x2_t.val[0] = vreinterpret_u8_s8(vget_low_s8(ip_filt_8x16_r2)); |
| u1_temp_8x8x2_t.val[1] = vreinterpret_u8_s8(vget_high_s8(ip_filt_8x16_r2)); |
| u1_temp_8x8_t0 = vtbl2_u8(u1_temp_8x8x2_t, vget_low_u8(phs_mask_8x8_0)); |
| u1_temp_8x8_t1 = vtbl2_u8(u1_temp_8x8x2_t, vget_high_u8(phs_mask_8x8_0)); |
| ip_filt_8x16_r2 = vcombine_s8(vreinterpret_s8_u8(u1_temp_8x8_t0), |
| vreinterpret_s8_u8(u1_temp_8x8_t1)); |
| u1_temp_8x8x2_t.val[0] = vreinterpret_u8_s8(vget_low_s8(ip_filt_8x16_r3)); |
| u1_temp_8x8x2_t.val[1] = vreinterpret_u8_s8(vget_high_s8(ip_filt_8x16_r3)); |
| u1_temp_8x8_t0 = vtbl2_u8(u1_temp_8x8x2_t, vget_low_u8(phs_mask_8x8_0)); |
| u1_temp_8x8_t1 = vtbl2_u8(u1_temp_8x8x2_t, vget_high_u8(phs_mask_8x8_0)); |
| ip_filt_8x16_r3 = vcombine_s8(vreinterpret_s8_u8(u1_temp_8x8_t0), |
| vreinterpret_s8_u8(u1_temp_8x8_t1)); |
| ip_filt_16x8_r0_0 = vmovl_s8(vget_low_s8(ip_filt_8x16_r0)); |
| ip_filt_16x8_r1_0 = vmovl_s8(vget_low_s8(ip_filt_8x16_r1)); |
| ip_filt_16x8_r2_0 = vmovl_s8(vget_low_s8(ip_filt_8x16_r2)); |
| ip_filt_16x8_r3_0 = vmovl_s8(vget_low_s8(ip_filt_8x16_r3)); |
| ip_filt_16x8_r0_1 = vmovl_s8(vget_high_s8(ip_filt_8x16_r0)); |
| ip_filt_16x8_r1_1 = vmovl_s8(vget_high_s8(ip_filt_8x16_r1)); |
| ip_filt_16x8_r2_1 = vmovl_s8(vget_high_s8(ip_filt_8x16_r2)); |
| ip_filt_16x8_r3_1 = vmovl_s8(vget_high_s8(ip_filt_8x16_r3)); |
| |
| for(i4_y = 0; i4_y < i4_temp_array_ht; i4_y++) |
| { |
| inp_16x8_0 = vld1q_s16((pi2_interp_buff_temp + strt_indx)); |
| inp_16x8_1 = vld1q_s16((pi2_interp_buff_temp + strt_indx + strt_indx_h)); |
| inp_16x8_2 = vld1q_s16((pi2_interp_buff_temp + strt_indx + 3)); |
| inp_16x8_3 = vld1q_s16((pi2_interp_buff_temp + strt_indx + strt_indx_h + 3)); |
| pi2_interp_buff_temp += i4_refarray_wd; |
| u1_temp_8x8x2_t.val[0] = |
| vreinterpret_u8_s8(vget_low_s8(vreinterpretq_s8_s16(inp_16x8_0))); |
| u1_temp_8x8x2_t.val[1] = |
| vreinterpret_u8_s8(vget_high_s8(vreinterpretq_s8_s16(inp_16x8_0))); |
| u1_temp_8x8_t0 = |
| vtbl2_u8(u1_temp_8x8x2_t, vget_low_u8(x_ref_pos_luma_mask_r0_0)); |
| u1_temp_8x8_t1 = |
| vtbl2_u8(u1_temp_8x8x2_t, vget_high_u8(x_ref_pos_luma_mask_r0_0)); |
| inp_16x8_r0_0 = vreinterpretq_s16_s8(vcombine_s8( |
| vreinterpret_s8_u8(u1_temp_8x8_t0), vreinterpret_s8_u8(u1_temp_8x8_t1))); |
| |
| u1_temp_8x8x2_t.val[0] = |
| vreinterpret_u8_s8(vget_low_s8(vreinterpretq_s8_s16(inp_16x8_1))); |
| u1_temp_8x8x2_t.val[1] = |
| vreinterpret_u8_s8(vget_high_s8(vreinterpretq_s8_s16(inp_16x8_1))); |
| u1_temp_8x8_t0 = |
| vtbl2_u8(u1_temp_8x8x2_t, vget_low_u8(x_ref_pos_luma_mask_r0_1)); |
| u1_temp_8x8_t1 = |
| vtbl2_u8(u1_temp_8x8x2_t, vget_high_u8(x_ref_pos_luma_mask_r0_1)); |
| inp_16x8_r0_1 = vreinterpretq_s16_s8(vcombine_s8( |
| vreinterpret_s8_u8(u1_temp_8x8_t0), vreinterpret_s8_u8(u1_temp_8x8_t1))); |
| |
| u1_temp_8x8x2_t.val[0] = |
| vreinterpret_u8_s8(vget_low_s8(vreinterpretq_s8_s16(inp_16x8_0))); |
| u1_temp_8x8x2_t.val[1] = |
| vreinterpret_u8_s8(vget_high_s8(vreinterpretq_s8_s16(inp_16x8_0))); |
| u1_temp_8x8_t0 = |
| vtbl2_u8(u1_temp_8x8x2_t, vget_low_u8(x_ref_pos_luma_mask_r1_0)); |
| u1_temp_8x8_t1 = |
| vtbl2_u8(u1_temp_8x8x2_t, vget_high_u8(x_ref_pos_luma_mask_r1_0)); |
| inp_16x8_r1_0 = vreinterpretq_s16_s8(vcombine_s8( |
| vreinterpret_s8_u8(u1_temp_8x8_t0), vreinterpret_s8_u8(u1_temp_8x8_t1))); |
| |
| u1_temp_8x8x2_t.val[0] = |
| vreinterpret_u8_s8(vget_low_s8(vreinterpretq_s8_s16(inp_16x8_1))); |
| u1_temp_8x8x2_t.val[1] = |
| vreinterpret_u8_s8(vget_high_s8(vreinterpretq_s8_s16(inp_16x8_1))); |
| u1_temp_8x8_t0 = |
| vtbl2_u8(u1_temp_8x8x2_t, vget_low_u8(x_ref_pos_luma_mask_r1_1)); |
| u1_temp_8x8_t1 = |
| vtbl2_u8(u1_temp_8x8x2_t, vget_high_u8(x_ref_pos_luma_mask_r1_1)); |
| inp_16x8_r1_1 = vreinterpretq_s16_s8(vcombine_s8( |
| vreinterpret_s8_u8(u1_temp_8x8_t0), vreinterpret_s8_u8(u1_temp_8x8_t1))); |
| |
| u1_temp_8x8x2_t.val[0] = |
| vreinterpret_u8_s8(vget_low_s8(vreinterpretq_s8_s16(inp_16x8_0))); |
| u1_temp_8x8x2_t.val[1] = |
| vreinterpret_u8_s8(vget_high_s8(vreinterpretq_s8_s16(inp_16x8_0))); |
| u1_temp_8x8_t0 = |
| vtbl2_u8(u1_temp_8x8x2_t, vget_low_u8(x_ref_pos_luma_mask_r2_0)); |
| u1_temp_8x8_t1 = |
| vtbl2_u8(u1_temp_8x8x2_t, vget_high_u8(x_ref_pos_luma_mask_r2_0)); |
| inp_16x8_r2_0 = vreinterpretq_s16_s8(vcombine_s8( |
| vreinterpret_s8_u8(u1_temp_8x8_t0), vreinterpret_s8_u8(u1_temp_8x8_t1))); |
| |
| u1_temp_8x8x2_t.val[0] = |
| vreinterpret_u8_s8(vget_low_s8(vreinterpretq_s8_s16(inp_16x8_1))); |
| u1_temp_8x8x2_t.val[1] = |
| vreinterpret_u8_s8(vget_high_s8(vreinterpretq_s8_s16(inp_16x8_1))); |
| u1_temp_8x8_t0 = |
| vtbl2_u8(u1_temp_8x8x2_t, vget_low_u8(x_ref_pos_luma_mask_r2_1)); |
| u1_temp_8x8_t1 = |
| vtbl2_u8(u1_temp_8x8x2_t, vget_high_u8(x_ref_pos_luma_mask_r2_1)); |
| inp_16x8_r2_1 = vreinterpretq_s16_s8(vcombine_s8( |
| vreinterpret_s8_u8(u1_temp_8x8_t0), vreinterpret_s8_u8(u1_temp_8x8_t1))); |
| |
| u1_temp_8x8x2_t.val[0] = |
| vreinterpret_u8_s8(vget_low_s8(vreinterpretq_s8_s16(inp_16x8_2))); |
| u1_temp_8x8x2_t.val[1] = |
| vreinterpret_u8_s8(vget_high_s8(vreinterpretq_s8_s16(inp_16x8_2))); |
| u1_temp_8x8_t0 = |
| vtbl2_u8(u1_temp_8x8x2_t, vget_low_u8(x_ref_pos_luma_mask_r3_0)); |
| u1_temp_8x8_t1 = |
| vtbl2_u8(u1_temp_8x8x2_t, vget_high_u8(x_ref_pos_luma_mask_r3_0)); |
| inp_16x8_r3_0 = vreinterpretq_s16_s8(vcombine_s8( |
| vreinterpret_s8_u8(u1_temp_8x8_t0), vreinterpret_s8_u8(u1_temp_8x8_t1))); |
| |
| u1_temp_8x8x2_t.val[0] = |
| vreinterpret_u8_s8(vget_low_s8(vreinterpretq_s8_s16(inp_16x8_3))); |
| u1_temp_8x8x2_t.val[1] = |
| vreinterpret_u8_s8(vget_high_s8(vreinterpretq_s8_s16(inp_16x8_3))); |
| u1_temp_8x8_t0 = |
| vtbl2_u8(u1_temp_8x8x2_t, vget_low_u8(x_ref_pos_luma_mask_r3_1)); |
| u1_temp_8x8_t1 = |
| vtbl2_u8(u1_temp_8x8x2_t, vget_high_u8(x_ref_pos_luma_mask_r3_1)); |
| inp_16x8_r3_1 = vreinterpretq_s16_s8(vcombine_s8( |
| vreinterpret_s8_u8(u1_temp_8x8_t0), vreinterpret_s8_u8(u1_temp_8x8_t1))); |
| |
| inp_16x4_r0_0 = vget_low_s16(inp_16x8_r0_0); |
| inp_16x4_r0_1 = vget_low_s16(inp_16x8_r0_1); |
| inp_16x4_r1_0 = vget_low_s16(inp_16x8_r1_0); |
| inp_16x4_r1_1 = vget_low_s16(inp_16x8_r1_1); |
| |
| inp_16x4_r2_0 = vget_low_s16(inp_16x8_r2_0); |
| inp_16x4_r2_1 = vget_low_s16(inp_16x8_r2_1); |
| inp_16x4_r3_0 = vget_low_s16(inp_16x8_r3_0); |
| inp_16x4_r3_1 = vget_low_s16(inp_16x8_r3_1); |
| |
| out_res_32x4_r0_l_0 = vmull_s16(inp_16x4_r0_0, vget_low_s16(ip_filt_16x8_r0_0)); |
| out_res_32x4_r0_l_0 = vmlal_s16(out_res_32x4_r0_l_0, inp_16x4_r1_0, |
| vget_low_s16(ip_filt_16x8_r1_0)); |
| out_res_32x4_r0_l_0 = vmlal_s16(out_res_32x4_r0_l_0, inp_16x4_r2_0, |
| vget_low_s16(ip_filt_16x8_r2_0)); |
| out_res_32x4_r0_l_0 = vmlal_s16(out_res_32x4_r0_l_0, inp_16x4_r3_0, |
| vget_low_s16(ip_filt_16x8_r3_0)); |
| out_res_32x4_r0_l_1 = |
| vmull_s16(vget_high_s16(inp_16x8_r0_0), vget_high_s16(ip_filt_16x8_r0_0)); |
| out_res_32x4_r0_l_1 = |
| vmlal_s16(out_res_32x4_r0_l_1, vget_high_s16(inp_16x8_r1_0), |
| vget_high_s16(ip_filt_16x8_r1_0)); |
| out_res_32x4_r0_l_1 = |
| vmlal_s16(out_res_32x4_r0_l_1, vget_high_s16(inp_16x8_r2_0), |
| vget_high_s16(ip_filt_16x8_r2_0)); |
| out_res_32x4_r0_l_1 = |
| vmlal_s16(out_res_32x4_r0_l_1, vget_high_s16(inp_16x8_r3_0), |
| vget_high_s16(ip_filt_16x8_r3_0)); |
| |
| out_res_32x4_r0_h_0 = vmull_s16(inp_16x4_r0_1, vget_low_s16(ip_filt_16x8_r0_1)); |
| out_res_32x4_r0_h_0 = vmlal_s16(out_res_32x4_r0_h_0, inp_16x4_r1_1, |
| vget_low_s16(ip_filt_16x8_r1_1)); |
| out_res_32x4_r0_h_0 = vmlal_s16(out_res_32x4_r0_h_0, inp_16x4_r2_1, |
| vget_low_s16(ip_filt_16x8_r2_1)); |
| out_res_32x4_r0_h_0 = vmlal_s16(out_res_32x4_r0_h_0, inp_16x4_r3_1, |
| vget_low_s16(ip_filt_16x8_r3_1)); |
| |
| out_res_32x4_r0_h_1 = |
| vmull_s16(vget_high_s16(inp_16x8_r0_1), vget_high_s16(ip_filt_16x8_r0_1)); |
| out_res_32x4_r0_h_1 = |
| vmlal_s16(out_res_32x4_r0_h_1, vget_high_s16(inp_16x8_r1_1), |
| vget_high_s16(ip_filt_16x8_r1_1)); |
| out_res_32x4_r0_h_1 = |
| vmlal_s16(out_res_32x4_r0_h_1, vget_high_s16(inp_16x8_r2_1), |
| vget_high_s16(ip_filt_16x8_r2_1)); |
| out_res_32x4_r0_h_1 = |
| vmlal_s16(out_res_32x4_r0_h_1, vget_high_s16(inp_16x8_r3_1), |
| vget_high_s16(ip_filt_16x8_r3_1)); |
| |
| out_res_16x4_r0_l_0 = vqrshrun_n_s32(out_res_32x4_r0_l_0, 10); |
| out_res_16x4_r0_l_1 = vqrshrun_n_s32(out_res_32x4_r0_l_1, 10); |
| out_res_16x4_r0_h_0 = vqrshrun_n_s32(out_res_32x4_r0_h_0, 10); |
| out_res_16x4_r0_h_1 = vqrshrun_n_s32(out_res_32x4_r0_h_1, 10); |
| |
| out_res_8x8_r0_l = |
| vqmovn_u16(vcombine_u16(out_res_16x4_r0_l_0, out_res_16x4_r0_l_1)); |
| out_res_8x8_r0_h = |
| vqmovn_u16(vcombine_u16(out_res_16x4_r0_h_0, out_res_16x4_r0_h_1)); |
| vst1q_u8((pu1_out + (i4_y * i4_out_stride)), |
| vcombine_u8(out_res_8x8_r0_l, out_res_8x8_r0_h)); |
| } |
| } |
| } |
| } |
| else |
| { |
| for(i4_y = 0; i4_y < (i4_temp_array_ht); i4_y++) |
| { |
| arr_y_ref_pos_luma[i4_y] = (UWORD8) ps_y_pos_phase[i4_y + i4_frm_mb_y].i2_ref_pos; |
| arr_phase_luma[i4_y] = (UWORD8) ps_y_pos_phase[i4_y + i4_frm_mb_y].i2_phase; |
| } |
| pi4_y_ref_pos_luma = arr_y_ref_pos_luma; |
| pi4_phase_luma = arr_phase_luma; |
| |
| { |
| uint8x8_t inp_8x8_r0, inp_8x8_r0_1; |
| uint8x8_t inp_8x8_r1, inp_8x8_r1_1; |
| int16x8_t out_res_16x8_r0_0, out_res_16x8_r0_1; |
| |
| for(i4_y = 0; i4_y < (i4_temp_array_ht); i4_y++) |
| { |
| pu1_refarray_temp = |
| pu1_refarray + (pi4_y_ref_pos_luma[i4_y] * i4_refarray_wd) + (i4_x_min - 1); |
| inp_8x8_r0 = vld1_u8((pu1_refarray_temp)); |
| inp_8x8_r1 = vld1_u8((pu1_refarray_temp + i4_refarray_wd)); |
| |
| inp_8x8_r0_1 = vld1_u8((pu1_refarray_temp + 8)); |
| inp_8x8_r1_1 = vld1_u8((pu1_refarray_temp + 8 + i4_refarray_wd)); |
| |
| out_res_16x8_r0_0 = vmulq_n_s16(vreinterpretq_s16_u16(vmovl_u8(inp_8x8_r0)), |
| g_au1_interp_filter_chroma[pi4_phase_luma[i4_y]]); |
| out_res_16x8_r0_0 = |
| vmlaq_n_s16(out_res_16x8_r0_0, vreinterpretq_s16_u16(vmovl_u8(inp_8x8_r1)), |
| g_au1_interp_filter_chroma[pi4_phase_luma[i4_y] + 16]); |
| |
| out_res_16x8_r0_1 = vmulq_n_s16(vreinterpretq_s16_u16(vmovl_u8(inp_8x8_r0_1)), |
| g_au1_interp_filter_chroma[pi4_phase_luma[i4_y]]); |
| out_res_16x8_r0_1 = |
| vmlaq_n_s16(out_res_16x8_r0_1, vreinterpretq_s16_u16(vmovl_u8(inp_8x8_r1_1)), |
| g_au1_interp_filter_chroma[pi4_phase_luma[i4_y] + 16]); |
| |
| vst1q_s16((pi2_interp_buff_temp + (i4_y * i4_refarray_wd) + (i4_x_min - 1)), |
| out_res_16x8_r0_0); |
| vst1q_s16((pi2_interp_buff_temp + (i4_y * i4_refarray_wd) + (i4_x_min - 1) + 8), |
| out_res_16x8_r0_1); |
| } |
| } |
| |
| { |
| WORD32 strt_indx = 10; |
| |
| uint8x16_t phs_mask_8x8_0; |
| uint8x16_t x_ref_pos_luma_mask_r0_0; |
| uint8x16_t x_ref_pos_luma_mask_r1_0; |
| |
| WORD32 i4_x2 = 0; |
| uint8x16_t twos = vdupq_n_u8(2); |
| strt_indx = ps_x_pos_phase[0 + i4_frm_mb_x].i2_ref_pos; |
| |
| for(i4_x = 0; i4_x < i4_mb_wd; i4_x++) |
| { |
| arr_x_ref_pos_luma[i4_x] = ps_x_pos_phase[i4_x + i4_frm_mb_x].i2_ref_pos; |
| arr_phase_luma[i4_x] = ps_x_pos_phase[i4_x + i4_frm_mb_x].i2_phase; |
| arr_x_ref_pos_luma[i4_x] = arr_x_ref_pos_luma[i4_x] - strt_indx; |
| i4_x2 = i4_x << 1; |
| arr_x_ref_pos_luma_low[i4_x2] = (arr_x_ref_pos_luma[i4_x]) << 1; |
| arr_x_ref_pos_luma_low[i4_x2 + 1] = arr_x_ref_pos_luma_low[i4_x2] + 1; |
| } |
| |
| pi4_x_ref_pos_luma_low = arr_x_ref_pos_luma_low; |
| pi4_phase_luma = arr_phase_luma; |
| |
| phs_mask_8x8_0 = vld1q_u8(pi4_phase_luma); |
| x_ref_pos_luma_mask_r0_0 = vld1q_u8(pi4_x_ref_pos_luma_low); |
| x_ref_pos_luma_mask_r1_0 = vaddq_u8(x_ref_pos_luma_mask_r0_0, twos); |
| |
| { |
| uint8x16_t ip_filt_8x16_r0; |
| uint8x16_t ip_filt_8x16_r1; |
| int16x8_t ip_filt_16x8_r0_0; |
| int16x8_t ip_filt_16x8_r1_0; |
| int16x8_t inp_16x8_0; |
| int16x8_t inp_16x8_r0_0; |
| int16x8_t inp_16x8_r1_0; |
| int16x4_t inp_16x4_r0_0; |
| int16x4_t inp_16x4_r1_0; |
| int32x4_t out_res_32x4_r0_l_0; |
| int32x4_t out_res_32x4_r0_l_1; |
| uint16x4_t out_res_16x4_r0_l_0; |
| uint16x4_t out_res_16x4_r0_l_1; |
| uint16x8_t out_res_16x8_r0_l; |
| uint8x16_t out_8x16_r0; |
| uint8x8x2_t u1_incr_8x8x2_t; |
| uint8x8_t u1_incr_8x8_t0, u1_incr_8x8_t1; |
| uint8x8x2_t u1_temp_8x8x2_t; |
| uint8x8_t u1_temp_8x8_t0, u1_temp_8x8_t1; |
| uint8x16_t chroma_mask_8x16 = vreinterpretq_u8_u16(vdupq_n_u16(0x00ff)); |
| |
| ip_filt_8x16_r0 = vld1q_u8((g_au1_interp_filter_chroma)); |
| ip_filt_8x16_r1 = vld1q_u8((g_au1_interp_filter_chroma + 16)); |
| |
| u1_incr_8x8x2_t.val[0] = vget_low_u8(ip_filt_8x16_r0); |
| u1_incr_8x8x2_t.val[1] = vget_high_u8(ip_filt_8x16_r0); |
| u1_incr_8x8_t0 = vtbl2_u8(u1_incr_8x8x2_t, vget_low_u8(phs_mask_8x8_0)); |
| u1_incr_8x8_t1 = vtbl2_u8(u1_incr_8x8x2_t, vget_high_u8(phs_mask_8x8_0)); |
| ip_filt_8x16_r0 = vcombine_u8(u1_incr_8x8_t0, u1_incr_8x8_t1); |
| |
| u1_incr_8x8x2_t.val[0] = vget_low_u8(ip_filt_8x16_r1); |
| u1_incr_8x8x2_t.val[1] = vget_high_u8(ip_filt_8x16_r1); |
| u1_incr_8x8_t0 = vtbl2_u8(u1_incr_8x8x2_t, vget_low_u8(phs_mask_8x8_0)); |
| u1_incr_8x8_t1 = vtbl2_u8(u1_incr_8x8x2_t, vget_high_u8(phs_mask_8x8_0)); |
| ip_filt_8x16_r1 = vcombine_u8(u1_incr_8x8_t0, u1_incr_8x8_t1); |
| |
| ip_filt_16x8_r0_0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(ip_filt_8x16_r0))); |
| ip_filt_16x8_r1_0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(ip_filt_8x16_r1))); |
| |
| for(i4_y = 0; i4_y < i4_temp_array_ht; i4_y++) |
| { |
| inp_16x8_0 = vld1q_s16((pi2_interp_buff_temp + strt_indx)); |
| pi2_interp_buff_temp += i4_refarray_wd; |
| u1_temp_8x8x2_t.val[0] = |
| vreinterpret_u8_s8(vget_low_s8(vreinterpretq_s8_s16(inp_16x8_0))); |
| u1_temp_8x8x2_t.val[1] = |
| vreinterpret_u8_s8(vget_high_s8(vreinterpretq_s8_s16(inp_16x8_0))); |
| u1_temp_8x8_t0 = |
| vtbl2_u8(u1_temp_8x8x2_t, vget_low_u8(x_ref_pos_luma_mask_r0_0)); |
| u1_temp_8x8_t1 = |
| vtbl2_u8(u1_temp_8x8x2_t, vget_high_u8(x_ref_pos_luma_mask_r0_0)); |
| inp_16x8_r0_0 = vreinterpretq_s16_s8(vcombine_s8( |
| vreinterpret_s8_u8(u1_temp_8x8_t0), vreinterpret_s8_u8(u1_temp_8x8_t1))); |
| |
| u1_temp_8x8x2_t.val[0] = |
| vreinterpret_u8_s8(vget_low_s8(vreinterpretq_s8_s16(inp_16x8_0))); |
| u1_temp_8x8x2_t.val[1] = |
| vreinterpret_u8_s8(vget_high_s8(vreinterpretq_s8_s16(inp_16x8_0))); |
| u1_temp_8x8_t0 = |
| vtbl2_u8(u1_temp_8x8x2_t, vget_low_u8(x_ref_pos_luma_mask_r1_0)); |
| u1_temp_8x8_t1 = |
| vtbl2_u8(u1_temp_8x8x2_t, vget_high_u8(x_ref_pos_luma_mask_r1_0)); |
| inp_16x8_r1_0 = vreinterpretq_s16_s8(vcombine_s8( |
| vreinterpret_s8_u8(u1_temp_8x8_t0), vreinterpret_s8_u8(u1_temp_8x8_t1))); |
| inp_16x4_r0_0 = vget_low_s16(inp_16x8_r0_0); |
| inp_16x4_r1_0 = vget_low_s16(inp_16x8_r1_0); |
| |
| out_res_32x4_r0_l_0 = vmull_s16(inp_16x4_r0_0, vget_low_s16(ip_filt_16x8_r0_0)); |
| out_res_32x4_r0_l_0 = vmlal_s16(out_res_32x4_r0_l_0, inp_16x4_r1_0, |
| vget_low_s16(ip_filt_16x8_r1_0)); |
| out_res_32x4_r0_l_1 = |
| vmull_s16(vget_high_s16(inp_16x8_r0_0), vget_high_s16(ip_filt_16x8_r0_0)); |
| out_res_32x4_r0_l_1 = |
| vmlal_s16(out_res_32x4_r0_l_1, vget_high_s16(inp_16x8_r1_0), |
| vget_high_s16(ip_filt_16x8_r1_0)); |
| |
| out_res_16x4_r0_l_0 = vqrshrun_n_s32(out_res_32x4_r0_l_0, 10); |
| out_res_16x4_r0_l_1 = vqrshrun_n_s32(out_res_32x4_r0_l_1, 10); |
| out_res_16x8_r0_l = vcombine_u16(out_res_16x4_r0_l_0, out_res_16x4_r0_l_1); |
| out_8x16_r0 = vld1q_u8(pu1_out + (i4_y * i4_out_stride)); |
| out_8x16_r0 = vbslq_u8(chroma_mask_8x16, |
| vreinterpretq_u8_u16(out_res_16x8_r0_l), out_8x16_r0); |
| vst1q_u8((pu1_out + (i4_y * i4_out_stride)), out_8x16_r0); |
| } |
| } |
| } |
| } |
| return; |
| } /* End of Interpolation Function */ |
| |
| /*****************************************************************************/ |
| /* */ |
| /* Function Name : isvcd_horz_interpol_chroma_dyadic_1_neonintr */ |
| /* */ |
| /* Description : This function takes the reference array buffer & performs*/ |
| /* interpolation of a component to find the intra */ |
| /* resampled value */ |
| /* Inputs : pv_intra_samp_ctxt : intra sampling context */ |
| /* pu1_out : output buffer pointer */ |
| /* i4_out_stride : output buffer stride */ |
| /* i4_refarray_wd : reference array width */ |
| /* i4_x_offset : offset in reference layer in horz direction*/ |
| /* ps_coord : current mb co-ordinate */ |
| /* i4_chroma_flag : chroma processing flag */ |
| /* Globals : none */ |
| /* Processing : it does the interpolation on horizontal direction */ |
| /* Outputs : resampled pixels */ |
| /* Returns : none */ |
| /* */ |
| /* Issues : none */ |
| /* */ |
| /* Revision History: */ |
| /* */ |
| /* DD MM YYYY Author(s) Changes (Describe the changes made) */ |
| /* 26 06 2009 vijayakumar creation */ |
| /* */ |
| /*****************************************************************************/ |
| |
| void isvcd_horz_interpol_chroma_dyadic_1_neonintr(WORD16 *pi2_tmp_filt_buf, UWORD8 *pu1_out_buf, |
| WORD32 i4_out_stride, WORD32 i4_phase_0, |
| WORD32 i4_phase_1) |
| { |
| WORD32 i4_y; |
| WORD32 i4_coeff_0, i4_coeff_1, i4_coeff_2, i4_coeff_3; |
| WORD32 i4_filt_stride, i4_dst_stride; |
| UWORD8 *pu1_out; |
| WORD16 *pi2_tmp; |
| |
| int16x8_t i4_samp_horz_16x8_r0_0, i4_samp_horz_16x8_r0_1, i4_samp_horz_16x8_r0_2; |
| int16x8_t i4_samp_horz_16x8_r1_0, i4_samp_horz_16x8_r1_1, i4_samp_horz_16x8_r1_2; |
| int16x8_t i4_rslt_horz_r0_1, i4_rslt_horz_r0_2; |
| int16x8_t i4_rslt_horz_r1_1, i4_rslt_horz_r1_2; |
| |
| int16x8_t final_horz_16x8_r0_1; |
| int16x8_t final_horz_16x8_r1_1; |
| |
| uint8x16_t i4_out_horz_8x16_r0, i4_out_horz_8x16_r1; |
| uint8x16_t chroma_mask_8x16 = vreinterpretq_u8_u16(vdupq_n_u16(0x00ff)); |
| |
| i4_coeff_0 = 8 - i4_phase_0; |
| i4_coeff_1 = i4_phase_0; |
| i4_coeff_2 = 8 - i4_phase_1; |
| i4_coeff_3 = i4_phase_1; |
| |
| pu1_out = pu1_out_buf; |
| pi2_tmp = pi2_tmp_filt_buf; |
| i4_filt_stride = 6; |
| i4_dst_stride = i4_out_stride; |
| |
| /* Horizontal interpolation */ |
| for(i4_y = 0; i4_y < 8; i4_y += 2) |
| { |
| i4_samp_horz_16x8_r0_0 = vld1q_s16(pi2_tmp); // a0 a1 a2 a3 a4 a5 a6 a7 |
| i4_samp_horz_16x8_r0_1 = vld1q_s16(pi2_tmp + 1); // a1 a2 a3 a4 |
| i4_samp_horz_16x8_r0_2 = vld1q_s16(pi2_tmp + 2); // a2 a3 a4 a5 |
| |
| i4_samp_horz_16x8_r1_0 = vld1q_s16(pi2_tmp + i4_filt_stride); |
| i4_samp_horz_16x8_r1_1 = vld1q_s16(pi2_tmp + i4_filt_stride + 1); |
| i4_samp_horz_16x8_r1_2 = vld1q_s16(pi2_tmp + (i4_filt_stride + 2)); |
| |
| i4_rslt_horz_r0_1 = |
| vmulq_n_s16(i4_samp_horz_16x8_r0_0, i4_coeff_0); // a0c0 a1c0 a2c0 a3c0 |
| |
| i4_rslt_horz_r0_2 = |
| vmulq_n_s16(i4_samp_horz_16x8_r0_1, i4_coeff_2); // a1c2 a2c2 a3c2 a4c2 |
| i4_rslt_horz_r0_1 = vmlaq_n_s16(i4_rslt_horz_r0_1, i4_samp_horz_16x8_r0_1, |
| i4_coeff_1); // a0c0+a1c1 a1c0+a2c1 a2c0+a3c1 a3c0+a4c1 |
| |
| i4_rslt_horz_r0_2 = vmlaq_n_s16(i4_rslt_horz_r0_2, i4_samp_horz_16x8_r0_2, |
| i4_coeff_3); // a1c2+a2c3 a2c2+a3c3 a3c2+a4c3 a4c2+a5c3 |
| |
| i4_rslt_horz_r1_1 = vmulq_n_s16(i4_samp_horz_16x8_r1_0, i4_coeff_0); |
| i4_rslt_horz_r1_2 = vmulq_n_s16(i4_samp_horz_16x8_r1_1, i4_coeff_2); |
| |
| i4_rslt_horz_r1_1 = vmlaq_n_s16(i4_rslt_horz_r1_1, i4_samp_horz_16x8_r1_1, i4_coeff_1); |
| i4_rslt_horz_r1_2 = vmlaq_n_s16(i4_rslt_horz_r1_2, i4_samp_horz_16x8_r1_2, i4_coeff_3); |
| |
| final_horz_16x8_r0_1 = vzipq_s16(i4_rslt_horz_r0_1, i4_rslt_horz_r0_2).val[0]; |
| final_horz_16x8_r1_1 = vzipq_s16(i4_rslt_horz_r1_1, i4_rslt_horz_r1_2).val[0]; |
| |
| final_horz_16x8_r0_1 = vrshrq_n_s16(final_horz_16x8_r0_1, 6); |
| |
| final_horz_16x8_r1_1 = vrshrq_n_s16(final_horz_16x8_r1_1, 6); |
| |
| i4_out_horz_8x16_r0 = vld1q_u8(pu1_out); |
| i4_out_horz_8x16_r1 = vld1q_u8(pu1_out + i4_dst_stride); |
| |
| i4_out_horz_8x16_r0 = vbslq_u8(chroma_mask_8x16, vreinterpretq_u8_s16(final_horz_16x8_r0_1), |
| i4_out_horz_8x16_r0); |
| i4_out_horz_8x16_r1 = vbslq_u8(chroma_mask_8x16, vreinterpretq_u8_s16(final_horz_16x8_r1_1), |
| i4_out_horz_8x16_r1); |
| |
| vst1q_u8(pu1_out, i4_out_horz_8x16_r0); |
| vst1q_u8(pu1_out + i4_dst_stride, i4_out_horz_8x16_r1); |
| |
| /* Incrementing ptr */ |
| pi2_tmp += (i4_filt_stride << 1); |
| pu1_out += (i4_dst_stride << 1); |
| |
| } /* End of loop over y */ |
| } |
| |
| /*****************************************************************************/ |
| /* */ |
| /* Function Name : isvcd_horz_interpol_chroma_dyadic_2_neonintr */ |
| /* */ |
| /* Description : This function takes the reference array buffer & performs*/ |
| /* vertical intra resampling for dyadic scaling ratios for */ |
| /* chroma for the following ref_lyr_chroma_phase_y_plus1 and*/ |
| /* chroma_phase_y_plus1: */ |
| /* ref_lyr cur_lyr */ |
| /* 0 1 */ |
| /* 0 2 */ |
| /* Inputs : pu1_inp_buf : ptr to the 6x6 reference sample buffer */ |
| /* pi2_tmp_filt_buf : ptr to the 6x8 buffer to hold the */ |
| /* vertically interpolated data */ |
| /* i4_phase_0 : y phase for even values of y */ |
| /* i4_phase_1 : y phase for odd values of y */ |
| /* Globals : none */ |
| /* Processing : it does the interpolation in vertical direction */ |
| /* Outputs : vertically resampled samples */ |
| /* Returns : none */ |
| /* */ |
| /* Issues : none */ |
| /* */ |
| /* Revision History: */ |
| /* */ |
| /* DD MM YYYY Author(s) Changes (Describe the changes made) */ |
| /* 21 05 2021 Dolan creation */ |
| /* */ |
| /*****************************************************************************/ |
| void isvcd_horz_interpol_chroma_dyadic_2_neonintr(WORD16 *pi2_tmp_filt_buf, UWORD8 *pu1_out_buf, |
| WORD32 i4_out_stride, WORD32 i4_phase_0, |
| WORD32 i4_phase_1) |
| { |
| WORD32 i4_y; |
| WORD32 i4_coeff_0, i4_coeff_1, i4_coeff_2, i4_coeff_3; |
| WORD32 i4_filt_stride, i4_dst_stride; |
| UWORD8 *pu1_out; |
| WORD16 *pi2_tmp; |
| |
| int16x8_t i4_samp_horz_16x8_r0_0, i4_samp_horz_16x8_r0_1; |
| int16x8_t i4_samp_horz_16x8_r1_0, i4_samp_horz_16x8_r1_1; |
| int16x8_t i4_rslt_horz_r0_1, i4_rslt_horz_r0_2; |
| int16x8_t i4_rslt_horz_r1_1, i4_rslt_horz_r1_2; |
| |
| int16x8_t final_horz_16x8_r0_1; |
| int16x8_t final_horz_16x8_r1_1; |
| |
| uint8x16_t i4_out_horz_8x16_r0, i4_out_horz_8x16_r1; |
| uint8x16_t chroma_mask_8x16 = vreinterpretq_u8_u16(vdupq_n_u16(0x00ff)); |
| |
| i4_coeff_0 = 8 - i4_phase_0; |
| i4_coeff_1 = i4_phase_0; |
| i4_coeff_2 = 8 - i4_phase_1; |
| i4_coeff_3 = i4_phase_1; |
| |
| pu1_out = pu1_out_buf; |
| pi2_tmp = pi2_tmp_filt_buf + 1; |
| i4_filt_stride = 6; |
| i4_dst_stride = i4_out_stride; |
| |
| /* Horizontal interpolation */ |
| for(i4_y = 0; i4_y < 8; i4_y += 2) |
| { |
| i4_samp_horz_16x8_r0_0 = vld1q_s16(pi2_tmp); // a0 a1 a2 a3 a4 a5 a6 a7 |
| i4_samp_horz_16x8_r0_1 = vld1q_s16(pi2_tmp + 1); // a1 a2 a3 a4 |
| |
| i4_samp_horz_16x8_r1_0 = vld1q_s16(pi2_tmp + i4_filt_stride); |
| i4_samp_horz_16x8_r1_1 = vld1q_s16(pi2_tmp + i4_filt_stride + 1); |
| |
| i4_rslt_horz_r0_1 = |
| vmulq_n_s16(i4_samp_horz_16x8_r0_0, i4_coeff_0); // a0c0 a1c0 a2c0 a3c0 |
| |
| i4_rslt_horz_r0_2 = |
| vmulq_n_s16(i4_samp_horz_16x8_r0_0, i4_coeff_2); // a1c2 a2c2 a3c2 a4c2 |
| i4_rslt_horz_r0_1 = vmlaq_n_s16(i4_rslt_horz_r0_1, i4_samp_horz_16x8_r0_1, |
| i4_coeff_1); // a0c0+a1c1 a1c0+a2c1 a2c0+a3c1 a3c0+a4c1 |
| |
| i4_rslt_horz_r0_2 = vmlaq_n_s16(i4_rslt_horz_r0_2, i4_samp_horz_16x8_r0_1, |
| i4_coeff_3); // a1c2+a2c3 a2c2+a3c3 a3c2+a4c3 a4c2+a5c3 |
| |
| i4_rslt_horz_r1_1 = vmulq_n_s16(i4_samp_horz_16x8_r1_0, i4_coeff_0); |
| i4_rslt_horz_r1_2 = vmulq_n_s16(i4_samp_horz_16x8_r1_0, i4_coeff_2); |
| |
| i4_rslt_horz_r1_1 = vmlaq_n_s16(i4_rslt_horz_r1_1, i4_samp_horz_16x8_r1_1, i4_coeff_1); |
| i4_rslt_horz_r1_2 = vmlaq_n_s16(i4_rslt_horz_r1_2, i4_samp_horz_16x8_r1_1, i4_coeff_3); |
| |
| final_horz_16x8_r0_1 = vzipq_s16(i4_rslt_horz_r0_1, i4_rslt_horz_r0_2).val[0]; |
| final_horz_16x8_r1_1 = vzipq_s16(i4_rslt_horz_r1_1, i4_rslt_horz_r1_2).val[0]; |
| |
| final_horz_16x8_r0_1 = vrshrq_n_s16(final_horz_16x8_r0_1, 6); |
| |
| final_horz_16x8_r1_1 = vrshrq_n_s16(final_horz_16x8_r1_1, 6); |
| |
| i4_out_horz_8x16_r0 = vld1q_u8(pu1_out); |
| i4_out_horz_8x16_r1 = vld1q_u8(pu1_out + i4_dst_stride); |
| |
| i4_out_horz_8x16_r0 = vbslq_u8(chroma_mask_8x16, vreinterpretq_u8_s16(final_horz_16x8_r0_1), |
| i4_out_horz_8x16_r0); |
| i4_out_horz_8x16_r1 = vbslq_u8(chroma_mask_8x16, vreinterpretq_u8_s16(final_horz_16x8_r1_1), |
| i4_out_horz_8x16_r1); |
| |
| vst1q_u8(pu1_out, i4_out_horz_8x16_r0); |
| vst1q_u8(pu1_out + i4_dst_stride, i4_out_horz_8x16_r1); |
| |
| /* Incrementing ptr */ |
| pi2_tmp += (i4_filt_stride << 1); |
| pu1_out += (i4_dst_stride << 1); |
| |
| } /* End of loop over y */ |
| } |
| |
| /*****************************************************************************/ |
| /* */ |
| /* Function Name : isvcd_vert_interpol_chroma_dyadic_1_neonintr */ |
| /* */ |
| /* Description : This function takes the reference array buffer & performs*/ |
| /* vertical intra resampling for dyadic scaling ratios for */ |
| /* chroma for the following ref_lyr_chroma_phase_y_plus1 and*/ |
| /* chroma_phase_y_plus1: */ |
| /* ref_lyr cur_lyr */ |
| /* 2 0 */ |
| /* Inputs : pu1_inp_buf : ptr to the 6x6 reference sample buffer */ |
| /* pi2_tmp_filt_buf : ptr to the 6x8 buffer to hold */ |
| /* vertically interpolated data */ |
| /* i4_phase_0 : y phase for even values of y */ |
| /* i4_phase_1 : y phase for odd values of y */ |
| /* Globals : none */ |
| /* Processing : it does the interpolation in vertical direction */ |
| /* Outputs : vertically resampled samples */ |
| /* Returns : none */ |
| /* */ |
| /* Issues : none */ |
| /* */ |
| /* Revision History: */ |
| /* */ |
| /* DD MM YYYY Author(s) Changes (Describe the changes made) */ |
| /* 21 05 2021 Dolan creation */ |
| /* */ |
| /*****************************************************************************/ |
| void isvcd_vert_interpol_chroma_dyadic_1_neonintr(UWORD8 *pu1_inp_buf, WORD16 *pi2_tmp_filt_buf, |
| WORD32 i4_phase_0, WORD32 i4_phase_1) |
| { |
| WORD32 i4_coeff_0, i4_coeff_1, i4_coeff_2, i4_coeff_3; |
| WORD32 i4_src_stride; |
| UWORD8 *pu1_inp; |
| WORD16 *pi2_tmp; |
| |
| uint8x8_t i4_samp_vert_8x8_r0, i4_samp_vert_8x8_r1, i4_samp_vert_8x8_r2; |
| uint8x8_t i4_samp_vert_8x8_r3, i4_samp_vert_8x8_r4, i4_samp_vert_8x8_r5; |
| int16x8_t i4_rslt_vert_16x8_r0, i4_rslt_vert_16x8_r1, i4_rslt_vert_16x8_r2, |
| i4_rslt_vert_16x8_r3; |
| int16x8_t i4_rslt_vert_16x8_r4, i4_rslt_vert_16x8_r5, i4_rslt_vert_16x8_r6, |
| i4_rslt_vert_16x8_r7; |
| |
| i4_coeff_0 = 8 - i4_phase_0; |
| i4_coeff_1 = i4_phase_0; |
| i4_coeff_2 = 8 - i4_phase_1; |
| i4_coeff_3 = i4_phase_1; |
| |
| pu1_inp = pu1_inp_buf; |
| pi2_tmp = pi2_tmp_filt_buf; |
| i4_src_stride = DYADIC_REF_W_C; |
| |
| /* Vertical interpolation */ |
| i4_samp_vert_8x8_r0 = vld1_u8(pu1_inp); |
| pu1_inp += i4_src_stride; |
| i4_samp_vert_8x8_r1 = vld1_u8(pu1_inp); |
| pu1_inp += i4_src_stride; |
| i4_samp_vert_8x8_r2 = vld1_u8(pu1_inp); |
| pu1_inp += i4_src_stride; |
| i4_samp_vert_8x8_r3 = vld1_u8(pu1_inp); |
| pu1_inp += i4_src_stride; |
| i4_samp_vert_8x8_r4 = vld1_u8(pu1_inp); |
| pu1_inp += i4_src_stride; |
| i4_samp_vert_8x8_r5 = vld1_u8(pu1_inp); |
| pu1_inp += i4_src_stride; |
| |
| i4_rslt_vert_16x8_r0 = |
| vmulq_n_s16(vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r0)), i4_coeff_0); |
| i4_rslt_vert_16x8_r0 = vmlaq_n_s16( |
| i4_rslt_vert_16x8_r0, vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r1)), i4_coeff_1); |
| vst1q_s16(pi2_tmp, i4_rslt_vert_16x8_r0); |
| |
| i4_rslt_vert_16x8_r1 = |
| vmulq_n_s16(vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r1)), i4_coeff_2); |
| i4_rslt_vert_16x8_r1 = vmlaq_n_s16( |
| i4_rslt_vert_16x8_r1, vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r2)), i4_coeff_3); |
| vst1q_s16(pi2_tmp + 6, i4_rslt_vert_16x8_r1); |
| |
| i4_rslt_vert_16x8_r2 = |
| vmulq_n_s16(vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r1)), i4_coeff_0); |
| i4_rslt_vert_16x8_r2 = vmlaq_n_s16( |
| i4_rslt_vert_16x8_r2, vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r2)), i4_coeff_1); |
| vst1q_s16(pi2_tmp + 12, i4_rslt_vert_16x8_r2); |
| |
| i4_rslt_vert_16x8_r3 = |
| vmulq_n_s16(vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r2)), i4_coeff_2); |
| i4_rslt_vert_16x8_r3 = vmlaq_n_s16( |
| i4_rslt_vert_16x8_r3, vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r3)), i4_coeff_3); |
| vst1q_s16(pi2_tmp + 18, i4_rslt_vert_16x8_r3); |
| |
| i4_rslt_vert_16x8_r4 = |
| vmulq_n_s16(vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r2)), i4_coeff_0); |
| i4_rslt_vert_16x8_r4 = vmlaq_n_s16( |
| i4_rslt_vert_16x8_r4, vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r3)), i4_coeff_1); |
| vst1q_s16(pi2_tmp + 24, i4_rslt_vert_16x8_r4); |
| |
| i4_rslt_vert_16x8_r5 = |
| vmulq_n_s16(vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r3)), i4_coeff_2); |
| i4_rslt_vert_16x8_r5 = vmlaq_n_s16( |
| i4_rslt_vert_16x8_r5, vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r4)), i4_coeff_3); |
| vst1q_s16(pi2_tmp + 30, i4_rslt_vert_16x8_r5); |
| |
| i4_rslt_vert_16x8_r6 = |
| vmulq_n_s16(vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r3)), i4_coeff_0); |
| i4_rslt_vert_16x8_r6 = vmlaq_n_s16( |
| i4_rslt_vert_16x8_r6, vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r4)), i4_coeff_1); |
| vst1q_s16(pi2_tmp + 36, i4_rslt_vert_16x8_r6); |
| |
| i4_rslt_vert_16x8_r7 = |
| vmulq_n_s16(vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r4)), i4_coeff_2); |
| i4_rslt_vert_16x8_r7 = vmlaq_n_s16( |
| i4_rslt_vert_16x8_r7, vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r5)), i4_coeff_3); |
| vst1_s16(pi2_tmp + 42, vget_low_s16(i4_rslt_vert_16x8_r7)); |
| vst1q_lane_s16(pi2_tmp + 46, i4_rslt_vert_16x8_r7, 4); |
| vst1q_lane_s16(pi2_tmp + 47, i4_rslt_vert_16x8_r7, 5); |
| } |
| |
| /*****************************************************************************/ |
| /* */ |
| /* Function Name : isvcd_vert_interpol_chroma_dyadic_2_neonintr */ |
| /* */ |
| /* Description : This function takes the reference array buffer & performs*/ |
| /* vertical intra resampling for dyadic scaling ratios for */ |
| /* chroma for the following ref_lyr_chroma_phase_y_plus1 and*/ |
| /* chroma_phase_y_plus1: */ |
| /* ref_lyr cur_lyr */ |
| /* 2 0 */ |
| /* Inputs : pu1_inp_buf : ptr to the 6x6 reference sample buffer */ |
| /* pi2_tmp_filt_buf : ptr to the 6x8 buffer to hold the */ |
| /* vertically interpolated data */ |
| /* i4_phase_0 : y phase for even values of y */ |
| /* i4_phase_1 : y phase for odd values of y */ |
| /* Globals : none */ |
| /* Processing : it does the interpolation in vertical direction */ |
| /* Outputs : vertically resampled samples */ |
| /* Returns : none */ |
| /* */ |
| /* Issues : none */ |
| /* */ |
| /* Revision History: */ |
| /* */ |
| /* DD MM YYYY Author(s) Changes (Describe the changes made) */ |
| /* 21 05 2021 Dolan creation */ |
| /* */ |
| /*****************************************************************************/ |
| void isvcd_vert_interpol_chroma_dyadic_2_neonintr(UWORD8 *pu1_inp_buf, WORD16 *pi2_tmp_filt_buf, |
| WORD32 i4_phase_0, WORD32 i4_phase_1) |
| { |
| WORD32 i4_coeff_0, i4_coeff_1, i4_coeff_2, i4_coeff_3; |
| WORD32 i4_src_stride; |
| UWORD8 *pu1_inp; |
| WORD16 *pi2_tmp; |
| |
| uint8x8_t i4_samp_vert_8x8_r0, i4_samp_vert_8x8_r1, i4_samp_vert_8x8_r2, i4_samp_vert_8x8_r3; |
| uint8x8_t i4_samp_vert_8x8_r4; |
| int16x8_t i4_rslt_vert_16x8_r0, i4_rslt_vert_16x8_r1, i4_rslt_vert_16x8_r2, |
| i4_rslt_vert_16x8_r3; |
| int16x8_t i4_rslt_vert_16x8_r4, i4_rslt_vert_16x8_r5, i4_rslt_vert_16x8_r6, |
| i4_rslt_vert_16x8_r7; |
| |
| i4_coeff_0 = 8 - i4_phase_0; |
| i4_coeff_1 = i4_phase_0; |
| i4_coeff_2 = 8 - i4_phase_1; |
| i4_coeff_3 = i4_phase_1; |
| |
| pi2_tmp = pi2_tmp_filt_buf; |
| i4_src_stride = DYADIC_REF_W_C; |
| pu1_inp = pu1_inp_buf + i4_src_stride; |
| |
| /* Vertical interpolation */ |
| i4_samp_vert_8x8_r0 = vld1_u8(pu1_inp); |
| pu1_inp += i4_src_stride; |
| i4_samp_vert_8x8_r1 = vld1_u8(pu1_inp); |
| pu1_inp += i4_src_stride; |
| i4_samp_vert_8x8_r2 = vld1_u8(pu1_inp); |
| pu1_inp += i4_src_stride; |
| i4_samp_vert_8x8_r3 = vld1_u8(pu1_inp); |
| pu1_inp += i4_src_stride; |
| i4_samp_vert_8x8_r4 = vld1_u8(pu1_inp); |
| pu1_inp += i4_src_stride; |
| |
| /* since y_phase = phase_0 for y = 0 */ |
| i4_rslt_vert_16x8_r0 = |
| vmulq_n_s16(vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r0)), i4_coeff_0); |
| i4_rslt_vert_16x8_r0 = vmlaq_n_s16( |
| i4_rslt_vert_16x8_r0, vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r1)), i4_coeff_1); |
| vst1q_s16(pi2_tmp, i4_rslt_vert_16x8_r0); |
| |
| i4_rslt_vert_16x8_r1 = |
| vmulq_n_s16(vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r0)), i4_coeff_2); |
| i4_rslt_vert_16x8_r1 = vmlaq_n_s16( |
| i4_rslt_vert_16x8_r1, vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r1)), i4_coeff_3); |
| vst1q_s16(pi2_tmp + 6, i4_rslt_vert_16x8_r1); |
| |
| i4_rslt_vert_16x8_r2 = |
| vmulq_n_s16(vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r1)), i4_coeff_0); |
| i4_rslt_vert_16x8_r2 = vmlaq_n_s16( |
| i4_rslt_vert_16x8_r2, vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r2)), i4_coeff_1); |
| vst1q_s16(pi2_tmp + 12, i4_rslt_vert_16x8_r2); |
| |
| i4_rslt_vert_16x8_r3 = |
| vmulq_n_s16(vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r1)), i4_coeff_2); |
| i4_rslt_vert_16x8_r3 = vmlaq_n_s16( |
| i4_rslt_vert_16x8_r3, vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r2)), i4_coeff_3); |
| vst1q_s16(pi2_tmp + 18, i4_rslt_vert_16x8_r3); |
| |
| i4_rslt_vert_16x8_r4 = |
| vmulq_n_s16(vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r2)), i4_coeff_0); |
| i4_rslt_vert_16x8_r4 = vmlaq_n_s16( |
| i4_rslt_vert_16x8_r4, vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r3)), i4_coeff_1); |
| vst1q_s16(pi2_tmp + 24, i4_rslt_vert_16x8_r4); |
| |
| i4_rslt_vert_16x8_r5 = |
| vmulq_n_s16(vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r2)), i4_coeff_2); |
| i4_rslt_vert_16x8_r5 = vmlaq_n_s16( |
| i4_rslt_vert_16x8_r5, vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r3)), i4_coeff_3); |
| vst1q_s16(pi2_tmp + 30, i4_rslt_vert_16x8_r5); |
| |
| i4_rslt_vert_16x8_r6 = |
| vmulq_n_s16(vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r3)), i4_coeff_0); |
| i4_rslt_vert_16x8_r6 = vmlaq_n_s16( |
| i4_rslt_vert_16x8_r6, vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r4)), i4_coeff_1); |
| vst1q_s16(pi2_tmp + 36, i4_rslt_vert_16x8_r6); |
| |
| i4_rslt_vert_16x8_r7 = |
| vmulq_n_s16(vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r3)), i4_coeff_2); |
| i4_rslt_vert_16x8_r7 = vmlaq_n_s16( |
| i4_rslt_vert_16x8_r7, vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r4)), i4_coeff_3); |
| vst1_s16(pi2_tmp + 42, vget_low_s16(i4_rslt_vert_16x8_r7)); |
| |
| vst1q_lane_s16(pi2_tmp + 46, i4_rslt_vert_16x8_r7, 4); |
| vst1q_lane_s16(pi2_tmp + 47, i4_rslt_vert_16x8_r7, 5); |
| } |
| |
| /*****************************************************************************/ |
| /* */ |
| /* Function Name : isvcd_vert_interpol_chroma_dyadic_3_neonintr */ |
| /* */ |
| /* Description : This function takes the reference array buffer & performs*/ |
| /* vertical intra resampling for dyadic scaling ratios for */ |
| /* chroma for the following ref_lyr_chroma_phase_y_plus1 and*/ |
| /* chroma_phase_y_plus1: */ |
| /* ref_lyr cur_lyr */ |
| /* 2 0 */ |
| /* Inputs : pu1_inp_buf : ptr to the 6x6 reference sample buffer */ |
| /* pi2_tmp_filt_buf : ptr to the 6x8 buffer to hold the */ |
| /* vertically interpolated data */ |
| /* i4_phase_0 : y phase for even values of y */ |
| /* i4_phase_1 : y phase for odd values of y */ |
| /* Globals : none */ |
| /* Processing : it does the interpolation in vertical direction */ |
| /* Outputs : vertically resampled samples */ |
| /* Returns : none */ |
| /* */ |
| /* Issues : none */ |
| /* */ |
| /* Revision History: */ |
| /* */ |
| /* DD MM YYYY Author(s) Changes (Describe the changes made) */ |
| /* 21 05 2021 Dolan creation */ |
| /* */ |
| /*****************************************************************************/ |
| void isvcd_vert_interpol_chroma_dyadic_3_neonintr(UWORD8 *pu1_inp_buf, WORD16 *pi2_tmp_filt_buf, |
| WORD32 i4_phase_0, WORD32 i4_phase_1) |
| { |
| WORD32 i4_coeff_0, i4_coeff_1, i4_coeff_2, i4_coeff_3; |
| WORD32 i4_src_stride; |
| UWORD8 *pu1_inp; |
| WORD16 *pi2_tmp; |
| |
| uint8x8_t i4_samp_vert_8x8_r0, i4_samp_vert_8x8_r1, i4_samp_vert_8x8_r2; |
| uint8x8_t i4_samp_vert_8x8_r3, i4_samp_vert_8x8_r4; |
| int16x8_t i4_rslt_vert_16x8_r0, i4_rslt_vert_16x8_r1, i4_rslt_vert_16x8_r2, |
| i4_rslt_vert_16x8_r3; |
| int16x8_t i4_rslt_vert_16x8_r4, i4_rslt_vert_16x8_r5, i4_rslt_vert_16x8_r6, |
| i4_rslt_vert_16x8_r7; |
| |
| i4_coeff_0 = 8 - i4_phase_0; |
| i4_coeff_1 = i4_phase_0; |
| i4_coeff_2 = 8 - i4_phase_1; |
| i4_coeff_3 = i4_phase_1; |
| |
| pi2_tmp = pi2_tmp_filt_buf; |
| i4_src_stride = DYADIC_REF_W_C; |
| pu1_inp = pu1_inp_buf; |
| |
| /* Vertical interpolation */ |
| /* y = 0, y_phase = phase_0 */ |
| i4_samp_vert_8x8_r0 = vld1_u8(pu1_inp); |
| pu1_inp += i4_src_stride; |
| i4_samp_vert_8x8_r1 = vld1_u8(pu1_inp); |
| pu1_inp += i4_src_stride; |
| i4_samp_vert_8x8_r2 = vld1_u8(pu1_inp); |
| pu1_inp += i4_src_stride; |
| i4_samp_vert_8x8_r3 = vld1_u8(pu1_inp); |
| pu1_inp += i4_src_stride; |
| i4_samp_vert_8x8_r4 = vld1_u8(pu1_inp); |
| pu1_inp += i4_src_stride; |
| |
| /* since y_phase = phase_0 for y = 0 */ |
| i4_rslt_vert_16x8_r0 = |
| vmulq_n_s16(vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r0)), i4_coeff_0); |
| i4_rslt_vert_16x8_r0 = vmlaq_n_s16( |
| i4_rslt_vert_16x8_r0, vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r1)), i4_coeff_1); |
| vst1q_s16(pi2_tmp, i4_rslt_vert_16x8_r0); |
| |
| i4_rslt_vert_16x8_r1 = |
| vmulq_n_s16(vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r0)), i4_coeff_2); |
| i4_rslt_vert_16x8_r1 = vmlaq_n_s16( |
| i4_rslt_vert_16x8_r1, vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r1)), i4_coeff_3); |
| vst1q_s16(pi2_tmp + 6, i4_rslt_vert_16x8_r1); |
| |
| i4_rslt_vert_16x8_r2 = |
| vmulq_n_s16(vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r1)), i4_coeff_0); |
| i4_rslt_vert_16x8_r2 = vmlaq_n_s16( |
| i4_rslt_vert_16x8_r2, vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r2)), i4_coeff_1); |
| vst1q_s16(pi2_tmp + 12, i4_rslt_vert_16x8_r2); |
| |
| i4_rslt_vert_16x8_r3 = |
| vmulq_n_s16(vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r1)), i4_coeff_2); |
| i4_rslt_vert_16x8_r3 = vmlaq_n_s16( |
| i4_rslt_vert_16x8_r3, vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r2)), i4_coeff_3); |
| vst1q_s16(pi2_tmp + 18, i4_rslt_vert_16x8_r3); |
| |
| i4_rslt_vert_16x8_r4 = |
| vmulq_n_s16(vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r2)), i4_coeff_0); |
| i4_rslt_vert_16x8_r4 = vmlaq_n_s16( |
| i4_rslt_vert_16x8_r4, vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r3)), i4_coeff_1); |
| vst1q_s16(pi2_tmp + 24, i4_rslt_vert_16x8_r4); |
| |
| i4_rslt_vert_16x8_r5 = |
| vmulq_n_s16(vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r2)), i4_coeff_2); |
| i4_rslt_vert_16x8_r5 = vmlaq_n_s16( |
| i4_rslt_vert_16x8_r5, vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r3)), i4_coeff_3); |
| vst1q_s16(pi2_tmp + 30, i4_rslt_vert_16x8_r5); |
| |
| i4_rslt_vert_16x8_r6 = |
| vmulq_n_s16(vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r3)), i4_coeff_0); |
| i4_rslt_vert_16x8_r6 = vmlaq_n_s16( |
| i4_rslt_vert_16x8_r6, vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r4)), i4_coeff_1); |
| vst1q_s16(pi2_tmp + 36, i4_rslt_vert_16x8_r6); |
| |
| i4_rslt_vert_16x8_r7 = |
| vmulq_n_s16(vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r3)), i4_coeff_2); |
| i4_rslt_vert_16x8_r7 = vmlaq_n_s16( |
| i4_rslt_vert_16x8_r7, vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r4)), i4_coeff_3); |
| vst1_s16(pi2_tmp + 42, vget_low_s16(i4_rslt_vert_16x8_r7)); |
| |
| vst1q_lane_s16(pi2_tmp + 46, i4_rslt_vert_16x8_r7, 4); |
| vst1q_lane_s16(pi2_tmp + 47, i4_rslt_vert_16x8_r7, 5); |
| } |