blob: a69d5f657fc1cd01128ca52eebe128a227352a61 [file] [log] [blame]
/*!
*************************************************************************************
* Copyright (c) 2022 Loongson Technology Corporation Limited
* Contributed by Jin Bo <jinbo@loongson.cn>
*
* \copy
* Copyright (c) 2022, Cisco Systems
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*
*
* \file mc_horver_lsx.c
*
* \brief Loongson optimization
*
* \date 3/3/2022 Created
*
*************************************************************************************
*/
#include <stdint.h>
#include "loongson_intrinsics.h"
#define FILTER_INPUT_8BIT(_in0, _in1, _in2, _in3, \
_in4, _in5, _out0) \
do { \
__m128i _tmp; \
_in0 = __lsx_vadd_h(_in0, _in5); \
_in1 = __lsx_vadd_h(_in1, _in4); \
_in2 = __lsx_vadd_h(_in2, _in3); \
_tmp = __lsx_vslli_h(_in1, 2); \
_in1 = __lsx_vadd_h(_tmp, _in1); \
_in0 = __lsx_vsub_h(_in0, _in1); \
_tmp = __lsx_vslli_h(_in2, 4); \
_in0 = __lsx_vadd_h(_in0, _tmp); \
_tmp = __lsx_vslli_h(_in2, 2); \
_out0 = __lsx_vadd_h(_in0, _tmp); \
}while(0)
#define HOR_FILTER_INPUT_16BIT(_in0, _in1, _in2, _in3, \
_in4, _in5, _out0) \
do { \
__m128i _pi05, _pi14, _pi23, _temp; \
_pi05 = __lsx_vadd_w(_in0, _in5); \
_pi14 = __lsx_vadd_w(_in1, _in4); \
_pi23 = __lsx_vadd_w(_in2, _in3); \
_temp = __lsx_vslli_w(_pi14, 2); \
_pi14 = __lsx_vadd_w(_temp, _pi14); \
_pi05 = __lsx_vsub_w(_pi05, _pi14); \
_temp = __lsx_vslli_w(_pi23, 4); \
_pi05 = __lsx_vadd_w(_pi05, _temp); \
_temp = __lsx_vslli_w(_pi23, 2); \
_out0 = __lsx_vadd_w(_pi05, _temp); \
}while(0)
void PixelAvgWidthEq4_lsx(uint8_t *pDst, int32_t iDstStride, const uint8_t *pSrcA,
int32_t iSrcAStride, const uint8_t *pSrcB, int32_t iSrcBStride,
int32_t iHeight ) {
int32_t i;
__m128i src0, src1;
for (i = 0; i < iHeight; i++) {
src0 = __lsx_vldrepl_w(pSrcA, 0);
src1 = __lsx_vldrepl_w(pSrcB, 0);
pSrcA += iSrcAStride;
pSrcB += iSrcBStride;
src0 = __lsx_vavgr_bu(src0, src1);
__lsx_vstelm_w(src0, pDst, 0, 0);
pDst += iDstStride;
}
}
void PixelAvgWidthEq8_lsx(uint8_t *pDst, int32_t iDstStride, const uint8_t *pSrcA,
int32_t iSrcAStride, const uint8_t *pSrcB, int32_t iSrcBStride,
int32_t iHeight ) {
int32_t i;
__m128i src0, src1, src2, src3;
for (i = 0; i < iHeight; i += 2) {
src0 = __lsx_vldrepl_d(pSrcA, 0);
src1 = __lsx_vldrepl_d(pSrcB, 0);
pSrcA += iSrcAStride;
pSrcB += iSrcBStride;
src0 = __lsx_vavgr_bu(src0, src1);
src2 = __lsx_vldrepl_d(pSrcA, 0);
src3 = __lsx_vldrepl_d(pSrcB, 0);
pSrcA += iSrcAStride;
pSrcB += iSrcBStride;
src2 = __lsx_vavgr_bu(src2, src3);
__lsx_vstelm_d(src0, pDst, 0, 0);
pDst += iDstStride;
__lsx_vstelm_d(src2, pDst, 0, 0);
pDst += iDstStride;
}
}
void PixelAvgWidthEq16_lsx(uint8_t *pDst, int32_t iDstStride, const uint8_t *pSrcA,
int32_t iSrcAStride, const uint8_t *pSrcB, int32_t iSrcBStride,
int32_t iHeight ) {
int32_t i;
__m128i src0, src1, src2, src3;
__m128i src4, src5, src6, src7;
for (i = 0; i < iHeight; i += 4) {
src0 = __lsx_vld(pSrcA, 0);
src1 = __lsx_vld(pSrcB, 0);
pSrcA += iSrcAStride;
pSrcB += iSrcBStride;
src0 = __lsx_vavgr_bu(src0, src1);
src2 = __lsx_vld(pSrcA, 0);
src3 = __lsx_vld(pSrcB, 0);
pSrcA += iSrcAStride;
pSrcB += iSrcBStride;
src2 = __lsx_vavgr_bu(src2, src3);
src4 = __lsx_vld(pSrcA, 0);
src5 = __lsx_vld(pSrcB, 0);
pSrcA += iSrcAStride;
pSrcB += iSrcBStride;
src4 = __lsx_vavgr_bu(src4, src5);
src6 = __lsx_vld(pSrcA, 0);
src7 = __lsx_vld(pSrcB, 0);
pSrcA += iSrcAStride;
pSrcB += iSrcBStride;
src6 = __lsx_vavgr_bu(src6, src7);
__lsx_vst(src0, pDst, 0);
pDst += iDstStride;
__lsx_vst(src2, pDst, 0);
pDst += iDstStride;
__lsx_vst(src4, pDst, 0);
pDst += iDstStride;
__lsx_vst(src6, pDst, 0);
pDst += iDstStride;
}
}
void McHorVer02WidthEq8_lsx(const uint8_t *pSrc, int32_t iSrcStride, uint8_t *pDst,
int32_t iDstStride, int32_t iHeight) {
int32_t iStride1 = iSrcStride;
int32_t iStride2 = iSrcStride << 1;
int32_t iStride3 = iStride1 + iStride2;
uint8_t *psrc = (uint8_t*)pSrc;
__m128i src0, src1, src2, src3, src4, src5;
for (int i = 0; i < iHeight; i++) {
DUP4_ARG2(__lsx_vldx,
psrc, -iStride2,
psrc, -iStride1,
psrc, iStride1,
psrc, iStride2,
src0, src1, src3, src4);
src2 = __lsx_vld(psrc, 0);
src5 = __lsx_vldx(psrc, iStride3);
DUP4_ARG2(__lsx_vsllwil_hu_bu,
src0, 0,
src1, 0,
src2, 0,
src3, 0,
src0, src1, src2, src3);
src4 = __lsx_vsllwil_hu_bu(src4, 0);
src5 = __lsx_vsllwil_hu_bu(src5, 0);
FILTER_INPUT_8BIT(src0, src1, src2, src3 ,src4, src5 ,src0);
src0 = __lsx_vsrari_h(src0, 5);
src0 = __lsx_vclip255_h(src0);
src0 = __lsx_vpickev_b(src0, src0);
__lsx_vstelm_d(src0, pDst, 0, 0);
pDst += iDstStride;
psrc += iSrcStride;
}
}
void McHorVer02WidthEq16_lsx(const uint8_t *pSrc, int32_t iSrcStride, uint8_t *pDst,
int32_t iDstStride, int32_t iHeight) {
int32_t iStride1 = iSrcStride;
int32_t iStride2 = iSrcStride << 1;
int32_t iStride3 = iStride1 + iStride2;
uint8_t *psrc = (uint8_t*)pSrc;
__m128i src0, src1, src2, src3, src4, src5;
__m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, out0_l, out0_h;
for (int i = 0; i < iHeight; i++) {
DUP4_ARG2(__lsx_vldx,
psrc, -iStride2,
psrc, -iStride1,
psrc, iStride1,
psrc, iStride2,
src0, src1, src3, src4);
src2 = __lsx_vld(psrc, 0);
src5 = __lsx_vldx(psrc, iStride3);
//l part
DUP4_ARG2(__lsx_vsllwil_hu_bu,
src0, 0,
src1, 0,
src2, 0,
src3, 0,
tmp0, tmp1, tmp2, tmp3);
tmp4 = __lsx_vsllwil_hu_bu(src4, 0);
tmp5 = __lsx_vsllwil_hu_bu(src5, 0);
FILTER_INPUT_8BIT(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, out0_l);
out0_l = __lsx_vsrari_h(out0_l, 5);
out0_l = __lsx_vclip255_h(out0_l);
//h part
DUP4_ARG1(__lsx_vexth_hu_bu,
src0,
src1,
src2,
src3,
tmp0, tmp1, tmp2, tmp3);
tmp4 = __lsx_vexth_hu_bu(src4);
tmp5 = __lsx_vexth_hu_bu(src5);
FILTER_INPUT_8BIT(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, out0_h);
out0_h = __lsx_vsrari_h(out0_h, 5);
out0_h = __lsx_vclip255_h(out0_h);
out0_l = __lsx_vpickev_b(out0_h, out0_l);
__lsx_vst(out0_l, pDst, 0);
pDst += iDstStride;
psrc += iSrcStride;
}
}
void McHorVer20WidthEq4_lsx(const uint8_t *pSrc, int32_t iSrcStride, uint8_t *pDst,
int32_t iDstStride, int32_t iHeight) {
uint8_t *psrc = (uint8_t*)pSrc -2;
__m128i src0, src1, src2, src3, src4, src5;
for (int i = 0; i < iHeight; i++) {
src0 = __lsx_vld(psrc, 0);
DUP4_ARG2(__lsx_vbsrl_v,
src0, 1,
src0, 2,
src0, 3,
src0, 4,
src1, src2, src3, src4);
src5 = __lsx_vbsrl_v(src0, 5);
DUP4_ARG2(__lsx_vsllwil_hu_bu,
src0, 0,
src1, 0,
src2, 0,
src3, 0,
src0, src1, src2, src3);
src4 = __lsx_vsllwil_hu_bu(src4, 0);
src5 = __lsx_vsllwil_hu_bu(src5, 0);
FILTER_INPUT_8BIT(src0, src1, src2, src3 ,src4, src5 ,src0);
src0 = __lsx_vsrari_h(src0, 5);
src0 = __lsx_vclip255_h(src0);
src0 = __lsx_vpickev_b(src0, src0);
__lsx_vstelm_w(src0, pDst, 0, 0);
pDst += iDstStride;
psrc += iSrcStride;
}
}
void McHorVer20WidthEq5_lsx(const uint8_t *pSrc, int32_t iSrcStride, uint8_t *pDst,
int32_t iDstStride, int32_t iHeight) {
uint8_t *psrc = (uint8_t*)pSrc -2;
__m128i src0, src1, src2, src3, src4, src5;
for (int i = 0; i < iHeight; i++) {
src0 = __lsx_vld(psrc, 0);
DUP4_ARG2(__lsx_vbsrl_v,
src0, 1,
src0, 2,
src0, 3,
src0, 4,
src1, src2, src3, src4);
src5 = __lsx_vbsrl_v(src0, 5);
DUP4_ARG2(__lsx_vsllwil_hu_bu,
src0, 0,
src1, 0,
src2, 0,
src3, 0,
src0, src1, src2, src3);
src4 = __lsx_vsllwil_hu_bu(src4, 0);
src5 = __lsx_vsllwil_hu_bu(src5, 0);
FILTER_INPUT_8BIT(src0, src1, src2, src3 ,src4, src5 ,src0);
src0 = __lsx_vsrari_h(src0, 5);
src0 = __lsx_vclip255_h(src0);
src0 = __lsx_vpickev_b(src0, src0);
__lsx_vstelm_w(src0, pDst, 0, 0);
__lsx_vstelm_b(src0, pDst, 4, 4);
pDst += iDstStride;
psrc += iSrcStride;
}
}
void McHorVer20WidthEq8_lsx(const uint8_t *pSrc, int32_t iSrcStride, uint8_t *pDst,
int32_t iDstStride, int32_t iHeight) {
uint8_t *psrc = (uint8_t*)pSrc -2;
__m128i src0, src1, src2, src3, src4, src5;
for (int i = 0; i < iHeight; i++) {
src0 = __lsx_vld(psrc, 0);
DUP4_ARG2(__lsx_vbsrl_v,
src0, 1,
src0, 2,
src0, 3,
src0, 4,
src1, src2, src3, src4);
src5 = __lsx_vbsrl_v(src0, 5);
DUP4_ARG2(__lsx_vsllwil_hu_bu,
src0, 0,
src1, 0,
src2, 0,
src3, 0,
src0, src1, src2, src3);
src4 = __lsx_vsllwil_hu_bu(src4, 0);
src5 = __lsx_vsllwil_hu_bu(src5, 0);
FILTER_INPUT_8BIT(src0, src1, src2, src3 ,src4, src5 ,src0);
src0 = __lsx_vsrari_h(src0, 5);
src0 = __lsx_vclip255_h(src0);
src0 = __lsx_vpickev_b(src0, src0);
__lsx_vstelm_d(src0, pDst, 0, 0);
pDst += iDstStride;
psrc += iSrcStride;
}
}
void McHorVer20WidthEq9_lsx(const uint8_t *pSrc, int32_t iSrcStride, uint8_t *pDst,
int32_t iDstStride, int32_t iHeight) {
McHorVer20WidthEq4_lsx(pSrc, iSrcStride, pDst, iDstStride, iHeight);
McHorVer20WidthEq5_lsx(&pSrc[4], iSrcStride, &pDst[4], iDstStride, iHeight);
}
void McHorVer20WidthEq16_lsx(const uint8_t *pSrc, int32_t iSrcStride, uint8_t *pDst,
int32_t iDstStride, int32_t iHeight) {
uint8_t *psrc = (uint8_t*)pSrc - 2;
__m128i src0, src1, src2, src3, src4, src5;
__m128i tmp0, tmp1, tmp2 ,tmp3 ,tmp4, tmp5, out0_l, out0_h;
for (int i = 0; i < iHeight; i++) {
DUP4_ARG2(__lsx_vld,
psrc, 0,
psrc + 1, 0,
psrc + 2, 0,
psrc + 3, 0,
src0, src1, src2, src3);
src4 = __lsx_vld(psrc + 4, 0);
src5 = __lsx_vld(psrc + 5, 0);
//l part
DUP4_ARG2(__lsx_vsllwil_hu_bu,
src0, 0,
src1, 0,
src2, 0,
src3, 0,
tmp0, tmp1, tmp2, tmp3);
tmp4 = __lsx_vsllwil_hu_bu(src4, 0);
tmp5 = __lsx_vsllwil_hu_bu(src5, 0);
FILTER_INPUT_8BIT(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, out0_l);
out0_l = __lsx_vsrari_h(out0_l, 5);
out0_l = __lsx_vclip255_h(out0_l);
//h part
DUP4_ARG1(__lsx_vexth_hu_bu,
src0,
src1,
src2,
src3,
tmp0, tmp1, tmp2, tmp3);
tmp4 = __lsx_vexth_hu_bu(src4);
tmp5 = __lsx_vexth_hu_bu(src5);
FILTER_INPUT_8BIT(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, out0_h);
out0_h = __lsx_vsrari_h(out0_h, 5);
out0_h = __lsx_vclip255_h(out0_h);
out0_l = __lsx_vpickev_b(out0_h, out0_l);
__lsx_vst(out0_l, pDst, 0);
pDst += iDstStride;
psrc += iSrcStride;
}
}
void McHorVer20WidthEq17_lsx(const uint8_t *pSrc, int32_t iSrcStride, uint8_t *pDst,
int32_t iDstStride, int32_t iHeight) {
McHorVer20WidthEq8_lsx(pSrc, iSrcStride, pDst, iDstStride, iHeight);
McHorVer20WidthEq9_lsx(&pSrc[8], iSrcStride, &pDst[8], iDstStride, iHeight);
}
void McHorVer22WidthEq8_lsx(const uint8_t *pSrc, int32_t iSrcStride, uint8_t *pDst,
int32_t iDstStride, int32_t iHeight) {
int32_t iStride1 = iSrcStride;
int32_t iStride2 = iSrcStride << 1;
int32_t iStride3 = iStride1 + iStride2;
uint8_t *psrc = (uint8_t*)pSrc - 2;
__m128i src0, src1, src2, src3, src4, src5;
__m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, out0_l, out0_h;
__m128i dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
v8i16 mask1 = {3, 4, 5, 6, 7, 8, 9, 10};
v8i16 mask2 = {6, 7, 8, 9, 10, 11, 12 ,13};
for (int i = 0; i < iHeight; i++) {
DUP4_ARG2(__lsx_vldx,
psrc, -iStride2,
psrc, -iStride1,
psrc, iStride1,
psrc, iStride2,
src0, src1, src3, src4);
src2 = __lsx_vld(psrc, 0);
src5 = __lsx_vldx(psrc, iStride3);
//l part
DUP4_ARG2(__lsx_vsllwil_hu_bu,
src0, 0,
src1, 0,
src2, 0,
src3, 0,
tmp0, tmp1, tmp2, tmp3);
tmp4 = __lsx_vsllwil_hu_bu(src4, 0);
tmp5 = __lsx_vsllwil_hu_bu(src5, 0);
FILTER_INPUT_8BIT(tmp0, tmp1 ,tmp2, tmp3, tmp4, tmp5, out0_l);
//h part
DUP4_ARG1(__lsx_vexth_hu_bu,
src0,
src1,
src2,
src3,
tmp0, tmp1, tmp2, tmp3);
tmp4 = __lsx_vexth_hu_bu(src4);
tmp5 = __lsx_vexth_hu_bu(src5);
FILTER_INPUT_8BIT(tmp0, tmp1 ,tmp2, tmp3, tmp4, tmp5, out0_h);
dst0 = out0_l;
dst1 = __lsx_vbsrl_v(out0_l, 2);
dst2 = __lsx_vbsrl_v(out0_l, 4);
dst3 = __lsx_vshuf_h((__m128i)mask1, out0_h, out0_l);
dst4 = __lsx_vbsrl_v(dst3, 2);
dst5 = __lsx_vbsrl_v(dst3, 4);
dst6 = __lsx_vshuf_h((__m128i)mask2, out0_h, out0_l);
dst7 = __lsx_vbsrl_v(dst6, 2);
LSX_TRANSPOSE8x8_H(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7,
dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
//l part
DUP4_ARG2(__lsx_vsllwil_w_h,
dst0, 0,
dst1, 0,
dst2, 0,
dst3, 0,
tmp0, tmp1, tmp2, tmp3);
DUP2_ARG2(__lsx_vsllwil_w_h,
dst4, 0,
dst5, 0,
tmp4, tmp5);
HOR_FILTER_INPUT_16BIT(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, out0_l);
//h part
DUP4_ARG1(__lsx_vexth_w_h,
dst0,
dst1,
dst2,
dst3,
tmp0, tmp1, tmp2, tmp3);
DUP2_ARG1(__lsx_vexth_w_h,
dst4,
dst5,
tmp4, tmp5);
HOR_FILTER_INPUT_16BIT(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, out0_h);
out0_l = __lsx_vsrari_w(out0_l, 10);
out0_h = __lsx_vsrari_w(out0_h, 10);
DUP2_ARG1(__lsx_vclip255_w,
out0_l, out0_h,
out0_l, out0_h);
out0_l = __lsx_vpickev_h(out0_h, out0_l);
out0_l = __lsx_vpickev_b(out0_l, out0_l);
__lsx_vstelm_d(out0_l, pDst, 0, 0);
psrc += iSrcStride;
pDst += iDstStride;
}
}
static
void McHorVer22WidthEq4_lsx(const uint8_t *pSrc, int32_t iSrcStride, uint8_t *pDst,
int32_t iDstStride, int32_t iHeight) {
int32_t iStride1 = iSrcStride;
int32_t iStride2 = iSrcStride << 1;
int32_t iStride3 = iStride1 + iStride2;
uint8_t *psrc = (uint8_t*)pSrc - 2;
__m128i src0, src1, src2, src3, src4, src5;
__m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, out0_l, out0_h;
__m128i dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
v8i16 mask1 = {3, 4, 5, 6, 7, 8, 9, 10};
for (int i = 0; i < iHeight; i++) {
DUP4_ARG2(__lsx_vldx,
psrc, -iStride2,
psrc, -iStride1,
psrc, iStride1,
psrc, iStride2,
src0, src1, src3, src4);
src2 = __lsx_vld(psrc, 0);
src5 = __lsx_vldx(psrc, iStride3);
//l part
DUP4_ARG2(__lsx_vsllwil_hu_bu,
src0, 0,
src1, 0,
src2, 0,
src3, 0,
tmp0, tmp1, tmp2, tmp3);
tmp4 = __lsx_vsllwil_hu_bu(src4, 0);
tmp5 = __lsx_vsllwil_hu_bu(src5, 0);
FILTER_INPUT_8BIT(tmp0, tmp1 ,tmp2, tmp3, tmp4, tmp5, out0_l);
//h part
DUP4_ARG1(__lsx_vexth_hu_bu,
src0,
src1,
src2,
src3,
tmp0, tmp1, tmp2, tmp3);
tmp4 = __lsx_vexth_hu_bu(src4);
tmp5 = __lsx_vexth_hu_bu(src5);
FILTER_INPUT_8BIT(tmp0, tmp1 ,tmp2, tmp3, tmp4, tmp5, out0_h);
dst0 = out0_l;
dst1 = __lsx_vbsrl_v(out0_l, 2);
dst2 = __lsx_vbsrl_v(out0_l, 4);
dst3 = __lsx_vshuf_h((__m128i)mask1, out0_h, out0_l);
LSX_TRANSPOSE8x8_H(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7,
dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
//l part
DUP4_ARG2(__lsx_vsllwil_w_h,
dst0, 0,
dst1, 0,
dst2, 0,
dst3, 0,
tmp0, tmp1, tmp2, tmp3);
DUP2_ARG2(__lsx_vsllwil_w_h,
dst4, 0,
dst5, 0,
tmp4, tmp5);
HOR_FILTER_INPUT_16BIT(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, out0_l);
//h part
DUP4_ARG1(__lsx_vexth_w_h,
dst0,
dst1,
dst2,
dst3,
tmp0, tmp1, tmp2, tmp3);
DUP2_ARG1(__lsx_vexth_w_h,
dst4,
dst5,
tmp4, tmp5);
HOR_FILTER_INPUT_16BIT(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, out0_h);
out0_l = __lsx_vsrari_w(out0_l, 10);
out0_h = __lsx_vsrari_w(out0_h, 10);
DUP2_ARG1(__lsx_vclip255_w,
out0_l, out0_h,
out0_l, out0_h);
out0_l = __lsx_vpickev_h(out0_h, out0_l);
out0_l = __lsx_vpickev_b(out0_l, out0_l);
__lsx_vstelm_w(out0_l, pDst, 0, 0);
psrc += iSrcStride;
pDst += iDstStride;
}
}
void McHorVer22WidthEq5_lsx(const uint8_t *pSrc, int32_t iSrcStride, uint8_t *pDst,
int32_t iDstStride, int32_t iHeight) {
int32_t iStride1 = iSrcStride;
int32_t iStride2 = iSrcStride << 1;
int32_t iStride3 = iStride1 + iStride2;
uint8_t *psrc = (uint8_t*)pSrc - 2;
__m128i src0, src1, src2, src3, src4, src5;
__m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, out0_l, out0_h;
__m128i dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
v8i16 mask1 = {3, 4, 5, 6, 7, 8, 9, 10};
for (int i = 0; i < iHeight; i++) {
DUP4_ARG2(__lsx_vldx,
psrc, -iStride2,
psrc, -iStride1,
psrc, iStride1,
psrc, iStride2,
src0, src1, src3, src4);
src2 = __lsx_vld(psrc, 0);
src5 = __lsx_vldx(psrc, iStride3);
//l part
DUP4_ARG2(__lsx_vsllwil_hu_bu,
src0, 0,
src1, 0,
src2, 0,
src3, 0,
tmp0, tmp1, tmp2, tmp3);
tmp4 = __lsx_vsllwil_hu_bu(src4, 0);
tmp5 = __lsx_vsllwil_hu_bu(src5, 0);
FILTER_INPUT_8BIT(tmp0, tmp1 ,tmp2, tmp3, tmp4, tmp5, out0_l);
//h part
DUP4_ARG1(__lsx_vexth_hu_bu,
src0,
src1,
src2,
src3,
tmp0, tmp1, tmp2, tmp3);
tmp4 = __lsx_vexth_hu_bu(src4);
tmp5 = __lsx_vexth_hu_bu(src5);
FILTER_INPUT_8BIT(tmp0, tmp1 ,tmp2, tmp3, tmp4, tmp5, out0_h);
dst0 = out0_l;
dst1 = __lsx_vbsrl_v(out0_l, 2);
dst2 = __lsx_vbsrl_v(out0_l, 4);
dst3 = __lsx_vshuf_h((__m128i)mask1, out0_h, out0_l);
dst4 = __lsx_vbsrl_v(dst3, 2);
LSX_TRANSPOSE8x8_H(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7,
dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
//l part
DUP4_ARG2(__lsx_vsllwil_w_h,
dst0, 0,
dst1, 0,
dst2, 0,
dst3, 0,
tmp0, tmp1, tmp2, tmp3);
DUP2_ARG2(__lsx_vsllwil_w_h,
dst4, 0,
dst5, 0,
tmp4, tmp5);
HOR_FILTER_INPUT_16BIT(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, out0_l);
//h part
DUP4_ARG1(__lsx_vexth_w_h,
dst0,
dst1,
dst2,
dst3,
tmp0, tmp1, tmp2, tmp3);
DUP2_ARG1(__lsx_vexth_w_h,
dst4,
dst5,
tmp4, tmp5);
HOR_FILTER_INPUT_16BIT(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, out0_h);
out0_l = __lsx_vsrari_w(out0_l, 10);
out0_h = __lsx_vsrari_w(out0_h, 10);
DUP2_ARG1(__lsx_vclip255_w,
out0_l, out0_h,
out0_l, out0_h);
out0_l = __lsx_vpickev_h(out0_h, out0_l);
out0_l = __lsx_vpickev_b(out0_l, out0_l);
__lsx_vstelm_w(out0_l, pDst, 0, 0);
__lsx_vstelm_b(out0_l, pDst, 4, 4);
psrc += iSrcStride;
pDst += iDstStride;
}
}
void McHorVer22WidthEq9_lsx(const uint8_t *pSrc, int32_t iSrcStride, uint8_t *pDst,
int32_t iDstStride, int32_t iHeight) {
McHorVer22WidthEq4_lsx(pSrc, iSrcStride, pDst, iDstStride, iHeight);
McHorVer22WidthEq5_lsx(&pSrc[4], iSrcStride, &pDst[4], iDstStride, iHeight);
}
void McHorVer22WidthEq17_lsx(const uint8_t *pSrc, int32_t iSrcStride, uint8_t *pDst,
int32_t iDstStride, int32_t iHeight) {
McHorVer22WidthEq8_lsx(pSrc, iSrcStride, pDst, iDstStride, iHeight);
McHorVer22WidthEq9_lsx(&pSrc[8], iSrcStride, &pDst[8], iDstStride, iHeight);
}