blob: a72b701332e96936cc8d3c57642a88c5940c239c [file] [log] [blame]
/*!
**********************************************************************************
* Copyright (c) 2022 Loongson Technology Corporation Limited
* Contributed by Lu Wang <wanglu@loongson.cn>
* Jin Bo <jinbo@loongson.cn>
*
* \copy
* Copyright (c) 2013, Cisco Systems
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*
* \file deblock_lsx.c
*
* \brief Loongson optimization
*
* \date 22/2/2022 Created
*
**********************************************************************************
*/
#include <stdint.h>
#include "loongson_intrinsics.h"
void DeblockLumaLt4V_lsx (uint8_t* pPix, int32_t iStrideX,
int32_t iAlpha, int32_t iBeta, int8_t* pTc) {
__m128i p0, p1, p2, q0, q1, q2;
__m128i p0_l, p1_l, p2_l, q0_l, q1_l, q2_l;
__m128i p0_h, p1_h, p2_h, q0_h, q1_h, q2_h;
__m128i t0, t1, t2, t3, t;
__m128i t0_l, t0_h, t1_l, t1_h, t2_l, t2_h;
__m128i iTc, iTc0, negiTc, negiTc0, f, flags;
__m128i iTc_l, iTc_h, negiTc_l, negiTc_h;
__m128i iTc0_l, iTc0_h, negiTc0_l, negiTc0_h;
__m128i bDetaP0Q0, bDetaP1P0, bDetaQ1Q0, bDetaP2P0, bDetaQ2Q0;
__m128i zero = __lsx_vldi(0);
__m128i alpha = __lsx_vreplgr2vr_b(iAlpha);
__m128i beta = __lsx_vreplgr2vr_b(iBeta);
__m128i shuf = {0x0101010100000000, 0x0303030302020202};
__m128i not_255 = {0xff00ff00ff00ff00, 0xff00ff00ff00ff00};
int32_t iStrideX_x0 = 0;
int32_t iStrideX_x2 = iStrideX << 1;
int32_t iStrideX_x3 = iStrideX_x2 + iStrideX;
iTc0 = __lsx_vldx(pTc, 0);
iTc0 = __lsx_vshuf_b(iTc0, iTc0, shuf);
negiTc0 = __lsx_vneg_b(iTc0);
iTc = iTc0;
DUP4_ARG2(__lsx_vldx, pPix, -iStrideX, pPix, -iStrideX_x2, pPix, -iStrideX_x3,
pPix, iStrideX_x0, p0, p1, p2, q0);
DUP2_ARG2(__lsx_vldx, pPix, iStrideX, pPix, iStrideX_x2, q1, q2);
DUP4_ARG2(__lsx_vabsd_bu, p0, q0, p1, p0, q1, q0, p2, p0,
bDetaP0Q0, bDetaP1P0, bDetaQ1Q0, bDetaP2P0);
bDetaQ2Q0 = __lsx_vabsd_bu(q2, q0);
DUP4_ARG2(__lsx_vslt_bu, bDetaP0Q0, alpha, bDetaP1P0, beta, bDetaQ1Q0, beta,
bDetaP2P0, beta, bDetaP0Q0, bDetaP1P0, bDetaQ1Q0, bDetaP2P0);
bDetaQ2Q0 = __lsx_vslt_bu(bDetaQ2Q0, beta);
DUP4_ARG2(__lsx_vilvl_b, zero, p0, zero, p1, zero, p2, zero, q0,
p0_l, p1_l, p2_l, q0_l);
DUP2_ARG2(__lsx_vilvl_b, zero, q1, zero, q2, q1_l, q2_l);
DUP4_ARG2(__lsx_vilvh_b, zero, p0, zero, p1, zero, p2, zero, q0,
p0_h, p1_h, p2_h, q0_h);
DUP2_ARG2(__lsx_vilvh_b, zero, q1, zero, q2, q1_h, q2_h);
DUP2_ARG2(__lsx_vand_v, bDetaP0Q0, bDetaP1P0, f, bDetaQ1Q0, f, f);
flags = __lsx_vsle_b(zero, iTc0);
DUP2_ARG2(__lsx_vand_v, f, flags, flags, bDetaP2P0, flags, flags);
flags = __lsx_vandi_b(flags, 1);
iTc = __lsx_vadd_b(iTc, flags);
flags = __lsx_vsle_b(zero, iTc0);
DUP2_ARG2(__lsx_vand_v, f, flags, flags, bDetaQ2Q0, flags, flags);
flags = __lsx_vandi_b(flags, 1);
iTc = __lsx_vadd_b(iTc, flags);
negiTc = __lsx_vneg_b(iTc);
flags = __lsx_vslt_b(iTc0, zero);
iTc0_l = __lsx_vilvl_b(flags, iTc0);
iTc0_h = __lsx_vilvh_b(flags, iTc0);
flags = __lsx_vslt_b(negiTc0, zero);
negiTc0_l = __lsx_vilvl_b(flags, negiTc0);
negiTc0_h = __lsx_vilvh_b(flags, negiTc0);
flags = __lsx_vslt_b(iTc, zero);
iTc_l = __lsx_vilvl_b(flags, iTc);
iTc_h = __lsx_vilvh_b(flags, iTc);
flags = __lsx_vslt_b(negiTc, zero);
negiTc_l = __lsx_vilvl_b(flags, negiTc);
negiTc_h = __lsx_vilvh_b(flags, negiTc);
t0_l = __lsx_vadd_h(p0_l, q0_l);
t0_l = __lsx_vaddi_hu(t0_l, 1);
t0_l = __lsx_vsrai_h(t0_l, 1);
t0_l = __lsx_vadd_h(p2_l, t0_l);
t = __lsx_vslli_h(p1_l, 1);
t0_l = __lsx_vsub_h(t0_l, t);
t0_l = __lsx_vsrai_h(t0_l, 1);
t0_l = __lsx_vmin_h(iTc0_l, t0_l);
t0_l = __lsx_vmax_h(negiTc0_l, t0_l);
t0_l = __lsx_vadd_h(p1_l, t0_l);
t1_l = __lsx_vadd_h(p0_l, q0_l);
t1_l = __lsx_vaddi_hu(t1_l, 1);
t1_l = __lsx_vsrai_h(t1_l, 1);
t1_l = __lsx_vadd_h(q2_l, t1_l);
t = __lsx_vslli_h(q1_l, 1);
t1_l = __lsx_vsub_h(t1_l, t);
t1_l = __lsx_vsrai_h(t1_l, 1);
t1_l = __lsx_vmin_h(iTc0_l, t1_l);
t1_l = __lsx_vmax_h(negiTc0_l, t1_l);
t1_l = __lsx_vadd_h(q1_l, t1_l);
t0_h = __lsx_vadd_h(p0_h, q0_h);
t0_h = __lsx_vaddi_hu(t0_h, 1);
t0_h = __lsx_vsrai_h(t0_h, 1);
t0_h = __lsx_vadd_h(p2_h, t0_h);
t = __lsx_vslli_h(p1_h, 1);
t0_h = __lsx_vsub_h(t0_h, t);
t0_h = __lsx_vsrai_h(t0_h, 1);
t0_h = __lsx_vmin_h(iTc0_h, t0_h);
t0_h = __lsx_vmax_h(negiTc0_h, t0_h);
t0_h = __lsx_vadd_h(p1_h, t0_h);
t1_h = __lsx_vadd_h(p0_h, q0_h);
t1_h = __lsx_vaddi_hu(t1_h, 1);
t1_h = __lsx_vsrai_h(t1_h, 1);
t1_h = __lsx_vadd_h(q2_h, t1_h);
t = __lsx_vslli_h(q1_h, 1);
t1_h = __lsx_vsub_h(t1_h, t);
t1_h = __lsx_vsrai_h(t1_h, 1);
t1_h = __lsx_vmin_h(iTc0_h, t1_h);
t1_h = __lsx_vmax_h(negiTc0_h, t1_h);
t1_h = __lsx_vadd_h(q1_h, t1_h);
t2_l = __lsx_vsub_h(q0_l, p0_l);
t2_l = __lsx_vslli_h(t2_l, 2);
t2_l = __lsx_vadd_h(t2_l, p1_l);
t2_l = __lsx_vsub_h(t2_l, q1_l);
t2_l = __lsx_vaddi_hu(t2_l, 4);
t2_l = __lsx_vsrai_h(t2_l, 3);
t2_l = __lsx_vmin_h(iTc_l, t2_l);
t2_l = __lsx_vmax_h(negiTc_l, t2_l);
t2_h = __lsx_vsub_h(q0_h, p0_h);
t2_h = __lsx_vslli_h(t2_h, 2);
t2_h = __lsx_vadd_h(t2_h, p1_h);
t2_h = __lsx_vsub_h(t2_h, q1_h);
t2_h = __lsx_vaddi_hu(t2_h, 4);
t2_h = __lsx_vsrai_h(t2_h, 3);
t2_h = __lsx_vmin_h(iTc_h, t2_h);
t2_h = __lsx_vmax_h(negiTc_h, t2_h);
p0_l = __lsx_vadd_h(p0_l, t2_l);
p1_l = __lsx_vand_v(p0_l, not_255);
p2_l = __lsx_vsle_h(zero, p0_l);
flags = __lsx_vseq_h(p1_l, zero);
p0_l = __lsx_vand_v(p0_l, flags);
flags = __lsx_vnor_v(flags,flags);
p2_l = __lsx_vand_v(p2_l, flags);
p0_l = __lsx_vadd_h(p0_l, p2_l);
q0_l = __lsx_vsub_h(q0_l, t2_l);
q1_l = __lsx_vand_v(q0_l, not_255);
q2_l = __lsx_vsle_h(zero, q0_l);
flags = __lsx_vseq_h(q1_l, zero);
q0_l = __lsx_vand_v(q0_l, flags);
flags = __lsx_vnor_v(flags, flags);
q2_l = __lsx_vand_v(q2_l, flags);
q0_l = __lsx_vadd_h(q0_l, q2_l);
p0_h = __lsx_vadd_h(p0_h, t2_h);
p1_h = __lsx_vand_v(p0_h, not_255);
p2_h = __lsx_vsle_h(zero, p0_h);
flags = __lsx_vseq_h(p1_h, zero);
p0_h = __lsx_vand_v(p0_h, flags);
flags = __lsx_vnor_v(flags, flags);
p2_h = __lsx_vand_v(p2_h, flags);
p0_h = __lsx_vadd_h(p0_h, p2_h);
q0_h = __lsx_vsub_h(q0_h, t2_h);
q1_h = __lsx_vand_v(q0_h, not_255);
q2_h = __lsx_vsle_h(zero, q0_h);
flags = __lsx_vseq_h(q1_h, zero);
q0_h = __lsx_vand_v(q0_h, flags);
flags = __lsx_vnor_v(flags, flags);
q2_h = __lsx_vand_v(q2_h, flags);
q0_h = __lsx_vadd_h(q0_h, q2_h);
DUP4_ARG2(__lsx_vpickev_b, t0_h, t0_l, t1_h, t1_l,
p0_h, p0_l, q0_h, q0_l, t0, t1, t2, t3);
flags = __lsx_vsle_b(zero, iTc0);
flags = __lsx_vand_v(flags, f);
t2 = __lsx_vand_v(t2, flags);
t = __lsx_vnor_v(flags,flags);
p0 = __lsx_vand_v(p0, t);
p0 = __lsx_vadd_b(t2, p0);
t3 = __lsx_vand_v(t3, flags);
t = __lsx_vnor_v(flags,flags);
q0 = __lsx_vand_v(q0, t);
q0 = __lsx_vadd_b(t3, q0);
DUP2_ARG2(__lsx_vand_v, flags, bDetaP2P0, t0, t, t, t0);
t = __lsx_vnor_v(t, t);
p1 = __lsx_vand_v(p1, t);
p1 = __lsx_vadd_b(t0, p1);
DUP2_ARG2(__lsx_vand_v, flags, bDetaQ2Q0, t1, t, t, t1);
t = __lsx_vnor_v(t, t);
q1 = __lsx_vand_v(q1, t);
q1 = __lsx_vadd_b(t1, q1);
__lsx_vstx(p1, pPix, -iStrideX_x2);
__lsx_vstx(p0, pPix, -iStrideX);
__lsx_vstx(q0, pPix, iStrideX_x0);
__lsx_vstx(q1, pPix, iStrideX);
}
void DeblockLumaLt4H_lsx (uint8_t* pPix, int32_t iStrideY,
int32_t iAlpha, int32_t iBeta, int8_t* pTc) {
__m128i p0, p1, p2, q0, q1, q2;
__m128i p0_l, p1_l, p2_l, q0_l, q1_l, q2_l;
__m128i p0_h, p1_h, p2_h, q0_h, q1_h, q2_h;
__m128i t0, t1, t2, t3, t;
__m128i t0_l, t0_h, t1_l, t1_h, t2_l, t2_h;
__m128i iTc, iTc0, negiTc, negiTc0, f, flags;
__m128i iTc_l, iTc_h, negiTc_l, negiTc_h;
__m128i iTc0_l, iTc0_h, negiTc0_l, negiTc0_h;
__m128i bDetaP0Q0, bDetaP1P0, bDetaQ1Q0, bDetaP2P0, bDetaQ2Q0;
__m128i zero = __lsx_vldi(0);
__m128i alpha = __lsx_vreplgr2vr_b(iAlpha);
__m128i beta = __lsx_vreplgr2vr_b(iBeta);
__m128i shuf = {0x0101010100000000, 0x0303030302020202};
__m128i not_255 = {0xff00ff00ff00ff00, 0xff00ff00ff00ff00};
int32_t iStrideY_x0 = 0;
int32_t iStrideY_x2 = iStrideY << 1;
int32_t iStrideY_x3 = iStrideY_x2 + iStrideY;
int32_t iStrideY_x4 = iStrideY << 2;
iTc0 = __lsx_vldx(pTc, 0);
iTc0 = __lsx_vshuf_b(iTc0, iTc0, shuf);
negiTc0 = __lsx_vneg_b(iTc0);
iTc = iTc0;
pPix -= 3;
DUP4_ARG2(__lsx_vldx, pPix, iStrideY_x0, pPix, iStrideY, pPix, iStrideY_x2,
pPix, iStrideY_x3, p0_l, p1_l, p2_l, q0_l);
pPix += iStrideY_x4;
DUP4_ARG2(__lsx_vldx, pPix, iStrideY_x0, pPix, iStrideY, pPix, iStrideY_x2,
pPix, iStrideY_x3, p0_h, p1_h, p2_h, q0_h);
pPix += iStrideY_x4;
DUP4_ARG2(__lsx_vldx, pPix, iStrideY_x0, pPix, iStrideY, pPix, iStrideY_x2,
pPix, iStrideY_x3, q1_l, q2_l, t0_l, t1_l);
pPix += iStrideY_x4;
DUP4_ARG2(__lsx_vldx, pPix, iStrideY_x0, pPix, iStrideY, pPix, iStrideY_x2,
pPix, iStrideY_x3, q1_h, q2_h, t0_h, t1_h);
LSX_TRANSPOSE16x8_B(p0_l, p1_l, p2_l, q0_l, p0_h, p1_h, p2_h, q0_h, q1_l, q2_l,
t0_l, t1_l, q1_h, q2_h, t0_h, t1_h, p2, p1, p0, q0, q1, q2,
t, f);
DUP4_ARG2(__lsx_vabsd_bu, p0, q0, p1, p0, q1, q0, p2, p0,
bDetaP0Q0, bDetaP1P0, bDetaQ1Q0, bDetaP2P0);
bDetaQ2Q0 = __lsx_vabsd_bu(q2, q0);
DUP4_ARG2(__lsx_vslt_bu, bDetaP0Q0, alpha, bDetaP1P0, beta, bDetaQ1Q0, beta,
bDetaP2P0, beta, bDetaP0Q0, bDetaP1P0, bDetaQ1Q0, bDetaP2P0);
bDetaQ2Q0 = __lsx_vslt_bu(bDetaQ2Q0, beta);
DUP4_ARG2(__lsx_vilvl_b, zero, p0, zero, p1, zero, p2, zero, q0,
p0_l, p1_l, p2_l, q0_l);
DUP2_ARG2(__lsx_vilvl_b, zero, q1, zero, q2, q1_l, q2_l);
DUP4_ARG2(__lsx_vilvh_b, zero, p0, zero, p1, zero, p2, zero, q0,
p0_h, p1_h, p2_h, q0_h);
DUP2_ARG2(__lsx_vilvh_b, zero, q1, zero, q2, q1_h, q2_h);
DUP2_ARG2(__lsx_vand_v, bDetaP0Q0, bDetaP1P0, f, bDetaQ1Q0, f, f);
flags = __lsx_vsle_b(zero, iTc0);
DUP2_ARG2(__lsx_vand_v, f, flags, flags, bDetaP2P0, flags, flags);
flags = __lsx_vandi_b(flags, 1);
iTc = __lsx_vadd_b(iTc, flags);
flags = __lsx_vsle_b(zero, iTc0);
DUP2_ARG2(__lsx_vand_v, f, flags, flags, bDetaQ2Q0, flags, flags);
flags = __lsx_vandi_b(flags, 1);
iTc = __lsx_vadd_b(iTc, flags);
negiTc = __lsx_vneg_b(iTc);
flags = __lsx_vslt_b(iTc0, zero);
iTc0_l = __lsx_vilvl_b(flags, iTc0);
iTc0_h = __lsx_vilvh_b(flags, iTc0);
flags = __lsx_vslt_b(negiTc0, zero);
negiTc0_l = __lsx_vilvl_b(flags, negiTc0);
negiTc0_h = __lsx_vilvh_b(flags, negiTc0);
flags = __lsx_vslt_b(iTc, zero);
iTc_l = __lsx_vilvl_b(flags, iTc);
iTc_h = __lsx_vilvh_b(flags, iTc);
flags = __lsx_vslt_b(negiTc, zero);
negiTc_l = __lsx_vilvl_b(flags, negiTc);
negiTc_h = __lsx_vilvh_b(flags, negiTc);
t0_l = __lsx_vadd_h(p0_l, q0_l);
t0_l = __lsx_vaddi_hu(t0_l, 1);
t0_l = __lsx_vsrai_h(t0_l, 1);
t0_l = __lsx_vadd_h(p2_l, t0_l);
t = __lsx_vslli_h(p1_l, 1);
t0_l = __lsx_vsub_h(t0_l, t);
t0_l = __lsx_vsrai_h(t0_l, 1);
t0_l = __lsx_vmin_h(iTc0_l, t0_l);
t0_l = __lsx_vmax_h(negiTc0_l, t0_l);
t0_l = __lsx_vadd_h(p1_l, t0_l);
t1_l = __lsx_vadd_h(p0_l, q0_l);
t1_l = __lsx_vaddi_hu(t1_l, 1);
t1_l = __lsx_vsrai_h(t1_l, 1);
t1_l = __lsx_vadd_h(q2_l, t1_l);
t = __lsx_vslli_h(q1_l, 1);
t1_l = __lsx_vsub_h(t1_l, t);
t1_l = __lsx_vsrai_h(t1_l, 1);
t1_l = __lsx_vmin_h(iTc0_l, t1_l);
t1_l = __lsx_vmax_h(negiTc0_l, t1_l);
t1_l = __lsx_vadd_h(q1_l, t1_l);
t0_h = __lsx_vadd_h(p0_h, q0_h);
t0_h = __lsx_vaddi_hu(t0_h, 1);
t0_h = __lsx_vsrai_h(t0_h, 1);
t0_h = __lsx_vadd_h(p2_h, t0_h);
t = __lsx_vslli_h(p1_h, 1);
t0_h = __lsx_vsub_h(t0_h, t);
t0_h = __lsx_vsrai_h(t0_h, 1);
t0_h = __lsx_vmin_h(iTc0_h, t0_h);
t0_h = __lsx_vmax_h(negiTc0_h, t0_h);
t0_h = __lsx_vadd_h(p1_h, t0_h);
t1_h = __lsx_vadd_h(p0_h, q0_h);
t1_h = __lsx_vaddi_hu(t1_h, 1);
t1_h = __lsx_vsrai_h(t1_h, 1);
t1_h = __lsx_vadd_h(q2_h, t1_h);
t = __lsx_vslli_h(q1_h, 1);
t1_h = __lsx_vsub_h(t1_h, t);
t1_h = __lsx_vsrai_h(t1_h, 1);
t1_h = __lsx_vmin_h(iTc0_h, t1_h);
t1_h = __lsx_vmax_h(negiTc0_h, t1_h);
t1_h = __lsx_vadd_h(q1_h, t1_h);
t2_l = __lsx_vsub_h(q0_l, p0_l);
t2_l = __lsx_vslli_h(t2_l, 2);
t2_l = __lsx_vadd_h(t2_l, p1_l);
t2_l = __lsx_vsub_h(t2_l, q1_l);
t2_l = __lsx_vaddi_hu(t2_l, 4);
t2_l = __lsx_vsrai_h(t2_l, 3);
t2_l = __lsx_vmin_h(iTc_l, t2_l);
t2_l = __lsx_vmax_h(negiTc_l, t2_l);
t2_h = __lsx_vsub_h(q0_h, p0_h);
t2_h = __lsx_vslli_h(t2_h, 2);
t2_h = __lsx_vadd_h(t2_h, p1_h);
t2_h = __lsx_vsub_h(t2_h, q1_h);
t2_h = __lsx_vaddi_hu(t2_h, 4);
t2_h = __lsx_vsrai_h(t2_h, 3);
t2_h = __lsx_vmin_h(iTc_h, t2_h);
t2_h = __lsx_vmax_h(negiTc_h, t2_h);
p0_l = __lsx_vadd_h(p0_l, t2_l);
p1_l = __lsx_vand_v(p0_l, not_255);
p2_l = __lsx_vsle_h(zero, p0_l);
flags = __lsx_vseq_h(p1_l, zero);
p0_l = __lsx_vand_v(p0_l, flags);
flags = __lsx_vnor_v(flags,flags);
p2_l = __lsx_vand_v(p2_l, flags);
p0_l = __lsx_vadd_h(p0_l, p2_l);
q0_l = __lsx_vsub_h(q0_l, t2_l);
q1_l = __lsx_vand_v(q0_l, not_255);
q2_l = __lsx_vsle_h(zero, q0_l);
flags = __lsx_vseq_h(q1_l, zero);
q0_l = __lsx_vand_v(q0_l, flags);
flags = __lsx_vnor_v(flags, flags);
q2_l = __lsx_vand_v(q2_l, flags);
q0_l = __lsx_vadd_h(q0_l, q2_l);
p0_h = __lsx_vadd_h(p0_h, t2_h);
p1_h = __lsx_vand_v(p0_h, not_255);
p2_h = __lsx_vsle_h(zero, p0_h);
flags = __lsx_vseq_h(p1_h, zero);
p0_h = __lsx_vand_v(p0_h, flags);
flags = __lsx_vnor_v(flags, flags);
p2_h = __lsx_vand_v(p2_h, flags);
p0_h = __lsx_vadd_h(p0_h, p2_h);
q0_h = __lsx_vsub_h(q0_h, t2_h);
q1_h = __lsx_vand_v(q0_h, not_255);
q2_h = __lsx_vsle_h(zero, q0_h);
flags = __lsx_vseq_h(q1_h, zero);
q0_h = __lsx_vand_v(q0_h, flags);
flags = __lsx_vnor_v(flags, flags);
q2_h = __lsx_vand_v(q2_h, flags);
q0_h = __lsx_vadd_h(q0_h, q2_h);
DUP4_ARG2(__lsx_vpickev_b, t0_h, t0_l, t1_h, t1_l,
p0_h, p0_l, q0_h, q0_l, t0, t1, t2, t3);
flags = __lsx_vsle_b(zero, iTc0);
flags = __lsx_vand_v(flags, f);
t2 = __lsx_vand_v(t2, flags);
t = __lsx_vnor_v(flags,flags);
p0 = __lsx_vand_v(p0, t);
p0 = __lsx_vadd_b(t2, p0);
t3 = __lsx_vand_v(t3, flags);
t = __lsx_vnor_v(flags,flags);
q0 = __lsx_vand_v(q0, t);
q0 = __lsx_vadd_b(t3, q0);
DUP2_ARG2(__lsx_vand_v, flags, bDetaP2P0, t0, t, t, t0);
t = __lsx_vnor_v(t, t);
p1 = __lsx_vand_v(p1, t);
p1 = __lsx_vadd_b(t0, p1);
DUP2_ARG2(__lsx_vand_v, flags, bDetaQ2Q0, t1, t, t, t1);
t = __lsx_vnor_v(t, t);
q1 = __lsx_vand_v(q1, t);
q1 = __lsx_vadd_b(t1, q1);
DUP2_ARG2(__lsx_vilvl_b, p0, p1, q1, q0, t0, t2);
DUP2_ARG2(__lsx_vilvh_b, p0, p1, q1, q0, t1, t3);
DUP2_ARG2(__lsx_vilvl_h, t2, t0, t3, t1, p0, p2);
DUP2_ARG2(__lsx_vilvh_h, t2, t0, t3, t1, p1, q0);
pPix -= iStrideY_x4;
pPix -= iStrideY_x4;
pPix -= iStrideY_x4 - 1;
__lsx_vstelm_w(p0, pPix, 0, 0);
__lsx_vstelm_w(p0, pPix + iStrideY, 0, 1);
__lsx_vstelm_w(p0, pPix + iStrideY_x2, 0, 2);
__lsx_vstelm_w(p0, pPix + iStrideY_x3, 0, 3);
pPix += iStrideY_x4;
__lsx_vstelm_w(p1, pPix, 0, 0);
__lsx_vstelm_w(p1, pPix + iStrideY, 0, 1);
__lsx_vstelm_w(p1, pPix + iStrideY_x2, 0, 2);
__lsx_vstelm_w(p1, pPix + iStrideY_x3, 0, 3);
pPix += iStrideY_x4;
__lsx_vstelm_w(p2, pPix, 0, 0);
__lsx_vstelm_w(p2, pPix + iStrideY, 0, 1);
__lsx_vstelm_w(p2, pPix + iStrideY_x2, 0, 2);
__lsx_vstelm_w(p2, pPix + iStrideY_x3, 0, 3);
pPix += iStrideY_x4;
__lsx_vstelm_w(q0, pPix, 0, 0);
__lsx_vstelm_w(q0, pPix + iStrideY, 0, 1);
__lsx_vstelm_w(q0, pPix + iStrideY_x2, 0, 2);
__lsx_vstelm_w(q0, pPix + iStrideY_x3, 0, 3);
}
void DeblockLumaEq4V_lsx(uint8_t *pPix, int32_t iStride, int32_t iAlpha,
int32_t iBeta) {
int32_t iStride0 = 0;
int32_t iStride_x2 = iStride << 1;
int32_t iStride_x3 = iStride + iStride_x2;
int32_t iStride_x4 = iStride << 2;
__m128i p0, p1, p2, p3, q0, q1, q2, q3;
__m128i p0_l, p1_l, p2_l, p3_l, q0_l, q1_l, q2_l, q3_l;
__m128i p0_h, p1_h, p2_h, p3_h, q0_h, q1_h, q2_h, q3_h;
__m128i t0, t1, t2, t0_con1, s0, s1, s2, s0_con1;
__m128i alpha, beta;
__m128i iDetaP0Q0, bDetaP1P0, bDetaQ1Q0, bDetaP2P0, bDetaQ2Q0;
__m128i mask0, mask1;
DUP4_ARG2(__lsx_vldx,
pPix, -iStride_x4,
pPix, -iStride_x3,
pPix, -iStride_x2,
pPix, -iStride,
p3, p2, p1, p0);
DUP4_ARG2(__lsx_vldx,
pPix, iStride_x3,
pPix, iStride_x2,
pPix, iStride,
pPix, iStride0,
q3, q2, q1, q0);
alpha = __lsx_vreplgr2vr_b(iAlpha);
beta = __lsx_vreplgr2vr_b(iBeta);
iDetaP0Q0 = __lsx_vabsd_bu(p0, q0);
DUP4_ARG2(__lsx_vabsd_bu,
p1, p0,
q1, q0,
p2, p0,
q2, q0,
bDetaP1P0, bDetaQ1Q0, bDetaP2P0, bDetaQ2Q0);
DUP4_ARG2(__lsx_vslt_bu,
bDetaP1P0, beta,
bDetaQ1Q0, beta,
bDetaP2P0, beta,
bDetaQ2Q0, beta,
bDetaP1P0, bDetaQ1Q0, bDetaP2P0, bDetaQ2Q0);
DUP4_ARG2(__lsx_vsllwil_hu_bu,
p0, 0,
p1, 0,
p2, 0,
p3, 0,
p0_l, p1_l, p2_l, p3_l);
DUP4_ARG1(__lsx_vexth_hu_bu,
p0,
p1,
p2,
p3,
p0_h, p1_h, p2_h, p3_h);
DUP4_ARG2(__lsx_vsllwil_hu_bu,
q0, 0,
q1, 0,
q2, 0,
q3, 0,
q0_l, q1_l, q2_l, q3_l);
DUP4_ARG1(__lsx_vexth_hu_bu,
q0,
q1,
q2,
q3,
q0_h, q1_h, q2_h, q3_h);
//(iDetaP0Q0 < iAlpha) && bDetaP1P0 && bDetaQ1Q0
mask0 = __lsx_vslt_bu(iDetaP0Q0, alpha);
mask0 &= bDetaP1P0;
mask0 &= bDetaQ1Q0;
//iDetaP0Q0 < ((iAlpha >> 2) + 2)
mask1 = __lsx_vsrli_b(alpha, 2);
mask1 = __lsx_vaddi_bu(mask1, 2);
mask1 = __lsx_vslt_bu(iDetaP0Q0, mask1);
//low part
//p0
t0 = __lsx_vadd_h(__lsx_vslli_h(p1_l, 1), p2_l);
t0 = __lsx_vadd_h(__lsx_vslli_h(p0_l, 1), t0);
t0 = __lsx_vadd_h(__lsx_vslli_h(q0_l, 1), t0);
t0 = __lsx_vadd_h(q1_l, t0);
t0 = __lsx_vsrari_h(t0, 3);
//p1
t1 = __lsx_vadd_h(p2_l, p1_l);
t1 = __lsx_vadd_h(p0_l, t1);
t1 = __lsx_vadd_h(q0_l, t1);
t1 = __lsx_vsrari_h(t1, 2);
//p2
t2 = __lsx_vadd_h(__lsx_vslli_h(p3_l, 1), p2_l);
t2 = __lsx_vadd_h(__lsx_vslli_h(p2_l, 1), t2);
t2 = __lsx_vadd_h(p1_l, t2);
t2 = __lsx_vadd_h(p0_l, t2);
t2 = __lsx_vadd_h(q0_l, t2);
t2 = __lsx_vsrari_h(t2, 3);
//p0 condition 1
t0_con1 = __lsx_vadd_h(__lsx_vslli_h(p1_l, 1), p0_l);
t0_con1 = __lsx_vadd_h(q1_l, t0_con1);
t0_con1 = __lsx_vsrari_h(t0_con1, 2);
//q0
s0 = __lsx_vadd_h(__lsx_vslli_h(p0_l, 1), p1_l);
s0 = __lsx_vadd_h(__lsx_vslli_h(q0_l, 1), s0);
s0 = __lsx_vadd_h(__lsx_vslli_h(q1_l, 1), s0);
s0 = __lsx_vadd_h(q2_l, s0);
s0 = __lsx_vsrari_h(s0, 3);
//q1
s1 = __lsx_vadd_h(p0_l, q0_l);
s1 = __lsx_vadd_h(q1_l, s1);
s1 = __lsx_vadd_h(q2_l, s1);
s1 = __lsx_vsrari_h(s1, 2);
//q2
s2 = __lsx_vadd_h(__lsx_vslli_h(q3_l, 1), q2_l);
s2 = __lsx_vadd_h(__lsx_vslli_h(q2_l, 1), s2);
s2 = __lsx_vadd_h(q1_l, s2);
s2 = __lsx_vadd_h(q0_l, s2);
s2 = __lsx_vadd_h(p0_l, s2);
s2 = __lsx_vsrari_h(s2, 3);
//q0 condition 1
s0_con1 = __lsx_vadd_h(__lsx_vslli_h(q1_l, 1), q0_l);
s0_con1 = __lsx_vadd_h(p1_l, s0_con1);
s0_con1 = __lsx_vsrari_h(s0_con1, 2);
//move back
p0_l = t0; p1_l = t1; p2_l = t2;
q0_l = s0; q1_l = s1; q2_l = s2;
p3_l = t0_con1; q3_l = s0_con1;
//high part
//p0
t0 = __lsx_vadd_h(__lsx_vslli_h(p1_h, 1), p2_h);
t0 = __lsx_vadd_h(__lsx_vslli_h(p0_h, 1), t0);
t0 = __lsx_vadd_h(__lsx_vslli_h(q0_h, 1), t0);
t0 = __lsx_vadd_h(q1_h, t0);
t0 = __lsx_vsrari_h(t0, 3);
//p1
t1 = __lsx_vadd_h(p2_h, p1_h);
t1 = __lsx_vadd_h(p0_h, t1);
t1 = __lsx_vadd_h(q0_h, t1);
t1 = __lsx_vsrari_h(t1, 2);
//p2
t2 = __lsx_vadd_h(__lsx_vslli_h(p3_h, 1), p2_h);
t2 = __lsx_vadd_h(__lsx_vslli_h(p2_h, 1), t2);
t2 = __lsx_vadd_h(p1_h, t2);
t2 = __lsx_vadd_h(p0_h, t2);
t2 = __lsx_vadd_h(q0_h, t2);
t2 = __lsx_vsrari_h(t2, 3);
//p0 condition 1
t0_con1 = __lsx_vadd_h(__lsx_vslli_h(p1_h, 1), p0_h);
t0_con1 = __lsx_vadd_h(q1_h, t0_con1);
t0_con1 = __lsx_vsrari_h(t0_con1, 2);
//q0
s0 = __lsx_vadd_h(__lsx_vslli_h(p0_h, 1), p1_h);
s0 = __lsx_vadd_h(__lsx_vslli_h(q0_h, 1), s0);
s0 = __lsx_vadd_h(__lsx_vslli_h(q1_h, 1), s0);
s0 = __lsx_vadd_h(q2_h, s0);
s0 = __lsx_vsrari_h(s0, 3);
//q1
s1 = __lsx_vadd_h(p0_h, q0_h);
s1 = __lsx_vadd_h(q1_h, s1);
s1 = __lsx_vadd_h(q2_h, s1);
s1 = __lsx_vsrari_h(s1, 2);
//q2
s2 = __lsx_vadd_h(__lsx_vslli_h(q3_h, 1), q2_h);
s2 = __lsx_vadd_h(__lsx_vslli_h(q2_h, 1), s2);
s2 = __lsx_vadd_h(q1_h, s2);
s2 = __lsx_vadd_h(q0_h, s2);
s2 = __lsx_vadd_h(p0_h, s2);
s2 = __lsx_vsrari_h(s2, 3);
//q0 condition 1
s0_con1 = __lsx_vadd_h(__lsx_vslli_h(q1_h, 1), q0_h);
s0_con1 = __lsx_vadd_h(p1_h, s0_con1);
s0_con1 = __lsx_vsrari_h(s0_con1, 2);
//move back
p0_h = t0; p1_h = t1; p2_h = t2;
q0_h = s0; q1_h = s1; q2_h = s2;
p3_h = t0_con1; q3_h = s0_con1;
//pack low part and high part
DUP4_ARG2(__lsx_vpickev_b,
p0_h, p0_l,
p1_h, p1_l,
p2_h, p2_l,
q0_h, q0_l,
t0, t1, t2, s0);
DUP4_ARG2(__lsx_vpickev_b,
q1_h, q1_l,
q2_h, q2_l,
p3_h, p3_l,
q3_h, q3_l,
s1, s2, t0_con1, s0_con1);
t0 = t0 & mask0 & mask1 & bDetaP2P0;
t0 = __lsx_vadd_b(t0, t0_con1 & mask0 & mask1 & (~bDetaP2P0));
t0 = __lsx_vadd_b(t0, t0_con1 & mask0 & (~mask1));
t1 = t1 & mask0 & mask1 & bDetaP2P0;
t2 = t2 & mask0 & mask1 & bDetaP2P0;
s0 = s0 & mask0 & mask1 & bDetaQ2Q0;
s0 = __lsx_vadd_b(s0, s0_con1 & mask0 & mask1 & (~bDetaQ2Q0));
s0 = __lsx_vadd_b(s0, s0_con1 & mask0 & (~mask1));
s1 = s1 & mask0 & mask1 & bDetaQ2Q0;
s2 = s2 & mask0 & mask1 & bDetaQ2Q0;
p0 = __lsx_vadd_b(t0, p0 & (~mask0));
p1 = __lsx_vadd_b(t1, p1 & ~(mask0 & mask1 & bDetaP2P0));
p2 = __lsx_vadd_b(t2, p2 & ~(mask0 & mask1 & bDetaP2P0));
q0 = __lsx_vadd_b(s0, q0 & (~mask0));
q1 = __lsx_vadd_b(s1, q1 & ~(mask0 & mask1 & bDetaQ2Q0));
q2 = __lsx_vadd_b(s2, q2 & ~(mask0 & mask1 & bDetaQ2Q0));
//Store back
__lsx_vstx(p2, pPix, -iStride_x3);
__lsx_vstx(p1, pPix, -iStride_x2);
__lsx_vstx(p0, pPix, -iStride);
__lsx_vstx(q0, pPix, iStride0);
__lsx_vstx(q1, pPix, iStride);
__lsx_vstx(q2, pPix, iStride_x2);
}
void DeblockLumaEq4H_lsx (uint8_t* pPix, int32_t iStrideY, int32_t iAlpha, int32_t iBeta) {
__m128i p0, p1, p2, p3, q0, q1, q2, q3;
__m128i p0_l, p1_l, p2_l, p3_l, q0_l, q1_l, q2_l, q3_l;
__m128i p0_h, p1_h, p2_h, p3_h, q0_h, q1_h, q2_h, q3_h;
__m128i t0, t1, t2, t3, t4, t5, t6, t7, temp;
__m128i t0_l, t0_h, t1_l, t1_h, t2_l, t2_h;
__m128i t3_l, t3_h, t4_l, t4_h, t5_l, t5_h;
__m128i t6_l, t6_h, t7_l, t7_h;
__m128i f0, f1, f2, f3, fn;
__m128i bDetaP0Q0, bDetaP1P0, bDetaQ1Q0, bDetaP2P0, bDetaQ2Q0;
__m128i zero = __lsx_vldi(0);
__m128i alpha = __lsx_vreplgr2vr_b(iAlpha);
__m128i beta = __lsx_vreplgr2vr_b(iBeta);
int32_t iStrideY_x0 = 0;
int32_t iStrideY_x2 = iStrideY << 1;
int32_t iStrideY_x3 = iStrideY_x2 + iStrideY;
int32_t iStrideY_x4 = iStrideY << 2;
// Load data from pPix
pPix -= 4;
DUP4_ARG2(__lsx_vldx, pPix, iStrideY_x0, pPix, iStrideY, pPix, iStrideY_x2,
pPix, iStrideY_x3, p0_l, p1_l, p2_l, q0_l);
pPix += iStrideY_x4;
DUP4_ARG2(__lsx_vldx, pPix, iStrideY_x0, pPix, iStrideY, pPix, iStrideY_x2,
pPix, iStrideY_x3, p0_h, p1_h, p2_h, q0_h);
pPix += iStrideY_x4;
DUP4_ARG2(__lsx_vldx, pPix, iStrideY_x0, pPix, iStrideY, pPix, iStrideY_x2,
pPix, iStrideY_x3, q1_l, q2_l, t0_l, t1_l);
pPix += iStrideY_x4;
DUP4_ARG2(__lsx_vldx, pPix, iStrideY_x0, pPix, iStrideY, pPix, iStrideY_x2,
pPix, iStrideY_x3, q1_h, q2_h, t0_h, t1_h);
LSX_TRANSPOSE16x8_B(p0_l, p1_l, p2_l, q0_l, p0_h, p1_h, p2_h, q0_h, q1_l, q2_l,
t0_l, t1_l, q1_h, q2_h, t0_h, t1_h, p3, p2, p1, p0, q0, q1,
q2, q3);
// Calculate condition mask
bDetaP0Q0 = __lsx_vabsd_bu(p0, q0);
DUP4_ARG2(__lsx_vabsd_bu, p1, p0, q1, q0, p2, p0, q2, q0,
bDetaP1P0, bDetaQ1Q0, bDetaP2P0, bDetaQ2Q0);
DUP4_ARG2(__lsx_vslt_bu, bDetaP1P0, beta, bDetaQ1Q0, beta, bDetaP2P0, beta,
bDetaQ2Q0, beta, bDetaP1P0, bDetaQ1Q0, bDetaP2P0, bDetaQ2Q0);
// Unsigned extend p0, p1, p2, p3, q0, q1, q2, q3 from 8 bits to 16 bits
DUP4_ARG2(__lsx_vilvl_b, zero, p0, zero, p1, zero, p2, zero, q0,
p0_l, p1_l, p2_l, q0_l);
DUP4_ARG2(__lsx_vilvh_b, zero, p0, zero, p1, zero, p2, zero, q0,
p0_h, p1_h, p2_h, q0_h);
DUP2_ARG2(__lsx_vilvl_b, zero, q1, zero, q2, q1_l, q2_l);
DUP2_ARG2(__lsx_vilvh_b, zero, q1, zero, q2, q1_h, q2_h);
DUP2_ARG2(__lsx_vilvl_b, zero, p3, zero, q3, p3_l, q3_l);
DUP2_ARG2(__lsx_vilvh_b, zero, p3, zero, q3, p3_h, q3_h);
// Calculate the low part
// (p2 + (p1 * (1 << 1)) + (p0 * (1 << 1)) + (q0 * (1 << 1)) + q1 + 4) >> 3
t0_l = __lsx_vslli_h(p1_l, 1);
t0_l = __lsx_vadd_h(t0_l, p2_l);
temp = __lsx_vslli_h(p0_l, 1);
t0_l = __lsx_vadd_h(t0_l, temp);
temp = __lsx_vslli_h(q0_l, 1);
t0_l = __lsx_vadd_h(t0_l, temp);
t0_l = __lsx_vadd_h(t0_l, q1_l);
t0_l = __lsx_vaddi_hu(t0_l, 4);
t0_l = __lsx_vsrai_h(t0_l, 3);
// (p2 + p1 + p0 + q0 + 2) >> 2
t1_l = __lsx_vadd_h(p2_l, p1_l);
t1_l = __lsx_vadd_h(t1_l, p0_l);
t1_l = __lsx_vadd_h(t1_l, q0_l);
t1_l = __lsx_vaddi_hu(t1_l, 2);
t1_l = __lsx_vsrai_h(t1_l, 2);
// ((p3 * (1 << 1)) + p2 + (p2 * (1 << 1)) + p1 + p0 + q0 + 4) >> 3
t2_l = __lsx_vslli_h(p3_l, 1);
t2_l = __lsx_vadd_h(t2_l, p2_l);
temp = __lsx_vslli_h(p2_l, 1);
t2_l = __lsx_vadd_h(t2_l, temp);
t2_l = __lsx_vadd_h(t2_l, p1_l);
t2_l = __lsx_vadd_h(t2_l, p0_l);
t2_l = __lsx_vadd_h(t2_l, q0_l);
t2_l = __lsx_vaddi_hu(t2_l, 4);
t2_l = __lsx_vsrai_h(t2_l, 3);
// ((p1 * (1 << 1)) + p0 + q1 + 2) >> 2
t3_l = __lsx_vslli_h(p1_l, 1);
t3_l = __lsx_vadd_h(t3_l, p0_l);
t3_l = __lsx_vadd_h(t3_l, q1_l);
t3_l = __lsx_vaddi_hu(t3_l, 2);
t3_l = __lsx_vsrai_h(t3_l, 2);
// (p1 + (p0 * (1 << 1)) + (q0 * (1 << 1)) + (q1 * (1 << 1)) + q2 + 4) >> 3
t4_l = __lsx_vslli_h(p0_l, 1);
t4_l = __lsx_vadd_h(t4_l, p1_l);
temp = __lsx_vslli_h(q0_l, 1);
t4_l = __lsx_vadd_h(t4_l, temp);
temp = __lsx_vslli_h(q1_l, 1);
t4_l = __lsx_vadd_h(t4_l, temp);
t4_l = __lsx_vadd_h(t4_l, q2_l);
t4_l = __lsx_vaddi_hu(t4_l, 4);
t4_l = __lsx_vsrai_h(t4_l, 3);
// (p0 + q0 + q1 + q2 + 2) >> 2
t5_l = __lsx_vadd_h(p0_l, q0_l);
t5_l = __lsx_vadd_h(t5_l, q1_l);
t5_l = __lsx_vadd_h(t5_l, q2_l);
t5_l = __lsx_vaddi_hu(t5_l, 2);
t5_l = __lsx_vsrai_h(t5_l, 2);
// ((q3 * (1 << 1)) + q2 + (q2 * (1 << 1)) + q1 + q0 + p0 + 4) >> 3
t6_l = __lsx_vslli_h(q3_l, 1);
t6_l = __lsx_vadd_h(t6_l, q2_l);
temp = __lsx_vslli_h(q2_l, 1);
t6_l = __lsx_vadd_h(t6_l, temp);
t6_l = __lsx_vadd_h(t6_l, q1_l);
t6_l = __lsx_vadd_h(t6_l, q0_l);
t6_l = __lsx_vadd_h(t6_l, p0_l);
t6_l = __lsx_vaddi_hu(t6_l, 4);
t6_l = __lsx_vsrai_h(t6_l, 3);
// ((q1 * (1 << 1)) + q0 + p1 + 2) >> 2
t7_l = __lsx_vslli_h(q1_l, 1);
t7_l = __lsx_vadd_h(t7_l, q0_l);
t7_l = __lsx_vadd_h(t7_l, p1_l);
t7_l = __lsx_vaddi_hu(t7_l, 2);
t7_l = __lsx_vsrai_h(t7_l, 2);
// Calculate the high part
// (p2 + (p1 * (1 << 1)) + (p0 * (1 << 1)) + (q0 * (1 << 1)) + q1 + 4) >> 3
t0_h = __lsx_vslli_h(p1_h, 1);
t0_h = __lsx_vadd_h(t0_h, p2_h);
temp = __lsx_vslli_h(p0_h, 1);
t0_h = __lsx_vadd_h(t0_h, temp);
temp = __lsx_vslli_h(q0_h, 1);
t0_h = __lsx_vadd_h(t0_h, temp);
t0_h = __lsx_vadd_h(t0_h, q1_h);
t0_h = __lsx_vaddi_hu(t0_h, 4);
t0_h = __lsx_vsrai_h(t0_h, 3);
// (p2 + p1 + p0 + q0 + 2) >> 2
t1_h = __lsx_vadd_h(p2_h, p1_h);
t1_h = __lsx_vadd_h(t1_h, p0_h);
t1_h = __lsx_vadd_h(t1_h, q0_h);
t1_h = __lsx_vaddi_hu(t1_h, 2);
t1_h = __lsx_vsrai_h(t1_h, 2);
// ((p3 * (1 << 1)) + p2 + (p2 * (1 << 1)) + p1 + p0 + q0 + 4) >> 3
t2_h = __lsx_vslli_h(p3_h, 1);
t2_h = __lsx_vadd_h(t2_h, p2_h);
temp = __lsx_vslli_h(p2_h, 1);
t2_h = __lsx_vadd_h(t2_h, temp);
t2_h = __lsx_vadd_h(t2_h, p1_h);
t2_h = __lsx_vadd_h(t2_h, p0_h);
t2_h = __lsx_vadd_h(t2_h, q0_h);
t2_h = __lsx_vaddi_hu(t2_h, 4);
t2_h = __lsx_vsrai_h(t2_h, 3);
// ((p1 * (1 << 1)) + p0 + q1 + 2) >> 2
t3_h = __lsx_vslli_h(p1_h, 1);
t3_h = __lsx_vadd_h(t3_h, p0_h);
t3_h = __lsx_vadd_h(t3_h, q1_h);
t3_h = __lsx_vaddi_hu(t3_h, 2);
t3_h = __lsx_vsrai_h(t3_h, 2);
// (p1 + (p0 * (1 << 1)) + (q0 * (1 << 1)) + (q1 * (1 << 1)) + q2 + 4) >> 3
t4_h = __lsx_vslli_h(p0_h, 1);
t4_h = __lsx_vadd_h(t4_h, p1_h);
temp = __lsx_vslli_h(q0_h, 1);
t4_h = __lsx_vadd_h(t4_h, temp);
temp = __lsx_vslli_h(q1_h, 1);
t4_h = __lsx_vadd_h(t4_h, temp);
t4_h = __lsx_vadd_h(t4_h, q2_h);
t4_h = __lsx_vaddi_hu(t4_h, 4);
t4_h = __lsx_vsrai_h(t4_h, 3);
// (p0 + q0 + q1 + q2 + 2) >> 2
t5_h = __lsx_vadd_h(p0_h, q0_h);
t5_h = __lsx_vadd_h(t5_h, q1_h);
t5_h = __lsx_vadd_h(t5_h, q2_h);
t5_h = __lsx_vaddi_hu(t5_h, 2);
t5_h = __lsx_vsrai_h(t5_h, 2);
// ((q3 * (1 << 1)) + q2 + (q2 * (1 << 1)) + q1 + q0 + p0 + 4) >> 3
t6_h = __lsx_vslli_h(q3_h, 1);
t6_h = __lsx_vadd_h(t6_h, q2_h);
temp = __lsx_vslli_h(q2_h, 1);
t6_h = __lsx_vadd_h(t6_h, temp);
t6_h = __lsx_vadd_h(t6_h, q1_h);
t6_h = __lsx_vadd_h(t6_h, q0_h);
t6_h = __lsx_vadd_h(t6_h, p0_h);
t6_h = __lsx_vaddi_hu(t6_h, 4);
t6_h = __lsx_vsrai_h(t6_h, 3);
// ((q1 * (1 << 1)) + q0 + p1 + 2) >> 2
t7_h = __lsx_vslli_h(q1_h, 1);
t7_h = __lsx_vadd_h(t7_h, q0_h);
t7_h = __lsx_vadd_h(t7_h, p1_h);
t7_h = __lsx_vaddi_hu(t7_h, 2);
t7_h = __lsx_vsrai_h(t7_h, 2);
// Combined low and high
DUP4_ARG2(__lsx_vpickev_b, t0_h, t0_l, t1_h, t1_l, t2_h, t2_l,
t3_h, t3_l, t0, t1, t2, t3);
DUP4_ARG2(__lsx_vpickev_b, t4_h, t4_l, t5_h, t5_l, t6_h, t6_l,
t7_h, t7_l, t4, t5, t6, t7);
f0 = __lsx_vslt_bu(bDetaP0Q0, alpha);
f0 = __lsx_vand_v(f0, bDetaP1P0);
f0 = __lsx_vand_v(f0, bDetaQ1Q0);
f1 = __lsx_vsrli_b(alpha, 2);
f1 = __lsx_vaddi_bu(f1, 2);
f1 = __lsx_vslt_bu(bDetaP0Q0, f1);
// t0
f2 = __lsx_vand_v(f0, f1);
fn = __lsx_vand_v(f2, bDetaP2P0);
f3 = __lsx_vand_v(fn, t0);
f2 = __lsx_vnor_v(bDetaP2P0, bDetaP2P0);
fn = __lsx_vand_v(f0, f2);
fn = __lsx_vand_v(fn, f1);
t0 = __lsx_vand_v(fn, t3);
t0 = __lsx_vadd_b(f3, t0);
fn = __lsx_vnor_v(f1, f1);
fn = __lsx_vand_v(fn, f0);
f3 = __lsx_vand_v(fn, t3);
t0 = __lsx_vadd_b(f3, t0);
// t1
f2 = __lsx_vand_v(f0, f1);
f2 = __lsx_vand_v(f2, bDetaP2P0);
t1 = __lsx_vand_v(f2, t1);
// t2
f2 = __lsx_vand_v(f0, f1);
f2 = __lsx_vand_v(f2, bDetaP2P0);
t2 = __lsx_vand_v(f2, t2);
// t3
f2 = __lsx_vand_v(f0, f1);
fn = __lsx_vand_v(f2, bDetaQ2Q0);
f3 = __lsx_vand_v(fn, t4);
fn = __lsx_vnor_v(bDetaQ2Q0, bDetaQ2Q0);
fn = __lsx_vand_v(fn, f2);
t3 = __lsx_vand_v(fn, t7);
t3 = __lsx_vadd_b(f3, t3);
fn = __lsx_vnor_v(f1, f1);
fn = __lsx_vand_v(fn, f0);
f3 = __lsx_vand_v(fn, t7);
t3 = __lsx_vadd_b(f3, t3);
// t4
f2 = __lsx_vand_v(f0, f1);
f2 = __lsx_vand_v(f2, bDetaQ2Q0);
t4 = __lsx_vand_v(f2, t5);
// t5
f2 = __lsx_vand_v(f0, f1);
f2 = __lsx_vand_v(f2, bDetaQ2Q0);
t5 = __lsx_vand_v(f2, t6);
// p0
fn = __lsx_vnor_v(f0, f0);
p0 = __lsx_vand_v(fn, p0);
p0 = __lsx_vadd_b(p0, t0);
// p1
f2 = __lsx_vand_v(f0, f1);
f2 = __lsx_vand_v(f2, bDetaP2P0);
fn = __lsx_vnor_v(f2, f2);
p1 = __lsx_vand_v(fn, p1);
p1 = __lsx_vadd_b(t1, p1);
// p2
f2 = __lsx_vand_v(f0, f1);
f2 = __lsx_vand_v(f2, bDetaP2P0);
fn = __lsx_vnor_v(f2, f2);
p2 = __lsx_vand_v(fn, p2);
p2 = __lsx_vadd_b(t2, p2);
// q0
fn = __lsx_vnor_v(f0, f0);
q0 = __lsx_vand_v(fn, q0);
q0 = __lsx_vadd_b(q0, t3);
// q1
f2 = __lsx_vand_v(f0, f1);
f2 = __lsx_vand_v(f2, bDetaQ2Q0);
fn = __lsx_vnor_v(f2, f2);
q1 = __lsx_vand_v(fn, q1);
q1 = __lsx_vadd_b(q1, t4);
// q2
f2 = __lsx_vand_v(f0, f1);
f2 = __lsx_vand_v(f2, bDetaQ2Q0);
fn = __lsx_vnor_v(f2, f2);
q2 = __lsx_vand_v(fn, q2);
q2 = __lsx_vadd_b(q2, t5);
DUP2_ARG2(__lsx_vilvl_b, p1, p2, q0, p0, t0, t1);
DUP2_ARG2(__lsx_vilvh_b, p1, p2, q0, p0, t2, t3);
DUP2_ARG2(__lsx_vilvl_h, t1, t0, t3, t2, p0, p1);
DUP2_ARG2(__lsx_vilvh_h, t1, t0, t3, t2, p2, p3);
t1 = __lsx_vilvl_b(q2, q1);
t2 = __lsx_vilvh_b(q2, q1);
// Store data to pPix
pPix -= iStrideY_x4;
pPix -= iStrideY_x4;
pPix -= iStrideY_x4;
pPix += 1;
__lsx_vstelm_w(p0, pPix, 0, 0);
__lsx_vstelm_w(p0, pPix + iStrideY, 0, 1);
__lsx_vstelm_w(p0, pPix + iStrideY_x2, 0, 2);
__lsx_vstelm_w(p0, pPix + iStrideY_x3, 0, 3);
pPix += iStrideY_x4;
__lsx_vstelm_w(p2, pPix, 0, 0);
__lsx_vstelm_w(p2, pPix + iStrideY, 0, 1);
__lsx_vstelm_w(p2, pPix + iStrideY_x2, 0, 2);
__lsx_vstelm_w(p2, pPix + iStrideY_x3, 0, 3);
pPix += iStrideY_x4;
__lsx_vstelm_w(p1, pPix, 0, 0);
__lsx_vstelm_w(p1, pPix + iStrideY, 0, 1);
__lsx_vstelm_w(p1, pPix + iStrideY_x2, 0, 2);
__lsx_vstelm_w(p1, pPix + iStrideY_x3, 0, 3);
pPix += iStrideY_x4;
__lsx_vstelm_w(p3, pPix, 0, 0);
__lsx_vstelm_w(p3, pPix + iStrideY, 0, 1);
__lsx_vstelm_w(p3, pPix + iStrideY_x2, 0, 2);
__lsx_vstelm_w(p3, pPix + iStrideY_x3, 0, 3);
pPix -= iStrideY_x4;
pPix -= iStrideY_x4;
pPix -= iStrideY_x4;
pPix += 4;
__lsx_vstelm_h(t1, pPix, 0, 0);
__lsx_vstelm_h(t1, pPix + iStrideY, 0, 1);
__lsx_vstelm_h(t1, pPix + iStrideY_x2, 0, 2);
__lsx_vstelm_h(t1, pPix + iStrideY_x3, 0, 3);
pPix += iStrideY_x4;
__lsx_vstelm_h(t1, pPix, 0, 4);
__lsx_vstelm_h(t1, pPix + iStrideY, 0, 5);
__lsx_vstelm_h(t1, pPix + iStrideY_x2, 0, 6);
__lsx_vstelm_h(t1, pPix + iStrideY_x3, 0, 7);
pPix += iStrideY_x4;
__lsx_vstelm_h(t2, pPix, 0, 0);
__lsx_vstelm_h(t2, pPix + iStrideY, 0, 1);
__lsx_vstelm_h(t2, pPix + iStrideY_x2, 0, 2);
__lsx_vstelm_h(t2, pPix + iStrideY_x3, 0, 3);
pPix += iStrideY_x4;
__lsx_vstelm_h(t2, pPix, 0, 4);
__lsx_vstelm_h(t2, pPix + iStrideY, 0, 5);
__lsx_vstelm_h(t2, pPix + iStrideY_x2, 0, 6);
__lsx_vstelm_h(t2, pPix + iStrideY_x3, 0, 7);
}
void DeblockChromaLt4V_lsx (uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStrideX,
int32_t iAlpha, int32_t iBeta, int8_t* pTc) {
__m128i p0, p1, q0, q1, t0, t1, tp;
__m128i p0_l, p1_l, p2_l, q0_l, q1_l, q2_l;
__m128i iTc0, negiTc0, iTc0_l, negiTc0_l;
__m128i flags, flag, iDeta_l;
__m128i bDetaP0Q0, bDetaP1P0, bDetaQ1Q0;
__m128i zero = __lsx_vldi(0);
__m128i alpha = __lsx_vreplgr2vr_b(iAlpha);
__m128i beta = __lsx_vreplgr2vr_b(iBeta);
__m128i shuf = {0x0303020201010000, 0x0};
__m128i not_255 = {0xff00ff00ff00ff00, 0xff00ff00ff00ff00};
int32_t iStrideX_x0 = 0;
int32_t iStrideX_x2 = iStrideX << 1;
iTc0 = __lsx_vldx(pTc, 0);
iTc0 = __lsx_vshuf_b(iTc0, iTc0, shuf);
negiTc0 = __lsx_vneg_b(iTc0);
flag = __lsx_vslt_b(iTc0, zero);
iTc0_l = __lsx_vilvl_b(flag, iTc0);
flag = __lsx_vslt_b(negiTc0, zero);
negiTc0_l = __lsx_vilvl_b(flag, negiTc0);
// Load data from pPixCb
DUP4_ARG2(__lsx_vldx, pPixCb, -iStrideX, pPixCb, -iStrideX_x2, pPixCb,
iStrideX_x0, pPixCb, iStrideX, p0, p1, q0, q1);
DUP4_ARG2(__lsx_vilvl_b, zero, p0, zero, p1, zero, q0, zero, q1,
p0_l, p1_l, q0_l, q1_l);
// Calculate condition mask
DUP2_ARG2(__lsx_vabsd_bu, p0, q0, p1, p0, bDetaP0Q0, bDetaP1P0);
bDetaQ1Q0 = __lsx_vabsd_bu(q1, q0);
DUP2_ARG2(__lsx_vslt_bu, bDetaP0Q0, alpha, bDetaP1P0, beta, bDetaP0Q0, bDetaP1P0);
bDetaQ1Q0 = __lsx_vslt_bu(bDetaQ1Q0, beta);
DUP2_ARG2(__lsx_vand_v, bDetaP0Q0, bDetaP1P0, bDetaQ1Q0, flags, flags, flags);
// Calculate the low part
// WELS_CLIP3 ((((q0 - p0) * (1 << 2)) + (p1 - q1) + 4) >> 3, -iTc0, iTc0)
iDeta_l = __lsx_vsub_h(q0_l, p0_l);
iDeta_l = __lsx_vslli_h(iDeta_l, 2);
iDeta_l = __lsx_vadd_h(iDeta_l, p1_l);
iDeta_l = __lsx_vsub_h(iDeta_l, q1_l);
iDeta_l = __lsx_vaddi_hu(iDeta_l, 4);
iDeta_l = __lsx_vsrai_h(iDeta_l, 3);
iDeta_l = __lsx_vmin_h(iTc0_l, iDeta_l);
iDeta_l = __lsx_vmax_h(negiTc0_l, iDeta_l);
// WelsClip1 (p0 + iDeta)
p0_l = __lsx_vadd_h(p0_l, iDeta_l);
p1_l = __lsx_vand_v(p0_l, not_255);
p2_l = __lsx_vsle_h(zero, p0_l);
flag = __lsx_vseq_h(p1_l, zero);
p0_l = __lsx_vand_v(p0_l, flag);
flag = __lsx_vnor_v(flag,flag);
p2_l = __lsx_vand_v(p2_l, flag);
p0_l = __lsx_vadd_h(p0_l, p2_l);
// WelsClip1 (q0 - iDeta)
q0_l = __lsx_vsub_h(q0_l, iDeta_l);
q1_l = __lsx_vand_v(q0_l, not_255);
q2_l = __lsx_vsle_h(zero, q0_l);
flag = __lsx_vseq_h(q1_l, zero);
q0_l = __lsx_vand_v(q0_l, flag);
flag = __lsx_vnor_v(flag, flag);
q2_l = __lsx_vand_v(q2_l, flag);
q0_l = __lsx_vadd_h(q0_l, q2_l);
DUP2_ARG2(__lsx_vpickev_b, zero, p0_l, zero, q0_l, t0, t1);
flag = __lsx_vsle_b(zero, iTc0);
flag = __lsx_vand_v(flag, flags);
t0 = __lsx_vand_v(t0, flag);
tp = __lsx_vnor_v(flag,flag);
p0 = __lsx_vand_v(p0, tp);
p0 = __lsx_vadd_b(t0, p0);
t1 = __lsx_vand_v(t1, flag);
tp = __lsx_vnor_v(flag,flag);
q0 = __lsx_vand_v(q0, tp);
q0 = __lsx_vadd_b(t1, q0);
// Store data to pPixCb
__lsx_vstelm_d(p0, pPixCb - iStrideX, 0, 0);
__lsx_vstelm_d(q0, pPixCb, 0, 0);
// Load data from pPixCr
DUP4_ARG2(__lsx_vldx, pPixCr, -iStrideX, pPixCr, -iStrideX_x2, pPixCr,
iStrideX_x0, pPixCr, iStrideX, p0, p1, q0, q1);
DUP4_ARG2(__lsx_vilvl_b, zero, p0, zero, p1, zero, q0, zero, q1,
p0_l, p1_l, q0_l, q1_l);
// Calculate condition mask
DUP2_ARG2(__lsx_vabsd_bu, p0, q0, p1, p0, bDetaP0Q0, bDetaP1P0);
bDetaQ1Q0 = __lsx_vabsd_bu(q1, q0);
DUP2_ARG2(__lsx_vslt_bu, bDetaP0Q0, alpha, bDetaP1P0, beta, bDetaP0Q0, bDetaP1P0);
bDetaQ1Q0 = __lsx_vslt_bu(bDetaQ1Q0, beta);
DUP2_ARG2(__lsx_vand_v, bDetaP0Q0, bDetaP1P0, bDetaQ1Q0, flags, flags, flags);
// Calculate the low part
// WELS_CLIP3 ((((q0 - p0) * (1 << 2)) + (p1 - q1) + 4) >> 3, -iTc0, iTc0)
iDeta_l = __lsx_vsub_h(q0_l, p0_l);
iDeta_l = __lsx_vslli_h(iDeta_l, 2);
iDeta_l = __lsx_vadd_h(iDeta_l, p1_l);
iDeta_l = __lsx_vsub_h(iDeta_l, q1_l);
iDeta_l = __lsx_vaddi_hu(iDeta_l, 4);
iDeta_l = __lsx_vsrai_h(iDeta_l, 3);
iDeta_l = __lsx_vmin_h(iTc0_l, iDeta_l);
iDeta_l = __lsx_vmax_h(negiTc0_l, iDeta_l);
// WelsClip1 (p0 + iDeta)
p0_l = __lsx_vadd_h(p0_l, iDeta_l);
p1_l = __lsx_vand_v(p0_l, not_255);
p2_l = __lsx_vsle_h(zero, p0_l);
flag = __lsx_vseq_h(p1_l, zero);
p0_l = __lsx_vand_v(p0_l, flag);
flag = __lsx_vnor_v(flag,flag);
p2_l = __lsx_vand_v(p2_l, flag);
p0_l = __lsx_vadd_h(p0_l, p2_l);
// WelsClip1 (q0 - iDeta)
q0_l = __lsx_vsub_h(q0_l, iDeta_l);
q1_l = __lsx_vand_v(q0_l, not_255);
q2_l = __lsx_vsle_h(zero, q0_l);
flag = __lsx_vseq_h(q1_l, zero);
q0_l = __lsx_vand_v(q0_l, flag);
flag = __lsx_vnor_v(flag, flag);
q2_l = __lsx_vand_v(q2_l, flag);
q0_l = __lsx_vadd_h(q0_l, q2_l);
DUP2_ARG2(__lsx_vpickev_b, zero, p0_l, zero, q0_l, t0, t1);
flag = __lsx_vsle_b(zero, iTc0);
flag = __lsx_vand_v(flag, flags);
t0 = __lsx_vand_v(t0, flag);
tp = __lsx_vnor_v(flag,flag);
p0 = __lsx_vand_v(p0, tp);
p0 = __lsx_vadd_b(t0, p0);
t1 = __lsx_vand_v(t1, flag);
tp = __lsx_vnor_v(flag,flag);
q0 = __lsx_vand_v(q0, tp);
q0 = __lsx_vadd_b(t1, q0);
// Store data to pPixCr
__lsx_vstelm_d(p0, pPixCr - iStrideX, 0, 0);
__lsx_vstelm_d(q0, pPixCr, 0, 0);
}
void DeblockChromaLt4H_lsx (uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStrideY,
int32_t iAlpha, int32_t iBeta, int8_t* pTc) {
__m128i p0, p1, q0, q1, t0, t1, t2, t3, tp;
__m128i p0_l, p1_l, p2_l, q0_l, q1_l, q2_l;
__m128i iTc0, negiTc0, iTc0_l, negiTc0_l;
__m128i flags, flag, iDeta_l;
__m128i bDetaP0Q0, bDetaP1P0, bDetaQ1Q0;
__m128i zero = __lsx_vldi(0);
__m128i alpha = __lsx_vreplgr2vr_b(iAlpha);
__m128i beta = __lsx_vreplgr2vr_b(iBeta);
__m128i shuf = {0x0303020201010000, 0x0};
__m128i not_255 = {0xff00ff00ff00ff00, 0xff00ff00ff00ff00};
int32_t iStrideY_x0 = 0;
int32_t iStrideY_x2 = iStrideY << 1;
int32_t iStrideY_x3 = iStrideY_x2 + iStrideY;
int32_t iStrideY_x4 = iStrideY << 2;
iTc0 = __lsx_vldx(pTc, 0);
iTc0 = __lsx_vshuf_b(iTc0, iTc0, shuf);
negiTc0 = __lsx_vneg_b(iTc0);
flag = __lsx_vslt_b(iTc0, zero);
iTc0_l = __lsx_vilvl_b(flag, iTc0);
flag = __lsx_vslt_b(negiTc0, zero);
negiTc0_l = __lsx_vilvl_b(flag, negiTc0);
// Load data from pPixCb
pPixCb -= 2;
DUP4_ARG2(__lsx_vldx, pPixCb, iStrideY_x0, pPixCb, iStrideY, pPixCb,
iStrideY_x2, pPixCb, iStrideY_x3, p1, p0, q0, q1);
pPixCb += iStrideY_x4;
DUP4_ARG2(__lsx_vldx, pPixCb, iStrideY_x0, pPixCb, iStrideY, pPixCb,
iStrideY_x2, pPixCb, iStrideY_x3, t0, t1, t2, t3);
LSX_TRANSPOSE8x4_B(p1, p0, q0, q1, t0, t1, t2, t3, p1, p0, q0, q1);
DUP4_ARG2(__lsx_vilvl_b, zero, p0, zero, p1, zero, q0, zero, q1,
p0_l, p1_l, q0_l, q1_l);
// Calculate condition mask
DUP2_ARG2(__lsx_vabsd_bu, p0, q0, p1, p0, bDetaP0Q0, bDetaP1P0);
bDetaQ1Q0 = __lsx_vabsd_bu(q1, q0);
DUP2_ARG2(__lsx_vslt_bu, bDetaP0Q0, alpha, bDetaP1P0, beta, bDetaP0Q0, bDetaP1P0);
bDetaQ1Q0 = __lsx_vslt_bu(bDetaQ1Q0, beta);
DUP2_ARG2(__lsx_vand_v, bDetaP0Q0, bDetaP1P0, bDetaQ1Q0, flags, flags, flags);
// Calculate the low part
// WELS_CLIP3 ((((q0 - p0) * (1 << 2)) + (p1 - q1) + 4) >> 3, -iTc0, iTc0)
iDeta_l = __lsx_vsub_h(q0_l, p0_l);
iDeta_l = __lsx_vslli_h(iDeta_l, 2);
iDeta_l = __lsx_vadd_h(iDeta_l, p1_l);
iDeta_l = __lsx_vsub_h(iDeta_l, q1_l);
iDeta_l = __lsx_vaddi_hu(iDeta_l, 4);
iDeta_l = __lsx_vsrai_h(iDeta_l, 3);
iDeta_l = __lsx_vmin_h(iTc0_l, iDeta_l);
iDeta_l = __lsx_vmax_h(negiTc0_l, iDeta_l);
// WelsClip1 (p0 + iDeta)
p0_l = __lsx_vadd_h(p0_l, iDeta_l);
p1_l = __lsx_vand_v(p0_l, not_255);
p2_l = __lsx_vsle_h(zero, p0_l);
flag = __lsx_vseq_h(p1_l, zero);
p0_l = __lsx_vand_v(p0_l, flag);
flag = __lsx_vnor_v(flag,flag);
p2_l = __lsx_vand_v(p2_l, flag);
p0_l = __lsx_vadd_h(p0_l, p2_l);
// WelsClip1 (q0 - iDeta)
q0_l = __lsx_vsub_h(q0_l, iDeta_l);
q1_l = __lsx_vand_v(q0_l, not_255);
q2_l = __lsx_vsle_h(zero, q0_l);
flag = __lsx_vseq_h(q1_l, zero);
q0_l = __lsx_vand_v(q0_l, flag);
flag = __lsx_vnor_v(flag, flag);
q2_l = __lsx_vand_v(q2_l, flag);
q0_l = __lsx_vadd_h(q0_l, q2_l);
DUP2_ARG2(__lsx_vpickev_b, zero, p0_l, zero, q0_l, t0, t1);
flag = __lsx_vsle_b(zero, iTc0);
flag = __lsx_vand_v(flag, flags);
t0 = __lsx_vand_v(t0, flag);
tp = __lsx_vnor_v(flag,flag);
p0 = __lsx_vand_v(p0, tp);
p0 = __lsx_vadd_b(t0, p0);
t1 = __lsx_vand_v(t1, flag);
tp = __lsx_vnor_v(flag,flag);
q0 = __lsx_vand_v(q0, tp);
q0 = __lsx_vadd_b(t1, q0);
p0 = __lsx_vilvl_b(q0, p0);
// Store data to pPixCb
pPixCb -= iStrideY_x4 - 1;
__lsx_vstelm_h(p0, pPixCb, 0, 0);
__lsx_vstelm_h(p0, pPixCb + iStrideY, 0, 1);
__lsx_vstelm_h(p0, pPixCb + iStrideY_x2, 0, 2);
__lsx_vstelm_h(p0, pPixCb + iStrideY_x3, 0, 3);
pPixCb += iStrideY_x4;
__lsx_vstelm_h(p0, pPixCb, 0, 4);
__lsx_vstelm_h(p0, pPixCb + iStrideY, 0, 5);
__lsx_vstelm_h(p0, pPixCb + iStrideY_x2, 0, 6);
__lsx_vstelm_h(p0, pPixCb + iStrideY_x3, 0, 7);
// Load data from pPixCr
pPixCr -= 2;
DUP4_ARG2(__lsx_vldx, pPixCr, iStrideY_x0, pPixCr, iStrideY, pPixCr,
iStrideY_x2, pPixCr, iStrideY_x3, p1, p0, q0, q1);
pPixCr += iStrideY_x4;
DUP4_ARG2(__lsx_vldx, pPixCr, iStrideY_x0, pPixCr, iStrideY, pPixCr,
iStrideY_x2, pPixCr, iStrideY_x3, t0, t1, t2, t3);
LSX_TRANSPOSE8x4_B(p1, p0, q0, q1, t0, t1, t2, t3, p1, p0, q0, q1);
DUP4_ARG2(__lsx_vilvl_b, zero, p0, zero, p1, zero, q0, zero, q1,
p0_l, p1_l, q0_l, q1_l);
DUP2_ARG2(__lsx_vabsd_bu, p0, q0, p1, p0, bDetaP0Q0, bDetaP1P0);
bDetaQ1Q0 = __lsx_vabsd_bu(q1, q0);
DUP2_ARG2(__lsx_vslt_bu, bDetaP0Q0, alpha, bDetaP1P0, beta, bDetaP0Q0, bDetaP1P0);
bDetaQ1Q0 = __lsx_vslt_bu(bDetaQ1Q0, beta);
DUP2_ARG2(__lsx_vand_v, bDetaP0Q0, bDetaP1P0, bDetaQ1Q0, flags, flags, flags);
// Calculate the low part
// WELS_CLIP3 ((((q0 - p0) * (1 << 2)) + (p1 - q1) + 4) >> 3, -iTc0, iTc0)
iDeta_l = __lsx_vsub_h(q0_l, p0_l);
iDeta_l = __lsx_vslli_h(iDeta_l, 2);
iDeta_l = __lsx_vadd_h(iDeta_l, p1_l);
iDeta_l = __lsx_vsub_h(iDeta_l, q1_l);
iDeta_l = __lsx_vaddi_hu(iDeta_l, 4);
iDeta_l = __lsx_vsrai_h(iDeta_l, 3);
iDeta_l = __lsx_vmin_h(iTc0_l, iDeta_l);
iDeta_l = __lsx_vmax_h(negiTc0_l, iDeta_l);
// WelsClip1 (p0 + iDeta)
p0_l = __lsx_vadd_h(p0_l, iDeta_l);
p1_l = __lsx_vand_v(p0_l, not_255);
p2_l = __lsx_vsle_h(zero, p0_l);
flag = __lsx_vseq_h(p1_l, zero);
p0_l = __lsx_vand_v(p0_l, flag);
flag = __lsx_vnor_v(flag,flag);
p2_l = __lsx_vand_v(p2_l, flag);
p0_l = __lsx_vadd_h(p0_l, p2_l);
// WelsClip1 (q0 - iDeta)
q0_l = __lsx_vsub_h(q0_l, iDeta_l);
q1_l = __lsx_vand_v(q0_l, not_255);
q2_l = __lsx_vsle_h(zero, q0_l);
flag = __lsx_vseq_h(q1_l, zero);
q0_l = __lsx_vand_v(q0_l, flag);
flag = __lsx_vnor_v(flag, flag);
q2_l = __lsx_vand_v(q2_l, flag);
q0_l = __lsx_vadd_h(q0_l, q2_l);
DUP2_ARG2(__lsx_vpickev_b, zero, p0_l, zero, q0_l, t0, t1);
flag = __lsx_vsle_b(zero, iTc0);
flag = __lsx_vand_v(flag, flags);
t0 = __lsx_vand_v(t0, flag);
tp = __lsx_vnor_v(flag,flag);
p0 = __lsx_vand_v(p0, tp);
p0 = __lsx_vadd_b(t0, p0);
t1 = __lsx_vand_v(t1, flag);
tp = __lsx_vnor_v(flag,flag);
q0 = __lsx_vand_v(q0, tp);
q0 = __lsx_vadd_b(t1, q0);
p0 = __lsx_vilvl_b(q0, p0);
// Store data to pPixCr
pPixCr -= iStrideY_x4 - 1;
__lsx_vstelm_h(p0, pPixCr, 0, 0);
__lsx_vstelm_h(p0, pPixCr + iStrideY, 0, 1);
__lsx_vstelm_h(p0, pPixCr + iStrideY_x2, 0, 2);
__lsx_vstelm_h(p0, pPixCr + iStrideY_x3, 0, 3);
pPixCr += iStrideY_x4;
__lsx_vstelm_h(p0, pPixCr, 0, 4);
__lsx_vstelm_h(p0, pPixCr + iStrideY, 0, 5);
__lsx_vstelm_h(p0, pPixCr + iStrideY_x2, 0, 6);
__lsx_vstelm_h(p0, pPixCr + iStrideY_x3, 0, 7);
}
void DeblockChromaEq4H_lsx (uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStrideY,
int32_t iAlpha, int32_t iBeta) {
__m128i p0, p1, q0, q1, t0, t1, t2, t3, tp;
__m128i p0_l, p1_l, p2_l, q0_l, q1_l, q2_l;
__m128i bDetaP0Q0, bDetaP1P0, bDetaQ1Q0, flags;
__m128i zero = __lsx_vldi(0);
__m128i alpha = __lsx_vreplgr2vr_b(iAlpha);
__m128i beta = __lsx_vreplgr2vr_b(iBeta);
int32_t iStrideY_x0 = 0;
int32_t iStrideY_x2 = iStrideY << 1;
int32_t iStrideY_x3 = iStrideY_x2 + iStrideY;
int32_t iStrideY_x4 = iStrideY << 2;
// Load data from pPixCb
pPixCb -= 2;
DUP4_ARG2(__lsx_vldx, pPixCb, iStrideY_x0, pPixCb, iStrideY, pPixCb,
iStrideY_x2, pPixCb, iStrideY_x3, p1, p0, q0, q1);
pPixCb += iStrideY_x4;
DUP4_ARG2(__lsx_vldx, pPixCb, iStrideY_x0, pPixCb, iStrideY, pPixCb,
iStrideY_x2, pPixCb, iStrideY_x3, t0, t1, t2, t3);
LSX_TRANSPOSE8x4_B(p1, p0, q0, q1, t0, t1, t2, t3, p1, p0, q0, q1);
DUP4_ARG2(__lsx_vilvl_b, zero, p0, zero, p1, zero, q0, zero, q1,
p0_l, p1_l, q0_l, q1_l);
// Calculate condition mask
DUP2_ARG2(__lsx_vabsd_bu, p0, q0, p1, p0, bDetaP0Q0, bDetaP1P0);
bDetaQ1Q0 = __lsx_vabsd_bu(q1, q0);
DUP2_ARG2(__lsx_vslt_bu, bDetaP0Q0, alpha, bDetaP1P0, beta, bDetaP0Q0, bDetaP1P0);
bDetaQ1Q0 = __lsx_vslt_bu(bDetaQ1Q0, beta);
DUP2_ARG2(__lsx_vand_v, bDetaP0Q0, bDetaP1P0, bDetaQ1Q0, flags, flags, flags);
// ((p1 * (1 << 1)) + p0 + q1 + 2) >> 2
p2_l = __lsx_vslli_h(p1_l, 1);
p2_l = __lsx_vadd_h(p2_l, p0_l);
p2_l = __lsx_vadd_h(p2_l, q1_l);
p2_l = __lsx_vaddi_hu(p2_l, 2);
p2_l = __lsx_vsrai_h(p2_l, 2);
// ((q1 * (1 << 1)) + q0 + p1 + 2) >> 2
q2_l = __lsx_vslli_h(q1_l, 1);
q2_l = __lsx_vadd_h(q2_l, q0_l);
q2_l = __lsx_vadd_h(q2_l, p1_l);
q2_l = __lsx_vaddi_hu(q2_l, 2);
q2_l = __lsx_vsrai_h(q2_l, 2);
DUP2_ARG2(__lsx_vpickev_b, zero, p2_l, zero, q2_l, t0, t1);
t0 = __lsx_vand_v(t0, flags);
tp = __lsx_vnor_v(flags,flags);
p0 = __lsx_vand_v(p0, tp);
p0 = __lsx_vadd_b(t0, p0);
t1 = __lsx_vand_v(t1, flags);
tp = __lsx_vnor_v(flags,flags);
q0 = __lsx_vand_v(q0, tp);
q0 = __lsx_vadd_b(t1, q0);
p0 = __lsx_vilvl_b(q0, p0);
// Store data to pPixCb
pPixCb -= iStrideY_x4 - 1;
__lsx_vstelm_h(p0, pPixCb, 0, 0);
__lsx_vstelm_h(p0, pPixCb + iStrideY, 0, 1);
__lsx_vstelm_h(p0, pPixCb + iStrideY_x2, 0, 2);
__lsx_vstelm_h(p0, pPixCb + iStrideY_x3, 0, 3);
pPixCb += iStrideY_x4;
__lsx_vstelm_h(p0, pPixCb, 0, 4);
__lsx_vstelm_h(p0, pPixCb + iStrideY, 0, 5);
__lsx_vstelm_h(p0, pPixCb + iStrideY_x2, 0, 6);
__lsx_vstelm_h(p0, pPixCb + iStrideY_x3, 0, 7);
// Load data from pPixCr
pPixCr -= 2;
DUP4_ARG2(__lsx_vldx, pPixCr, iStrideY_x0, pPixCr, iStrideY, pPixCr,
iStrideY_x2, pPixCr, iStrideY_x3, p1, p0, q0, q1);
pPixCr += iStrideY_x4;
DUP4_ARG2(__lsx_vldx, pPixCr, iStrideY_x0, pPixCr, iStrideY, pPixCr,
iStrideY_x2, pPixCr, iStrideY_x3, t0, t1, t2, t3);
LSX_TRANSPOSE8x4_B(p1, p0, q0, q1, t0, t1, t2, t3, p1, p0, q0, q1);
DUP4_ARG2(__lsx_vilvl_b, zero, p0, zero, p1, zero, q0, zero, q1,
p0_l, p1_l, q0_l, q1_l);
// Calculate condition mask
DUP2_ARG2(__lsx_vabsd_bu, p0, q0, p1, p0, bDetaP0Q0, bDetaP1P0);
bDetaQ1Q0 = __lsx_vabsd_bu(q1, q0);
DUP2_ARG2(__lsx_vslt_bu, bDetaP0Q0, alpha, bDetaP1P0, beta, bDetaP0Q0, bDetaP1P0);
bDetaQ1Q0 = __lsx_vslt_bu(bDetaQ1Q0, beta);
DUP2_ARG2(__lsx_vand_v, bDetaP0Q0, bDetaP1P0, bDetaQ1Q0, flags, flags, flags);
// ((p1 * (1 << 1)) + p0 + q1 + 2) >> 2
p2_l = __lsx_vslli_h(p1_l, 1);
p2_l = __lsx_vadd_h(p2_l, p0_l);
p2_l = __lsx_vadd_h(p2_l, q1_l);
p2_l = __lsx_vaddi_hu(p2_l, 2);
p2_l = __lsx_vsrai_h(p2_l, 2);
// ((q1 * (1 << 1)) + q0 + p1 + 2) >> 2
q2_l = __lsx_vslli_h(q1_l, 1);
q2_l = __lsx_vadd_h(q2_l, q0_l);
q2_l = __lsx_vadd_h(q2_l, p1_l);
q2_l = __lsx_vaddi_hu(q2_l, 2);
q2_l = __lsx_vsrai_h(q2_l, 2);
DUP2_ARG2(__lsx_vpickev_b, zero, p2_l, zero, q2_l, t0, t1);
t0 = __lsx_vand_v(t0, flags);
tp = __lsx_vnor_v(flags,flags);
p0 = __lsx_vand_v(p0, tp);
p0 = __lsx_vadd_b(t0, p0);
t1 = __lsx_vand_v(t1, flags);
tp = __lsx_vnor_v(flags,flags);
q0 = __lsx_vand_v(q0, tp);
q0 = __lsx_vadd_b(t1, q0);
p0 = __lsx_vilvl_b(q0, p0);
// Store data to pPixCr
pPixCr -= iStrideY_x4 - 1;
__lsx_vstelm_h(p0, pPixCr, 0, 0);
__lsx_vstelm_h(p0, pPixCr + iStrideY, 0, 1);
__lsx_vstelm_h(p0, pPixCr + iStrideY_x2, 0, 2);
__lsx_vstelm_h(p0, pPixCr + iStrideY_x3, 0, 3);
pPixCr += iStrideY_x4;
__lsx_vstelm_h(p0, pPixCr, 0, 4);
__lsx_vstelm_h(p0, pPixCr + iStrideY, 0, 5);
__lsx_vstelm_h(p0, pPixCr + iStrideY_x2, 0, 6);
__lsx_vstelm_h(p0, pPixCr + iStrideY_x3, 0, 7);
}