blob: e1bdb6d636e15850d487b769b36304b3d1cbbd0d [file] [log] [blame]
/*!
* \copy
* Copyright (c) 2013, Cisco Systems
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*
*/
#include "ls_defines.h"
#include "encode_mb_aux.h"
#include "cpu_core.h"
namespace WelsEnc {
ALIGNED_DECLARE (const int16_t, g_kiQuantInterFF[58][8], 16) = {
/* 0*/ { 0, 1, 0, 1, 1, 1, 1, 1 },
/* 1*/ { 0, 1, 0, 1, 1, 1, 1, 1 },
/* 2*/ { 1, 1, 1, 1, 1, 1, 1, 1 },
/* 3*/ { 1, 1, 1, 1, 1, 1, 1, 1 },
/* 4*/ { 1, 1, 1, 1, 1, 2, 1, 2 },
/* 5*/ { 1, 1, 1, 1, 1, 2, 1, 2 },
/* 6*/ { 1, 1, 1, 1, 1, 2, 1, 2 },
/* 7*/ { 1, 1, 1, 1, 1, 2, 1, 2 },
/* 8*/ { 1, 2, 1, 2, 2, 3, 2, 3 },
/* 9*/ { 1, 2, 1, 2, 2, 3, 2, 3 },
/*10*/ { 1, 2, 1, 2, 2, 3, 2, 3 },
/*11*/ { 1, 2, 1, 2, 2, 4, 2, 4 },
/*12*/ { 2, 3, 2, 3, 3, 4, 3, 4 },
/*13*/ { 2, 3, 2, 3, 3, 5, 3, 5 },
/*14*/ { 2, 3, 2, 3, 3, 5, 3, 5 },
/*15*/ { 2, 4, 2, 4, 4, 6, 4, 6 },
/*16*/ { 3, 4, 3, 4, 4, 7, 4, 7 },
/*17*/ { 3, 5, 3, 5, 5, 8, 5, 8 },
/*18*/ { 3, 5, 3, 5, 5, 8, 5, 8 },
/*19*/ { 4, 6, 4, 6, 6, 9, 6, 9 },
/*20*/ { 4, 7, 4, 7, 7, 10, 7, 10 },
/*21*/ { 5, 8, 5, 8, 8, 12, 8, 12 },
/*22*/ { 5, 8, 5, 8, 8, 13, 8, 13 },
/*23*/ { 6, 10, 6, 10, 10, 15, 10, 15 },
/*24*/ { 7, 11, 7, 11, 11, 17, 11, 17 },
/*25*/ { 7, 12, 7, 12, 12, 19, 12, 19 },
/*26*/ { 9, 13, 9, 13, 13, 21, 13, 21 },
/*27*/ { 9, 15, 9, 15, 15, 24, 15, 24 },
/*28*/ { 11, 17, 11, 17, 17, 26, 17, 26 },
/*29*/ { 12, 19, 12, 19, 19, 30, 19, 30 },
/*30*/ { 13, 22, 13, 22, 22, 33, 22, 33 },
/*31*/ { 15, 23, 15, 23, 23, 38, 23, 38 },
/*32*/ { 17, 27, 17, 27, 27, 42, 27, 42 },
/*33*/ { 19, 30, 19, 30, 30, 48, 30, 48 },
/*34*/ { 21, 33, 21, 33, 33, 52, 33, 52 },
/*35*/ { 24, 38, 24, 38, 38, 60, 38, 60 },
/*36*/ { 27, 43, 27, 43, 43, 67, 43, 67 },
/*37*/ { 29, 47, 29, 47, 47, 75, 47, 75 },
/*38*/ { 35, 53, 35, 53, 53, 83, 53, 83 },
/*39*/ { 37, 60, 37, 60, 60, 96, 60, 96 },
/*40*/ { 43, 67, 43, 67, 67, 104, 67, 104 },
/*41*/ { 48, 77, 48, 77, 77, 121, 77, 121 },
/*42*/ { 53, 87, 53, 87, 87, 133, 87, 133 },
/*43*/ { 59, 93, 59, 93, 93, 150, 93, 150 },
/*44*/ { 69, 107, 69, 107, 107, 167, 107, 167 },
/*45*/ { 75, 120, 75, 120, 120, 192, 120, 192 },
/*46*/ { 85, 133, 85, 133, 133, 208, 133, 208 },
/*47*/ { 96, 153, 96, 153, 153, 242, 153, 242 },
/*48*/ { 107, 173, 107, 173, 173, 267, 173, 267 },
/*49*/ { 117, 187, 117, 187, 187, 300, 187, 300 },
/*50*/ { 139, 213, 139, 213, 213, 333, 213, 333 },
/*51*/ { 149, 240, 149, 240, 240, 383, 240, 383 },
/* from here below is only for intra */
/*46*/ { 171, 267, 171, 267, 267, 417, 267, 417 },
/*47*/ { 192, 307, 192, 307, 307, 483, 307, 483 },
/*48*/ { 213, 347, 213, 347, 347, 533, 347, 533 },
/*49*/ { 235, 373, 235, 373, 373, 600, 373, 600 },
/*50*/ { 277, 427, 277, 427, 427, 667, 427, 667 },
/*51*/ { 299, 480, 299, 480, 480, 767, 480, 767 },
};
ALIGNED_DECLARE (const int16_t, g_kiQuantMF[52][8], 16) = {
/* 0*/ {26214, 16132, 26214, 16132, 16132, 10486, 16132, 10486 },
/* 1*/ {23832, 14980, 23832, 14980, 14980, 9320, 14980, 9320 },
/* 2*/ {20164, 13108, 20164, 13108, 13108, 8388, 13108, 8388 },
/* 3*/ {18724, 11650, 18724, 11650, 11650, 7294, 11650, 7294 },
/* 4*/ {16384, 10486, 16384, 10486, 10486, 6710, 10486, 6710 },
/* 5*/ {14564, 9118, 14564, 9118, 9118, 5786, 9118, 5786 },
/* 6*/ {13107, 8066, 13107, 8066, 8066, 5243, 8066, 5243 },
/* 7*/ {11916, 7490, 11916, 7490, 7490, 4660, 7490, 4660 },
/* 8*/ {10082, 6554, 10082, 6554, 6554, 4194, 6554, 4194 },
/* 9*/ { 9362, 5825, 9362, 5825, 5825, 3647, 5825, 3647 },
/*10*/ { 8192, 5243, 8192, 5243, 5243, 3355, 5243, 3355 },
/*11*/ { 7282, 4559, 7282, 4559, 4559, 2893, 4559, 2893 },
/*12*/ { 6554, 4033, 6554, 4033, 4033, 2622, 4033, 2622 },
/*13*/ { 5958, 3745, 5958, 3745, 3745, 2330, 3745, 2330 },
/*14*/ { 5041, 3277, 5041, 3277, 3277, 2097, 3277, 2097 },
/*15*/ { 4681, 2913, 4681, 2913, 2913, 1824, 2913, 1824 },
/*16*/ { 4096, 2622, 4096, 2622, 2622, 1678, 2622, 1678 },
/*17*/ { 3641, 2280, 3641, 2280, 2280, 1447, 2280, 1447 },
/*18*/ { 3277, 2017, 3277, 2017, 2017, 1311, 2017, 1311 },
/*19*/ { 2979, 1873, 2979, 1873, 1873, 1165, 1873, 1165 },
/*20*/ { 2521, 1639, 2521, 1639, 1639, 1049, 1639, 1049 },
/*21*/ { 2341, 1456, 2341, 1456, 1456, 912, 1456, 912 },
/*22*/ { 2048, 1311, 2048, 1311, 1311, 839, 1311, 839 },
/*23*/ { 1821, 1140, 1821, 1140, 1140, 723, 1140, 723 },
/*24*/ { 1638, 1008, 1638, 1008, 1008, 655, 1008, 655 },
/*25*/ { 1490, 936, 1490, 936, 936, 583, 936, 583 },
/*26*/ { 1260, 819, 1260, 819, 819, 524, 819, 524 },
/*27*/ { 1170, 728, 1170, 728, 728, 456, 728, 456 },
/*28*/ { 1024, 655, 1024, 655, 655, 419, 655, 419 },
/*29*/ { 910, 570, 910, 570, 570, 362, 570, 362 },
/*30*/ { 819, 504, 819, 504, 504, 328, 504, 328 },
/*31*/ { 745, 468, 745, 468, 468, 291, 468, 291 },
/*32*/ { 630, 410, 630, 410, 410, 262, 410, 262 },
/*33*/ { 585, 364, 585, 364, 364, 228, 364, 228 },
/*34*/ { 512, 328, 512, 328, 328, 210, 328, 210 },
/*35*/ { 455, 285, 455, 285, 285, 181, 285, 181 },
/*36*/ { 410, 252, 410, 252, 252, 164, 252, 164 },
/*37*/ { 372, 234, 372, 234, 234, 146, 234, 146 },
/*38*/ { 315, 205, 315, 205, 205, 131, 205, 131 },
/*39*/ { 293, 182, 293, 182, 182, 114, 182, 114 },
/*40*/ { 256, 164, 256, 164, 164, 105, 164, 105 },
/*41*/ { 228, 142, 228, 142, 142, 90, 142, 90 },
/*42*/ { 205, 126, 205, 126, 126, 82, 126, 82 },
/*43*/ { 186, 117, 186, 117, 117, 73, 117, 73 },
/*44*/ { 158, 102, 158, 102, 102, 66, 102, 66 },
/*45*/ { 146, 91, 146, 91, 91, 57, 91, 57 },
/*46*/ { 128, 82, 128, 82, 82, 52, 82, 52 },
/*47*/ { 114, 71, 114, 71, 71, 45, 71, 45 },
/*48*/ { 102, 63, 102, 63, 63, 41, 63, 41 },
/*49*/ { 93, 59, 93, 59, 59, 36, 59, 36 },
/*50*/ { 79, 51, 79, 51, 51, 33, 51, 33 },
/*51*/ { 73, 46, 73, 46, 46, 28, 46, 28 }
};
/****************************************************************************
* HDM and Quant functions
****************************************************************************/
#define WELS_ABS_LC(a) ((iSign ^ (int32_t)(a)) - iSign)
#define NEW_QUANT(pDct, iFF, iMF) (((iFF)+ WELS_ABS_LC(pDct))*(iMF)) >>16
#define WELS_NEW_QUANT(pDct,iFF,iMF) WELS_ABS_LC(NEW_QUANT(pDct, iFF, iMF))
void WelsQuant4x4_c (int16_t* pDct, const int16_t* pFF, const int16_t* pMF) {
int32_t i, j, iSign;
for (i = 0; i < 16; i += 4) {
j = i & 0x07;
iSign = WELS_SIGN (pDct[i]);
pDct[i] = WELS_NEW_QUANT (pDct[i], pFF[j], pMF[j]);
iSign = WELS_SIGN (pDct[i + 1]);
pDct[i + 1] = WELS_NEW_QUANT (pDct[i + 1], pFF[j + 1], pMF[j + 1]);
iSign = WELS_SIGN (pDct[i + 2]);
pDct[i + 2] = WELS_NEW_QUANT (pDct[i + 2], pFF[j + 2], pMF[j + 2]);
iSign = WELS_SIGN (pDct[i + 3]);
pDct[i + 3] = WELS_NEW_QUANT (pDct[i + 3], pFF[j + 3], pMF[j + 3]);
}
}
void WelsQuant4x4Dc_c (int16_t* pDct, int16_t iFF, int16_t iMF) {
int32_t i, iSign;
for (i = 0; i < 16; i += 4) {
iSign = WELS_SIGN (pDct[i]);
pDct[i] = WELS_NEW_QUANT (pDct[i], iFF, iMF);
iSign = WELS_SIGN (pDct[i + 1]);
pDct[i + 1] = WELS_NEW_QUANT (pDct[i + 1], iFF, iMF);
iSign = WELS_SIGN (pDct[i + 2]);
pDct[i + 2] = WELS_NEW_QUANT (pDct[i + 2], iFF, iMF);
iSign = WELS_SIGN (pDct[i + 3]);
pDct[i + 3] = WELS_NEW_QUANT (pDct[i + 3], iFF, iMF);
}
}
void WelsQuantFour4x4_c (int16_t* pDct, const int16_t* pFF, const int16_t* pMF) {
int32_t i, j, iSign;
for (i = 0; i < 64; i += 4) {
j = i & 0x07;
iSign = WELS_SIGN (pDct[i]);
pDct[i] = WELS_NEW_QUANT (pDct[i], pFF[j], pMF[j]);
iSign = WELS_SIGN (pDct[i + 1]);
pDct[i + 1] = WELS_NEW_QUANT (pDct[i + 1], pFF[j + 1], pMF[j + 1]);
iSign = WELS_SIGN (pDct[i + 2]);
pDct[i + 2] = WELS_NEW_QUANT (pDct[i + 2], pFF[j + 2], pMF[j + 2]);
iSign = WELS_SIGN (pDct[i + 3]);
pDct[i + 3] = WELS_NEW_QUANT (pDct[i + 3], pFF[j + 3], pMF[j + 3]);
}
}
void WelsQuantFour4x4Max_c (int16_t* pDct, const int16_t* pFF, const int16_t* pMF, int16_t* pMax) {
int32_t i, j, k, iSign;
int16_t iMaxAbs;
for (k = 0; k < 4; k++) {
iMaxAbs = 0;
for (i = 0; i < 16; i++) {
j = i & 0x07;
iSign = WELS_SIGN (pDct[i]);
pDct[i] = NEW_QUANT (pDct[i], pFF[j], pMF[j]);
if (iMaxAbs < pDct[i]) iMaxAbs = pDct[i];
pDct[i] = WELS_ABS_LC (pDct[i]);
}
pDct += 16;
pMax[k] = iMaxAbs;
}
}
int32_t WelsHadamardQuant2x2Skip_c (int16_t* pRs, int16_t iFF, int16_t iMF) {
int16_t pDct[4], s[4];
int16_t iThreshold = ((1 << 16) - 1) / iMF - iFF;
s[0] = pRs[0] + pRs[32];
s[1] = pRs[0] - pRs[32];
s[2] = pRs[16] + pRs[48];
s[3] = pRs[16] - pRs[48];
pDct[0] = s[0] + s[2];
pDct[1] = s[0] - s[2];
pDct[2] = s[1] + s[3];
pDct[3] = s[1] - s[3];
return ((WELS_ABS (pDct[0]) > iThreshold) || (WELS_ABS (pDct[1]) > iThreshold) || (WELS_ABS (pDct[2]) > iThreshold)
|| (WELS_ABS (pDct[3]) > iThreshold));
}
int32_t WelsHadamardQuant2x2_c (int16_t* pRs, const int16_t iFF, int16_t iMF, int16_t* pDct, int16_t* pBlock) {
int16_t s[4];
int32_t iSign, i, iDcNzc = 0;
s[0] = pRs[0] + pRs[32];
s[1] = pRs[0] - pRs[32];
s[2] = pRs[16] + pRs[48];
s[3] = pRs[16] - pRs[48];
pRs[0] = 0;
pRs[16] = 0;
pRs[32] = 0;
pRs[48] = 0;
pDct[0] = s[0] + s[2];
pDct[1] = s[0] - s[2];
pDct[2] = s[1] + s[3];
pDct[3] = s[1] - s[3];
iSign = WELS_SIGN (pDct[0]);
pDct[0] = WELS_NEW_QUANT (pDct[0], iFF, iMF);
iSign = WELS_SIGN (pDct[1]);
pDct[1] = WELS_NEW_QUANT (pDct[1], iFF, iMF);
iSign = WELS_SIGN (pDct[2]);
pDct[2] = WELS_NEW_QUANT (pDct[2], iFF, iMF);
iSign = WELS_SIGN (pDct[3]);
pDct[3] = WELS_NEW_QUANT (pDct[3], iFF, iMF);
ST64 (pBlock, LD64 (pDct));
for (i = 0; i < 4; i++)
iDcNzc += (pBlock[i] != 0);
return iDcNzc;
}
/* dc value pick up and hdm_4x4 */
void WelsHadamardT4Dc_c (int16_t* pLumaDc, int16_t* pDct) {
int32_t p[16], s[4];
int32_t i, iIdx;
for (i = 0 ; i < 16 ; i += 4) {
iIdx = ((i & 0x08) << 4) + ((i & 0x04) << 3);
s[0] = pDct[iIdx ] + pDct[iIdx + 80];
s[3] = pDct[iIdx ] - pDct[iIdx + 80];
s[1] = pDct[iIdx + 16] + pDct[iIdx + 64];
s[2] = pDct[iIdx + 16] - pDct[iIdx + 64];
p[i ] = s[0] + s[1];
p[i + 2] = s[0] - s[1];
p[i + 1] = s[3] + s[2];
p[i + 3] = s[3] - s[2];
}
for (i = 0 ; i < 4 ; i ++) {
s[0] = p[i ] + p[i + 12];
s[3] = p[i ] - p[i + 12];
s[1] = p[i + 4] + p[i + 8];
s[2] = p[i + 4] - p[i + 8];
pLumaDc[i ] = WELS_CLIP3 ((s[0] + s[1] + 1) >> 1, -32768, 32767);
pLumaDc[i + 8 ] = WELS_CLIP3 ((s[0] - s[1] + 1) >> 1, -32768, 32767);
pLumaDc[i + 4 ] = WELS_CLIP3 ((s[3] + s[2] + 1) >> 1, -32768, 32767);
pLumaDc[i + 12] = WELS_CLIP3 ((s[3] - s[2] + 1) >> 1, -32768, 32767);
}
}
/****************************************************************************
* DCT functions
****************************************************************************/
void WelsDctT4_c (int16_t* pDct, uint8_t* pPixel1, int32_t iStride1, uint8_t* pPixel2, int32_t iStride2) {
int16_t i, pData[16], s[4];
for (i = 0 ; i < 16 ; i += 4) {
const int32_t kiI1 = 1 + i;
const int32_t kiI2 = 2 + i;
const int32_t kiI3 = 3 + i;
pData[i ] = pPixel1[0] - pPixel2[0];
pData[kiI1] = pPixel1[1] - pPixel2[1];
pData[kiI2] = pPixel1[2] - pPixel2[2];
pData[kiI3] = pPixel1[3] - pPixel2[3];
pPixel1 += iStride1;
pPixel2 += iStride2;
/*horizontal transform */
s[0] = pData[i] + pData[kiI3];
s[3] = pData[i] - pData[kiI3];
s[1] = pData[kiI1] + pData[kiI2];
s[2] = pData[kiI1] - pData[kiI2];
pDct[i ] = s[0] + s[1];
pDct[kiI2] = s[0] - s[1];
pDct[kiI1] = (s[3] * (1 << 1)) + s[2];
pDct[kiI3] = s[3] - (s[2] * (1 << 1));
}
/* vertical transform */
for (i = 0 ; i < 4 ; i ++) {
const int32_t kiI4 = 4 + i;
const int32_t kiI8 = 8 + i;
const int32_t kiI12 = 12 + i;
s[0] = pDct[i ] + pDct[kiI12];
s[3] = pDct[i ] - pDct[kiI12];
s[1] = pDct[kiI4] + pDct[kiI8 ];
s[2] = pDct[kiI4] - pDct[kiI8 ];
pDct[i ] = s[0] + s[1];
pDct[kiI8 ] = s[0] - s[1];
pDct[kiI4 ] = (s[3] * (1 << 1)) + s[2];
pDct[kiI12] = s[3] - (s[2] * (1 << 1));
}
}
void WelsDctFourT4_c (int16_t* pDct, uint8_t* pPixel1, int32_t iStride1, uint8_t* pPixel2, int32_t iStride2) {
int32_t stride_1 = iStride1 << 2;
int32_t stride_2 = iStride2 << 2;
WelsDctT4_c (pDct, &pPixel1[0], iStride1, &pPixel2[0], iStride2);
WelsDctT4_c (pDct + 16, &pPixel1[4], iStride1, &pPixel2[4], iStride2);
WelsDctT4_c (pDct + 32, &pPixel1[stride_1 ], iStride1, &pPixel2[stride_2 ], iStride2);
WelsDctT4_c (pDct + 48, &pPixel1[stride_1 + 4], iStride1, &pPixel2[stride_2 + 4], iStride2);
}
/****************************************************************************
* Scan and Score functions
****************************************************************************/
void WelsScan4x4DcAc_c (int16_t* pLevel, int16_t* pDct) {
ST32 (pLevel, LD32 (pDct));
pLevel[2] = pDct[4];
pLevel[3] = pDct[8];
pLevel[4] = pDct[5];
ST32 (pLevel + 5, LD32 (pDct + 2));
pLevel[7] = pDct[6];
pLevel[8] = pDct[9];
ST32 (pLevel + 9, LD32 (pDct + 12));
pLevel[11] = pDct[10];
pLevel[12] = pDct[7];
pLevel[13] = pDct[11];
ST32 (pLevel + 14, LD32 (pDct + 14));
}
void WelsScan4x4Ac_c (int16_t* pLevel, int16_t* pDct) {
pLevel[0] = pDct[1];
pLevel[1] = pDct[4];
pLevel[2] = pDct[8];
pLevel[3] = pDct[5];
ST32 (&pLevel[4], LD32 (&pDct[2]));
pLevel[6] = pDct[6];
pLevel[7] = pDct[9];
ST32 (&pLevel[8], LD32 (&pDct[12]));
pLevel[10] = pDct[10];
pLevel[11] = pDct[7];
pLevel[12] = pDct[11];
ST32 (&pLevel[13], LD32 (&pDct[14]));
pLevel[15] = 0;
}
void WelsScan4x4Dc (int16_t* pLevel, int16_t* pDct) {
ST32 (pLevel, LD32 (pDct));
pLevel[2] = pDct[4];
pLevel[3] = pDct[8];
pLevel[4] = pDct[5];
ST32 (pLevel + 5, LD32 (pDct + 2));
pLevel[7] = pDct[6];
pLevel[8] = pDct[9];
ST32 (pLevel + 9, LD32 (pDct + 12));
pLevel[11] = pDct[10];
pLevel[12] = pDct[7];
pLevel[13] = pDct[11];
ST32 (pLevel + 14, LD32 (pDct + 14));
}
//refer to JVT-O079
int32_t WelsCalculateSingleCtr4x4_c (int16_t* pDct) {
static const int32_t kiTRunTable[16] = { 3, 2, 2, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
int32_t iSingleCtr = 0;
int32_t iIdx = 15;
int32_t iRun;
while (iIdx >= 0 && pDct[iIdx] == 0) --iIdx;
while (iIdx >= 0) {
-- iIdx;
iRun = iIdx;
while (iIdx >= 0 && pDct[iIdx] == 0) --iIdx;
iRun -= iIdx;
iSingleCtr += kiTRunTable[iRun];
}
return iSingleCtr;
}
int32_t WelsGetNoneZeroCount_c (int16_t* pLevel) {
int32_t iCnt = 0;
int32_t iIdx = 0;
while (iIdx < 16) {
iCnt += (pLevel[ iIdx] == 0);
iCnt += (pLevel[1 + iIdx] == 0);
iCnt += (pLevel[2 + iIdx] == 0);
iCnt += (pLevel[3 + iIdx] == 0);
iIdx += 4;
}
return (16 - iCnt);
}
#ifdef HAVE_NEON
int32_t WelsHadamardQuant2x2Skip_neon (int16_t* pRes, int16_t iFF, int16_t iMF) {
int16_t iThreshold = ((1 << 16) - 1) / iMF - iFF;
return WelsHadamardQuant2x2SkipKernel_neon (pRes, iThreshold);
}
#endif
#ifdef HAVE_NEON_AARCH64
int32_t WelsHadamardQuant2x2Skip_AArch64_neon (int16_t* pRes, int16_t iFF, int16_t iMF) {
int16_t iThreshold = ((1 << 16) - 1) / iMF - iFF;
return WelsHadamardQuant2x2SkipKernel_AArch64_neon (pRes, iThreshold);
}
#endif
void WelsInitEncodingFuncs (SWelsFuncPtrList* pFuncList, uint32_t uiCpuFlag) {
pFuncList->pfCopy8x8Aligned = WelsCopy8x8_c;
pFuncList->pfCopy16x16Aligned =
pFuncList->pfCopy16x16NotAligned = WelsCopy16x16_c;
pFuncList->pfCopy16x8NotAligned = WelsCopy16x8_c;
pFuncList->pfCopy8x16Aligned = WelsCopy8x16_c;
pFuncList->pfCopy4x4 = WelsCopy4x4_c;
pFuncList->pfCopy8x4 = WelsCopy8x4_c;
pFuncList->pfCopy4x8 = WelsCopy4x8_c;
pFuncList->pfQuantizationHadamard2x2 = WelsHadamardQuant2x2_c;
pFuncList->pfQuantizationHadamard2x2Skip = WelsHadamardQuant2x2Skip_c;
pFuncList->pfTransformHadamard4x4Dc = WelsHadamardT4Dc_c;
pFuncList->pfDctT4 = WelsDctT4_c;
pFuncList->pfDctFourT4 = WelsDctFourT4_c;
pFuncList->pfScan4x4 = WelsScan4x4DcAc_c;
pFuncList->pfScan4x4Ac = WelsScan4x4Ac_c;
pFuncList->pfCalculateSingleCtr4x4 = WelsCalculateSingleCtr4x4_c;
pFuncList->pfGetNoneZeroCount = WelsGetNoneZeroCount_c;
pFuncList->pfQuantization4x4 = WelsQuant4x4_c;
pFuncList->pfQuantizationDc4x4 = WelsQuant4x4Dc_c;
pFuncList->pfQuantizationFour4x4 = WelsQuantFour4x4_c;
pFuncList->pfQuantizationFour4x4Max = WelsQuantFour4x4Max_c;
#if defined(X86_ASM)
if (uiCpuFlag & WELS_CPU_MMXEXT) {
pFuncList->pfQuantizationHadamard2x2 = WelsHadamardQuant2x2_mmx;
pFuncList->pfQuantizationHadamard2x2Skip = WelsHadamardQuant2x2Skip_mmx;
pFuncList->pfDctT4 = WelsDctT4_mmx;
pFuncList->pfCopy8x8Aligned = WelsCopy8x8_mmx;
pFuncList->pfCopy8x16Aligned = WelsCopy8x16_mmx;
}
if (uiCpuFlag & WELS_CPU_SSE2) {
pFuncList->pfGetNoneZeroCount = WelsGetNoneZeroCount_sse2;
pFuncList->pfTransformHadamard4x4Dc = WelsHadamardT4Dc_sse2;
pFuncList->pfQuantization4x4 = WelsQuant4x4_sse2;
pFuncList->pfQuantizationDc4x4 = WelsQuant4x4Dc_sse2;
pFuncList->pfQuantizationFour4x4 = WelsQuantFour4x4_sse2;
pFuncList->pfQuantizationFour4x4Max = WelsQuantFour4x4Max_sse2;
pFuncList->pfCopy16x16Aligned = WelsCopy16x16_sse2;
pFuncList->pfCopy16x16NotAligned = WelsCopy16x16NotAligned_sse2;
pFuncList->pfCopy16x8NotAligned = WelsCopy16x8NotAligned_sse2;
pFuncList->pfScan4x4 = WelsScan4x4DcAc_sse2;
pFuncList->pfScan4x4Ac = WelsScan4x4Ac_sse2;
pFuncList->pfCalculateSingleCtr4x4 = WelsCalculateSingleCtr4x4_sse2;
pFuncList->pfDctT4 = WelsDctT4_sse2;
pFuncList->pfDctFourT4 = WelsDctFourT4_sse2;
}
//#ifndef MACOS
if (uiCpuFlag & WELS_CPU_SSSE3) {
pFuncList->pfScan4x4 = WelsScan4x4DcAc_ssse3;
}
if (uiCpuFlag & WELS_CPU_SSE42) {
pFuncList->pfGetNoneZeroCount = WelsGetNoneZeroCount_sse42;
}
#if defined(HAVE_AVX2)
if (uiCpuFlag & WELS_CPU_AVX2) {
pFuncList->pfDctT4 = WelsDctT4_avx2;
pFuncList->pfDctFourT4 = WelsDctFourT4_avx2;
pFuncList->pfQuantization4x4 = WelsQuant4x4_avx2;
pFuncList->pfQuantizationDc4x4 = WelsQuant4x4Dc_avx2;
pFuncList->pfQuantizationFour4x4 = WelsQuantFour4x4_avx2;
pFuncList->pfQuantizationFour4x4Max = WelsQuantFour4x4Max_avx2;
}
#endif
//#endif//MACOS
#endif//X86_ASM
#if defined(HAVE_NEON)
if (uiCpuFlag & WELS_CPU_NEON) {
pFuncList->pfQuantizationHadamard2x2 = WelsHadamardQuant2x2_neon;
pFuncList->pfQuantizationHadamard2x2Skip = WelsHadamardQuant2x2Skip_neon;
pFuncList->pfDctT4 = WelsDctT4_neon;
pFuncList->pfCopy8x8Aligned = WelsCopy8x8_neon;
pFuncList->pfCopy8x16Aligned = WelsCopy8x16_neon;
pFuncList->pfGetNoneZeroCount = WelsGetNoneZeroCount_neon;
pFuncList->pfTransformHadamard4x4Dc = WelsHadamardT4Dc_neon;
pFuncList->pfQuantization4x4 = WelsQuant4x4_neon;
pFuncList->pfQuantizationDc4x4 = WelsQuant4x4Dc_neon;
pFuncList->pfQuantizationFour4x4 = WelsQuantFour4x4_neon;
pFuncList->pfQuantizationFour4x4Max = WelsQuantFour4x4Max_neon;
pFuncList->pfCopy16x16Aligned = WelsCopy16x16_neon;
pFuncList->pfCopy16x16NotAligned = WelsCopy16x16NotAligned_neon;
pFuncList->pfCopy16x8NotAligned = WelsCopy16x8NotAligned_neon;
pFuncList->pfDctFourT4 = WelsDctFourT4_neon;
}
#endif
#if defined(HAVE_NEON_AARCH64)
if (uiCpuFlag & WELS_CPU_NEON) {
pFuncList->pfQuantizationHadamard2x2 = WelsHadamardQuant2x2_AArch64_neon;
pFuncList->pfQuantizationHadamard2x2Skip = WelsHadamardQuant2x2Skip_AArch64_neon;
pFuncList->pfDctT4 = WelsDctT4_AArch64_neon;
pFuncList->pfCopy8x8Aligned = WelsCopy8x8_AArch64_neon;
pFuncList->pfCopy8x16Aligned = WelsCopy8x16_AArch64_neon;
pFuncList->pfGetNoneZeroCount = WelsGetNoneZeroCount_AArch64_neon;
pFuncList->pfTransformHadamard4x4Dc = WelsHadamardT4Dc_AArch64_neon;
pFuncList->pfQuantization4x4 = WelsQuant4x4_AArch64_neon;
pFuncList->pfQuantizationDc4x4 = WelsQuant4x4Dc_AArch64_neon;
pFuncList->pfQuantizationFour4x4 = WelsQuantFour4x4_AArch64_neon;
pFuncList->pfQuantizationFour4x4Max = WelsQuantFour4x4Max_AArch64_neon;
pFuncList->pfCopy16x16Aligned = WelsCopy16x16_AArch64_neon;
pFuncList->pfCopy16x16NotAligned = WelsCopy16x16NotAligned_AArch64_neon;
pFuncList->pfCopy16x8NotAligned = WelsCopy16x8NotAligned_AArch64_neon;
pFuncList->pfDctFourT4 = WelsDctFourT4_AArch64_neon;
}
#endif
#if defined(HAVE_MMI)
if (uiCpuFlag & WELS_CPU_MMI) {
pFuncList->pfCopy8x8Aligned = WelsCopy8x8_mmi;
pFuncList->pfCopy8x16Aligned = WelsCopy8x16_mmi;
pFuncList->pfGetNoneZeroCount = WelsGetNoneZeroCount_mmi;
pFuncList->pfTransformHadamard4x4Dc = WelsHadamardT4Dc_mmi;
pFuncList->pfQuantization4x4 = WelsQuant4x4_mmi;
pFuncList->pfQuantizationDc4x4 = WelsQuant4x4Dc_mmi;
pFuncList->pfQuantizationFour4x4 = WelsQuantFour4x4_mmi;
pFuncList->pfQuantizationFour4x4Max = WelsQuantFour4x4Max_mmi;
pFuncList->pfCopy16x16Aligned = WelsCopy16x16_mmi;
pFuncList->pfCopy16x16NotAligned = WelsCopy16x16NotAligned_mmi;
pFuncList->pfCopy16x8NotAligned = WelsCopy16x8NotAligned_mmi;
pFuncList->pfScan4x4 = WelsScan4x4DcAc_mmi;
pFuncList->pfScan4x4Ac = WelsScan4x4Ac_mmi;
pFuncList->pfCalculateSingleCtr4x4 = WelsCalculateSingleCtr4x4_mmi;
pFuncList->pfDctT4 = WelsDctT4_mmi;
pFuncList->pfDctFourT4 = WelsDctFourT4_mmi;
}
#endif//HAVE_MMI
#if defined(HAVE_MSA)
if (uiCpuFlag & WELS_CPU_MSA) {
pFuncList->pfCopy8x8Aligned = WelsCopy8x8_msa;
pFuncList->pfCopy8x16Aligned = WelsCopy8x16_msa;
pFuncList->pfCopy16x16Aligned =
pFuncList->pfCopy16x16NotAligned = WelsCopy16x16_msa;
pFuncList->pfCopy16x8NotAligned = WelsCopy16x8_msa;
}
#endif
#if defined(HAVE_LSX)
if (uiCpuFlag & WELS_CPU_LSX) {
pFuncList->pfCopy8x8Aligned = WelsCopy8x8_lsx;
pFuncList->pfCopy16x16Aligned = WelsCopy16x16_lsx;
pFuncList->pfCopy16x16NotAligned = WelsCopy16x16NotAligned_lsx;
pFuncList->pfQuantizationFour4x4Max = WelsQuantFour4x4Max_lsx;
}
#endif
#if defined(HAVE_LASX)
if (uiCpuFlag & WELS_CPU_LASX) {
pFuncList->pfDctT4 = WelsDctT4_lasx;
pFuncList->pfDctFourT4 = WelsDctFourT4_lasx;
}
#endif
}
}