Unlimited data for Windows
Port unlimited data YUVToRGB code to windows.
Disable MIPS YUVToRGB assembly for now to get correct result.
R=fbarchard@chromium.org
Bug: libyuv:862, libyuv:863
Change-Id: Ib3e99c98082badfef4eb671205a151dd1de56b67
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/2839383
Reviewed-by: Frank Barchard <fbarchard@chromium.org>
diff --git a/include/libyuv/row.h b/include/libyuv/row.h
index dd89a81..1836cb8 100644
--- a/include/libyuv/row.h
+++ b/include/libyuv/row.h
@@ -551,24 +551,24 @@
#define HAS_BGRATOYROW_MSA
#define HAS_HALFFLOATROW_MSA
#define HAS_I400TOARGBROW_MSA
-#define HAS_I422ALPHATOARGBROW_MSA
-#define HAS_I422TOARGBROW_MSA
-#define HAS_I422TORGB24ROW_MSA
-#define HAS_I422TORGBAROW_MSA
+//#define HAS_I422ALPHATOARGBROW_MSA
+//#define HAS_I422TOARGBROW_MSA
+//#define HAS_I422TORGB24ROW_MSA
+//#define HAS_I422TORGBAROW_MSA
#define HAS_I422TOUYVYROW_MSA
#define HAS_I422TOYUY2ROW_MSA
-#define HAS_I444TOARGBROW_MSA
-#define HAS_I422TOARGB1555ROW_MSA
-#define HAS_I422TORGB565ROW_MSA
+//#define HAS_I444TOARGBROW_MSA
+//#define HAS_I422TOARGB1555ROW_MSA
+//#define HAS_I422TORGB565ROW_MSA
#define HAS_INTERPOLATEROW_MSA
#define HAS_J400TOARGBROW_MSA
#define HAS_MERGEUVROW_MSA
#define HAS_MIRRORROW_MSA
#define HAS_MIRRORUVROW_MSA
#define HAS_MIRRORSPLITUVROW_MSA
-#define HAS_NV12TOARGBROW_MSA
-#define HAS_NV12TORGB565ROW_MSA
-#define HAS_NV21TOARGBROW_MSA
+//#define HAS_NV12TOARGBROW_MSA
+//#define HAS_NV12TORGB565ROW_MSA
+//#define HAS_NV21TOARGBROW_MSA
#define HAS_RAWTOARGBROW_MSA
#define HAS_RAWTORGB24ROW_MSA
#define HAS_RAWTOUVROW_MSA
@@ -588,10 +588,10 @@
#define HAS_SOBELXYROW_MSA
#define HAS_SOBELYROW_MSA
#define HAS_SPLITUVROW_MSA
-#define HAS_UYVYTOARGBROW_MSA
+//#define HAS_UYVYTOARGBROW_MSA
#define HAS_UYVYTOUVROW_MSA
#define HAS_UYVYTOYROW_MSA
-#define HAS_YUY2TOARGBROW_MSA
+//#define HAS_YUY2TOARGBROW_MSA
#define HAS_YUY2TOUV422ROW_MSA
#define HAS_YUY2TOUVROW_MSA
#define HAS_YUY2TOYROW_MSA
@@ -641,8 +641,8 @@
#define HAS_I400TOARGBROW_MMI
#define HAS_I422TOUYVYROW_MMI
#define HAS_I422TOYUY2ROW_MMI
-#define HAS_I422TOARGBROW_MMI
-#define HAS_I444TOARGBROW_MMI
+//#define HAS_I422TOARGBROW_MMI
+//#define HAS_I444TOARGBROW_MMI
#define HAS_INTERPOLATEROW_MMI
#define HAS_J400TOARGBROW_MMI
#define HAS_MERGERGBROW_MMI
@@ -673,20 +673,20 @@
#define HAS_YUY2TOUV422ROW_MMI
#define HAS_YUY2TOUVROW_MMI
#define HAS_YUY2TOYROW_MMI
-#define HAS_I210TOARGBROW_MMI
-#define HAS_I422TOARGB4444ROW_MMI
-#define HAS_I422TOARGB1555ROW_MMI
-#define HAS_I422TORGB565ROW_MMI
-#define HAS_NV21TORGB24ROW_MMI
-#define HAS_NV12TORGB24ROW_MMI
-#define HAS_I422ALPHATOARGBROW_MMI
-#define HAS_I422TORGB24ROW_MMI
-#define HAS_NV12TOARGBROW_MMI
-#define HAS_NV21TOARGBROW_MMI
-#define HAS_NV12TORGB565ROW_MMI
-#define HAS_YUY2TOARGBROW_MMI
-#define HAS_UYVYTOARGBROW_MMI
-#define HAS_I422TORGBAROW_MMI
+//#define HAS_I210TOARGBROW_MMI
+//#define HAS_I422TOARGB4444ROW_MMI
+//#define HAS_I422TOARGB1555ROW_MMI
+//#define HAS_I422TORGB565ROW_MMI
+//#define HAS_NV21TORGB24ROW_MMI
+//#define HAS_NV12TORGB24ROW_MMI
+//#define HAS_I422ALPHATOARGBROW_MMI
+//#define HAS_I422TORGB24ROW_MMI
+//#define HAS_NV12TOARGBROW_MMI
+//#define HAS_NV21TOARGBROW_MMI
+//#define HAS_NV12TORGB565ROW_MMI
+//#define HAS_YUY2TOARGBROW_MMI
+//#define HAS_UYVYTOARGBROW_MMI
+//#define HAS_I422TORGBAROW_MMI
#endif
#if defined(_MSC_VER) && !defined(__CLR_VER) && !defined(__clang__)
@@ -778,7 +778,7 @@
#define KYTORGB 192
#define KYBIASTORGB 224
#define KUMASKB 256
-#define KVMASKR 288
+#define KVMASKR 272
#endif
diff --git a/source/row_win.cc b/source/row_win.cc
index 78256f8..07aca7b 100644
--- a/source/row_win.cc
+++ b/source/row_win.cc
@@ -29,9 +29,9 @@
// Read 8 UV from 444
#define READYUV444 \
- xmm0 = _mm_loadl_epi64((__m128i*)u_buf); \
+ xmm3 = _mm_loadl_epi64((__m128i*)u_buf); \
xmm1 = _mm_loadl_epi64((__m128i*)(u_buf + offset)); \
- xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); \
+ xmm3 = _mm_unpacklo_epi8(xmm3, xmm1); \
u_buf += 8; \
xmm4 = _mm_loadl_epi64((__m128i*)y_buf); \
xmm4 = _mm_unpacklo_epi8(xmm4, xmm4); \
@@ -39,9 +39,9 @@
// Read 8 UV from 444, With 8 Alpha.
#define READYUVA444 \
- xmm0 = _mm_loadl_epi64((__m128i*)u_buf); \
+ xmm3 = _mm_loadl_epi64((__m128i*)u_buf); \
xmm1 = _mm_loadl_epi64((__m128i*)(u_buf + offset)); \
- xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); \
+ xmm3 = _mm_unpacklo_epi8(xmm3, xmm1); \
u_buf += 8; \
xmm4 = _mm_loadl_epi64((__m128i*)y_buf); \
xmm4 = _mm_unpacklo_epi8(xmm4, xmm4); \
@@ -51,10 +51,10 @@
// Read 4 UV from 422, upsample to 8 UV.
#define READYUV422 \
- xmm0 = _mm_cvtsi32_si128(*(uint32_t*)u_buf); \
+ xmm3 = _mm_cvtsi32_si128(*(uint32_t*)u_buf); \
xmm1 = _mm_cvtsi32_si128(*(uint32_t*)(u_buf + offset)); \
- xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); \
- xmm0 = _mm_unpacklo_epi16(xmm0, xmm0); \
+ xmm3 = _mm_unpacklo_epi8(xmm3, xmm1); \
+ xmm3 = _mm_unpacklo_epi16(xmm3, xmm3); \
u_buf += 4; \
xmm4 = _mm_loadl_epi64((__m128i*)y_buf); \
xmm4 = _mm_unpacklo_epi8(xmm4, xmm4); \
@@ -62,10 +62,10 @@
// Read 4 UV from 422, upsample to 8 UV. With 8 Alpha.
#define READYUVA422 \
- xmm0 = _mm_cvtsi32_si128(*(uint32_t*)u_buf); \
+ xmm3 = _mm_cvtsi32_si128(*(uint32_t*)u_buf); \
xmm1 = _mm_cvtsi32_si128(*(uint32_t*)(u_buf + offset)); \
- xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); \
- xmm0 = _mm_unpacklo_epi16(xmm0, xmm0); \
+ xmm3 = _mm_unpacklo_epi8(xmm3, xmm1); \
+ xmm3 = _mm_unpacklo_epi16(xmm3, xmm3); \
u_buf += 4; \
xmm4 = _mm_loadl_epi64((__m128i*)y_buf); \
xmm4 = _mm_unpacklo_epi8(xmm4, xmm4); \
@@ -74,24 +74,31 @@
a_buf += 8;
// Convert 8 pixels: 8 UV and 8 Y.
-#define YUVTORGB(yuvconstants) \
- xmm1 = _mm_loadu_si128(&xmm0); \
- xmm2 = _mm_loadu_si128(&xmm0); \
- xmm0 = _mm_maddubs_epi16(xmm0, *(__m128i*)yuvconstants->kUVToB); \
- xmm1 = _mm_maddubs_epi16(xmm1, *(__m128i*)yuvconstants->kUVToG); \
- xmm2 = _mm_maddubs_epi16(xmm2, *(__m128i*)yuvconstants->kUVToR); \
- xmm0 = _mm_sub_epi16(*(__m128i*)yuvconstants->kUVBiasB, xmm0); \
- xmm1 = _mm_sub_epi16(*(__m128i*)yuvconstants->kUVBiasG, xmm1); \
- xmm2 = _mm_sub_epi16(*(__m128i*)yuvconstants->kUVBiasR, xmm2); \
- xmm4 = _mm_mulhi_epu16(xmm4, *(__m128i*)yuvconstants->kYToRgb); \
- xmm0 = _mm_adds_epi16(xmm0, xmm4); \
- xmm1 = _mm_adds_epi16(xmm1, xmm4); \
- xmm2 = _mm_adds_epi16(xmm2, xmm4); \
- xmm0 = _mm_srai_epi16(xmm0, 6); \
- xmm1 = _mm_srai_epi16(xmm1, 6); \
- xmm2 = _mm_srai_epi16(xmm2, 6); \
- xmm0 = _mm_packus_epi16(xmm0, xmm0); \
- xmm1 = _mm_packus_epi16(xmm1, xmm1); \
+#define YUVTORGB(yuvconstants) \
+ xmm0 = _mm_loadu_si128(&xmm3); \
+ xmm1 = _mm_loadu_si128(&xmm3); \
+ xmm2 = _mm_loadu_si128(&xmm3); \
+ xmm0 = _mm_maddubs_epi16(xmm0, *(__m128i*)yuvconstants->kUVToB); \
+ xmm2 = _mm_maddubs_epi16(xmm2, *(__m128i*)yuvconstants->kUVToR); \
+ xmm1 = _mm_slli_epi16(xmm1, 8); \
+ xmm1 = _mm_and_si128(xmm1, *(__m128i*)yuvconstants->kUVMaskBR); \
+ xmm0 = _mm_add_epi16(xmm0, xmm1); \
+ xmm1 = _mm_loadu_si128(&xmm3); \
+ xmm1 = _mm_maddubs_epi16(xmm1, *(__m128i*)yuvconstants->kUVToG); \
+ xmm4 = _mm_mulhi_epu16(xmm4, *(__m128i*)yuvconstants->kYToRgb); \
+ xmm3 = _mm_and_si128(xmm3, *((__m128i*)(yuvconstants->kUVMaskBR) + 1)); \
+ xmm2 = _mm_add_epi16(xmm2, xmm3); \
+ xmm0 = _mm_add_epi16(xmm0, xmm4); \
+ xmm2 = _mm_add_epi16(xmm2, xmm4); \
+ xmm4 = _mm_add_epi16(xmm4, *(__m128i*)yuvconstants->kUVBiasG); \
+ xmm0 = _mm_subs_epu16(xmm0, *(__m128i*)yuvconstants->kUVBiasB); \
+ xmm1 = _mm_subs_epu16(xmm4, xmm1); \
+ xmm2 = _mm_subs_epu16(xmm2, *(__m128i*)yuvconstants->kUVBiasR); \
+ xmm0 = _mm_srli_epi16(xmm0, 6); \
+ xmm1 = _mm_srli_epi16(xmm1, 6); \
+ xmm2 = _mm_srli_epi16(xmm2, 6); \
+ xmm0 = _mm_packus_epi16(xmm0, xmm0); \
+ xmm1 = _mm_packus_epi16(xmm1, xmm1); \
xmm2 = _mm_packus_epi16(xmm2, xmm2);
// Store 8 ARGB values.
@@ -112,7 +119,7 @@
uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width) {
- __m128i xmm0, xmm1, xmm2, xmm4;
+ __m128i xmm0, xmm1, xmm2, xmm3, xmm4;
const __m128i xmm5 = _mm_set1_epi8(-1);
const ptrdiff_t offset = (uint8_t*)v_buf - (uint8_t*)u_buf;
while (width > 0) {
@@ -132,7 +139,7 @@
uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width) {
- __m128i xmm0, xmm1, xmm2, xmm4, xmm5;
+ __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5;
const ptrdiff_t offset = (uint8_t*)v_buf - (uint8_t*)u_buf;
while (width > 0) {
READYUVA422
@@ -150,7 +157,7 @@
uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width) {
- __m128i xmm0, xmm1, xmm2, xmm4;
+ __m128i xmm0, xmm1, xmm2, xmm3, xmm4;
const __m128i xmm5 = _mm_set1_epi8(-1);
const ptrdiff_t offset = (uint8_t*)v_buf - (uint8_t*)u_buf;
while (width > 0) {
@@ -170,7 +177,7 @@
uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width) {
- __m128i xmm0, xmm1, xmm2, xmm4, xmm5;
+ __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5;
const ptrdiff_t offset = (uint8_t*)v_buf - (uint8_t*)u_buf;
while (width > 0) {
READYUVA444
@@ -1987,12 +1994,12 @@
// Read 16 UV from 444
#define READYUV444_AVX2 \
__asm { \
- __asm vmovdqu xmm0, [esi] /* U */ \
+ __asm vmovdqu xmm3, [esi] /* U */ \
__asm vmovdqu xmm1, [esi + edi] /* V */ \
__asm lea esi, [esi + 16] \
- __asm vpermq ymm0, ymm0, 0xd8 \
+ __asm vpermq ymm3, ymm3, 0xd8 \
__asm vpermq ymm1, ymm1, 0xd8 \
- __asm vpunpcklbw ymm0, ymm0, ymm1 /* UV */ \
+ __asm vpunpcklbw ymm3, ymm3, ymm1 /* UV */ \
__asm vmovdqu xmm4, [eax] /* Y */ \
__asm vpermq ymm4, ymm4, 0xd8 \
__asm vpunpcklbw ymm4, ymm4, ymm4 \
@@ -2001,12 +2008,12 @@
// Read 16 UV from 444. With 16 Alpha.
#define READYUVA444_AVX2 \
__asm { \
- __asm vmovdqu xmm0, [esi] /* U */ \
+ __asm vmovdqu xmm3, [esi] /* U */ \
__asm vmovdqu xmm1, [esi + edi] /* V */ \
__asm lea esi, [esi + 16] \
- __asm vpermq ymm0, ymm0, 0xd8 \
+ __asm vpermq ymm3, ymm3, 0xd8 \
__asm vpermq ymm1, ymm1, 0xd8 \
- __asm vpunpcklbw ymm0, ymm0, ymm1 /* UV */ \
+ __asm vpunpcklbw ymm3, ymm3, ymm1 /* UV */ \
__asm vmovdqu xmm4, [eax] /* Y */ \
__asm vpermq ymm4, ymm4, 0xd8 \
__asm vpunpcklbw ymm4, ymm4, ymm4 \
@@ -2018,12 +2025,12 @@
// Read 8 UV from 422, upsample to 16 UV.
#define READYUV422_AVX2 \
__asm { \
- __asm vmovq xmm0, qword ptr [esi] /* U */ \
+ __asm vmovq xmm3, qword ptr [esi] /* U */ \
__asm vmovq xmm1, qword ptr [esi + edi] /* V */ \
__asm lea esi, [esi + 8] \
- __asm vpunpcklbw ymm0, ymm0, ymm1 /* UV */ \
- __asm vpermq ymm0, ymm0, 0xd8 \
- __asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */ \
+ __asm vpunpcklbw ymm3, ymm3, ymm1 /* UV */ \
+ __asm vpermq ymm3, ymm3, 0xd8 \
+ __asm vpunpcklwd ymm3, ymm3, ymm3 /* UVUV (upsample) */ \
__asm vmovdqu xmm4, [eax] /* Y */ \
__asm vpermq ymm4, ymm4, 0xd8 \
__asm vpunpcklbw ymm4, ymm4, ymm4 \
@@ -2032,12 +2039,12 @@
// Read 8 UV from 422, upsample to 16 UV. With 16 Alpha.
#define READYUVA422_AVX2 \
__asm { \
- __asm vmovq xmm0, qword ptr [esi] /* U */ \
+ __asm vmovq xmm3, qword ptr [esi] /* U */ \
__asm vmovq xmm1, qword ptr [esi + edi] /* V */ \
__asm lea esi, [esi + 8] \
- __asm vpunpcklbw ymm0, ymm0, ymm1 /* UV */ \
- __asm vpermq ymm0, ymm0, 0xd8 \
- __asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */ \
+ __asm vpunpcklbw ymm3, ymm3, ymm1 /* UV */ \
+ __asm vpermq ymm3, ymm3, 0xd8 \
+ __asm vpunpcklwd ymm3, ymm3, ymm3 /* UVUV (upsample) */ \
__asm vmovdqu xmm4, [eax] /* Y */ \
__asm vpermq ymm4, ymm4, 0xd8 \
__asm vpunpcklbw ymm4, ymm4, ymm4 \
@@ -2049,10 +2056,10 @@
// Read 8 UV from NV12, upsample to 16 UV.
#define READNV12_AVX2 \
__asm { \
- __asm vmovdqu xmm0, [esi] /* UV */ \
+ __asm vmovdqu xmm3, [esi] /* UV */ \
__asm lea esi, [esi + 16] \
- __asm vpermq ymm0, ymm0, 0xd8 \
- __asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */ \
+ __asm vpermq ymm3, ymm3, 0xd8 \
+ __asm vpunpcklwd ymm3, ymm3, ymm3 /* UVUV (upsample) */ \
__asm vmovdqu xmm4, [eax] /* Y */ \
__asm vpermq ymm4, ymm4, 0xd8 \
__asm vpunpcklbw ymm4, ymm4, ymm4 \
@@ -2061,10 +2068,10 @@
// Read 8 UV from NV21, upsample to 16 UV.
#define READNV21_AVX2 \
__asm { \
- __asm vmovdqu xmm0, [esi] /* UV */ \
+ __asm vmovdqu xmm3, [esi] /* UV */ \
__asm lea esi, [esi + 16] \
- __asm vpermq ymm0, ymm0, 0xd8 \
- __asm vpshufb ymm0, ymm0, ymmword ptr kShuffleNV21 \
+ __asm vpermq ymm3, ymm3, 0xd8 \
+ __asm vpshufb ymm3, ymm3, ymmword ptr kShuffleNV21 \
__asm vmovdqu xmm4, [eax] /* Y */ \
__asm vpermq ymm4, ymm4, 0xd8 \
__asm vpunpcklbw ymm4, ymm4, ymm4 \
@@ -2075,8 +2082,8 @@
__asm { \
__asm vmovdqu ymm4, [eax] /* YUY2 */ \
__asm vpshufb ymm4, ymm4, ymmword ptr kShuffleYUY2Y \
- __asm vmovdqu ymm0, [eax] /* UV */ \
- __asm vpshufb ymm0, ymm0, ymmword ptr kShuffleYUY2UV \
+ __asm vmovdqu ymm3, [eax] /* UV */ \
+ __asm vpshufb ymm3, ymm3, ymmword ptr kShuffleYUY2UV \
__asm lea eax, [eax + 32]}
// Read 8 UYVY with 16 Y and upsample 8 UV to 16 UV.
@@ -2084,32 +2091,39 @@
__asm { \
__asm vmovdqu ymm4, [eax] /* UYVY */ \
__asm vpshufb ymm4, ymm4, ymmword ptr kShuffleUYVYY \
- __asm vmovdqu ymm0, [eax] /* UV */ \
- __asm vpshufb ymm0, ymm0, ymmword ptr kShuffleUYVYUV \
+ __asm vmovdqu ymm3, [eax] /* UV */ \
+ __asm vpshufb ymm3, ymm3, ymmword ptr kShuffleUYVYUV \
__asm lea eax, [eax + 32]}
// Convert 16 pixels: 16 UV and 16 Y.
#define YUVTORGB_AVX2(YuvConstants) \
__asm { \
- __asm vpmaddubsw ymm2, ymm0, ymmword ptr [YuvConstants + KUVTOR] /* R UV */\
- __asm vpmaddubsw ymm1, ymm0, ymmword ptr [YuvConstants + KUVTOG] /* G UV */\
- __asm vpmaddubsw ymm0, ymm0, ymmword ptr [YuvConstants + KUVTOB] /* B UV */\
- __asm vmovdqu ymm3, ymmword ptr [YuvConstants + KUVBIASR] \
- __asm vpsubw ymm2, ymm3, ymm2 \
- __asm vmovdqu ymm3, ymmword ptr [YuvConstants + KUVBIASG] \
- __asm vpsubw ymm1, ymm3, ymm1 \
- __asm vmovdqu ymm3, ymmword ptr [YuvConstants + KUVBIASB] \
- __asm vpsubw ymm0, ymm3, ymm0 /* Step 2: Find Y contribution to 16 R,G,B values */ \
+ __asm vpmaddubsw ymm0, ymm3, ymmword ptr [YuvConstants + KUVTOB] /* B UV */\
+ __asm vpmaddubsw ymm2, ymm3, ymmword ptr [YuvConstants + KUVTOR] /* R UV */\
+ __asm vpsllw ymm1, ymm3, 8 \
+ __asm vbroadcastf128 ymm6, xmmword ptr [YuvConstants + KUMASKB] \
+ __asm vpand ymm1, ymm1, ymm6 \
+ __asm vpaddw ymm0, ymm0, ymm1 \
+ __asm vpmaddubsw ymm1, ymm3, ymmword ptr [YuvConstants + KUVTOG] /* B UV */\
__asm vpmulhuw ymm4, ymm4, ymmword ptr [YuvConstants + KYTORGB] \
- __asm vpaddsw ymm0, ymm0, ymm4 /* B += Y */ \
- __asm vpaddsw ymm1, ymm1, ymm4 /* G += Y */ \
- __asm vpaddsw ymm2, ymm2, ymm4 /* R += Y */ \
- __asm vpsraw ymm0, ymm0, 6 \
- __asm vpsraw ymm1, ymm1, 6 \
- __asm vpsraw ymm2, ymm2, 6 \
- __asm vpackuswb ymm0, ymm0, ymm0 /* B */ \
- __asm vpackuswb ymm1, ymm1, ymm1 /* G */ \
- __asm vpackuswb ymm2, ymm2, ymm2 /* R */ \
+ __asm vbroadcastf128 ymm6, xmmword ptr [YuvConstants + KVMASKR] \
+ __asm vpand ymm3, ymm3, ymm6 \
+ __asm vpaddw ymm2, ymm2, ymm3 \
+ __asm vpaddw ymm0, ymm0, ymm4 \
+ __asm vmovdqu ymm6, ymmword ptr [YuvConstants + KUVBIASG] \
+ __asm vpaddw ymm3, ymm4, ymm6 \
+ __asm vpaddw ymm2, ymm2, ymm4 \
+ __asm vmovdqu ymm6, ymmword ptr [YuvConstants + KUVBIASB] \
+ __asm vpsubusw ymm0, ymm0, ymm6 \
+ __asm vpsubusw ymm1, ymm3, ymm1 \
+ __asm vmovdqu ymm6, ymmword ptr [YuvConstants + KUVBIASR] \
+ __asm vpsubusw ymm2, ymm2, ymm6 \
+ __asm vpsrlw ymm0, ymm0, 6 \
+ __asm vpsrlw ymm1, ymm1, 6 \
+ __asm vpsrlw ymm2, ymm2, 6 \
+ __asm vpackuswb ymm0, ymm0, ymm0 /* B */ \
+ __asm vpackuswb ymm1, ymm1, ymm1 /* G */ \
+ __asm vpackuswb ymm2, ymm2, ymm2 /* R */ \
}
// Store 16 ARGB values.
@@ -2481,10 +2495,10 @@
// Read 8 UV from 444.
#define READYUV444 \
__asm { \
- __asm movq xmm0, qword ptr [esi] /* U */ \
+ __asm movq xmm3, qword ptr [esi] /* U */ \
__asm movq xmm1, qword ptr [esi + edi] /* V */ \
__asm lea esi, [esi + 8] \
- __asm punpcklbw xmm0, xmm1 /* UV */ \
+ __asm punpcklbw xmm3, xmm1 /* UV */ \
__asm movq xmm4, qword ptr [eax] \
__asm punpcklbw xmm4, xmm4 \
__asm lea eax, [eax + 8]}
@@ -2492,10 +2506,10 @@
// Read 4 UV from 444. With 8 Alpha.
#define READYUVA444 \
__asm { \
- __asm movq xmm0, qword ptr [esi] /* U */ \
+ __asm movq xmm3, qword ptr [esi] /* U */ \
__asm movq xmm1, qword ptr [esi + edi] /* V */ \
__asm lea esi, [esi + 8] \
- __asm punpcklbw xmm0, xmm1 /* UV */ \
+ __asm punpcklbw xmm3, xmm1 /* UV */ \
__asm movq xmm4, qword ptr [eax] \
__asm punpcklbw xmm4, xmm4 \
__asm lea eax, [eax + 8] \
@@ -2505,11 +2519,11 @@
// Read 4 UV from 422, upsample to 8 UV.
#define READYUV422 \
__asm { \
- __asm movd xmm0, [esi] /* U */ \
+ __asm movd xmm3, [esi] /* U */ \
__asm movd xmm1, [esi + edi] /* V */ \
__asm lea esi, [esi + 4] \
- __asm punpcklbw xmm0, xmm1 /* UV */ \
- __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \
+ __asm punpcklbw xmm3, xmm1 /* UV */ \
+ __asm punpcklwd xmm3, xmm3 /* UVUV (upsample) */ \
__asm movq xmm4, qword ptr [eax] \
__asm punpcklbw xmm4, xmm4 \
__asm lea eax, [eax + 8]}
@@ -2517,11 +2531,11 @@
// Read 4 UV from 422, upsample to 8 UV. With 8 Alpha.
#define READYUVA422 \
__asm { \
- __asm movd xmm0, [esi] /* U */ \
+ __asm movd xmm3, [esi] /* U */ \
__asm movd xmm1, [esi + edi] /* V */ \
__asm lea esi, [esi + 4] \
- __asm punpcklbw xmm0, xmm1 /* UV */ \
- __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \
+ __asm punpcklbw xmm3, xmm1 /* UV */ \
+ __asm punpcklwd xmm3, xmm3 /* UVUV (upsample) */ \
__asm movq xmm4, qword ptr [eax] /* Y */ \
__asm punpcklbw xmm4, xmm4 \
__asm lea eax, [eax + 8] \
@@ -2531,9 +2545,9 @@
// Read 4 UV from NV12, upsample to 8 UV.
#define READNV12 \
__asm { \
- __asm movq xmm0, qword ptr [esi] /* UV */ \
+ __asm movq xmm3, qword ptr [esi] /* UV */ \
__asm lea esi, [esi + 8] \
- __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \
+ __asm punpcklwd xmm3, xmm3 /* UVUV (upsample) */ \
__asm movq xmm4, qword ptr [eax] \
__asm punpcklbw xmm4, xmm4 \
__asm lea eax, [eax + 8]}
@@ -2541,9 +2555,9 @@
// Read 4 VU from NV21, upsample to 8 UV.
#define READNV21 \
__asm { \
- __asm movq xmm0, qword ptr [esi] /* UV */ \
+ __asm movq xmm3, qword ptr [esi] /* UV */ \
__asm lea esi, [esi + 8] \
- __asm pshufb xmm0, xmmword ptr kShuffleNV21 \
+ __asm pshufb xmm3, xmmword ptr kShuffleNV21 \
__asm movq xmm4, qword ptr [eax] \
__asm punpcklbw xmm4, xmm4 \
__asm lea eax, [eax + 8]}
@@ -2553,8 +2567,8 @@
__asm { \
__asm movdqu xmm4, [eax] /* YUY2 */ \
__asm pshufb xmm4, xmmword ptr kShuffleYUY2Y \
- __asm movdqu xmm0, [eax] /* UV */ \
- __asm pshufb xmm0, xmmword ptr kShuffleYUY2UV \
+ __asm movdqu xmm3, [eax] /* UV */ \
+ __asm pshufb xmm3, xmmword ptr kShuffleYUY2UV \
__asm lea eax, [eax + 16]}
// Read 4 UYVY with 8 Y and upsample 4 UV to 8 UV.
@@ -2562,32 +2576,37 @@
__asm { \
__asm movdqu xmm4, [eax] /* UYVY */ \
__asm pshufb xmm4, xmmword ptr kShuffleUYVYY \
- __asm movdqu xmm0, [eax] /* UV */ \
- __asm pshufb xmm0, xmmword ptr kShuffleUYVYUV \
+ __asm movdqu xmm3, [eax] /* UV */ \
+ __asm pshufb xmm3, xmmword ptr kShuffleUYVYUV \
__asm lea eax, [eax + 16]}
// Convert 8 pixels: 8 UV and 8 Y.
#define YUVTORGB(YuvConstants) \
__asm { \
- __asm movdqa xmm1, xmm0 \
- __asm movdqa xmm2, xmm0 \
- __asm movdqa xmm3, xmm0 \
- __asm movdqa xmm0, xmmword ptr [YuvConstants + KUVBIASB] \
- __asm pmaddubsw xmm1, xmmword ptr [YuvConstants + KUVTOB] \
- __asm psubw xmm0, xmm1 \
- __asm movdqa xmm1, xmmword ptr [YuvConstants + KUVBIASG] \
- __asm pmaddubsw xmm2, xmmword ptr [YuvConstants + KUVTOG] \
- __asm psubw xmm1, xmm2 \
- __asm movdqa xmm2, xmmword ptr [YuvConstants + KUVBIASR] \
- __asm pmaddubsw xmm3, xmmword ptr [YuvConstants + KUVTOR] \
- __asm psubw xmm2, xmm3 \
+ __asm movdqa xmm0, xmm3 \
+ __asm movdqa xmm1, xmm3 \
+ __asm movdqa xmm2, xmm3 \
+ __asm pmaddubsw xmm0, xmmword ptr [YuvConstants + KUVTOB] \
+ __asm pmaddubsw xmm2, xmmword ptr [YuvConstants + KUVTOR] \
+ __asm psllw xmm1, 8 \
+ __asm pand xmm1, xmmword ptr [YuvConstants + KUMASKB] \
+ __asm paddw xmm0, xmm1 \
+ __asm pmaddubsw xmm1, xmmword ptr [YuvConstants + KUVTOG] \
__asm pmulhuw xmm4, xmmword ptr [YuvConstants + KYTORGB] \
- __asm paddsw xmm0, xmm4 /* B += Y */ \
- __asm paddsw xmm1, xmm4 /* G += Y */ \
- __asm paddsw xmm2, xmm4 /* R += Y */ \
- __asm psraw xmm0, 6 \
- __asm psraw xmm1, 6 \
- __asm psraw xmm2, 6 \
+ __asm pand xmm3, xmmword ptr [YuvConstants + KVMASKR] \
+ __asm paddw xmm0, xmm4 \
+ __asm movdqa xmm6, xmmword ptr [YuvConstants + KUVBIASG] \
+ __asm paddw xmm2, xmm4 \
+ __asm paddw xmm4, xmm6 \
+ __asm movdqa xmm6, xmmword ptr [YuvConstants + KUVBIASG] \
+ __asm psubusw xmm0, xmm6 \
+ __asm psubusw xmm4, xmm1 \
+ __asm movdqa xmm6, xmmword ptr [YuvConstants + KUVBIASG] \
+ __asm psubusw xmm2, xmm6 \
+ __asm movdqa xmm1, xmm4 \
+ __asm psrlw xmm0, 6 \
+ __asm psrlw xmm1, 6 \
+ __asm psrlw xmm2, 6 \
__asm packuswb xmm0, xmm0 /* B */ \
__asm packuswb xmm1, xmm1 /* G */ \
__asm packuswb xmm2, xmm2 /* R */ \