Unlimited data for Intel

Use unsigned coefficients on Intel.
Make C, NEON and AVX2 match under LIBYUV_UNLIMITED_DATA.

Bug: libyuv:862, libyuv:863
Change-Id: I6c02147ea3c1875c4fc23863435aea86dcf5880a
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/2830180
Reviewed-by: Frank Barchard <fbarchard@chromium.org>
Reviewed-by: richard winterton <rrwinterton@gmail.com>
diff --git a/include/libyuv/row.h b/include/libyuv/row.h
index f27bed9..d137c26 100644
--- a/include/libyuv/row.h
+++ b/include/libyuv/row.h
@@ -755,20 +755,15 @@
 #else
 // This struct is for Intel color conversion.
 struct YuvConstants {
-#if defined(LIBYUV_UNLIMITED_DATA)
   uint8_t kUVToB[32];
   uint8_t kUVToG[32];
   uint8_t kUVToR[32];
-#else
-  int8_t kUVToB[32];
-  int8_t kUVToG[32];
-  int8_t kUVToR[32];
-#endif
   int16_t kUVBiasB[16];
   int16_t kUVBiasG[16];
   int16_t kUVBiasR[16];
   int16_t kYToRgb[16];
   int16_t kYBiasToRgb[16];
+  uint8_t kUVMaskBR[32];
 };
 
 // Offsets into YuvConstants structure
@@ -780,6 +775,8 @@
 #define KUVBIASR 160
 #define KYTORGB 192
 #define KYBIASTORGB 224
+#define KUMASKB 256
+#define KVMASKR 288
 
 #endif
 
diff --git a/source/row_common.cc b/source/row_common.cc
index 9d2b5d9..3e71ce3 100644
--- a/source/row_common.cc
+++ b/source/row_common.cc
@@ -1427,18 +1427,24 @@
   {{UB, VR, UG, VG, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},     \
    {YG, BB, BG, BR, YB, 0, 0, 0}}
 #else
+#define UVMASK(C) ((C) > 127 ? 0xff : 0)
+
 #define YUBCONSTANTSBODY(YG, YB, UB, UG, VG, VR, BB, BG, BR)         \
-  {{-UB, 0, -UB, 0, -UB, 0, -UB, 0, -UB, 0, -UB, 0, -UB, 0, -UB, 0,  \
-    -UB, 0, -UB, 0, -UB, 0, -UB, 0, -UB, 0, -UB, 0, -UB, 0, -UB, 0}, \
+  {{UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0,          \
+    UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0},         \
    {UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG,  \
     UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG}, \
-   {0, -VR, 0, -VR, 0, -VR, 0, -VR, 0, -VR, 0, -VR, 0, -VR, 0, -VR,  \
-    0, -VR, 0, -VR, 0, -VR, 0, -VR, 0, -VR, 0, -VR, 0, -VR, 0, -VR}, \
+   {0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR,          \
+    0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR},         \
    {BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB}, \
    {BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG}, \
    {BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR}, \
    {YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG}, \
-   {YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB}}
+   {YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB}, \
+   {0, UVMASK(UB), 0, UVMASK(UB), 0, UVMASK(UB), 0, UVMASK(UB),      \
+    0, UVMASK(UB), 0, UVMASK(UB), 0, UVMASK(UB), 0, UVMASK(UB),      \
+    0, UVMASK(VR), 0, UVMASK(VR), 0, UVMASK(VR), 0, UVMASK(VR),      \
+    0, UVMASK(VR), 0, UVMASK(VR), 0, UVMASK(VR), 0, UVMASK(VR)}}
 #endif
 
 // clang-format on
@@ -1453,15 +1459,9 @@
 
 // Bias values to round, and subtract 128 from U and V.
 // For B and R this is negative. For G this is positive.
-#ifdef LIBYUV_UNLIMITED_DATA
 #define BB (UB * 128 - YB)
 #define BG (UG * 128 + VG * 128 + YB)
 #define BR (VR * 128 - YB)
-#else
-#define BB (-UB * 128 + YB)
-#define BG (UG * 128 + VG * 128 + YB)
-#define BR (-VR * 128 + YB)
-#endif
 
 // BT.601 limited range YUV to RGB reference
 //  R = (Y - 16) * 1.164             + V * 1.596
@@ -1631,7 +1631,6 @@
 #undef MAKEYUVCONSTANTS
 
 #if defined(__aarch64__) || defined(__arm__)
-#ifdef LIBYUV_UNLIMITED_DATA
 #define LOAD_YUV_CONSTANTS                  \
   int ub = -yuvconstants->kUVCoeff[0];      \
   int vr = -yuvconstants->kUVCoeff[1];      \
@@ -1642,25 +1641,14 @@
   int bg = yuvconstants->kRGBCoeffBias[2];  \
   int br = -yuvconstants->kRGBCoeffBias[3]
 #else
-#define LOAD_YUV_CONSTANTS                 \
-  int ub = -yuvconstants->kUVCoeff[0];     \
-  int vr = -yuvconstants->kUVCoeff[1];     \
-  int ug = yuvconstants->kUVCoeff[2];      \
-  int vg = yuvconstants->kUVCoeff[3];      \
-  int yg = yuvconstants->kRGBCoeffBias[0]; \
-  int bb = yuvconstants->kRGBCoeffBias[1]; \
-  int bg = yuvconstants->kRGBCoeffBias[2]; \
-  int br = yuvconstants->kRGBCoeffBias[3]
-#endif
-#else
-#define LOAD_YUV_CONSTANTS            \
-  int ub = yuvconstants->kUVToB[0];   \
-  int ug = yuvconstants->kUVToG[0];   \
-  int vg = yuvconstants->kUVToG[1];   \
-  int vr = yuvconstants->kUVToR[1];   \
-  int bb = yuvconstants->kUVBiasB[0]; \
-  int bg = yuvconstants->kUVBiasG[0]; \
-  int br = yuvconstants->kUVBiasR[0]; \
+#define LOAD_YUV_CONSTANTS             \
+  int ub = -yuvconstants->kUVToB[0];   \
+  int ug = yuvconstants->kUVToG[0];    \
+  int vg = yuvconstants->kUVToG[1];    \
+  int vr = -yuvconstants->kUVToR[1];   \
+  int bb = -yuvconstants->kUVBiasB[0]; \
+  int bg = yuvconstants->kUVBiasG[0];  \
+  int br = -yuvconstants->kUVBiasR[0]; \
   int yg = yuvconstants->kYToRgb[0]
 #endif
 
diff --git a/source/row_gcc.cc b/source/row_gcc.cc
index 07c2638..d21c153 100644
--- a/source/row_gcc.cc
+++ b/source/row_gcc.cc
@@ -2132,21 +2132,21 @@
 
 // Read 8 UV from 444
 #define READYUV444                                                \
-  "movq       (%[u_buf]),%%xmm0                               \n" \
+  "movq       (%[u_buf]),%%xmm3                               \n" \
   "movq       0x00(%[u_buf],%[v_buf],1),%%xmm1                \n" \
   "lea        0x8(%[u_buf]),%[u_buf]                          \n" \
-  "punpcklbw  %%xmm1,%%xmm0                                   \n" \
+  "punpcklbw  %%xmm1,%%xmm3                                   \n" \
   "movq       (%[y_buf]),%%xmm4                               \n" \
   "punpcklbw  %%xmm4,%%xmm4                                   \n" \
   "lea        0x8(%[y_buf]),%[y_buf]                          \n"
 
 // Read 4 UV from 422, upsample to 8 UV
 #define READYUV422                                                \
-  "movd       (%[u_buf]),%%xmm0                               \n" \
+  "movd       (%[u_buf]),%%xmm3                               \n" \
   "movd       0x00(%[u_buf],%[v_buf],1),%%xmm1                \n" \
   "lea        0x4(%[u_buf]),%[u_buf]                          \n" \
-  "punpcklbw  %%xmm1,%%xmm0                                   \n" \
-  "punpcklwd  %%xmm0,%%xmm0                                   \n" \
+  "punpcklbw  %%xmm1,%%xmm3                                   \n" \
+  "punpcklwd  %%xmm3,%%xmm3                                   \n" \
   "movq       (%[y_buf]),%%xmm4                               \n" \
   "punpcklbw  %%xmm4,%%xmm4                                   \n" \
   "lea        0x8(%[y_buf]),%[y_buf]                          \n"
@@ -2156,25 +2156,25 @@
 // TODO(fbarchard): Consider pmulhuw to replace psraw
 // TODO(fbarchard): Consider pmullw to replace psllw and allow different bits.
 #define READYUV210                                                \
-  "movq       (%[u_buf]),%%xmm0                               \n" \
+  "movq       (%[u_buf]),%%xmm3                               \n" \
   "movq       0x00(%[u_buf],%[v_buf],1),%%xmm1                \n" \
   "lea        0x8(%[u_buf]),%[u_buf]                          \n" \
-  "punpcklwd  %%xmm1,%%xmm0                                   \n" \
-  "psraw      $2,%%xmm0                                       \n" \
-  "packuswb   %%xmm0,%%xmm0                                   \n" \
-  "punpcklwd  %%xmm0,%%xmm0                                   \n" \
+  "punpcklwd  %%xmm1,%%xmm3                                   \n" \
+  "psraw      $2,%%xmm3                                       \n" \
+  "packuswb   %%xmm3,%%xmm3                                   \n" \
+  "punpcklwd  %%xmm3,%%xmm3                                   \n" \
   "movdqu     (%[y_buf]),%%xmm4                               \n" \
   "psllw      $6,%%xmm4                                       \n" \
   "lea        0x10(%[y_buf]),%[y_buf]                         \n"
 
 #define READYUVA210                                               \
-  "movq       (%[u_buf]),%%xmm0                               \n" \
+  "movq       (%[u_buf]),%%xmm3                               \n" \
   "movq       0x00(%[u_buf],%[v_buf],1),%%xmm1                \n" \
   "lea        0x8(%[u_buf]),%[u_buf]                          \n" \
-  "punpcklwd  %%xmm1,%%xmm0                                   \n" \
-  "psraw      $2,%%xmm0                                       \n" \
-  "packuswb   %%xmm0,%%xmm0                                   \n" \
-  "punpcklwd  %%xmm0,%%xmm0                                   \n" \
+  "punpcklwd  %%xmm1,%%xmm3                                   \n" \
+  "psraw      $2,%%xmm3                                       \n" \
+  "packuswb   %%xmm3,%%xmm3                                   \n" \
+  "punpcklwd  %%xmm3,%%xmm3                                   \n" \
   "movdqu     (%[y_buf]),%%xmm4                               \n" \
   "psllw      $6,%%xmm4                                       \n" \
   "lea        0x10(%[y_buf]),%[y_buf]                         \n" \
@@ -2185,30 +2185,30 @@
 
 // Read 8 UV from 444 10 bit
 #define READYUV410                                                \
-  "movdqu     (%[u_buf]),%%xmm0                               \n" \
+  "movdqu     (%[u_buf]),%%xmm3                               \n" \
   "movdqu     0x00(%[u_buf],%[v_buf],1),%%xmm2                \n" \
   "lea        0x10(%[u_buf]),%[u_buf]                         \n" \
-  "psraw      $2,%%xmm0                                       \n" \
+  "psraw      $2,%%xmm3                                       \n" \
   "psraw      $2,%%xmm2                                       \n" \
-  "movdqa     %%xmm0,%%xmm1                                   \n" \
-  "punpcklwd  %%xmm2,%%xmm0                                   \n" \
+  "movdqa     %%xmm3,%%xmm1                                   \n" \
+  "punpcklwd  %%xmm2,%%xmm3                                   \n" \
   "punpckhwd  %%xmm2,%%xmm1                                   \n" \
-  "packuswb   %%xmm1,%%xmm0                                   \n" \
+  "packuswb   %%xmm1,%%xmm3                                   \n" \
   "movdqu     (%[y_buf]),%%xmm4                               \n" \
   "psllw      $6,%%xmm4                                       \n" \
   "lea        0x10(%[y_buf]),%[y_buf]                         \n"
 
 // Read 8 UV from 444 10 bit.  With 8 Alpha.
 #define READYUVA410                                               \
-  "movdqu     (%[u_buf]),%%xmm0                               \n" \
+  "movdqu     (%[u_buf]),%%xmm3                               \n" \
   "movdqu     0x00(%[u_buf],%[v_buf],1),%%xmm2                \n" \
   "lea        0x10(%[u_buf]),%[u_buf]                         \n" \
-  "psraw      $2,%%xmm0                                       \n" \
+  "psraw      $2,%%xmm3                                       \n" \
   "psraw      $2,%%xmm2                                       \n" \
-  "movdqa     %%xmm0,%%xmm1                                   \n" \
-  "punpcklwd  %%xmm2,%%xmm0                                   \n" \
+  "movdqa     %%xmm3,%%xmm1                                   \n" \
+  "punpcklwd  %%xmm2,%%xmm3                                   \n" \
   "punpckhwd  %%xmm2,%%xmm1                                   \n" \
-  "packuswb   %%xmm1,%%xmm0                                   \n" \
+  "packuswb   %%xmm1,%%xmm3                                   \n" \
   "movdqu     (%[y_buf]),%%xmm4                               \n" \
   "psllw      $0x6,%%xmm4                                     \n" \
   "lea        0x10(%[y_buf]),%[y_buf]                         \n" \
@@ -2219,24 +2219,24 @@
 
 // Read 4 UV from 422 12 bit, upsample to 8 UV
 #define READYUV212                                                \
-  "movq       (%[u_buf]),%%xmm0                               \n" \
+  "movq       (%[u_buf]),%%xmm3                               \n" \
   "movq       0x00(%[u_buf],%[v_buf],1),%%xmm1                \n" \
   "lea        0x8(%[u_buf]),%[u_buf]                          \n" \
-  "punpcklwd  %%xmm1,%%xmm0                                   \n" \
-  "psraw      $0x4,%%xmm0                                     \n" \
-  "packuswb   %%xmm0,%%xmm0                                   \n" \
-  "punpcklwd  %%xmm0,%%xmm0                                   \n" \
+  "punpcklwd  %%xmm1,%%xmm3                                   \n" \
+  "psraw      $0x4,%%xmm3                                     \n" \
+  "packuswb   %%xmm3,%%xmm3                                   \n" \
+  "punpcklwd  %%xmm3,%%xmm3                                   \n" \
   "movdqu     (%[y_buf]),%%xmm4                               \n" \
   "psllw      $0x4,%%xmm4                                     \n" \
   "lea        0x10(%[y_buf]),%[y_buf]                         \n"
 
 // Read 4 UV from 422, upsample to 8 UV.  With 8 Alpha.
 #define READYUVA422                                               \
-  "movd       (%[u_buf]),%%xmm0                               \n" \
+  "movd       (%[u_buf]),%%xmm3                               \n" \
   "movd       0x00(%[u_buf],%[v_buf],1),%%xmm1                \n" \
   "lea        0x4(%[u_buf]),%[u_buf]                          \n" \
-  "punpcklbw  %%xmm1,%%xmm0                                   \n" \
-  "punpcklwd  %%xmm0,%%xmm0                                   \n" \
+  "punpcklbw  %%xmm1,%%xmm3                                   \n" \
+  "punpcklwd  %%xmm3,%%xmm3                                   \n" \
   "movq       (%[y_buf]),%%xmm4                               \n" \
   "punpcklbw  %%xmm4,%%xmm4                                   \n" \
   "lea        0x8(%[y_buf]),%[y_buf]                          \n" \
@@ -2245,10 +2245,10 @@
 
 // Read 8 UV from 444.  With 8 Alpha.
 #define READYUVA444                                               \
-  "movq       (%[u_buf]),%%xmm0                               \n" \
+  "movq       (%[u_buf]),%%xmm3                               \n" \
   "movq       0x00(%[u_buf],%[v_buf],1),%%xmm1                \n" \
   "lea        0x8(%[u_buf]),%[u_buf]                          \n" \
-  "punpcklbw  %%xmm1,%%xmm0                                   \n" \
+  "punpcklbw  %%xmm1,%%xmm3                                   \n" \
   "movq       (%[y_buf]),%%xmm4                               \n" \
   "punpcklbw  %%xmm4,%%xmm4                                   \n" \
   "lea        0x8(%[y_buf]),%[y_buf]                          \n" \
@@ -2257,18 +2257,18 @@
 
 // Read 4 UV from NV12, upsample to 8 UV
 #define READNV12                                                  \
-  "movq       (%[uv_buf]),%%xmm0                              \n" \
+  "movq       (%[uv_buf]),%%xmm3                              \n" \
   "lea        0x8(%[uv_buf]),%[uv_buf]                        \n" \
-  "punpcklwd  %%xmm0,%%xmm0                                   \n" \
+  "punpcklwd  %%xmm3,%%xmm3                                   \n" \
   "movq       (%[y_buf]),%%xmm4                               \n" \
   "punpcklbw  %%xmm4,%%xmm4                                   \n" \
   "lea        0x8(%[y_buf]),%[y_buf]                          \n"
 
 // Read 4 VU from NV21, upsample to 8 UV
 #define READNV21                                                  \
-  "movq       (%[vu_buf]),%%xmm0                              \n" \
+  "movq       (%[vu_buf]),%%xmm3                              \n" \
   "lea        0x8(%[vu_buf]),%[vu_buf]                        \n" \
-  "pshufb     %[kShuffleNV21], %%xmm0                         \n" \
+  "pshufb     %[kShuffleNV21], %%xmm3                         \n" \
   "movq       (%[y_buf]),%%xmm4                               \n" \
   "punpcklbw  %%xmm4,%%xmm4                                   \n" \
   "lea        0x8(%[y_buf]),%[y_buf]                          \n"
@@ -2277,36 +2277,36 @@
 #define READYUY2                                                  \
   "movdqu     (%[yuy2_buf]),%%xmm4                            \n" \
   "pshufb     %[kShuffleYUY2Y], %%xmm4                        \n" \
-  "movdqu     (%[yuy2_buf]),%%xmm0                            \n" \
-  "pshufb     %[kShuffleYUY2UV], %%xmm0                       \n" \
+  "movdqu     (%[yuy2_buf]),%%xmm3                            \n" \
+  "pshufb     %[kShuffleYUY2UV], %%xmm3                       \n" \
   "lea        0x10(%[yuy2_buf]),%[yuy2_buf]                   \n"
 
 // Read 4 UYVY with 8 Y and update 4 UV to 8 UV.
 #define READUYVY                                                  \
   "movdqu     (%[uyvy_buf]),%%xmm4                            \n" \
   "pshufb     %[kShuffleUYVYY], %%xmm4                        \n" \
-  "movdqu     (%[uyvy_buf]),%%xmm0                            \n" \
-  "pshufb     %[kShuffleUYVYUV], %%xmm0                       \n" \
+  "movdqu     (%[uyvy_buf]),%%xmm3                            \n" \
+  "pshufb     %[kShuffleUYVYUV], %%xmm3                       \n" \
   "lea        0x10(%[uyvy_buf]),%[uyvy_buf]                   \n"
 
 // Read 4 UV from P210, upsample to 8 UV
 #define READP210                                                  \
-  "movdqu     (%[uv_buf]),%%xmm0                              \n" \
+  "movdqu     (%[uv_buf]),%%xmm3                              \n" \
   "lea        0x10(%[uv_buf]),%[uv_buf]                       \n" \
-  "psrlw      $0x8,%%xmm0                                     \n" \
-  "packuswb   %%xmm0,%%xmm0                                   \n" \
-  "punpcklwd  %%xmm0,%%xmm0                                   \n" \
+  "psrlw      $0x8,%%xmm3                                     \n" \
+  "packuswb   %%xmm3,%%xmm3                                   \n" \
+  "punpcklwd  %%xmm3,%%xmm3                                   \n" \
   "movdqu     (%[y_buf]),%%xmm4                               \n" \
   "lea        0x10(%[y_buf]),%[y_buf]                         \n"
 
 // Read 8 UV from P410
 #define READP410                                                  \
-  "movdqu     (%[uv_buf]),%%xmm0                              \n" \
+  "movdqu     (%[uv_buf]),%%xmm3                              \n" \
   "movdqu     0x10(%[uv_buf]),%%xmm1                          \n" \
   "lea        0x20(%[uv_buf]),%[uv_buf]                       \n" \
-  "psrlw      $0x8,%%xmm0                                     \n" \
+  "psrlw      $0x8,%%xmm3                                     \n" \
   "psrlw      $0x8,%%xmm1                                     \n" \
-  "packuswb   %%xmm1,%%xmm0                                   \n" \
+  "packuswb   %%xmm1,%%xmm3                                   \n" \
   "movdqu     (%[y_buf]),%%xmm4                               \n" \
   "lea        0x10(%[y_buf]),%[y_buf]                         \n"
 
@@ -2318,56 +2318,72 @@
   "movdqa     96(%[yuvconstants]),%%xmm11                     \n" \
   "movdqa     128(%[yuvconstants]),%%xmm12                    \n" \
   "movdqa     160(%[yuvconstants]),%%xmm13                    \n" \
-  "movdqa     192(%[yuvconstants]),%%xmm14                    \n"
+  "movdqa     192(%[yuvconstants]),%%xmm14                    \n" \
+  "movdqa     256(%[yuvconstants]),%%xmm15                    \n" \
+  "movdqa     272(%[yuvconstants]),%%xmm7                     \n"
+
 // Convert 8 pixels: 8 UV and 8 Y
-#define YUVTORGB16(yuvconstants)                                  \
-  "movdqa     %%xmm0,%%xmm1                                   \n" \
-  "movdqa     %%xmm0,%%xmm2                                   \n" \
-  "movdqa     %%xmm0,%%xmm3                                   \n" \
-  "movdqa     %%xmm11,%%xmm0                                  \n" \
-  "pmaddubsw  %%xmm8,%%xmm1                                   \n" \
-  "psubw      %%xmm1,%%xmm0                                   \n" \
-  "movdqa     %%xmm12,%%xmm1                                  \n" \
-  "pmaddubsw  %%xmm9,%%xmm2                                   \n" \
-  "psubw      %%xmm2,%%xmm1                                   \n" \
-  "movdqa     %%xmm13,%%xmm2                                  \n" \
-  "pmaddubsw  %%xmm10,%%xmm3                                  \n" \
-  "psubw      %%xmm3,%%xmm2                                   \n" \
+#define YUVTORGB16(yuvconstants) \
+  "movdqa     %%xmm3,%%xmm0                                   \n" \
+  "movdqa     %%xmm3,%%xmm1                                   \n" \
+  "movdqa     %%xmm3,%%xmm2                                   \n" \
+  "pmaddubsw  %%xmm8,%%xmm0                                   \n" \
+  "pmaddubsw  %%xmm10,%%xmm2                                  \n" \
+  "psllw      $8,%%xmm1                                       \n" \
+  "pand       %%xmm15,%%xmm1                                  \n" \
+  "paddw      %%xmm1,%%xmm0                                   \n" \
+  "movdqa     %%xmm3,%%xmm1                                   \n" \
+  "pmaddubsw  %%xmm9,%%xmm1                                   \n" \
   "pmulhuw    %%xmm14,%%xmm4                                  \n" \
-  "paddsw     %%xmm4,%%xmm0                                   \n" \
-  "paddsw     %%xmm4,%%xmm1                                   \n" \
-  "paddsw     %%xmm4,%%xmm2                                   \n"
+  "pand       %%xmm7,%%xmm3                                   \n" \
+  "paddw      %%xmm3,%%xmm2                                   \n" \
+  "paddw      %%xmm4,%%xmm0                                   \n" \
+  "paddw      %%xmm4,%%xmm2                                   \n" \
+  "paddw      %%xmm12,%%xmm4                                  \n" \
+  "psubusw    %%xmm11,%%xmm0                                  \n" \
+  "psubusw    %%xmm1,%%xmm4                                   \n" \
+  "psubusw    %%xmm13,%%xmm2                                  \n" \
+  "movdqa     %%xmm4,%%xmm1                                   \n"
+
 #define YUVTORGB_REGS \
-  "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14",
+  "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15",
 
 #else
 #define YUVTORGB_SETUP(yuvconstants)
 // Convert 8 pixels: 8 UV and 8 Y
-#define YUVTORGB16(yuvconstants)                                  \
-  "movdqa     %%xmm0,%%xmm1                                   \n" \
-  "movdqa     %%xmm0,%%xmm2                                   \n" \
-  "movdqa     %%xmm0,%%xmm3                                   \n" \
-  "movdqa     96(%[yuvconstants]),%%xmm0                      \n" \
-  "pmaddubsw  (%[yuvconstants]),%%xmm1                        \n" \
-  "psubw      %%xmm1,%%xmm0                                   \n" \
-  "movdqa     128(%[yuvconstants]),%%xmm1                     \n" \
-  "pmaddubsw  32(%[yuvconstants]),%%xmm2                      \n" \
-  "psubw      %%xmm2,%%xmm1                                   \n" \
-  "movdqa     160(%[yuvconstants]),%%xmm2                     \n" \
-  "pmaddubsw  64(%[yuvconstants]),%%xmm3                      \n" \
-  "psubw      %%xmm3,%%xmm2                                   \n" \
+#define YUVTORGB16(yuvconstants) \
+  "movdqa     %%xmm3,%%xmm0                                   \n" \
+  "movdqa     %%xmm3,%%xmm1                                   \n" \
+  "movdqa     %%xmm3,%%xmm2                                   \n" \
+  "pmaddubsw  (%[yuvconstants]),%%xmm0                        \n" \
+  "pmaddubsw  64(%[yuvconstants]),%%xmm2                      \n" \
+  "psllw      $8,%%xmm1                                       \n" \
+  "pand       256(%[yuvconstants]),%%xmm1                     \n" \
+  "paddw      %%xmm1,%%xmm0                                   \n" \
+  "movdqa     %%xmm3,%%xmm1                                   \n" \
+  "pmaddubsw  32(%[yuvconstants]),%%xmm1                      \n" \
   "pmulhuw    192(%[yuvconstants]),%%xmm4                     \n" \
-  "paddsw     %%xmm4,%%xmm0                                   \n" \
-  "paddsw     %%xmm4,%%xmm1                                   \n" \
-  "paddsw     %%xmm4,%%xmm2                                   \n"
-#define YUVTORGB_REGS
+  "pand       272(%[yuvconstants]),%%xmm3                     \n" \
+  "paddw      %%xmm3,%%xmm2                                   \n" \
+  "movdqa     128(%[yuvconstants]),%%xmm7                     \n" \
+  "paddw      %%xmm4,%%xmm0                                   \n" \
+  "paddw      %%xmm4,%%xmm2                                   \n" \
+  "paddw      %%xmm7,%%xmm4                                   \n" \
+  "movdqa     96(%[yuvconstants]),%%xmm7                      \n" \
+  "psubusw    %%xmm7,%%xmm0                                   \n" \
+  "psubusw    %%xmm1,%%xmm4                                   \n" \
+  "movdqa     160(%[yuvconstants]),%%xmm7                     \n" \
+  "psubusw    %%xmm7,%%xmm2                                   \n" \
+  "movdqa     %%xmm4,%%xmm1                                   \n" \
+
+#define YUVTORGB_REGS "xmm7",
 #endif
 
 #define YUVTORGB(yuvconstants)                                    \
   YUVTORGB16(yuvconstants)                                        \
-  "psraw      $0x6,%%xmm0                                     \n" \
-  "psraw      $0x6,%%xmm1                                     \n" \
-  "psraw      $0x6,%%xmm2                                     \n" \
+  "psrlw      $0x6,%%xmm0                                     \n" \
+  "psrlw      $0x6,%%xmm1                                     \n" \
+  "psrlw      $0x6,%%xmm2                                     \n" \
   "packuswb   %%xmm0,%%xmm0                                   \n" \
   "packuswb   %%xmm1,%%xmm1                                   \n" \
   "packuswb   %%xmm2,%%xmm2                                   \n"
@@ -2400,12 +2416,9 @@
   "psraw      $0x4,%%xmm0                                      \n" \
   "psraw      $0x4,%%xmm1                                      \n" \
   "psraw      $0x4,%%xmm2                                      \n" \
-  "pminsw     %%xmm7,%%xmm0                                    \n" \
-  "pminsw     %%xmm7,%%xmm1                                    \n" \
-  "pminsw     %%xmm7,%%xmm2                                    \n" \
-  "pmaxsw     %%xmm6,%%xmm0                                    \n" \
-  "pmaxsw     %%xmm6,%%xmm1                                    \n" \
-  "pmaxsw     %%xmm6,%%xmm2                                    \n" \
+  "pminuw     %%xmm6,%%xmm0                                    \n" \
+  "pminuw     %%xmm6,%%xmm1                                    \n" \
+  "pminuw     %%xmm6,%%xmm2                                    \n" \
   "psllw      $0x4,%%xmm2                                      \n" \
   "movdqa     %%xmm0,%%xmm3                                    \n" \
   "punpcklwd  %%xmm2,%%xmm0                                    \n" \
@@ -2575,9 +2588,8 @@
       "pcmpeqb     %%xmm5,%%xmm5                 \n"  // AR30 constants
       "psrlw       $14,%%xmm5                    \n"
       "psllw       $4,%%xmm5                     \n"  // 2 alpha bits
-      "pxor        %%xmm6,%%xmm6                 \n"
-      "pcmpeqb     %%xmm7,%%xmm7                 \n"  // 0 for min
-      "psrlw       $6,%%xmm7                     \n"  // 1023 for max
+      "pcmpeqb     %%xmm6,%%xmm6                 \n"
+      "psrlw       $6,%%xmm6                     \n"  // 1023 for max
 
     LABELALIGN
       "1:                                        \n"
@@ -2593,7 +2605,7 @@
     [width]"+rm"(width)    // %[width]
   : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
   : "memory", "cc", YUVTORGB_REGS
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
   );
 }
 
@@ -2670,9 +2682,8 @@
       "pcmpeqb     %%xmm5,%%xmm5                 \n"
       "psrlw       $14,%%xmm5                    \n"
       "psllw       $4,%%xmm5                     \n"  // 2 alpha bits
-      "pxor        %%xmm6,%%xmm6                 \n"
-      "pcmpeqb     %%xmm7,%%xmm7                 \n"  // 0 for min
-      "psrlw       $6,%%xmm7                     \n"  // 1023 for max
+      "pcmpeqb     %%xmm6,%%xmm6                 \n"
+      "psrlw       $6,%%xmm6                     \n"  // 1023 for max
 
     LABELALIGN
       "1:                                        \n"
@@ -2688,7 +2699,7 @@
     [width]"+rm"(width)    // %[width]
   : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
   : "memory", "cc", YUVTORGB_REGS
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
   );
 }
 
@@ -2705,9 +2716,8 @@
       "pcmpeqb     %%xmm5,%%xmm5                 \n"
       "psrlw       $14,%%xmm5                    \n"
       "psllw       $4,%%xmm5                     \n"  // 2 alpha bits
-      "pxor        %%xmm6,%%xmm6                 \n"
-      "pcmpeqb     %%xmm7,%%xmm7                 \n"  // 0 for min
-      "psrlw       $6,%%xmm7                     \n"  // 1023 for max
+      "pcmpeqb     %%xmm6,%%xmm6                 \n"
+      "psrlw       $6,%%xmm6                     \n"  // 1023 for max
 
     LABELALIGN
       "1:                                        \n"
@@ -2723,7 +2733,7 @@
     [width]"+rm"(width)    // %[width]
   : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
   : "memory", "cc", YUVTORGB_REGS
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
   );
 }
 
@@ -2735,7 +2745,6 @@
                                 const struct YuvConstants* yuvconstants,
                                 int width) {
   asm volatile (
-
     YUVTORGB_SETUP(yuvconstants)
       "sub         %[u_buf],%[v_buf]             \n"
       "pcmpeqb     %%xmm5,%%xmm5                 \n"
@@ -2767,9 +2776,8 @@
                                      const struct YuvConstants* yuvconstants,
                                      int width) {
   asm volatile(
-
       YUVTORGB_SETUP(
-      yuvconstants) "sub         %[u_buf],%[v_buf]             \n"
+          yuvconstants) "sub         %[u_buf],%[v_buf]             \n"
 
       LABELALIGN "1:                                        \n" READYUVA210
           YUVTORGB(yuvconstants) STOREARGB
@@ -2800,28 +2808,32 @@
                                      uint8_t* dst_argb,
                                      const struct YuvConstants* yuvconstants,
                                      int width) {
+  // clang-format off
   asm volatile(
+    YUVTORGB_SETUP(yuvconstants)
+      "sub         %[u_buf],%[v_buf]             \n"
 
-      YUVTORGB_SETUP(
-      yuvconstants) "sub         %[u_buf],%[v_buf]             \n"
-
-      LABELALIGN "1:                                        \n" READYUVA410
-          YUVTORGB(yuvconstants) STOREARGB
+    LABELALIGN
+      "1:                                        \n"
+    READYUVA410
+    YUVTORGB(yuvconstants)
+    STOREARGB
       "subl        $0x8,%[width]                 \n"
       "jg          1b                            \n"
-      : [y_buf] "+r"(y_buf),  // %[y_buf]
-        [u_buf] "+r"(u_buf),  // %[u_buf]
-        [v_buf] "+r"(v_buf),  // %[v_buf]
-        [a_buf] "+r"(a_buf),
-        [dst_argb] "+r"(dst_argb),  // %[dst_argb]
+    : [y_buf] "+r"(y_buf),  // %[y_buf]
+      [u_buf] "+r"(u_buf),  // %[u_buf]
+      [v_buf] "+r"(v_buf),  // %[v_buf]
+      [a_buf] "+r"(a_buf),
+      [dst_argb] "+r"(dst_argb),  // %[dst_argb]
 #if defined(__i386__)
-        [width] "+m"(width)  // %[width]
+      [width] "+m"(width)  // %[width]
 #else
-        [width] "+rm"(width)  // %[width]
+      [width] "+rm"(width)  // %[width]
 #endif
-      : [yuvconstants] "r"(yuvconstants)  // %[yuvconstants]
-      : "memory", "cc", YUVTORGB_REGS "xmm0", "xmm1", "xmm2", "xmm3", "xmm4",
-        "xmm5");
+    : [yuvconstants] "r"(yuvconstants)  // %[yuvconstants]
+    : "memory", "cc", YUVTORGB_REGS "xmm0", "xmm1", "xmm2", "xmm3", "xmm4",
+      "xmm5");
+  // clang-format on
 }
 #endif
 
@@ -2833,15 +2845,13 @@
                                 const struct YuvConstants* yuvconstants,
                                 int width) {
   asm volatile (
-
     YUVTORGB_SETUP(yuvconstants)
       "sub         %[u_buf],%[v_buf]             \n"
       "pcmpeqb     %%xmm5,%%xmm5                 \n"
       "psrlw       $14,%%xmm5                    \n"
       "psllw       $4,%%xmm5                     \n"  // 2 alpha bits
-      "pxor        %%xmm6,%%xmm6                 \n"
-      "pcmpeqb     %%xmm7,%%xmm7                 \n"  // 0 for min
-      "psrlw       $6,%%xmm7                     \n"  // 1023 for max
+      "pcmpeqb     %%xmm6,%%xmm6                 \n"
+      "psrlw       $6,%%xmm6                     \n"  // 1023 for max
 
     LABELALIGN
       "1:                                        \n"
@@ -2857,7 +2867,7 @@
     [width]"+rm"(width)    // %[width]
   : [yuvconstants]"r"(yuvconstants)   // %[yuvconstants]
   : "memory", "cc", YUVTORGB_REGS
-      "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+      "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
   );
 }
 
@@ -3018,7 +3028,6 @@
                                 const struct YuvConstants* yuvconstants,
                                 int width) {
   asm volatile(
-
       YUVTORGB_SETUP(
       yuvconstants) "pcmpeqb     %%xmm5,%%xmm5                 \n"
 
@@ -3041,7 +3050,6 @@
                                 const struct YuvConstants* yuvconstants,
                                 int width) {
   asm volatile(
-
       YUVTORGB_SETUP(
       yuvconstants) "pcmpeqb     %%xmm5,%%xmm5                 \n"
 
@@ -3064,14 +3072,12 @@
                                 const struct YuvConstants* yuvconstants,
                                 int width) {
   asm volatile (
-
     YUVTORGB_SETUP(yuvconstants)
       "pcmpeqb     %%xmm5,%%xmm5                 \n"
       "psrlw       $14,%%xmm5                    \n"
       "psllw       $4,%%xmm5                     \n"  // 2 alpha bits
-      "pxor        %%xmm6,%%xmm6                 \n"
-      "pcmpeqb     %%xmm7,%%xmm7                 \n"  // 0 for min
-      "psrlw       $6,%%xmm7                     \n"  // 1023 for max
+      "pcmpeqb     %%xmm6,%%xmm6                 \n"
+      "psrlw       $6,%%xmm6                     \n"  // 1023 for max
 
     LABELALIGN
       "1:                                        \n"
@@ -3086,7 +3092,7 @@
     [width]"+rm"(width)              // %[width]
   : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
   : "memory", "cc", YUVTORGB_REGS
-      "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+      "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
   );
 }
 
@@ -3096,14 +3102,12 @@
                                 const struct YuvConstants* yuvconstants,
                                 int width) {
   asm volatile (
-
     YUVTORGB_SETUP(yuvconstants)
       "pcmpeqb     %%xmm5,%%xmm5                 \n"
       "psrlw       $14,%%xmm5                    \n"
       "psllw       $4,%%xmm5                     \n"  // 2 alpha bits
-      "pxor        %%xmm6,%%xmm6                 \n"
-      "pcmpeqb     %%xmm7,%%xmm7                 \n"  // 0 for min
-      "psrlw       $6,%%xmm7                     \n"  // 1023 for max
+      "pcmpeqb     %%xmm6,%%xmm6                 \n"
+      "psrlw       $6,%%xmm6                     \n"  // 1023 for max
 
     LABELALIGN
       "1:                                        \n"
@@ -3118,7 +3122,7 @@
     [width]"+rm"(width)              // %[width]
   : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
   : "memory", "cc", YUVTORGB_REGS
-      "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+      "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
   );
 }
 
@@ -3155,12 +3159,12 @@
 
 // Read 16 UV from 444
 #define READYUV444_AVX2                                               \
-  "vmovdqu    (%[u_buf]),%%xmm0                                   \n" \
+  "vmovdqu    (%[u_buf]),%%xmm3                                   \n" \
   "vmovdqu    0x00(%[u_buf],%[v_buf],1),%%xmm1                    \n" \
   "lea        0x10(%[u_buf]),%[u_buf]                             \n" \
-  "vpermq     $0xd8,%%ymm0,%%ymm0                                 \n" \
+  "vpermq     $0xd8,%%ymm3,%%ymm3                                 \n" \
   "vpermq     $0xd8,%%ymm1,%%ymm1                                 \n" \
-  "vpunpcklbw %%ymm1,%%ymm0,%%ymm0                                \n" \
+  "vpunpcklbw %%ymm1,%%ymm3,%%ymm3                                \n" \
   "vmovdqu    (%[y_buf]),%%xmm4                                   \n" \
   "vpermq     $0xd8,%%ymm4,%%ymm4                                 \n" \
   "vpunpcklbw %%ymm4,%%ymm4,%%ymm4                                \n" \
@@ -3168,12 +3172,12 @@
 
 // Read 8 UV from 422, upsample to 16 UV.
 #define READYUV422_AVX2                                               \
-  "vmovq      (%[u_buf]),%%xmm0                                   \n" \
+  "vmovq      (%[u_buf]),%%xmm3                                   \n" \
   "vmovq      0x00(%[u_buf],%[v_buf],1),%%xmm1                    \n" \
   "lea        0x8(%[u_buf]),%[u_buf]                              \n" \
-  "vpunpcklbw %%ymm1,%%ymm0,%%ymm0                                \n" \
-  "vpermq     $0xd8,%%ymm0,%%ymm0                                 \n" \
-  "vpunpcklwd %%ymm0,%%ymm0,%%ymm0                                \n" \
+  "vpunpcklbw %%ymm1,%%ymm3,%%ymm3                                \n" \
+  "vpermq     $0xd8,%%ymm3,%%ymm3                                 \n" \
+  "vpunpcklwd %%ymm3,%%ymm3,%%ymm3                                \n" \
   "vmovdqu    (%[y_buf]),%%xmm4                                   \n" \
   "vpermq     $0xd8,%%ymm4,%%ymm4                                 \n" \
   "vpunpcklbw %%ymm4,%%ymm4,%%ymm4                                \n" \
@@ -3183,30 +3187,30 @@
 // TODO(fbarchard): Consider vshufb to replace pack/unpack
 // TODO(fbarchard): Consider vunpcklpd to combine the 2 registers into 1.
 #define READYUV210_AVX2                                            \
-  "vmovdqu    (%[u_buf]),%%xmm0                                \n" \
+  "vmovdqu    (%[u_buf]),%%xmm3                                \n" \
   "vmovdqu    0x00(%[u_buf],%[v_buf],1),%%xmm1                 \n" \
   "lea        0x10(%[u_buf]),%[u_buf]                          \n" \
-  "vpermq     $0xd8,%%ymm0,%%ymm0                              \n" \
+  "vpermq     $0xd8,%%ymm3,%%ymm3                              \n" \
   "vpermq     $0xd8,%%ymm1,%%ymm1                              \n" \
-  "vpunpcklwd %%ymm1,%%ymm0,%%ymm0                             \n" \
-  "vpsraw     $2,%%ymm0,%%ymm0                                 \n" \
-  "vpackuswb  %%ymm0,%%ymm0,%%ymm0                             \n" \
-  "vpunpcklwd %%ymm0,%%ymm0,%%ymm0                             \n" \
+  "vpunpcklwd %%ymm1,%%ymm3,%%ymm3                             \n" \
+  "vpsraw     $2,%%ymm3,%%ymm3                                 \n" \
+  "vpackuswb  %%ymm3,%%ymm3,%%ymm3                             \n" \
+  "vpunpcklwd %%ymm3,%%ymm3,%%ymm3                             \n" \
   "vmovdqu    (%[y_buf]),%%ymm4                                \n" \
   "vpsllw     $6,%%ymm4,%%ymm4                                 \n" \
   "lea        0x20(%[y_buf]),%[y_buf]                          \n"
 
 // Read 8 UV from 210, upsample to 16 UV. With 16 Alpha.
 #define READYUVA210_AVX2                                           \
-  "vmovdqu    (%[u_buf]),%%xmm0                                \n" \
+  "vmovdqu    (%[u_buf]),%%xmm3                                \n" \
   "vmovdqu    0x00(%[u_buf],%[v_buf],1),%%xmm1                 \n" \
   "lea        0x10(%[u_buf]),%[u_buf]                          \n" \
-  "vpermq     $0xd8,%%ymm0,%%ymm0                              \n" \
+  "vpermq     $0xd8,%%ymm3,%%ymm3                              \n" \
   "vpermq     $0xd8,%%ymm1,%%ymm1                              \n" \
-  "vpunpcklwd %%ymm1,%%ymm0,%%ymm0                             \n" \
-  "vpsraw     $2,%%ymm0,%%ymm0                                 \n" \
-  "vpackuswb  %%ymm0,%%ymm0,%%ymm0                             \n" \
-  "vpunpcklwd %%ymm0,%%ymm0,%%ymm0                             \n" \
+  "vpunpcklwd %%ymm1,%%ymm3,%%ymm3                             \n" \
+  "vpsraw     $2,%%ymm3,%%ymm3                                 \n" \
+  "vpackuswb  %%ymm3,%%ymm3,%%ymm3                             \n" \
+  "vpunpcklwd %%ymm3,%%ymm3,%%ymm3                             \n" \
   "vmovdqu    (%[y_buf]),%%ymm4                                \n" \
   "vpsllw     $6,%%ymm4,%%ymm4                                 \n" \
   "lea        0x20(%[y_buf]),%[y_buf]                          \n" \
@@ -3217,43 +3221,43 @@
 
 // Read 16 UV from 410
 #define READYUV410_AVX2                                            \
-  "vmovdqu    (%[u_buf]),%%ymm0                                \n" \
+  "vmovdqu    (%[u_buf]),%%ymm3                                \n" \
   "vmovdqu    0x00(%[u_buf],%[v_buf],1),%%ymm2                 \n" \
   "lea        0x20(%[u_buf]),%[u_buf]                          \n" \
-  "vpsraw     $2,%%ymm0,%%ymm0                                 \n" \
+  "vpsraw     $2,%%ymm3,%%ymm3                                 \n" \
   "vpsraw     $2,%%ymm2,%%ymm2                                 \n" \
-  "vpunpckhwd %%ymm2,%%ymm0,%%ymm1                             \n" \
-  "vpunpcklwd %%ymm2,%%ymm0,%%ymm0                             \n" \
-  "vpackuswb  %%ymm1,%%ymm0,%%ymm0                             \n" \
+  "vpunpckhwd %%ymm2,%%ymm3,%%ymm1                             \n" \
+  "vpunpcklwd %%ymm2,%%ymm3,%%ymm3                             \n" \
+  "vpackuswb  %%ymm1,%%ymm3,%%ymm3                             \n" \
   "vmovdqu    (%[y_buf]),%%ymm4                                \n" \
   "vpsllw     $6,%%ymm4,%%ymm4                                 \n" \
   "lea        0x20(%[y_buf]),%[y_buf]                          \n"
 
 // Read 8 UV from 212 12 bit, upsample to 16 UV
 #define READYUV212_AVX2                                            \
-  "vmovdqu    (%[u_buf]),%%xmm0                                \n" \
+  "vmovdqu    (%[u_buf]),%%xmm3                                \n" \
   "vmovdqu    0x00(%[u_buf],%[v_buf],1),%%xmm1                 \n" \
   "lea        0x10(%[u_buf]),%[u_buf]                          \n" \
-  "vpermq     $0xd8,%%ymm0,%%ymm0                              \n" \
+  "vpermq     $0xd8,%%ymm3,%%ymm3                              \n" \
   "vpermq     $0xd8,%%ymm1,%%ymm1                              \n" \
-  "vpunpcklwd %%ymm1,%%ymm0,%%ymm0                             \n" \
-  "vpsraw     $0x4,%%ymm0,%%ymm0                               \n" \
-  "vpackuswb  %%ymm0,%%ymm0,%%ymm0                             \n" \
-  "vpunpcklwd %%ymm0,%%ymm0,%%ymm0                             \n" \
+  "vpunpcklwd %%ymm1,%%ymm3,%%ymm3                             \n" \
+  "vpsraw     $0x4,%%ymm3,%%ymm3                               \n" \
+  "vpackuswb  %%ymm3,%%ymm3,%%ymm3                             \n" \
+  "vpunpcklwd %%ymm3,%%ymm3,%%ymm3                             \n" \
   "vmovdqu    (%[y_buf]),%%ymm4                                \n" \
   "vpsllw     $0x4,%%ymm4,%%ymm4                               \n" \
   "lea        0x20(%[y_buf]),%[y_buf]                          \n"
 
 // Read 16 UV from 410. With 16 Alpha.
 #define READYUVA410_AVX2                                           \
-  "vmovdqu    (%[u_buf]),%%ymm0                                \n" \
+  "vmovdqu    (%[u_buf]),%%ymm3                                \n" \
   "vmovdqu    0x00(%[u_buf],%[v_buf],1),%%ymm2                 \n" \
   "lea        0x20(%[u_buf]),%[u_buf]                          \n" \
-  "vpsraw     $2,%%ymm0,%%ymm0                                 \n" \
+  "vpsraw     $2,%%ymm3,%%ymm3                                 \n" \
   "vpsraw     $2,%%ymm2,%%ymm2                                 \n" \
-  "vpunpckhwd %%ymm2,%%ymm0,%%ymm1                             \n" \
-  "vpunpcklwd %%ymm2,%%ymm0,%%ymm0                             \n" \
-  "vpackuswb  %%ymm1,%%ymm0,%%ymm0                             \n" \
+  "vpunpckhwd %%ymm2,%%ymm3,%%ymm1                             \n" \
+  "vpunpcklwd %%ymm2,%%ymm3,%%ymm3                             \n" \
+  "vpackuswb  %%ymm1,%%ymm3,%%ymm3                             \n" \
   "vmovdqu    (%[y_buf]),%%ymm4                                \n" \
   "vpsllw     $6,%%ymm4,%%ymm4                                 \n" \
   "lea        0x20(%[y_buf]),%[y_buf]                          \n" \
@@ -3264,12 +3268,12 @@
 
 // Read 16 UV from 444.  With 16 Alpha.
 #define READYUVA444_AVX2                                              \
-  "vmovdqu    (%[u_buf]),%%xmm0                                   \n" \
+  "vmovdqu    (%[u_buf]),%%xmm3                                   \n" \
   "vmovdqu    0x00(%[u_buf],%[v_buf],1),%%xmm1                    \n" \
   "lea        0x10(%[u_buf]),%[u_buf]                             \n" \
-  "vpermq     $0xd8,%%ymm0,%%ymm0                                 \n" \
+  "vpermq     $0xd8,%%ymm3,%%ymm3                                 \n" \
   "vpermq     $0xd8,%%ymm1,%%ymm1                                 \n" \
-  "vpunpcklbw %%ymm1,%%ymm0,%%ymm0                                \n" \
+  "vpunpcklbw %%ymm1,%%ymm3,%%ymm3                                \n" \
   "vmovdqu    (%[y_buf]),%%xmm4                                   \n" \
   "vpermq     $0xd8,%%ymm4,%%ymm4                                 \n" \
   "vpunpcklbw %%ymm4,%%ymm4,%%ymm4                                \n" \
@@ -3280,12 +3284,12 @@
 
 // Read 8 UV from 422, upsample to 16 UV.  With 16 Alpha.
 #define READYUVA422_AVX2                                              \
-  "vmovq      (%[u_buf]),%%xmm0                                   \n" \
+  "vmovq      (%[u_buf]),%%xmm3                                   \n" \
   "vmovq      0x00(%[u_buf],%[v_buf],1),%%xmm1                    \n" \
   "lea        0x8(%[u_buf]),%[u_buf]                              \n" \
-  "vpunpcklbw %%ymm1,%%ymm0,%%ymm0                                \n" \
-  "vpermq     $0xd8,%%ymm0,%%ymm0                                 \n" \
-  "vpunpcklwd %%ymm0,%%ymm0,%%ymm0                                \n" \
+  "vpunpcklbw %%ymm1,%%ymm3,%%ymm3                                \n" \
+  "vpermq     $0xd8,%%ymm3,%%ymm3                                 \n" \
+  "vpunpcklwd %%ymm3,%%ymm3,%%ymm3                                \n" \
   "vmovdqu    (%[y_buf]),%%xmm4                                   \n" \
   "vpermq     $0xd8,%%ymm4,%%ymm4                                 \n" \
   "vpunpcklbw %%ymm4,%%ymm4,%%ymm4                                \n" \
@@ -3296,10 +3300,10 @@
 
 // Read 8 UV from NV12, upsample to 16 UV.
 #define READNV12_AVX2                                                 \
-  "vmovdqu    (%[uv_buf]),%%xmm0                                  \n" \
+  "vmovdqu    (%[uv_buf]),%%xmm3                                  \n" \
   "lea        0x10(%[uv_buf]),%[uv_buf]                           \n" \
-  "vpermq     $0xd8,%%ymm0,%%ymm0                                 \n" \
-  "vpunpcklwd %%ymm0,%%ymm0,%%ymm0                                \n" \
+  "vpermq     $0xd8,%%ymm3,%%ymm3                                 \n" \
+  "vpunpcklwd %%ymm3,%%ymm3,%%ymm3                                \n" \
   "vmovdqu    (%[y_buf]),%%xmm4                                   \n" \
   "vpermq     $0xd8,%%ymm4,%%ymm4                                 \n" \
   "vpunpcklbw %%ymm4,%%ymm4,%%ymm4                                \n" \
@@ -3307,10 +3311,10 @@
 
 // Read 8 VU from NV21, upsample to 16 UV.
 #define READNV21_AVX2                                                 \
-  "vmovdqu    (%[vu_buf]),%%xmm0                                  \n" \
+  "vmovdqu    (%[vu_buf]),%%xmm3                                  \n" \
   "lea        0x10(%[vu_buf]),%[vu_buf]                           \n" \
-  "vpermq     $0xd8,%%ymm0,%%ymm0                                 \n" \
-  "vpshufb     %[kShuffleNV21], %%ymm0, %%ymm0                    \n" \
+  "vpermq     $0xd8,%%ymm3,%%ymm3                                 \n" \
+  "vpshufb     %[kShuffleNV21], %%ymm3, %%ymm3                    \n" \
   "vmovdqu    (%[y_buf]),%%xmm4                                   \n" \
   "vpermq     $0xd8,%%ymm4,%%ymm4                                 \n" \
   "vpunpcklbw %%ymm4,%%ymm4,%%ymm4                                \n" \
@@ -3318,23 +3322,23 @@
 
 // Read 4 UV from P210, upsample to 8 UV
 #define READP210_AVX2                                                 \
-  "vmovdqu    (%[uv_buf]),%%ymm0                                  \n" \
+  "vmovdqu    (%[uv_buf]),%%ymm3                                  \n" \
   "lea        0x20(%[uv_buf]),%[uv_buf]                           \n" \
-  "vpsrlw     $0x8,%%ymm0,%%ymm0                                  \n" \
-  "vpackuswb  %%ymm0,%%ymm0,%%ymm0                                \n" \
-  "vpunpcklwd %%ymm0,%%ymm0,%%ymm0                                \n" \
+  "vpsrlw     $0x8,%%ymm3,%%ymm3                                  \n" \
+  "vpackuswb  %%ymm3,%%ymm3,%%ymm3                                \n" \
+  "vpunpcklwd %%ymm3,%%ymm3,%%ymm3                                \n" \
   "vmovdqu    (%[y_buf]),%%ymm4                                   \n" \
   "lea        0x20(%[y_buf]),%[y_buf]                             \n"
 
 // Read 8 UV from P410
 #define READP410_AVX2                                                 \
-  "vmovdqu    (%[uv_buf]),%%ymm0                                  \n" \
+  "vmovdqu    (%[uv_buf]),%%ymm3                                  \n" \
   "vmovdqu    0x20(%[uv_buf]),%%ymm1                              \n" \
   "lea        0x40(%[uv_buf]),%[uv_buf]                           \n" \
-  "vpsrlw     $0x8,%%ymm0,%%ymm0                                  \n" \
+  "vpsrlw     $0x8,%%ymm3,%%ymm3                                  \n" \
   "vpsrlw     $0x8,%%ymm1,%%ymm1                                  \n" \
-  "vpackuswb  %%ymm1,%%ymm0,%%ymm0                                \n" \
-  "vpermq     $0xd8,%%ymm0,%%ymm0                                 \n" \
+  "vpackuswb  %%ymm1,%%ymm3,%%ymm3                                \n" \
+  "vpermq     $0xd8,%%ymm3,%%ymm3                                 \n" \
   "vmovdqu    (%[y_buf]),%%ymm4                                   \n" \
   "lea        0x20(%[y_buf]),%[y_buf]                             \n"
 
@@ -3342,68 +3346,84 @@
 #define READYUY2_AVX2                                                 \
   "vmovdqu    (%[yuy2_buf]),%%ymm4                                \n" \
   "vpshufb    %[kShuffleYUY2Y], %%ymm4, %%ymm4                    \n" \
-  "vmovdqu    (%[yuy2_buf]),%%ymm0                                \n" \
-  "vpshufb    %[kShuffleYUY2UV], %%ymm0, %%ymm0                   \n" \
+  "vmovdqu    (%[yuy2_buf]),%%ymm3                                \n" \
+  "vpshufb    %[kShuffleYUY2UV], %%ymm3, %%ymm3                   \n" \
   "lea        0x20(%[yuy2_buf]),%[yuy2_buf]                       \n"
 
 // Read 8 UYVY with 16 Y and upsample 8 UV to 16 UV.
 #define READUYVY_AVX2                                                 \
   "vmovdqu    (%[uyvy_buf]),%%ymm4                                \n" \
   "vpshufb    %[kShuffleUYVYY], %%ymm4, %%ymm4                    \n" \
-  "vmovdqu    (%[uyvy_buf]),%%ymm0                                \n" \
-  "vpshufb    %[kShuffleUYVYUV], %%ymm0, %%ymm0                   \n" \
+  "vmovdqu    (%[uyvy_buf]),%%ymm3                                \n" \
+  "vpshufb    %[kShuffleUYVYUV], %%ymm3, %%ymm3                   \n" \
   "lea        0x20(%[uyvy_buf]),%[uyvy_buf]                       \n"
 
 #if defined(__x86_64__)
-#define YUVTORGB_SETUP_AVX2(yuvconstants)                            \
-  "vmovdqa     (%[yuvconstants]),%%ymm8                          \n" \
-  "vmovdqa     32(%[yuvconstants]),%%ymm9                        \n" \
-  "vmovdqa     64(%[yuvconstants]),%%ymm10                       \n" \
-  "vmovdqa     96(%[yuvconstants]),%%ymm11                       \n" \
-  "vmovdqa     128(%[yuvconstants]),%%ymm12                      \n" \
-  "vmovdqa     160(%[yuvconstants]),%%ymm13                      \n" \
-  "vmovdqa     192(%[yuvconstants]),%%ymm14                      \n"
+#define YUVTORGB_SETUP_AVX2(yuvconstants)                             \
+  "vmovdqa     (%[yuvconstants]),%%ymm8                           \n" \
+  "vmovdqa     32(%[yuvconstants]),%%ymm9                         \n" \
+  "vmovdqa     64(%[yuvconstants]),%%ymm10                        \n" \
+  "vmovdqa     96(%[yuvconstants]),%%ymm11                        \n" \
+  "vmovdqa     128(%[yuvconstants]),%%ymm12                       \n" \
+  "vmovdqa     160(%[yuvconstants]),%%ymm13                       \n" \
+  "vmovdqa     192(%[yuvconstants]),%%ymm14                       \n" \
+  "vbroadcastf128 256(%[yuvconstants]),%%ymm15                    \n" \
+  "vbroadcastf128 272(%[yuvconstants]),%%ymm7                     \n"
 
+// TODO(yuan): Consider signed UV and unsigned coefficient for vpmaddubsw.
 #define YUVTORGB16_AVX2(yuvconstants)                                 \
-  "vpmaddubsw  %%ymm10,%%ymm0,%%ymm2                              \n" \
-  "vpmaddubsw  %%ymm9,%%ymm0,%%ymm1                               \n" \
-  "vpmaddubsw  %%ymm8,%%ymm0,%%ymm0                               \n" \
-  "vpsubw      %%ymm2,%%ymm13,%%ymm2                              \n" \
-  "vpsubw      %%ymm1,%%ymm12,%%ymm1                              \n" \
-  "vpsubw      %%ymm0,%%ymm11,%%ymm0                              \n" \
+  "vpmaddubsw  %%ymm8,%%ymm3,%%ymm0                               \n" \
+  "vpmaddubsw  %%ymm10,%%ymm3,%%ymm2                              \n" \
+  "vpsllw      $8,%%ymm3,%%ymm1                                   \n" \
+  "vpand       %%ymm1,%%ymm15,%%ymm1                              \n" \
+  "vpaddw      %%ymm1,%%ymm0,%%ymm0                               \n" \
+  "vpmaddubsw  %%ymm9,%%ymm3,%%ymm1                               \n" \
   "vpmulhuw    %%ymm14,%%ymm4,%%ymm4                              \n" \
-  "vpaddsw     %%ymm4,%%ymm0,%%ymm0                               \n" \
-  "vpaddsw     %%ymm4,%%ymm1,%%ymm1                               \n" \
-  "vpaddsw     %%ymm4,%%ymm2,%%ymm2                               \n"
+  "vpand       %%ymm3,%%ymm7,%%ymm3                               \n" \
+  "vpaddw      %%ymm3,%%ymm2,%%ymm2                               \n" \
+  "vpaddw      %%ymm4,%%ymm0,%%ymm0                               \n" \
+  "vpaddw      %%ymm4,%%ymm12,%%ymm3                              \n" \
+  "vpaddw      %%ymm4,%%ymm2,%%ymm2                               \n" \
+  "vpsubusw    %%ymm11,%%ymm0,%%ymm0                              \n" \
+  "vpsubusw    %%ymm1,%%ymm3,%%ymm1                               \n" \
+  "vpsubusw    %%ymm13,%%ymm2,%%ymm2                              \n"
 
 #define YUVTORGB_REGS_AVX2 \
-  "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14",
+  "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15",
 
 #else  // Convert 16 pixels: 16 UV and 16 Y.
 
 #define YUVTORGB_SETUP_AVX2(yuvconstants)
-#define YUVTORGB16_AVX2(yuvconstants)                                 \
-  "vpmaddubsw  64(%[yuvconstants]),%%ymm0,%%ymm2                  \n" \
-  "vpmaddubsw  32(%[yuvconstants]),%%ymm0,%%ymm1                  \n" \
-  "vpmaddubsw  (%[yuvconstants]),%%ymm0,%%ymm0                    \n" \
-  "vmovdqu     160(%[yuvconstants]),%%ymm3                        \n" \
-  "vpsubw      %%ymm2,%%ymm3,%%ymm2                               \n" \
-  "vmovdqu     128(%[yuvconstants]),%%ymm3                        \n" \
-  "vpsubw      %%ymm1,%%ymm3,%%ymm1                               \n" \
-  "vmovdqu     96(%[yuvconstants]),%%ymm3                         \n" \
-  "vpsubw      %%ymm0,%%ymm3,%%ymm0                               \n" \
+#define YUVTORGB16_AVX2(yuvconstants) \
+  "vpmaddubsw  (%[yuvconstants]),%%ymm3,%%ymm0                    \n" \
+  "vpmaddubsw  64(%[yuvconstants]),%%ymm3,%%ymm2                  \n" \
+  "vpsllw      $8,%%ymm3,%%ymm1                                   \n" \
+  "vbroadcastf128 256(%[yuvconstants]),%%ymm7                     \n" \
+  "vpand       %%ymm7,%%ymm1,%%ymm1                               \n" \
+  "vpaddw      %%ymm1,%%ymm0,%%ymm0                               \n" \
+  "vpmaddubsw  32(%[yuvconstants]),%%ymm3,%%ymm1                  \n" \
   "vpmulhuw    192(%[yuvconstants]),%%ymm4,%%ymm4                 \n" \
-  "vpaddsw     %%ymm4,%%ymm0,%%ymm0                               \n" \
-  "vpaddsw     %%ymm4,%%ymm1,%%ymm1                               \n" \
-  "vpaddsw     %%ymm4,%%ymm2,%%ymm2                               \n"
-#define YUVTORGB_REGS_AVX2
+  "vbroadcastf128 272(%[yuvconstants]),%%ymm7                     \n" \
+  "vpand       %%ymm7,%%ymm3,%%ymm3                               \n" \
+  "vpaddw      %%ymm3,%%ymm2,%%ymm2                               \n" \
+  "vpaddw      %%ymm4,%%ymm0,%%ymm0                               \n" \
+  "vmovdqu     128(%[yuvconstants]),%%ymm7                        \n" \
+  "vpaddw      %%ymm4,%%ymm7,%%ymm3                               \n" \
+  "vpaddw      %%ymm4,%%ymm2,%%ymm2                               \n" \
+  "vmovdqu     96(%[yuvconstants]),%%ymm7                         \n" \
+  "vpsubusw    %%ymm7,%%ymm0,%%ymm0                               \n" \
+  "vpsubusw    %%ymm1,%%ymm3,%%ymm1                               \n" \
+  "vmovdqu     160(%[yuvconstants]),%%ymm7                        \n" \
+  "vpsubusw    %%ymm7,%%ymm2,%%ymm2                               \n"
+
+#define YUVTORGB_REGS_AVX2 "xmm7",
 #endif
 
 #define YUVTORGB_AVX2(yuvconstants)                                   \
   YUVTORGB16_AVX2(yuvconstants)                                       \
-  "vpsraw      $0x6,%%ymm0,%%ymm0                                 \n" \
-  "vpsraw      $0x6,%%ymm1,%%ymm1                                 \n" \
-  "vpsraw      $0x6,%%ymm2,%%ymm2                                 \n" \
+  "vpsrlw      $0x6,%%ymm0,%%ymm0                                 \n" \
+  "vpsrlw      $0x6,%%ymm1,%%ymm1                                 \n" \
+  "vpsrlw      $0x6,%%ymm2,%%ymm2                                 \n" \
   "vpackuswb   %%ymm0,%%ymm0,%%ymm0                               \n" \
   "vpackuswb   %%ymm1,%%ymm1,%%ymm1                               \n" \
   "vpackuswb   %%ymm2,%%ymm2,%%ymm2                               \n"
@@ -3422,15 +3442,12 @@
 
 // Store 16 AR30 values.
 #define STOREAR30_AVX2                                                \
-  "vpsraw     $0x4,%%ymm0,%%ymm0                                  \n" \
-  "vpsraw     $0x4,%%ymm1,%%ymm1                                  \n" \
-  "vpsraw     $0x4,%%ymm2,%%ymm2                                  \n" \
-  "vpminsw    %%ymm7,%%ymm0,%%ymm0                                \n" \
-  "vpminsw    %%ymm7,%%ymm1,%%ymm1                                \n" \
-  "vpminsw    %%ymm7,%%ymm2,%%ymm2                                \n" \
-  "vpmaxsw    %%ymm6,%%ymm0,%%ymm0                                \n" \
-  "vpmaxsw    %%ymm6,%%ymm1,%%ymm1                                \n" \
-  "vpmaxsw    %%ymm6,%%ymm2,%%ymm2                                \n" \
+  "vpsrlw     $0x4,%%ymm0,%%ymm0                                  \n" \
+  "vpsrlw     $0x4,%%ymm1,%%ymm1                                  \n" \
+  "vpsrlw     $0x4,%%ymm2,%%ymm2                                  \n" \
+  "vpminuw    %%ymm6,%%ymm0,%%ymm0                                \n" \
+  "vpminuw    %%ymm6,%%ymm1,%%ymm1                                \n" \
+  "vpminuw    %%ymm6,%%ymm2,%%ymm2                                \n" \
   "vpsllw     $0x4,%%ymm2,%%ymm2                                  \n" \
   "vpermq     $0xd8,%%ymm0,%%ymm0                                 \n" \
   "vpermq     $0xd8,%%ymm1,%%ymm1                                 \n" \
@@ -3531,9 +3548,8 @@
       "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"  // AR30 constants
       "vpsrlw      $14,%%ymm5,%%ymm5             \n"
       "vpsllw      $4,%%ymm5,%%ymm5              \n"  // 2 alpha bits
-      "vpxor       %%ymm6,%%ymm6,%%ymm6          \n"  // 0 for min
-      "vpcmpeqb    %%ymm7,%%ymm7,%%ymm7          \n"  // 1023 for max
-      "vpsrlw      $6,%%ymm7,%%ymm7              \n"
+      "vpcmpeqb    %%ymm6,%%ymm6,%%ymm6          \n"  // 1023 for max
+      "vpsrlw      $6,%%ymm6,%%ymm6              \n"
 
     LABELALIGN
       "1:                                        \n"
@@ -3551,7 +3567,7 @@
     [width]"+rm"(width)    // %[width]
   : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
   : "memory", "cc", YUVTORGB_REGS_AVX2
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
   );
 }
 #endif  // HAS_I422TOAR30ROW_AVX2
@@ -3641,9 +3657,8 @@
       "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"  // AR30 constants
       "vpsrlw      $14,%%ymm5,%%ymm5             \n"
       "vpsllw      $4,%%ymm5,%%ymm5              \n"  // 2 alpha bits
-      "vpxor       %%ymm6,%%ymm6,%%ymm6          \n"  // 0 for min
-      "vpcmpeqb    %%ymm7,%%ymm7,%%ymm7          \n"  // 1023 for max
-      "vpsrlw      $6,%%ymm7,%%ymm7              \n"
+      "vpcmpeqb    %%ymm6,%%ymm6,%%ymm6          \n"  // 1023 for max
+      "vpsrlw      $6,%%ymm6,%%ymm6              \n"
 
     LABELALIGN
       "1:                                        \n"
@@ -3681,9 +3696,8 @@
       "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"  // AR30 constants
       "vpsrlw      $14,%%ymm5,%%ymm5             \n"
       "vpsllw      $4,%%ymm5,%%ymm5              \n"  // 2 alpha bits
-      "vpxor       %%ymm6,%%ymm6,%%ymm6          \n"  // 0 for min
-      "vpcmpeqb    %%ymm7,%%ymm7,%%ymm7          \n"  // 1023 for max
-      "vpsrlw      $6,%%ymm7,%%ymm7              \n"
+      "vpcmpeqb    %%ymm6,%%ymm6,%%ymm6          \n"  // 1023 for max
+      "vpsrlw      $6,%%ymm6,%%ymm6              \n"
 
     LABELALIGN
       "1:                                        \n"
@@ -3716,7 +3730,6 @@
                                const struct YuvConstants* yuvconstants,
                                int width) {
   asm volatile (
-
     YUVTORGB_SETUP_AVX2(yuvconstants)
       "sub         %[u_buf],%[v_buf]             \n"
       "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
@@ -3753,7 +3766,6 @@
                                     const struct YuvConstants* yuvconstants,
                                     int width) {
   asm volatile(
-
       YUVTORGB_SETUP_AVX2(
       yuvconstants) "sub         %[u_buf],%[v_buf]             \n"
 
@@ -3790,7 +3802,6 @@
                                     const struct YuvConstants* yuvconstants,
                                     int width) {
   asm volatile(
-
       YUVTORGB_SETUP_AVX2(
       yuvconstants) "sub         %[u_buf],%[v_buf]             \n"
 
@@ -3826,15 +3837,13 @@
                                const struct YuvConstants* yuvconstants,
                                int width) {
   asm volatile (
-
     YUVTORGB_SETUP_AVX2(yuvconstants)
       "sub         %[u_buf],%[v_buf]             \n"
       "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"  // AR30 constants
       "vpsrlw      $14,%%ymm5,%%ymm5             \n"
       "vpsllw      $4,%%ymm5,%%ymm5              \n"  // 2 alpha bits
-      "vpxor       %%ymm6,%%ymm6,%%ymm6          \n"  // 0 for min
-      "vpcmpeqb    %%ymm7,%%ymm7,%%ymm7          \n"  // 1023 for max
-      "vpsrlw      $6,%%ymm7,%%ymm7              \n"
+      "vpcmpeqb    %%ymm6,%%ymm6,%%ymm6          \n"  // 1023 for max
+      "vpsrlw      $6,%%ymm6,%%ymm6              \n"
 
     LABELALIGN
       "1:                                        \n"
@@ -4191,14 +4200,12 @@
                                const struct YuvConstants* yuvconstants,
                                int width) {
   asm volatile (
-
     YUVTORGB_SETUP_AVX2(yuvconstants)
       "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"  // AR30 constants
       "vpsrlw      $14,%%ymm5,%%ymm5             \n"
       "vpsllw      $4,%%ymm5,%%ymm5              \n"  // 2 alpha bits
-      "vpxor       %%ymm6,%%ymm6,%%ymm6          \n"  // 0 for min
-      "vpcmpeqb    %%ymm7,%%ymm7,%%ymm7          \n"  // 1023 for max
-      "vpsrlw      $6,%%ymm7,%%ymm7              \n"
+      "vpcmpeqb    %%ymm6,%%ymm6,%%ymm6          \n"  // 1023 for max
+      "vpsrlw      $6,%%ymm6,%%ymm6              \n"
 
     LABELALIGN
       "1:                                        \n"
@@ -4229,14 +4236,12 @@
                                const struct YuvConstants* yuvconstants,
                                int width) {
   asm volatile (
-
     YUVTORGB_SETUP_AVX2(yuvconstants)
       "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"  // AR30 constants
       "vpsrlw      $14,%%ymm5,%%ymm5             \n"
       "vpsllw      $4,%%ymm5,%%ymm5              \n"  // 2 alpha bits
-      "vpxor       %%ymm6,%%ymm6,%%ymm6          \n"  // 0 for min
-      "vpcmpeqb    %%ymm7,%%ymm7,%%ymm7          \n"  // 1023 for max
-      "vpsrlw      $6,%%ymm7,%%ymm7              \n"
+      "vpcmpeqb    %%ymm6,%%ymm6,%%ymm6          \n"  // 1023 for max
+      "vpsrlw      $6,%%ymm6,%%ymm6              \n"
 
     LABELALIGN
       "1:                                        \n"
diff --git a/source/row_neon.cc b/source/row_neon.cc
index a7b0c51..ccc4af6 100644
--- a/source/row_neon.cc
+++ b/source/row_neon.cc
@@ -99,7 +99,6 @@
 // q1: G uint16x8_t
 // q2: R uint16x8_t
 
-#ifdef LIBYUV_UNLIMITED_DATA
 // Convert from YUV to 2.14 fixed point RGB
 #define YUVTORGB                                           \
   "vmull.u16  q2, d1, d31                    \n"           \
@@ -122,28 +121,6 @@
   "vqshrn.u16 d4, q2, #6                     \n" /* R */ \
   "vqshrn.u16 d2, q1, #6                     \n" /* G */ \
   "vqshrn.u16 d0, q0, #6                     \n" /* B */
-#else
-#define YUVTORGB                                          \
-  "vmull.u16  q2, d1, d31                    \n"          \
-  "vmull.u8   q8, d3, d29                    \n"          \
-  "vmull.u16  q0, d0, d31                    \n"          \
-  "vmlal.u8   q8, d2, d28                    \n" /* DG */ \
-  "vqshrun.s32 d0, q0, #16                   \n"          \
-  "vqshrun.s32 d1, q2, #16                   \n" /* Y */  \
-  "vmull.u8   q9, d2, d26                    \n" /* DB */ \
-  "vmull.u8   q1, d3, d27                    \n" /* DR */ \
-  "vqadd.s16  q2, q0, q12                    \n"          \
-  "vqadd.s16  q4, q0, q11                    \n"          \
-  "vqadd.s16  q0, q0, q10                    \n"          \
-  "vqadd.s16  q2, q2, q1                     \n" /* R */  \
-  "vqsub.s16  q1, q4, q8                     \n" /* G */  \
-  "vqadd.s16  q0, q0, q9                     \n" /* B */
-
-#define RGBTORGB8                                        \
-  "vqshrun.s16 d4, q2, #6                    \n" /* R */ \
-  "vqshrun.s16 d2, q1, #6                    \n" /* G */ \
-  "vqshrun.s16 d0, q0, #6                    \n" /* B */
-#endif
 
 #define YUVTORGB_REGS \
   "q0", "q1", "q2", "q4", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "d31"
diff --git a/source/row_neon64.cc b/source/row_neon64.cc
index 2bf2860..ba6ca5d 100644
--- a/source/row_neon64.cc
+++ b/source/row_neon64.cc
@@ -87,7 +87,6 @@
 // v17.8h: G
 // v18.8h: R
 
-#ifdef LIBYUV_UNLIMITED_DATA
 // Convert from YUV to 2.14 fixed point RGB
 #define YUVTORGB                                          \
   "umull2     v3.4s, v0.8h, v24.8h           \n"          \
@@ -110,28 +109,6 @@
   "uqshrn     v17.8b, v17.8h, #6             \n" \
   "uqshrn     v16.8b, v16.8h, #6             \n" \
   "uqshrn     v18.8b, v18.8h, #6             \n"
-#else
-#define YUVTORGB                                          \
-  "umull2     v3.4s, v0.8h, v24.8h           \n"          \
-  "umull      v6.8h, v1.8b, v30.8b           \n"          \
-  "umull      v0.4s, v0.4h, v24.4h           \n"          \
-  "umlal2     v6.8h, v1.16b, v31.16b         \n" /* DG */ \
-  "sqshrun    v0.4h, v0.4s, #16              \n"          \
-  "sqshrun2   v0.8h, v3.4s, #16              \n" /* Y */  \
-  "umull      v4.8h, v1.8b, v28.8b           \n" /* DB */ \
-  "umull2     v5.8h, v1.16b, v29.16b         \n" /* DR */ \
-  "sqadd      v17.8h, v0.8h, v26.8h          \n"          \
-  "sqadd      v16.8h, v0.8h, v25.8h          \n"          \
-  "sqadd      v18.8h, v0.8h, v27.8h          \n"          \
-  "sqsub      v17.8h, v17.8h, v6.8h          \n" /* G */  \
-  "sqadd      v16.8h, v16.8h, v4.8h          \n" /* B */  \
-  "sqadd      v18.8h, v18.8h, v5.8h          \n" /* R */
-
-#define RGBTORGB8                                \
-  "sqshrun    v17.8b, v17.8h, #6             \n" \
-  "sqshrun    v16.8b, v16.8h, #6             \n" \
-  "sqshrun    v18.8b, v18.8h, #6             \n"
-#endif
 
 #define YUVTORGB_REGS                                                          \
   "v0", "v1", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", "v24", "v25", \
diff --git a/unit_test/convert_test.cc b/unit_test/convert_test.cc
index 10633c2..0b4e04a 100644
--- a/unit_test/convert_test.cc
+++ b/unit_test/convert_test.cc
@@ -2376,7 +2376,11 @@
 
   // Test result matches known hash value.
   uint32_t dst_argb_hash = HashDjb2(dst_argb, width * height, 5381);
+#ifdef LIBYUV_UNLIMITED_DATA
+  EXPECT_EQ(dst_argb_hash, 3900633302u);
+#else
   EXPECT_EQ(dst_argb_hash, 2355976473u);
+#endif
 
   free_aligned_buffer_page_end(dst_argb);
 }