libyuv:loongson Correct the optimization of mmi on loongson3a platform.

When loading or storing the data, the unaligned address will greatly degrade
the optimization performance, so non-aligned access instructions are required
on the loongson platform.

Also delete the optimization function:ScaleARGBFilterCols_MMI,
because it degraded the performance.

BUG=libyuv:804
R=fbarchard@chromium.org

Change-Id: If4c15886a21cdcbac7ae8b336292e4549acf1e47
Reviewed-on: https://chromium-review.googlesource.com/1164627
Reviewed-by: Frank Barchard <fbarchard@chromium.org>
Commit-Queue: Frank Barchard <fbarchard@chromium.org>
diff --git a/include/libyuv/scale_row.h b/include/libyuv/scale_row.h
index 3042136..282d521 100644
--- a/include/libyuv/scale_row.h
+++ b/include/libyuv/scale_row.h
@@ -115,7 +115,6 @@
 #define HAS_FIXEDDIV_MIPS
 #define HAS_SCALEARGBCOLS_MMI
 #define HAS_SCALEARGBCOLSUP2_MMI
-#define HAS_SCALEARGBFILTERCOLS_MMI
 #define HAS_SCALEARGBROWDOWN2_MMI
 #define HAS_SCALEARGBROWDOWNEVEN_MMI
 #define HAS_SCALEROWDOWN2_MMI
@@ -592,21 +591,11 @@
                            int dst_width,
                            int x,
                            int dx);
-void ScaleARGBFilterCols_MMI(uint8_t* dst_argb,
-                             const uint8_t* src_argb,
-                             int dst_width,
-                             int x,
-                             int dx);
 void ScaleARGBCols_MMI(uint8_t* dst_argb,
                        const uint8_t* src_argb,
                        int dst_width,
                        int x,
                        int dx);
-void ScaleARGBFilterCols_Any_MMI(uint8_t* dst_ptr,
-                                 const uint8_t* src_ptr,
-                                 int dst_width,
-                                 int x,
-                                 int dx);
 void ScaleARGBCols_Any_MMI(uint8_t* dst_ptr,
                            const uint8_t* src_ptr,
                            int dst_width,
diff --git a/source/row_mmi.cc b/source/row_mmi.cc
index dab8010..3649952 100644
--- a/source/row_mmi.cc
+++ b/source/row_mmi.cc
@@ -7,10 +7,8 @@
  *  in the file PATENTS. All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
-#include <sys/time.h>
 #include "libyuv/row.h"
 
-#include <stdio.h>
 #include <string.h>  // For memcpy and memset.
 
 #include "libyuv/basic_types.h"
@@ -4492,7 +4490,8 @@
       "psubh     %[y00],        %[y10],          %[y20]  \n\t"
 
       "packushb  %[sobel],      %[sobel],        %[y00]  \n\t"  // clamp255
-      "sdc1      %[sobel],      0(%[dst_sobelx])         \n\t"
+      "gssdrc1   %[sobel],      0(%[dst_sobelx])         \n\t"
+      "gssdlc1   %[sobel],      7(%[dst_sobelx])         \n\t"
 
       "daddiu    %[src_y0],     %[src_y0],      8        \n\t"
       "daddiu    %[src_y1],     %[src_y1],      8        \n\t"
@@ -4587,7 +4586,8 @@
       "psubh     %[y00],        %[y02],         %[y12]  \n\t"
 
       "packushb  %[sobel],      %[sobel],       %[y00]  \n\t"  // clamp255
-      "sdc1      %[sobel],      0(%[dst_sobely])        \n\t"
+      "gssdrc1   %[sobel],      0(%[dst_sobely])        \n\t"
+      "gssdlc1   %[sobel],      7(%[dst_sobely])        \n\t"
 
       "daddiu    %[src_y0],     %[src_y0],      8       \n\t"
       "daddiu    %[src_y1],     %[src_y1],      8       \n\t"
@@ -4624,13 +4624,15 @@
       "punpcklbh %[t1],         %[t0],              %[t0] \n\t"
       "or        %[t1],         %[t1],              %[c1] \n\t"
       // 255 s1 s1 s1 s55 s0 s0 s0
-      "sdc1      %[t1],         0x00(%[dst_argb])	  \n\t"
+      "gssdrc1   %[t1],         0x00(%[dst_argb])	  \n\t"
+      "gssdlc1   %[t1],         0x07(%[dst_argb])         \n\t"
 
       // s3 s3 s2 s2->s3 s3 s3 s3 s2 s2 s2 s2
       "punpckhbh %[t1],         %[t0],              %[t0] \n\t"
       "or        %[t1],         %[t1],              %[c1] \n\t"
       // 255 s3 s3 s3 255 s2 s2 s2
-      "sdc1      %[t1],         0x08(%[dst_argb])	  \n\t"
+      "gssdrc1   %[t1],         0x08(%[dst_argb])	  \n\t"
+      "gssdlc1   %[t1],         0x0f(%[dst_argb])         \n\t"
 
       // s7 s6 s5 s4->s7 s7 s6 s6 s5 s5 s4 s4
       "punpckhbh %[t0],         %[t2],              %[t2] \n\t"
@@ -4638,12 +4640,14 @@
       // s5 s5 s4 s4->s5 s5 s5 s5 s4 s4 s4 s4
       "punpcklbh %[t1],         %[t0],              %[t0] \n\t"
       "or        %[t1],         %[t1],              %[c1] \n\t"
-      "sdc1      %[t1],         0x10(%[dst_argb])	  \n\t"
+      "gssdrc1   %[t1],         0x10(%[dst_argb])	  \n\t"
+      "gssdlc1   %[t1],         0x17(%[dst_argb])         \n\t"
 
       // s7 s7 s6 s6->s7 s7 s7 s7 s6 s6 s6 s6
       "punpckhbh %[t1],         %[t0],              %[t0] \n\t"
       "or        %[t1],         %[t1],              %[c1] \n\t"
-      "sdc1      %[t1],         0x18(%[dst_argb])	  \n\t"
+      "gssdrc1   %[t1],         0x18(%[dst_argb])	  \n\t"
+      "gssdlc1   %[t1],         0x1f(%[dst_argb])         \n\t"
 
       "daddiu    %[dst_argb],   %[dst_argb],        32    \n\t"
       "daddiu    %[src_sobelx], %[src_sobelx],      8     \n\t"
@@ -4665,10 +4669,13 @@
   uint64_t tb = 0;
   __asm__ volatile(
       "1:	                                       \n\t"
-      "ldc1    %[tr],         0x0(%[src_sobelx])       \n\t"  // r=src_sobelx[i]
-      "ldc1    %[tb],         0x0(%[src_sobely])       \n\t"  // b=src_sobely[i]
+      "gsldrc1 %[tr],         0x0(%[src_sobelx])       \n\t"
+      "gsldlc1 %[tr],         0x7(%[src_sobelx])       \n\t"  // r=src_sobelx[i]
+      "gsldrc1 %[tb],         0x0(%[src_sobely])       \n\t"
+      "gsldlc1 %[tb],         0x7(%[src_sobely])       \n\t"  // b=src_sobely[i]
       "paddusb %[tr],         %[tr],             %[tb] \n\t"  // g
-      "sdc1    %[tr],         0x0(%[dst_y])	       \n\t"
+      "gssdrc1 %[tr],         0x0(%[dst_y])	       \n\t"
+      "gssdlc1 %[tr],         0x7(%[dst_y])            \n\t"
 
       "daddiu  %[dst_y],      %[dst_y],          8     \n\t"
       "daddiu  %[src_sobelx], %[src_sobelx],     8     \n\t"
@@ -4705,10 +4712,12 @@
       "punpcklbh %[cr],         %[tr],              %[c1] \n\t"
       // c1 r1 g1 b1 c0 r0 g0 b0
       "punpcklhw %[result],     %[gb],              %[cr] \n\t"
-      "sdc1      %[result],     0x00(%[dst_argb])	  \n\t"
+      "gssdrc1   %[result],     0x00(%[dst_argb])	  \n\t"
+      "gssdlc1   %[result],     0x07(%[dst_argb])         \n\t"
       // c3 r3 g3 b3 c2 r2 g2 b2
       "punpckhhw %[result],     %[gb],              %[cr] \n\t"
-      "sdc1      %[result],     0x08(%[dst_argb])	  \n\t"
+      "gssdrc1   %[result],     0x08(%[dst_argb])	  \n\t"
+      "gssdlc1   %[result],     0x0f(%[dst_argb])         \n\t"
 
       // g7 b7 g6 b6 g5 b5 g4 b4
       "punpckhbh %[gb],         %[tb],              %[tg] \n\t"
@@ -4716,10 +4725,12 @@
       "punpckhbh %[cr],         %[tr],              %[c1] \n\t"
       // c5 r5 g5 b5 c4 r4 g4 b4
       "punpcklhw %[result],     %[gb],              %[cr] \n\t"
-      "sdc1      %[result],     0x10(%[dst_argb])	  \n\t"
+      "gssdrc1   %[result],     0x10(%[dst_argb])	  \n\t"
+      "gssdlc1   %[result],     0x17(%[dst_argb])         \n\t"
       // c7 r7 g7 b7 c6 r6 g6 b6
       "punpckhhw %[result],     %[gb],              %[cr] \n\t"
-      "sdc1      %[result],     0x18(%[dst_argb])	  \n\t"
+      "gssdrc1   %[result],     0x18(%[dst_argb])	  \n\t"
+      "gssdlc1   %[result],     0x1f(%[dst_argb])         \n\t"
 
       "daddiu    %[dst_argb],   %[dst_argb],        32    \n\t"
       "daddiu    %[src_sobelx], %[src_sobelx],      8     \n\t"
@@ -4748,12 +4759,14 @@
       "punpcklhw  %[dest],         %[src],            %[src]        \n\t"
       "and        %[dest],         %[dest],           %[mask0]      \n\t"
       "or         %[dest],         %[dest],           %[mask1]      \n\t"
-      "sdc1       %[dest],         0x00(%[dst_ptr])                 \n\t"
+      "gssdrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"
+      "gssdlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
 
       "punpckhhw  %[dest],         %[src],            %[src]        \n\t"
       "and        %[dest],         %[dest],           %[mask0]      \n\t"
       "or         %[dest],         %[dest],           %[mask1]      \n\t"
-      "sdc1       %[dest],         0x08(%[dst_ptr])                 \n\t"
+      "gssdrc1    %[dest],         0x08(%[dst_ptr])                 \n\t"
+      "gssdlc1    %[dest],         0x0f(%[dst_ptr])                 \n\t"
 
       "daddiu     %[src_ptr],      %[src_ptr],        0x04          \n\t"
       "daddiu     %[dst_ptr],      %[dst_ptr],        0x10          \n\t"
@@ -4955,7 +4968,8 @@
       "gsldlc1 %[temp],  3(%[src])     	       \n\t"
       "gsldrc1 %[temp], -4(%[src])     	       \n\t"
       "pshufh  %[temp],  %[temp],    %[shuff]  \n\t"
-      "sdc1    %[temp],  0x0(%[dst])           \n\t"
+      "gssdrc1 %[temp],  0x0(%[dst])           \n\t"
+      "gssdlc1 %[temp],  0x7(%[dst])           \n\t"
 
       "daddiu  %[src],   %[src],    -0x08      \n\t"
       "daddiu  %[dst],   %[dst],     0x08      \n\t"
@@ -4975,18 +4989,22 @@
   uint64_t shift = 0x08;
   __asm__ volatile(
       "1:	                                    \n\t"
-      "ldc1     %[t0],     0x00(%[src_uv])          \n\t"
-      "ldc1     %[t1],     0x08(%[src_uv])          \n\t"
+      "gsldrc1  %[t0],     0x00(%[src_uv])          \n\t"
+      "gsldlc1  %[t0],     0x07(%[src_uv])          \n\t"
+      "gsldrc1  %[t1],     0x08(%[src_uv])          \n\t"
+      "gsldlc1  %[t1],     0x0f(%[src_uv])          \n\t"
 
       "and      %[t2],     %[t0],          %[c0]    \n\t"
       "and      %[t3],     %[t1],          %[c0]    \n\t"
       "packushb %[t2],     %[t2],          %[t3]    \n\t"
-      "sdc1     %[t2],     0x0(%[dst_u])	    \n\t"
+      "gssdrc1  %[t2],     0x0(%[dst_u])	    \n\t"
+      "gssdlc1  %[t2],     0x7(%[dst_u])            \n\t"
 
       "psrlh    %[t2],     %[t0],          %[shift] \n\t"
       "psrlh    %[t3],     %[t1],          %[shift] \n\t"
       "packushb %[t2],     %[t2],          %[t3]    \n\t"
-      "sdc1     %[t2],     0x0(%[dst_v])            \n\t"
+      "gssdrc1  %[t2],     0x0(%[dst_v])            \n\t"
+      "gssdlc1  %[t2],     0x7(%[dst_v])            \n\t"
 
       "daddiu   %[src_uv], %[src_uv],      16       \n\t"
       "daddiu   %[dst_u],  %[dst_u],       8        \n\t"
@@ -5008,12 +5026,16 @@
   uint64_t temp[3];
   __asm__ volatile(
       "1:	                                 \n\t"
-      "ldc1      %[t0],     0x0(%[src_u])        \n\t"
-      "ldc1      %[t1],     0x0(%[src_v])        \n\t"
+      "gsldrc1   %[t0],     0x0(%[src_u])        \n\t"
+      "gsldlc1   %[t0],     0x7(%[src_u])        \n\t"
+      "gsldrc1   %[t1],     0x0(%[src_v])        \n\t"
+      "gsldlc1   %[t1],     0x7(%[src_v])        \n\t"
       "punpcklbh %[t2],     %[t0],         %[t1] \n\t"
-      "sdc1      %[t2],     0x0(%[dst_uv])	 \n\t"
+      "gssdrc1   %[t2],     0x0(%[dst_uv])	 \n\t"
+      "gssdlc1   %[t2],     0x7(%[dst_uv])       \n\t"
       "punpckhbh %[t2],     %[t0],         %[t1] \n\t"
-      "sdc1      %[t2],     0x8(%[dst_uv])	 \n\t"
+      "gssdrc1   %[t2],     0x8(%[dst_uv])	 \n\t"
+      "gssdlc1   %[t2],     0xf(%[dst_uv])       \n\t"
 
       "daddiu    %[src_u],  %[src_u],      8     \n\t"
       "daddiu    %[src_v],  %[src_v],      8     \n\t"
@@ -5149,13 +5171,17 @@
   uint64_t src_stride = 0x0;
   __asm__ volatile(
       "1:	                                                     \n\t"
-      "ldc1     %[t0],         0x00(%[src_yuy2])                     \n\t"
+      "gsldrc1  %[t0],         0x00(%[src_yuy2])                     \n\t"
+      "gsldlc1  %[t0],         0x07(%[src_yuy2])                     \n\t"
       "daddu    %[src_stride], %[src_yuy2],       %[src_stride_yuy2] \n\t"
-      "ldc1     %[t1],         0x00(%[src_stride])                   \n\t"
-      "pavgb    %[t0],         %[t0], %[t1]                          \n\t"
+      "gsldrc1  %[t1],         0x00(%[src_stride])                   \n\t"
+      "gsldlc1  %[t1],         0x07(%[src_stride])                   \n\t"
+      "pavgb    %[t0],         %[t0],             %[t1]              \n\t"
 
-      "ldc1     %[t2],         0x08(%[src_yuy2])                     \n\t"
-      "ldc1     %[t1],         0x08(%[src_stride])                   \n\t"
+      "gsldrc1  %[t2],         0x08(%[src_yuy2])                     \n\t"
+      "gsldlc1  %[t2],         0x0f(%[src_yuy2])                     \n\t"
+      "gsldrc1  %[t1],         0x08(%[src_stride])                   \n\t"
+      "gsldlc1  %[t1],         0x0f(%[src_stride])                   \n\t"
       "pavgb    %[t1],         %[t2],             %[t1]              \n\t"
 
       "and      %[t0],         %[t0],             %[c0]              \n\t"
@@ -5167,12 +5193,16 @@
       "and      %[d0],         %[t0],             %[c1]              \n\t"
       "psrlh    %[d1],         %[t1],             %[shift]           \n\t"
 
-      "ldc1     %[t0],         0x10(%[src_yuy2])                     \n\t"
-      "ldc1     %[t1],         0x10(%[src_stride])                   \n\t"
+      "gsldrc1  %[t0],         0x10(%[src_yuy2])                     \n\t"
+      "gsldlc1  %[t0],         0x17(%[src_yuy2])                     \n\t"
+      "gsldrc1  %[t1],         0x10(%[src_stride])                   \n\t"
+      "gsldlc1  %[t1],         0x17(%[src_stride])                   \n\t"
       "pavgb    %[t0],         %[t0],              %[t1]             \n\t"
 
-      "ldc1     %[t2],         0x18(%[src_yuy2])                     \n\t"
-      "ldc1     %[t1],         0x18(%[src_stride])                   \n\t"
+      "gsldrc1  %[t2],         0x18(%[src_yuy2])                     \n\t"
+      "gsldlc1  %[t2],         0x1f(%[src_yuy2])                     \n\t"
+      "gsldrc1  %[t1],         0x18(%[src_stride])                   \n\t"
+      "gsldlc1  %[t1],         0x1f(%[src_stride])                   \n\t"
       "pavgb    %[t1],         %[t2],              %[t1]             \n\t"
 
       "and      %[t0],         %[t0],              %[c0]             \n\t"
@@ -5186,8 +5216,10 @@
 
       "packushb %[d0],         %[d0],              %[d2]             \n\t"
       "packushb %[d1],         %[d1],              %[d3]             \n\t"
-      "sdc1     %[d0],         0x0(%[dst_u])	                     \n\t"
-      "sdc1     %[d1],         0x0(%[dst_v])	                     \n\t"
+      "gssdrc1  %[d0],         0x0(%[dst_u])	                     \n\t"
+      "gssdlc1  %[d0],         0x7(%[dst_u])                         \n\t"
+      "gssdrc1  %[d1],         0x0(%[dst_v])	                     \n\t"
+      "gssdlc1  %[d1],         0x7(%[dst_v])                         \n\t"
       "daddiu   %[src_yuy2],   %[src_yuy2],        32                \n\t"
       "daddiu   %[dst_u],      %[dst_u],           8                 \n\t"
       "daddiu   %[dst_v],      %[dst_v],           8                 \n\t"
@@ -5215,8 +5247,10 @@
   uint64_t shift = 0x08;
   __asm__ volatile(
       "1:	                                        \n\t"
-      "ldc1     %[t0],       0x00(%[src_yuy2])          \n\t"
-      "ldc1     %[t1],       0x08(%[src_yuy2])          \n\t"
+      "gsldrc1  %[t0],       0x00(%[src_yuy2])          \n\t"
+      "gsldlc1  %[t0],       0x07(%[src_yuy2])          \n\t"
+      "gsldrc1  %[t1],       0x08(%[src_yuy2])          \n\t"
+      "gsldlc1  %[t1],       0x0f(%[src_yuy2])          \n\t"
       "and      %[t0],       %[t0],            %[c0]    \n\t"
       "and      %[t1],       %[t1],            %[c0]    \n\t"
       "psrlh    %[t0],       %[t0],            %[shift] \n\t"
@@ -5226,8 +5260,10 @@
       "and      %[d0],       %[t0],            %[c1]    \n\t"
       "psrlh    %[d1],       %[t1],            %[shift] \n\t"
 
-      "ldc1     %[t0],       0x10(%[src_yuy2])          \n\t"
-      "ldc1     %[t1],       0x18(%[src_yuy2])          \n\t"
+      "gsldrc1  %[t0],       0x10(%[src_yuy2])          \n\t"
+      "gsldlc1  %[t0],       0x17(%[src_yuy2])          \n\t"
+      "gsldrc1  %[t1],       0x18(%[src_yuy2])          \n\t"
+      "gsldlc1  %[t1],       0x1f(%[src_yuy2])          \n\t"
       "and      %[t0],       %[t0],            %[c0]    \n\t"
       "and      %[t1],       %[t1],            %[c0]    \n\t"
       "psrlh    %[t0],       %[t0],            %[shift] \n\t"
@@ -5239,8 +5275,10 @@
 
       "packushb %[d0],       %[d0],            %[d2]    \n\t"
       "packushb %[d1],       %[d1],            %[d3]    \n\t"
-      "sdc1     %[d0],       0x0(%[dst_u])	        \n\t"
-      "sdc1     %[d1],       0x0(%[dst_v])	        \n\t"
+      "gssdrc1  %[d0],       0x0(%[dst_u])	        \n\t"
+      "gssdlc1  %[d0],       0x7(%[dst_u])              \n\t"
+      "gssdrc1  %[d1],       0x0(%[dst_v])	        \n\t"
+      "gssdlc1  %[d1],       0x7(%[dst_v])              \n\t"
       "daddiu   %[src_yuy2], %[src_yuy2],      32       \n\t"
       "daddiu   %[dst_u],    %[dst_u],         8        \n\t"
       "daddiu   %[dst_v],    %[dst_v],         8        \n\t"
@@ -5256,17 +5294,19 @@
 
 // Copy row of YUY2 Y's (422) into Y (420/422).
 void YUY2ToYRow_MMI(const uint8_t* src_yuy2, uint8_t* dst_y, int width) {
-  // Output a row of UV values, filtering 2 rows of YUY2.
   uint64_t c0 = 0x00ff00ff00ff00ff;
   uint64_t temp[2];
   __asm__ volatile(
       "1:	                                     \n\t"
-      "ldc1     %[t0],       0x00(%[src_yuy2])       \n\t"
-      "ldc1     %[t1],       0x08(%[src_yuy2])       \n\t"
+      "gsldrc1  %[t0],       0x00(%[src_yuy2])       \n\t"
+      "gsldlc1  %[t0],       0x07(%[src_yuy2])       \n\t"
+      "gsldrc1  %[t1],       0x08(%[src_yuy2])       \n\t"
+      "gsldlc1  %[t1],       0x0f(%[src_yuy2])       \n\t"
       "and      %[t0],       %[t0],            %[c0] \n\t"
       "and      %[t1],       %[t1],            %[c0] \n\t"
       "packushb %[t0],       %[t0],            %[t1] \n\t"
-      "sdc1     %[t0],       0x0(%[dst_y])	     \n\t"
+      "gssdrc1  %[t0],       0x0(%[dst_y])	     \n\t"
+      "gssdlc1  %[t0],       0x7(%[dst_y])           \n\t"
       "daddiu   %[src_yuy2], %[src_yuy2],      16    \n\t"
       "daddiu   %[dst_y],    %[dst_y],         8     \n\t"
       "daddiu   %[width],    %[width],        -8     \n\t"
@@ -5292,13 +5332,17 @@
   uint64_t src_stride = 0x0;
   __asm__ volatile(
       "1:	                                                      \n\t"
-      "ldc1     %[t0],         0x00(%[src_uyvy])                      \n\t"
+      "gsldrc1  %[t0],         0x00(%[src_uyvy])                      \n\t"
+      "gsldlc1  %[t0],         0x07(%[src_uyvy])                      \n\t"
       "daddu    %[src_stride], %[src_uyvy],        %[src_stride_uyvy] \n\t"
-      "ldc1     %[t1],         0x00(%[src_stride])                    \n\t"
+      "gsldrc1  %[t1],         0x00(%[src_stride])                    \n\t"
+      "gsldlc1  %[t1],         0x07(%[src_stride])                    \n\t"
       "pavgb    %[t0],         %[t0],              %[t1]              \n\t"
 
-      "ldc1     %[t2],         0x08(%[src_uyvy])                      \n\t"
-      "ldc1     %[t1],         0x08(%[src_stride])                    \n\t"
+      "gsldrc1  %[t2],         0x08(%[src_uyvy])                      \n\t"
+      "gsldlc1  %[t2],         0x0f(%[src_uyvy])                      \n\t"
+      "gsldrc1  %[t1],         0x08(%[src_stride])                    \n\t"
+      "gsldlc1  %[t1],         0x0f(%[src_stride])                    \n\t"
       "pavgb    %[t1],         %[t2],              %[t1]              \n\t"
 
       "and      %[t0],         %[t0],              %[c0]              \n\t"
@@ -5308,12 +5352,16 @@
       "and      %[d0],         %[t0],              %[c0]              \n\t"
       "psrlh    %[d1],         %[t1],              %[shift]           \n\t"
 
-      "ldc1     %[t0],         0x10(%[src_uyvy])                      \n\t"
-      "ldc1     %[t1],         0x10(%[src_stride])                    \n\t"
+      "gsldrc1  %[t0],         0x10(%[src_uyvy])                      \n\t"
+      "gsldlc1  %[t0],         0x17(%[src_uyvy])                      \n\t"
+      "gsldrc1  %[t1],         0x10(%[src_stride])                    \n\t"
+      "gsldlc1  %[t1],         0x17(%[src_stride])                    \n\t"
       "pavgb    %[t0],         %[t0],              %[t1]              \n\t"
 
-      "ldc1     %[t2],         0x18(%[src_uyvy])                      \n\t"
-      "ldc1     %[t1],         0x18(%[src_stride])                    \n\t"
+      "gsldrc1  %[t2],         0x18(%[src_uyvy])                      \n\t"
+      "gsldlc1  %[t2],         0x1f(%[src_uyvy])                      \n\t"
+      "gsldrc1  %[t1],         0x18(%[src_stride])                    \n\t"
+      "gsldlc1  %[t1],         0x1f(%[src_stride])                    \n\t"
       "pavgb    %[t1],         %[t2],              %[t1]              \n\t"
 
       "and      %[t0],         %[t0],              %[c0]              \n\t"
@@ -5325,8 +5373,10 @@
 
       "packushb %[d0],         %[d0],              %[d2]              \n\t"
       "packushb %[d1],         %[d1],              %[d3]              \n\t"
-      "sdc1     %[d0],         0x0(%[dst_u])	                      \n\t"
-      "sdc1     %[d1],         0x0(%[dst_v])	                      \n\t"
+      "gssdrc1  %[d0],         0x0(%[dst_u])	                      \n\t"
+      "gssdlc1  %[d0],         0x7(%[dst_u])                          \n\t"
+      "gssdrc1  %[d1],         0x0(%[dst_v])	                      \n\t"
+      "gssdlc1  %[d1],         0x7(%[dst_v])                          \n\t"
       "daddiu   %[src_uyvy],   %[src_uyvy],        32                 \n\t"
       "daddiu   %[dst_u],      %[dst_u],           8                  \n\t"
       "daddiu   %[dst_v],      %[dst_v],           8                  \n\t"
@@ -5354,8 +5404,10 @@
   uint64_t shift = 0x08;
   __asm__ volatile(
       "1:	                                        \n\t"
-      "ldc1     %[t0],       0x00(%[src_uyvy])          \n\t"
-      "ldc1     %[t1],       0x08(%[src_uyvy])          \n\t"
+      "gsldrc1  %[t0],       0x00(%[src_uyvy])          \n\t"
+      "gsldlc1  %[t0],       0x07(%[src_uyvy])          \n\t"
+      "gsldrc1  %[t1],       0x08(%[src_uyvy])          \n\t"
+      "gsldlc1  %[t1],       0x0f(%[src_uyvy])          \n\t"
       "and      %[t0],       %[t0],            %[c0]    \n\t"
       "and      %[t1],       %[t1],            %[c0]    \n\t"
       "packushb %[t0],       %[t0],            %[t1]    \n\t"
@@ -5363,8 +5415,10 @@
       "and      %[d0],       %[t0],            %[c0]    \n\t"
       "psrlh    %[d1],       %[t1],            %[shift] \n\t"
 
-      "ldc1     %[t0],       0x10(%[src_uyvy])          \n\t"
-      "ldc1     %[t1],       0x18(%[src_uyvy])          \n\t"
+      "gsldrc1  %[t0],       0x10(%[src_uyvy])          \n\t"
+      "gsldlc1  %[t0],       0x17(%[src_uyvy])          \n\t"
+      "gsldrc1  %[t1],       0x18(%[src_uyvy])          \n\t"
+      "gsldlc1  %[t1],       0x1f(%[src_uyvy])          \n\t"
       "and      %[t0],       %[t0],            %[c0]    \n\t"
       "and      %[t1],       %[t1],            %[c0]    \n\t"
       "packushb %[t0],       %[t0],            %[t1]    \n\t"
@@ -5374,8 +5428,10 @@
 
       "packushb %[d0],       %[d0],            %[d2]    \n\t"
       "packushb %[d1],       %[d1],            %[d3]    \n\t"
-      "sdc1     %[d0],       0x0(%[dst_u])	        \n\t"
-      "sdc1     %[d1],       0x0(%[dst_v])	        \n\t"
+      "gssdrc1  %[d0],       0x0(%[dst_u])	        \n\t"
+      "gssdlc1  %[d0],       0x7(%[dst_u])              \n\t"
+      "gssdrc1  %[d1],       0x0(%[dst_v])	        \n\t"
+      "gssdlc1  %[d1],       0x7(%[dst_v])              \n\t"
       "daddiu   %[src_uyvy], %[src_uyvy],      32       \n\t"
       "daddiu   %[dst_u],    %[dst_u],         8        \n\t"
       "daddiu   %[dst_v],    %[dst_v],         8        \n\t"
@@ -5397,15 +5453,18 @@
   uint64_t temp[2];
   __asm__ volatile(
       "1:	                                        \n\t"
-      "ldc1     %[t0],       0x00(%[src_uyvy])          \n\t"
-      "ldc1     %[t1],       0x08(%[src_uyvy])          \n\t"
+      "gsldrc1  %[t0],       0x00(%[src_uyvy])          \n\t"
+      "gsldlc1  %[t0],       0x07(%[src_uyvy])          \n\t"
+      "gsldrc1  %[t1],       0x08(%[src_uyvy])          \n\t"
+      "gsldlc1  %[t1],       0x0f(%[src_uyvy])          \n\t"
       "dsrl     %[t0],       %[t0],            %[shift] \n\t"
       "dsrl     %[t1],       %[t1],            %[shift] \n\t"
       "and      %[t0],       %[t0],            %[c0]    \n\t"
       "and      %[t1],       %[t1],            %[c0]    \n\t"
       "and      %[t1],       %[t1],            %[c0]    \n\t"
       "packushb %[t0],       %[t0],            %[t1]    \n\t"
-      "sdc1     %[t0],       0x0(%[dst_y])	        \n\t"
+      "gssdrc1  %[t0],       0x0(%[dst_y])	        \n\t"
+      "gssdlc1  %[t0],       0x7(%[dst_y])              \n\t"
       "daddiu   %[src_uyvy], %[src_uyvy],      16       \n\t"
       "daddiu   %[dst_y],    %[dst_y],         8        \n\t"
       "daddiu   %[width],    %[width],        -8        \n\t"
@@ -5670,19 +5729,22 @@
     uint64_t uv = 0x0;
     uint64_t uv_stride = 0x0;
     __asm__ volatile(
-        "1:	                                           \n\t"
-        "ldc1   %[uv],        0x0(%[src_ptr])              \n\t"
-        "daddu  $t0,          %[src_ptr],     %[stride]    \n\t"
-        "ldc1   %[uv_stride], 0x0($t0)                     \n\t"
+        "1:	                                            \n\t"
+        "gsldrc1 %[uv],        0x0(%[src_ptr])              \n\t"
+        "gsldlc1 %[uv],        0x7(%[src_ptr])              \n\t"
+        "daddu   $t0,          %[src_ptr],     %[stride]    \n\t"
+        "gsldrc1 %[uv_stride], 0x0($t0)                     \n\t"
+        "gsldlc1 %[uv_stride], 0x7($t0)                     \n\t"
 
-        "pavgb  %[uv],        %[uv],          %[uv_stride] \n\t"
-        "sdc1   %[uv],        0x0(%[dst_ptr])              \n\t"
+        "pavgb   %[uv],        %[uv],          %[uv_stride] \n\t"
+        "gssdrc1 %[uv],        0x0(%[dst_ptr])              \n\t"
+        "gssdlc1 %[uv],        0x7(%[dst_ptr])              \n\t"
 
-        "daddiu %[src_ptr],   %[src_ptr],     8            \n\t"
-        "daddiu %[dst_ptr],   %[dst_ptr],     8            \n\t"
-        "daddiu %[width],     %[width],      -8            \n\t"
-        "bgtz   %[width],     1b                           \n\t"
-        "nop                                               \n\t"
+        "daddiu  %[src_ptr],   %[src_ptr],     8            \n\t"
+        "daddiu  %[dst_ptr],   %[dst_ptr],     8            \n\t"
+        "daddiu  %[width],     %[width],      -8            \n\t"
+        "bgtz    %[width],     1b                           \n\t"
+        "nop                                                \n\t"
         : [uv] "=&f"(uv), [uv_stride] "=&f"(uv_stride)
         : [src_ptr] "r"(src_ptr), [dst_ptr] "r"(dst_ptr), [width] "r"(width),
           [stride] "r"((int64_t)src_stride)
@@ -5700,10 +5762,12 @@
       "pshufh    %[fy1],      %[fy1],          %[zero]  \n\t"
       "psubh     %[fy0],      %[fy0],          %[fy1]   \n\t"
       "1:	                                        \n\t"
-      "ldc1      %[t0],       0x0(%[src_ptr])           \n\t"
+      "gsldrc1   %[t0],       0x0(%[src_ptr])           \n\t"
+      "gsldlc1   %[t0],       0x7(%[src_ptr])           \n\t"
       "punpcklbh %[d0],       %[t0],           %[zero]  \n\t"
       "punpckhbh %[d1],       %[t0],           %[zero]  \n\t"
-      "ldc1      %[t0],       0x0(%[src_ptr1])          \n\t"
+      "gsldrc1   %[t0],       0x0(%[src_ptr1])          \n\t"
+      "gsldlc1   %[t0],       0x7(%[src_ptr1])          \n\t"
       "punpcklbh %[d2],       %[t0],           %[zero]  \n\t"
       "punpckhbh %[d3],       %[t0],           %[zero]  \n\t"
 
@@ -5720,7 +5784,8 @@
       "psrlh     %[d1],       %[d1],           %[shift] \n\t"
 
       "packushb  %[d0],       %[d0],           %[d1]    \n\t"
-      "sdc1      %[d0],       0x0(%[dst_ptr])           \n\t"
+      "gssdrc1   %[d0],       0x0(%[dst_ptr])           \n\t"
+      "gssdlc1   %[d0],       0x7(%[dst_ptr])           \n\t"
       "daddiu    %[src_ptr],  %[src_ptr],      8        \n\t"
       "daddiu    %[src_ptr1], %[src_ptr1],     8        \n\t"
       "daddiu    %[dst_ptr],  %[dst_ptr],      8        \n\t"
diff --git a/source/scale_mmi.cc b/source/scale_mmi.cc
index 4757d89..e12c6bb 100644
--- a/source/scale_mmi.cc
+++ b/source/scale_mmi.cc
@@ -38,10 +38,12 @@
 
   __asm__ volatile(
       "1:                                                           \n\t"
-      "ldc1       %[src0],         0x00(%[src_ptr])                 \n\t"
+      "gsldrc1    %[src0],         0x00(%[src_ptr])                 \n\t"
+      "gsldlc1    %[src0],         0x07(%[src_ptr])                 \n\t"
       "psrlh      %[src0],         %[src0],           %[shift]      \n\t"
 
-      "ldc1       %[src1],         0x08(%[src_ptr])                 \n\t"
+      "gsldrc1    %[src1],         0x08(%[src_ptr])                 \n\t"
+      "gsldlc1    %[src1],         0x0f(%[src_ptr])                 \n\t"
       "psrlh      %[src1],         %[src1],           %[shift]      \n\t"
 
       "packushb   %[dest],         %[src0],           %[src1]       \n\t"
@@ -72,9 +74,11 @@
 
   __asm__ volatile(
       "1:                                                           \n\t"
-      "ldc1       %[src0],          0x00(%[src_ptr])                \n\t"
+      "gsldrc1    %[src0],          0x00(%[src_ptr])                \n\t"
+      "gsldlc1    %[src0],          0x07(%[src_ptr])                \n\t"
       "and        %[dest0],         %[src0],          %[mask]       \n\t"
-      "ldc1       %[src1],          0x08(%[src_ptr])                \n\t"
+      "gsldrc1    %[src1],          0x08(%[src_ptr])                \n\t"
+      "gsldlc1    %[src1],          0x0f(%[src_ptr])                \n\t"
       "and        %[dest1],         %[src1],          %[mask]       \n\t"
       "packushb   %[dest0],         %[dest0],         %[dest1]      \n\t"
 
@@ -114,11 +118,13 @@
 
   __asm__ volatile(
       "1:                                                           \n\t"
-      "ldc1       %[s0],            0x00(%[s])                      \n\t"
+      "gsldrc1    %[s0],            0x00(%[s])                      \n\t"
+      "gsldlc1    %[s0],            0x07(%[s])                      \n\t"
       "psrlh      %[s1],            %[s0],            %[shift1]     \n\t"
       "and        %[s0],            %[s0],            %[mask]       \n\t"
 
-      "ldc1       %[t0],            0x00(%[t])                      \n\t"
+      "gsldrc1    %[t0],            0x00(%[t])                      \n\t"
+      "gsldlc1    %[t0],            0x07(%[t])                      \n\t"
       "psrlh      %[t1],            %[t0],            %[shift1]     \n\t"
       "and        %[t0],            %[t0],            %[mask]       \n\t"
 
@@ -128,11 +134,13 @@
       "paddh      %[dest0],         %[dest0],         %[ph]         \n\t"
       "psrlh      %[dest0],         %[dest0],         %[shift0]     \n\t"
 
-      "ldc1       %[s0],            0x08(%[s])                      \n\t"
+      "gsldrc1    %[s0],            0x08(%[s])                      \n\t"
+      "gsldlc1    %[s0],            0x0f(%[s])                      \n\t"
       "psrlh      %[s1],            %[s0],            %[shift1]     \n\t"
       "and        %[s0],            %[s0],            %[mask]       \n\t"
 
-      "ldc1       %[t0],            0x08(%[t])                      \n\t"
+      "gsldrc1    %[t0],            0x08(%[t])                      \n\t"
+      "gsldlc1    %[t0],            0x0f(%[t])                      \n\t"
       "psrlh      %[t1],            %[t0],            %[shift1]     \n\t"
       "and        %[t0],            %[t0],            %[mask]       \n\t"
 
@@ -172,8 +180,10 @@
 
   __asm__ volatile(
       "1:                                                           \n\t"
-      "ldc1       %[src0],         0x00(%[src_ptr])                 \n\t"
-      "ldc1       %[src1],         0x08(%[src_ptr])                 \n\t"
+      "gsldrc1    %[src0],         0x00(%[src_ptr])                 \n\t"
+      "gsldlc1    %[src0],         0x07(%[src_ptr])                 \n\t"
+      "gsldrc1    %[src1],         0x08(%[src_ptr])                 \n\t"
+      "gsldlc1    %[src1],         0x0f(%[src_ptr])                 \n\t"
       "punpckhwd  %[dest],         %[src0],           %[src1]       \n\t"
 
       "gssdlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
@@ -237,12 +247,14 @@
 
   __asm__ volatile(
       "1:                                                           \n\t"
-      "ldc1       %[s0],            0x00(%[s])                      \n\t"
+      "gsldrc1    %[s0],            0x00(%[s])                      \n\t"
+      "gsldlc1    %[s0],            0x07(%[s])                      \n\t"
       "punpcklbh  %[s_lo],          %[s0],           %[mask]        \n\t"
       "punpckhbh  %[s_hi],          %[s0],           %[mask]        \n\t"
       "paddh      %[dest_lo],       %[s_lo],         %[s_hi]        \n\t"
 
-      "ldc1       %[t0],            0x00(%[t])                      \n\t"
+      "gsldrc1    %[t0],            0x00(%[t])                      \n\t"
+      "gsldlc1    %[t0],            0x07(%[t])                      \n\t"
       "punpcklbh  %[t_lo],          %[t0],           %[mask]        \n\t"
       "punpckhbh  %[t_hi],          %[t0],           %[mask]        \n\t"
       "paddh      %[dest_lo],       %[dest_lo],      %[t_lo]        \n\t"
@@ -251,12 +263,14 @@
       "paddh      %[dest_lo],      %[dest_lo],       %[ph]          \n\t"
       "psrlh      %[dest_lo],      %[dest_lo],       %[shfit]       \n\t"
 
-      "ldc1       %[s0],            0x08(%[s])                      \n\t"
+      "gsldrc1    %[s0],            0x08(%[s])                      \n\t"
+      "gsldlc1    %[s0],            0x0f(%[s])                      \n\t"
       "punpcklbh  %[s_lo],          %[s0],           %[mask]        \n\t"
       "punpckhbh  %[s_hi],          %[s0],           %[mask]        \n\t"
       "paddh      %[dest_hi],       %[s_lo],         %[s_hi]        \n\t"
 
-      "ldc1       %[t0],            0x08(%[t])                      \n\t"
+      "gsldrc1    %[t0],            0x08(%[t])                      \n\t"
+      "gsldlc1    %[t0],            0x0f(%[t])                      \n\t"
       "punpcklbh  %[t_lo],          %[t0],           %[mask]        \n\t"
       "punpckhbh  %[t_hi],          %[t0],           %[mask]        \n\t"
       "paddh      %[dest_hi],       %[dest_hi],      %[t_lo]        \n\t"
@@ -293,10 +307,12 @@
 
   __asm__ volatile(
       "1:                                                           \n\t"
-      "ldc1       %[src0],         0x00(%[src_ptr])                 \n\t"
+      "gsldrc1    %[src0],         0x00(%[src_ptr])                 \n\t"
+      "gsldlc1    %[src0],         0x07(%[src_ptr])                 \n\t"
       "psrlw      %[src0],         %[src0],           %[shift]      \n\t"
 
-      "ldc1       %[src1],         0x08(%[src_ptr])                 \n\t"
+      "gsldrc1    %[src1],         0x08(%[src_ptr])                 \n\t"
+      "gsldlc1    %[src1],         0x0f(%[src_ptr])                 \n\t"
       "psrlw      %[src1],         %[src1],           %[shift]      \n\t"
 
       "packsswh   %[dest],         %[src0],           %[src1]       \n\t"
@@ -324,8 +340,10 @@
 
   __asm__ volatile(
       "1:                                                           \n\t"
-      "ldc1       %[src0],         0x00(%[src_ptr])                 \n\t"
-      "ldc1       %[src1],         0x08(%[src_ptr])                 \n\t"
+      "gsldrc1    %[src0],         0x00(%[src_ptr])                 \n\t"
+      "gsldlc1    %[src0],         0x07(%[src_ptr])                 \n\t"
+      "gsldrc1    %[src1],         0x08(%[src_ptr])                 \n\t"
+      "gsldlc1    %[src1],         0x0f(%[src_ptr])                 \n\t"
       "punpcklhw  %[dest_lo],      %[src0],           %[src1]       \n\t"
       "punpckhhw  %[dest_hi],      %[src0],           %[src1]       \n\t"
 
@@ -364,11 +382,13 @@
 
   __asm__ volatile(
       "1:                                                           \n\t"
-      "ldc1       %[s0],            0x00(%[s])                      \n\t"
+      "gsldrc1    %[s0],            0x00(%[s])                      \n\t"
+      "gsldlc1    %[s0],            0x07(%[s])                      \n\t"
       "psrlw      %[s1],            %[s0],            %[shift0]     \n\t"
       "and        %[s0],            %[s0],            %[mask]       \n\t"
 
-      "ldc1       %[t0],            0x00(%[t])                      \n\t"
+      "gsldrc1    %[t0],            0x00(%[t])                      \n\t"
+      "gsldlc1    %[t0],            0x07(%[t])                      \n\t"
       "psrlw      %[t1],            %[t0],            %[shift0]     \n\t"
       "and        %[t0],            %[t0],            %[mask]       \n\t"
 
@@ -378,11 +398,13 @@
       "paddw      %[dest0],         %[dest0],         %[ph]         \n\t"
       "psrlw      %[dest0],         %[dest0],         %[shift1]     \n\t"
 
-      "ldc1       %[s0],            0x08(%[s])                      \n\t"
+      "gsldrc1    %[s0],            0x08(%[s])                      \n\t"
+      "gsldlc1    %[s0],            0x0f(%[s])                      \n\t"
       "psrlw      %[s1],            %[s0],            %[shift0]     \n\t"
       "and        %[s0],            %[s0],            %[mask]       \n\t"
 
-      "ldc1       %[t0],            0x08(%[t])                      \n\t"
+      "gsldrc1    %[t0],            0x08(%[t])                      \n\t"
+      "gsldlc1    %[t0],            0x0f(%[t])                      \n\t"
       "psrlw      %[t1],            %[t0],            %[shift0]     \n\t"
       "and        %[t0],            %[t0],            %[mask]       \n\t"
 
@@ -425,18 +447,22 @@
 
   __asm__ volatile(
       "1:                                                           \n\t"
-      "ldc1       %[src0],         0x00(%[src_ptr])                 \n\t"
+      "gsldrc1    %[src0],         0x00(%[src_ptr])                 \n\t"
+      "gsldlc1    %[src0],         0x07(%[src_ptr])                 \n\t"
       "psrlw      %[src0],         %[src0],           %[shift]      \n\t"
       "and        %[src0],         %[src0],           %[mask]       \n\t"
-      "ldc1       %[src1],         0x08(%[src_ptr])                 \n\t"
+      "gsldrc1    %[src1],         0x08(%[src_ptr])                 \n\t"
+      "gsldlc1    %[src1],         0x0f(%[src_ptr])                 \n\t"
       "psrlw      %[src1],         %[src1],           %[shift]      \n\t"
       "and        %[src1],         %[src1],           %[mask]       \n\t"
       "packsswh   %[dest_lo],      %[src0],           %[src1]       \n\t"
 
-      "ldc1       %[src0],         0x10(%[src_ptr])                 \n\t"
+      "gsldrc1    %[src0],         0x10(%[src_ptr])                 \n\t"
+      "gsldlc1    %[src0],         0x17(%[src_ptr])                 \n\t"
       "psrlw      %[src0],         %[src0],           %[shift]      \n\t"
       "and        %[src0],         %[src0],           %[mask]       \n\t"
-      "ldc1       %[src1],         0x18(%[src_ptr])                 \n\t"
+      "gsldrc1    %[src1],         0x18(%[src_ptr])                 \n\t"
+      "gsldlc1    %[src1],         0x1f(%[src_ptr])                 \n\t"
       "psrlw      %[src1],         %[src1],           %[shift]      \n\t"
       "and        %[src1],         %[src1],           %[mask]       \n\t"
       "packsswh   %[dest_hi],      %[src0],           %[src1]       \n\t"
@@ -469,13 +495,17 @@
 
   __asm__ volatile(
       "1:                                                           \n\t"
-      "ldc1       %[src0],         0x00(%[src_ptr])                 \n\t"
-      "ldc1       %[src1],         0x08(%[src_ptr])                 \n\t"
+      "gsldrc1    %[src0],         0x00(%[src_ptr])                 \n\t"
+      "gsldlc1    %[src0],         0x07(%[src_ptr])                 \n\t"
+      "gsldrc1    %[src1],         0x08(%[src_ptr])                 \n\t"
+      "gsldlc1    %[src1],         0x0f(%[src_ptr])                 \n\t"
       "punpckhhw  %[dest_lo],      %[src0],           %[src1]       \n\t"
       "punpcklhw  %[dest_lo],      %[dest_lo],        %[mask]       \n\t"
 
-      "ldc1       %[src0],         0x10(%[src_ptr])                 \n\t"
-      "ldc1       %[src1],         0x18(%[src_ptr])                 \n\t"
+      "gsldrc1    %[src0],         0x10(%[src_ptr])                 \n\t"
+      "gsldlc1    %[src0],         0x17(%[src_ptr])                 \n\t"
+      "gsldrc1    %[src1],         0x18(%[src_ptr])                 \n\t"
+      "gsldlc1    %[src1],         0x1f(%[src_ptr])                 \n\t"
       "punpckhhw  %[dest_hi],      %[src0],           %[src1]       \n\t"
       "punpcklhw  %[dest_hi],      %[dest_hi],        %[mask]       \n\t"
 
@@ -691,7 +721,8 @@
 
   __asm__ volatile(
       "1:                                                           \n\t"
-      "ldc1       %[src],          0x00(%[src_ptr])                 \n\t"
+      "gsldrc1    %[src],          0x00(%[src_ptr])                 \n\t"
+      "gsldlc1    %[src],          0x07(%[src_ptr])                 \n\t"
 
       "punpcklhw  %[dest],         %[src],            %[src]        \n\t"
       "gssdlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
@@ -721,9 +752,11 @@
       "punpcklbh  %[src_lo],       %[src],            %[mask]       \n\t"
       "punpckhbh  %[src_hi],       %[src],            %[mask]       \n\t"
 
-      "ldc1       %[dest0],        0x00(%[dst_ptr])                 \n\t"
+      "gsldrc1    %[dest0],        0x00(%[dst_ptr])                 \n\t"
+      "gsldlc1    %[dest0],        0x07(%[dst_ptr])                 \n\t"
       "paddush    %[dest0],        %[dest0],          %[src_lo]     \n\t"
-      "ldc1       %[dest1],        0x08(%[dst_ptr])                 \n\t"
+      "gsldrc1    %[dest1],        0x08(%[dst_ptr])                 \n\t"
+      "gsldlc1    %[dest1],        0x0f(%[dst_ptr])                 \n\t"
       "paddush    %[dest1],        %[dest1],          %[src_hi]     \n\t"
 
       "gssdlc1    %[dest0],        0x07(%[dst_ptr])                 \n\t"
@@ -750,16 +783,19 @@
 
   __asm__ volatile(
       "1:                                                           \n\t"
-      "ldc1       %[src],          0x00(%[src_ptr])                 \n\t"
+      "gsldrc1    %[src],          0x00(%[src_ptr])                 \n\t"
+      "gsldlc1    %[src],          0x07(%[src_ptr])                 \n\t"
       "punpcklhw  %[src_lo],       %[src],            %[mask]       \n\t"
       "punpckhhw  %[src_hi],       %[src],            %[mask]       \n\t"
 
-      "ldc1       %[dest0],        0x00(%[dst_ptr])                 \n\t"
+      "gsldrc1    %[dest0],        0x00(%[dst_ptr])                 \n\t"
+      "gsldlc1    %[dest0],        0x07(%[dst_ptr])                 \n\t"
       "paddw      %[dest0],        %[dest0],          %[src_lo]     \n\t"
       "gssdlc1    %[dest0],        0x07(%[dst_ptr])                 \n\t"
       "gssdrc1    %[dest0],        0x00(%[dst_ptr])                 \n\t"
 
-      "ldc1       %[dest1],        0x08(%[dst_ptr])                 \n\t"
+      "gsldrc1    %[dest1],        0x08(%[dst_ptr])                 \n\t"
+      "gsldlc1    %[dest1],        0x0f(%[dst_ptr])                 \n\t"
       "paddw      %[dest1],        %[dest1],          %[src_hi]     \n\t"
       "gssdlc1    %[dest1],        0x0f(%[dst_ptr])                 \n\t"
       "gssdrc1    %[dest1],        0x08(%[dst_ptr])                 \n\t"
@@ -922,7 +958,8 @@
 
   __asm__ volatile(
       "1:                                                           \n\t"
-      "ldc1       %[src],           0x00(%[src_ptr])                \n\t"
+      "gsldrc1    %[src],           0x00(%[src_ptr])                \n\t"
+      "gsldlc1    %[src],           0x07(%[src_ptr])                \n\t"
       "punpcklwd  %[dest0],         %[src],           %[src]        \n\t"
       "gssdlc1    %[dest0],         0x07(%[dst_ptr])                \n\t"
       "gssdrc1    %[dest0],         0x00(%[dst_ptr])                \n\t"
@@ -939,67 +976,6 @@
       : "memory");
 }
 
-void ScaleARGBFilterCols_MMI(uint8_t* dst_argb,
-                             const uint8_t* src_argb,
-                             int dst_width,
-                             int x,
-                             int dx) {
-  uint64_t dest, src, src_hi, src_lo;
-  int xi, xf, nxf;
-  int64_t fxf, fnxf;
-
-  const uint8_t* src_ptr = src_argb;
-
-  const uint64_t mask0 = 0;
-  const uint64_t mask1 = 0x7fULL;
-
-  const uint64_t shift2 = 2;
-  const uint64_t shift9 = 9;
-  const uint64_t shift7 = 7;
-  const uint64_t shift16 = 16;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "dsrl       %[xi],            %[x],             %[shift16]    \n\t"
-      "dsll       %[xi],            %[xi],            %[shift2]     \n\t"
-
-      "dadd       %[src_ptr],       %[src_argb],      %[xi]         \n\t"
-      "ldc1       %[src],           0x00(%[src_ptr])                \n\t"
-      "punpcklbh  %[src_lo],        %[src],           %[mask0]      \n\t"
-      "punpckhbh  %[src_hi],        %[src],           %[mask0]      \n\t"
-
-      "dsrl       %[xf],            %[x],             %[shift9]     \n\t"
-      "andi       %[xf],            %[xf],            0x7f          \n\t"
-      "xori       %[nxf],           %[xf],            0x7f          \n\t"
-      "dmtc1      %[xf],            %[fxf]                          \n\t"
-      "pshufh     %[fxf],           %[fxf],           %[mask0]      \n\t"
-      "dmtc1      %[nxf],           %[fnxf]                         \n\t"
-      "pshufh     %[fnxf],          %[fnxf],          %[mask0]      \n\t"
-
-      "pmullh     %[src_lo],        %[src_lo],        %[fnxf]       \n\t"
-      "pmullh     %[src_hi],        %[src_hi],        %[fxf]        \n\t"
-      "paddh      %[dest],          %[src_lo],        %[src_hi]     \n\t"
-      "psrlh      %[dest],          %[dest],          %[shift7]     \n\t"
-      "packushb   %[dest],          %[dest],          %[mask0]      \n\t"
-
-      "dadd       %[x],             %[x],             %[dx]         \n\t"
-
-      "swc1       %[dest],          0x00(%[dst_ptr])                \n\t"
-
-      "daddiu     %[dst_ptr],       %[dst_ptr],       0x04          \n\t"
-      "daddi      %[width],         %[width],        -0x01          \n\t"
-      "bnez       %[width],         1b                              \n\t"
-      : [src] "=&f"(src), [dest] "=&f"(dest), [src_hi] "=&f"(src_hi),
-        [src_lo] "=&f"(src_lo), [fxf] "=&f"(fxf), [fnxf] "=&f"(fnxf),
-        [xi] "=&r"(xi), [xf] "=&r"(xf), [nxf] "=&r"(nxf)
-      : [src_argb] "r"(src_argb), [src_ptr] "r"(src_ptr),
-        [dst_ptr] "r"(dst_argb), [width] "r"(dst_width), [x] "r"(x),
-        [dx] "r"(dx), [mask0] "f"(mask0), [mask1] "f"(mask1),
-        [shift2] "r"(shift2), [shift7] "f"(shift7), [shift9] "r"(shift9),
-        [shift16] "r"(shift16)
-      : "memory");
-}
-
 // Divide num by div and return as 16.16 fixed point result.
 /* LibYUVBaseTest.TestFixedDiv */
 int FixedDiv_MIPS(int num, int div) {
@@ -1058,9 +1034,11 @@
 
   __asm__ volatile(
       "1:                                                           \n\t"
-      "ldc1       %[src0],          0x00(%[src1_ptr])               \n\t"
+      "gsldrc1    %[src0],          0x00(%[src1_ptr])               \n\t"
+      "gsldlc1    %[src0],          0x07(%[src1_ptr])               \n\t"
       "pmaddhw    %[dest04],        %[src0],          %[mask0]      \n\t"
-      "ldc1       %[src1],          0x00(%[src2_ptr])               \n\t"
+      "gsldrc1    %[src1],          0x00(%[src2_ptr])               \n\t"
+      "gsldlc1    %[src1],          0x07(%[src2_ptr])               \n\t"
       "pmaddhw    %[dest],          %[src1],          %[mask1]      \n\t"
       "paddw      %[dest04],        %[dest04],        %[dest]       \n\t"
       "paddw      %[dest04],        %[dest04],        %[ph]         \n\t"
@@ -1072,9 +1050,11 @@
       "paddw      %[dest15],        %[dest15],        %[ph]         \n\t"
       "psrlw      %[dest15],        %[dest15],        %[shift]      \n\t"
 
-      "ldc1       %[src0],          0x02(%[src1_ptr])               \n\t"
+      "gsldrc1    %[src0],          0x02(%[src1_ptr])               \n\t"
+      "gsldlc1    %[src0],          0x09(%[src1_ptr])               \n\t"
       "pmaddhw    %[dest26],        %[src0],          %[mask0]      \n\t"
-      "ldc1       %[src1],          0x02(%[src2_ptr])               \n\t"
+      "gsldrc1    %[src1],          0x02(%[src2_ptr])               \n\t"
+      "gsldlc1    %[src1],          0x09(%[src2_ptr])               \n\t"
       "pmaddhw    %[dest],          %[src1],          %[mask1]      \n\t"
       "paddw      %[dest26],        %[dest26],        %[dest]       \n\t"
       "paddw      %[dest26],        %[dest26],        %[ph]         \n\t"