vpx_dsp/mips/sad_mmi.c - webm/libvpx - Git at Google

 /*
  *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
  *
  *  Use of this source code is governed by a BSD-style license
  *  that can be found in the LICENSE file in the root of the source
  *  tree. An additional intellectual property rights grant can be found
  *  in the file PATENTS.  All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */

 #include "./vpx_dsp_rtcd.h"
 #include "vpx_ports/asmdefs_mmi.h"
 #include "vpx/vpx_integer.h"
 #include "vpx_ports/mem.h"

 #define SAD_SRC_REF_ABS_SUB_64                                      \
   "gsldlc1    %[ftmp1],   0x07(%[src])                        \n\t" \
   "gsldrc1    %[ftmp1],   0x00(%[src])                        \n\t" \
   "gsldlc1    %[ftmp2],   0x0f(%[src])                        \n\t" \
   "gsldrc1    %[ftmp2],   0x08(%[src])                        \n\t" \
   "gsldlc1    %[ftmp3],   0x07(%[ref])                        \n\t" \
   "gsldrc1    %[ftmp3],   0x00(%[ref])                        \n\t" \
   "gsldlc1    %[ftmp4],   0x0f(%[ref])                        \n\t" \
   "gsldrc1    %[ftmp4],   0x08(%[ref])                        \n\t" \
   "pasubub    %[ftmp1],   %[ftmp1],       %[ftmp3]            \n\t" \
   "pasubub    %[ftmp2],   %[ftmp2],       %[ftmp4]            \n\t" \
   "biadd      %[ftmp1],   %[ftmp1]                            \n\t" \
   "biadd      %[ftmp2],   %[ftmp2]                            \n\t" \
   "paddw      %[ftmp5],   %[ftmp5],       %[ftmp1]            \n\t" \
   "paddw      %[ftmp5],   %[ftmp5],       %[ftmp2]            \n\t" \
   "gsldlc1    %[ftmp1],   0x17(%[src])                        \n\t" \
   "gsldrc1    %[ftmp1],   0x10(%[src])                        \n\t" \
   "gsldlc1    %[ftmp2],   0x1f(%[src])                        \n\t" \
   "gsldrc1    %[ftmp2],   0x18(%[src])                        \n\t" \
   "gsldlc1    %[ftmp3],   0x17(%[ref])                        \n\t" \
   "gsldrc1    %[ftmp3],   0x10(%[ref])                        \n\t" \
   "gsldlc1    %[ftmp4],   0x1f(%[ref])                        \n\t" \
   "gsldrc1    %[ftmp4],   0x18(%[ref])                        \n\t" \
   "pasubub    %[ftmp1],   %[ftmp1],       %[ftmp3]            \n\t" \
   "pasubub    %[ftmp2],   %[ftmp2],       %[ftmp4]            \n\t" \
   "biadd      %[ftmp1],   %[ftmp1]                            \n\t" \
   "biadd      %[ftmp2],   %[ftmp2]                            \n\t" \
   "paddw      %[ftmp5],   %[ftmp5],       %[ftmp1]            \n\t" \
   "paddw      %[ftmp5],   %[ftmp5],       %[ftmp2]            \n\t" \
   "gsldlc1    %[ftmp1],   0x27(%[src])                        \n\t" \
   "gsldrc1    %[ftmp1],   0x20(%[src])                        \n\t" \
   "gsldlc1    %[ftmp2],   0x2f(%[src])                        \n\t" \
   "gsldrc1    %[ftmp2],   0x28(%[src])                        \n\t" \
   "gsldlc1    %[ftmp3],   0x27(%[ref])                        \n\t" \
   "gsldrc1    %[ftmp3],   0x20(%[ref])                        \n\t" \
   "gsldlc1    %[ftmp4],   0x2f(%[ref])                        \n\t" \
   "gsldrc1    %[ftmp4],   0x28(%[ref])                        \n\t" \
   "pasubub    %[ftmp1],   %[ftmp1],       %[ftmp3]            \n\t" \
   "pasubub    %[ftmp2],   %[ftmp2],       %[ftmp4]            \n\t" \
   "biadd      %[ftmp1],   %[ftmp1]                            \n\t" \
   "biadd      %[ftmp2],   %[ftmp2]                            \n\t" \
   "paddw      %[ftmp5],   %[ftmp5],       %[ftmp1]            \n\t" \
   "paddw      %[ftmp5],   %[ftmp5],       %[ftmp2]            \n\t" \
   "gsldlc1    %[ftmp1],   0x37(%[src])                        \n\t" \
   "gsldrc1    %[ftmp1],   0x30(%[src])                        \n\t" \
   "gsldlc1    %[ftmp2],   0x3f(%[src])                        \n\t" \
   "gsldrc1    %[ftmp2],   0x38(%[src])                        \n\t" \
   "gsldlc1    %[ftmp3],   0x37(%[ref])                        \n\t" \
   "gsldrc1    %[ftmp3],   0x30(%[ref])                        \n\t" \
   "gsldlc1    %[ftmp4],   0x3f(%[ref])                        \n\t" \
   "gsldrc1    %[ftmp4],   0x38(%[ref])                        \n\t" \
   "pasubub    %[ftmp1],   %[ftmp1],       %[ftmp3]            \n\t" \
   "pasubub    %[ftmp2],   %[ftmp2],       %[ftmp4]            \n\t" \
   "biadd      %[ftmp1],   %[ftmp1]                            \n\t" \
   "biadd      %[ftmp2],   %[ftmp2]                            \n\t" \
   "paddw      %[ftmp5],   %[ftmp5],       %[ftmp1]            \n\t" \
   "paddw      %[ftmp5],   %[ftmp5],       %[ftmp2]            \n\t"

 #define SAD_SRC_REF_ABS_SUB_32                                      \
   "gsldlc1    %[ftmp1],   0x07(%[src])                        \n\t" \
   "gsldrc1    %[ftmp1],   0x00(%[src])                        \n\t" \
   "gsldlc1    %[ftmp2],   0x0f(%[src])                        \n\t" \
   "gsldrc1    %[ftmp2],   0x08(%[src])                        \n\t" \
   "gsldlc1    %[ftmp3],   0x07(%[ref])                        \n\t" \
   "gsldrc1    %[ftmp3],   0x00(%[ref])                        \n\t" \
   "gsldlc1    %[ftmp4],   0x0f(%[ref])                        \n\t" \
   "gsldrc1    %[ftmp4],   0x08(%[ref])                        \n\t" \
   "pasubub    %[ftmp1],   %[ftmp1],       %[ftmp3]            \n\t" \
   "pasubub    %[ftmp2],   %[ftmp2],       %[ftmp4]            \n\t" \
   "biadd      %[ftmp1],   %[ftmp1]                            \n\t" \
   "biadd      %[ftmp2],   %[ftmp2]                            \n\t" \
   "paddw      %[ftmp5],   %[ftmp5],       %[ftmp1]            \n\t" \
   "paddw      %[ftmp5],   %[ftmp5],       %[ftmp2]            \n\t" \
   "gsldlc1    %[ftmp1],   0x17(%[src])                        \n\t" \
   "gsldrc1    %[ftmp1],   0x10(%[src])                        \n\t" \
   "gsldlc1    %[ftmp2],   0x1f(%[src])                        \n\t" \
   "gsldrc1    %[ftmp2],   0x18(%[src])                        \n\t" \
   "gsldlc1    %[ftmp3],   0x17(%[ref])                        \n\t" \
   "gsldrc1    %[ftmp3],   0x10(%[ref])                        \n\t" \
   "gsldlc1    %[ftmp4],   0x1f(%[ref])                        \n\t" \
   "gsldrc1    %[ftmp4],   0x18(%[ref])                        \n\t" \
   "pasubub    %[ftmp1],   %[ftmp1],       %[ftmp3]            \n\t" \
   "pasubub    %[ftmp2],   %[ftmp2],       %[ftmp4]            \n\t" \
   "biadd      %[ftmp1],   %[ftmp1]                            \n\t" \
   "biadd      %[ftmp2],   %[ftmp2]                            \n\t" \
   "paddw      %[ftmp5],   %[ftmp5],       %[ftmp1]            \n\t" \
   "paddw      %[ftmp5],   %[ftmp5],       %[ftmp2]            \n\t"

 #define SAD_SRC_REF_ABS_SUB_16                                      \
   "gsldlc1    %[ftmp1],   0x07(%[src])                        \n\t" \
   "gsldrc1    %[ftmp1],   0x00(%[src])                        \n\t" \
   "gsldlc1    %[ftmp2],   0x0f(%[src])                        \n\t" \
   "gsldrc1    %[ftmp2],   0x08(%[src])                        \n\t" \
   "gsldlc1    %[ftmp3],   0x07(%[ref])                        \n\t" \
   "gsldrc1    %[ftmp3],   0x00(%[ref])                        \n\t" \
   "gsldlc1    %[ftmp4],   0x0f(%[ref])                        \n\t" \
   "gsldrc1    %[ftmp4],   0x08(%[ref])                        \n\t" \
   "pasubub    %[ftmp1],   %[ftmp1],       %[ftmp3]            \n\t" \
   "pasubub    %[ftmp2],   %[ftmp2],       %[ftmp4]            \n\t" \
   "biadd      %[ftmp1],   %[ftmp1]                            \n\t" \
   "biadd      %[ftmp2],   %[ftmp2]                            \n\t" \
   "paddw      %[ftmp5],   %[ftmp5],       %[ftmp1]            \n\t" \
   "paddw      %[ftmp5],   %[ftmp5],       %[ftmp2]            \n\t"

 #define SAD_SRC_REF_ABS_SUB_8                                       \
   "gsldlc1    %[ftmp1],   0x07(%[src])                        \n\t" \
   "gsldrc1    %[ftmp1],   0x00(%[src])                        \n\t" \
   "gsldlc1    %[ftmp2],   0x07(%[ref])                        \n\t" \
   "gsldrc1    %[ftmp2],   0x00(%[ref])                        \n\t" \
   "pasubub    %[ftmp1],   %[ftmp1],       %[ftmp2]            \n\t" \
   "biadd      %[ftmp1],   %[ftmp1]                            \n\t" \
   "paddw      %[ftmp3],   %[ftmp3],       %[ftmp1]            \n\t"

 #if _MIPS_SIM == _ABIO32
 #define SAD_SRC_REF_ABS_SUB_4                                       \
   "ulw        %[tmp0],    0x00(%[src])                        \n\t" \
   "mtc1       %[tmp0],    %[ftmp1]                            \n\t" \
   "ulw        %[tmp0],    0x00(%[ref])                        \n\t" \
   "mtc1       %[tmp0],    %[ftmp2]                            \n\t" \
   "pasubub    %[ftmp1],   %[ftmp1],       %[ftmp2]            \n\t" \
   "mthc1      $0,         %[ftmp1]                            \n\t" \
   "biadd      %[ftmp1],   %[ftmp1]                            \n\t" \
   "paddw      %[ftmp3],   %[ftmp3],       %[ftmp1]            \n\t"
 #else /* _MIPS_SIM == _ABI64 || _MIPS_SIM == _ABIN32 */
 #define SAD_SRC_REF_ABS_SUB_4                                       \
   "gslwlc1    %[ftmp1],   0x03(%[src])                        \n\t" \
   "gslwrc1    %[ftmp1],   0x00(%[src])                        \n\t" \
   "gslwlc1    %[ftmp2],   0x03(%[ref])                        \n\t" \
   "gslwrc1    %[ftmp2],   0x00(%[ref])                        \n\t" \
   "pasubub    %[ftmp1],   %[ftmp1],       %[ftmp2]            \n\t" \
   "mthc1      $0,         %[ftmp1]                            \n\t" \
   "biadd      %[ftmp1],   %[ftmp1]                            \n\t" \
   "paddw      %[ftmp3],   %[ftmp3],       %[ftmp1]            \n\t"
 #endif /* _MIPS_SIM == _ABIO32 */

 #define SAD_SRC_AVGREF_ABS_SUB_64                                   \
   "gsldlc1    %[ftmp1],   0x07(%[second_pred])                \n\t" \
   "gsldrc1    %[ftmp1],   0x00(%[second_pred])                \n\t" \
   "gsldlc1    %[ftmp2],   0x0f(%[second_pred])                \n\t" \
   "gsldrc1    %[ftmp2],   0x08(%[second_pred])                \n\t" \
   "gsldlc1    %[ftmp3],   0x07(%[ref])                        \n\t" \
   "gsldrc1    %[ftmp3],   0x00(%[ref])                        \n\t" \
   "gsldlc1    %[ftmp4],   0x0f(%[ref])                        \n\t" \
   "gsldrc1    %[ftmp4],   0x08(%[ref])                        \n\t" \
   "pavgb      %[ftmp3],   %[ftmp1],       %[ftmp3]            \n\t" \
   "pavgb      %[ftmp4],   %[ftmp2],       %[ftmp4]            \n\t" \
   "gsldlc1    %[ftmp1],   0x07(%[src])                        \n\t" \
   "gsldrc1    %[ftmp1],   0x00(%[src])                        \n\t" \
   "gsldlc1    %[ftmp2],   0x0f(%[src])                        \n\t" \
   "gsldrc1    %[ftmp2],   0x08(%[src])                        \n\t" \
   "pasubub    %[ftmp1],   %[ftmp1],       %[ftmp3]            \n\t" \
   "pasubub    %[ftmp2],   %[ftmp2],       %[ftmp4]            \n\t" \
   "biadd      %[ftmp1],   %[ftmp1]                            \n\t" \
   "biadd      %[ftmp2],   %[ftmp2]                            \n\t" \
   "paddw      %[ftmp5],   %[ftmp5],       %[ftmp1]            \n\t" \
   "paddw      %[ftmp5],   %[ftmp5],       %[ftmp2]            \n\t" \
   "gsldlc1    %[ftmp1],   0x17(%[second_pred])                \n\t" \
   "gsldrc1    %[ftmp1],   0x10(%[second_pred])                \n\t" \
   "gsldlc1    %[ftmp2],   0x1f(%[second_pred])                \n\t" \
   "gsldrc1    %[ftmp2],   0x18(%[second_pred])                \n\t" \
   "gsldlc1    %[ftmp3],   0x17(%[ref])                        \n\t" \
   "gsldrc1    %[ftmp3],   0x10(%[ref])                        \n\t" \
   "gsldlc1    %[ftmp4],   0x1f(%[ref])                        \n\t" \
   "gsldrc1    %[ftmp4],   0x18(%[ref])                        \n\t" \
   "pavgb      %[ftmp3],   %[ftmp1],       %[ftmp3]            \n\t" \
   "pavgb      %[ftmp4],   %[ftmp2],       %[ftmp4]            \n\t" \
   "gsldlc1    %[ftmp1],   0x17(%[src])                        \n\t" \
   "gsldrc1    %[ftmp1],   0x10(%[src])                        \n\t" \
   "gsldlc1    %[ftmp2],   0x1f(%[src])                        \n\t" \
   "gsldrc1    %[ftmp2],   0x18(%[src])                        \n\t" \
   "pasubub    %[ftmp1],   %[ftmp1],       %[ftmp3]            \n\t" \
   "pasubub    %[ftmp2],   %[ftmp2],       %[ftmp4]            \n\t" \
   "biadd      %[ftmp1],   %[ftmp1]                            \n\t" \
   "biadd      %[ftmp2],   %[ftmp2]                            \n\t" \
   "paddw      %[ftmp5],   %[ftmp5],       %[ftmp1]            \n\t" \
   "paddw      %[ftmp5],   %[ftmp5],       %[ftmp2]            \n\t" \
   "gsldlc1    %[ftmp1],   0x27(%[second_pred])                \n\t" \
   "gsldrc1    %[ftmp1],   0x20(%[second_pred])                \n\t" \
   "gsldlc1    %[ftmp2],   0x2f(%[second_pred])                \n\t" \
   "gsldrc1    %[ftmp2],   0x28(%[second_pred])                \n\t" \
   "gsldlc1    %[ftmp3],   0x27(%[ref])                        \n\t" \
   "gsldrc1    %[ftmp3],   0x20(%[ref])                        \n\t" \
   "gsldlc1    %[ftmp4],   0x2f(%[ref])                        \n\t" \
   "gsldrc1    %[ftmp4],   0x28(%[ref])                        \n\t" \
   "pavgb      %[ftmp3],   %[ftmp1],       %[ftmp3]            \n\t" \
   "pavgb      %[ftmp4],   %[ftmp2],       %[ftmp4]            \n\t" \
   "gsldlc1    %[ftmp1],   0x27(%[src])                        \n\t" \
   "gsldrc1    %[ftmp1],   0x20(%[src])                        \n\t" \
   "gsldlc1    %[ftmp2],   0x2f(%[src])                        \n\t" \
   "gsldrc1    %[ftmp2],   0x28(%[src])                        \n\t" \
   "pasubub    %[ftmp1],   %[ftmp1],       %[ftmp3]            \n\t" \
   "pasubub    %[ftmp2],   %[ftmp2],       %[ftmp4]            \n\t" \
   "biadd      %[ftmp1],   %[ftmp1]                            \n\t" \
   "biadd      %[ftmp2],   %[ftmp2]                            \n\t" \
   "paddw      %[ftmp5],   %[ftmp5],       %[ftmp1]            \n\t" \
   "paddw      %[ftmp5],   %[ftmp5],       %[ftmp2]            \n\t" \
   "gsldlc1    %[ftmp1],   0x37(%[second_pred])                \n\t" \
   "gsldrc1    %[ftmp1],   0x30(%[second_pred])                \n\t" \
   "gsldlc1    %[ftmp2],   0x3f(%[second_pred])                \n\t" \
   "gsldrc1    %[ftmp2],   0x38(%[second_pred])                \n\t" \
   "gsldlc1    %[ftmp3],   0x37(%[ref])                        \n\t" \
   "gsldrc1    %[ftmp3],   0x30(%[ref])                        \n\t" \
   "gsldlc1    %[ftmp4],   0x3f(%[ref])                        \n\t" \
   "gsldrc1    %[ftmp4],   0x38(%[ref])                        \n\t" \
   "pavgb      %[ftmp3],   %[ftmp1],       %[ftmp3]            \n\t" \
   "pavgb      %[ftmp4],   %[ftmp2],       %[ftmp4]            \n\t" \
   "gsldlc1    %[ftmp1],   0x37(%[src])                        \n\t" \
   "gsldrc1    %[ftmp1],   0x30(%[src])                        \n\t" \
   "gsldlc1    %[ftmp2],   0x3f(%[src])                        \n\t" \
   "gsldrc1    %[ftmp2],   0x38(%[src])                        \n\t" \
   "pasubub    %[ftmp1],   %[ftmp1],       %[ftmp3]            \n\t" \
   "pasubub    %[ftmp2],   %[ftmp2],       %[ftmp4]            \n\t" \
   "biadd      %[ftmp1],   %[ftmp1]                            \n\t" \
   "biadd      %[ftmp2],   %[ftmp2]                            \n\t" \
   "paddw      %[ftmp5],   %[ftmp5],       %[ftmp1]            \n\t" \
   "paddw      %[ftmp5],   %[ftmp5],       %[ftmp2]            \n\t"

 #define SAD_SRC_AVGREF_ABS_SUB_32                                   \
   "gsldlc1    %[ftmp1],   0x07(%[second_pred])                \n\t" \
   "gsldrc1    %[ftmp1],   0x00(%[second_pred])                \n\t" \
   "gsldlc1    %[ftmp2],   0x0f(%[second_pred])                \n\t" \
   "gsldrc1    %[ftmp2],   0x08(%[second_pred])                \n\t" \
   "gsldlc1    %[ftmp3],   0x07(%[ref])                        \n\t" \
   "gsldrc1    %[ftmp3],   0x00(%[ref])                        \n\t" \
   "gsldlc1    %[ftmp4],   0x0f(%[ref])                        \n\t" \
   "gsldrc1    %[ftmp4],   0x08(%[ref])                        \n\t" \
   "pavgb      %[ftmp3],   %[ftmp1],       %[ftmp3]            \n\t" \
   "pavgb      %[ftmp4],   %[ftmp2],       %[ftmp4]            \n\t" \
   "gsldlc1    %[ftmp1],   0x07(%[src])                        \n\t" \
   "gsldrc1    %[ftmp1],   0x00(%[src])                        \n\t" \
   "gsldlc1    %[ftmp2],   0x0f(%[src])                        \n\t" \
   "gsldrc1    %[ftmp2],   0x08(%[src])                        \n\t" \
   "pasubub    %[ftmp1],   %[ftmp1],       %[ftmp3]            \n\t" \
   "pasubub    %[ftmp2],   %[ftmp2],       %[ftmp4]            \n\t" \
   "biadd      %[ftmp1],   %[ftmp1]                            \n\t" \
   "biadd      %[ftmp2],   %[ftmp2]                            \n\t" \
   "paddw      %[ftmp5],   %[ftmp5],       %[ftmp1]            \n\t" \
   "paddw      %[ftmp5],   %[ftmp5],       %[ftmp2]            \n\t" \
   "gsldlc1    %[ftmp1],   0x17(%[second_pred])                \n\t" \
   "gsldrc1    %[ftmp1],   0x10(%[second_pred])                \n\t" \
   "gsldlc1    %[ftmp2],   0x1f(%[second_pred])                \n\t" \
   "gsldrc1    %[ftmp2],   0x18(%[second_pred])                \n\t" \
   "gsldlc1    %[ftmp3],   0x17(%[ref])                        \n\t" \
   "gsldrc1    %[ftmp3],   0x10(%[ref])                        \n\t" \
   "gsldlc1    %[ftmp4],   0x1f(%[ref])                        \n\t" \
   "gsldrc1    %[ftmp4],   0x18(%[ref])                        \n\t" \
   "pavgb      %[ftmp3],   %[ftmp1],       %[ftmp3]            \n\t" \
   "pavgb      %[ftmp4],   %[ftmp2],       %[ftmp4]            \n\t" \
   "gsldlc1    %[ftmp1],   0x17(%[src])                        \n\t" \
   "gsldrc1    %[ftmp1],   0x10(%[src])                        \n\t" \
   "gsldlc1    %[ftmp2],   0x1f(%[src])                        \n\t" \
   "gsldrc1    %[ftmp2],   0x18(%[src])                        \n\t" \
   "pasubub    %[ftmp1],   %[ftmp1],       %[ftmp3]            \n\t" \
   "pasubub    %[ftmp2],   %[ftmp2],       %[ftmp4]            \n\t" \
   "biadd      %[ftmp1],   %[ftmp1]                            \n\t" \
   "biadd      %[ftmp2],   %[ftmp2]                            \n\t" \
   "paddw      %[ftmp5],   %[ftmp5],       %[ftmp1]            \n\t" \
   "paddw      %[ftmp5],   %[ftmp5],       %[ftmp2]            \n\t"

 #define SAD_SRC_AVGREF_ABS_SUB_16                                   \
   "gsldlc1    %[ftmp1],   0x07(%[second_pred])                \n\t" \
   "gsldrc1    %[ftmp1],   0x00(%[second_pred])                \n\t" \
   "gsldlc1    %[ftmp2],   0x0f(%[second_pred])                \n\t" \
   "gsldrc1    %[ftmp2],   0x08(%[second_pred])                \n\t" \
   "gsldlc1    %[ftmp3],   0x07(%[ref])                        \n\t" \
   "gsldrc1    %[ftmp3],   0x00(%[ref])                        \n\t" \
   "gsldlc1    %[ftmp4],   0x0f(%[ref])                        \n\t" \
   "gsldrc1    %[ftmp4],   0x08(%[ref])                        \n\t" \
   "pavgb      %[ftmp3],   %[ftmp1],       %[ftmp3]            \n\t" \
   "pavgb      %[ftmp4],   %[ftmp2],       %[ftmp4]            \n\t" \
   "gsldlc1    %[ftmp1],   0x07(%[src])                        \n\t" \
   "gsldrc1    %[ftmp1],   0x00(%[src])                        \n\t" \
   "gsldlc1    %[ftmp2],   0x0f(%[src])                        \n\t" \
   "gsldrc1    %[ftmp2],   0x08(%[src])                        \n\t" \
   "pasubub    %[ftmp1],   %[ftmp1],       %[ftmp3]            \n\t" \
   "pasubub    %[ftmp2],   %[ftmp2],       %[ftmp4]            \n\t" \
   "biadd      %[ftmp1],   %[ftmp1]                            \n\t" \
   "biadd      %[ftmp2],   %[ftmp2]                            \n\t" \
   "paddw      %[ftmp5],   %[ftmp5],       %[ftmp1]            \n\t" \
   "paddw      %[ftmp5],   %[ftmp5],       %[ftmp2]            \n\t"

 #define SAD_SRC_AVGREF_ABS_SUB_8                                    \
   "gsldlc1    %[ftmp1],   0x07(%[second_pred])                \n\t" \
   "gsldrc1    %[ftmp1],   0x00(%[second_pred])                \n\t" \
   "gsldlc1    %[ftmp2],   0x07(%[ref])                        \n\t" \
   "gsldrc1    %[ftmp2],   0x00(%[ref])                        \n\t" \
   "pavgb      %[ftmp2],   %[ftmp1],       %[ftmp2]            \n\t" \
   "gsldlc1    %[ftmp1],   0x07(%[src])                        \n\t" \
   "gsldrc1    %[ftmp1],   0x00(%[src])                        \n\t" \
   "pasubub    %[ftmp1],   %[ftmp1],       %[ftmp2]            \n\t" \
   "biadd      %[ftmp1],   %[ftmp1]                            \n\t" \
   "paddw      %[ftmp3],   %[ftmp3],       %[ftmp1]            \n\t"

 #if _MIPS_SIM == _ABIO32
 #define SAD_SRC_AVGREF_ABS_SUB_4                                    \
   "ulw        %[tmp0],    0x00(%[second_pred])                \n\t" \
   "mtc1       %[tmp0],    %[ftmp1]                            \n\t" \
   "ulw        %[tmp0],    0x00(%[ref])                        \n\t" \
   "mtc1       %[tmp0],    %[ftmp2]                            \n\t" \
   "pavgb      %[ftmp2],   %[ftmp1],       %[ftmp2]            \n\t" \
   "gsldlc1    %[ftmp1],   0x07(%[src])                        \n\t" \
   "gsldrc1    %[ftmp1],   0x00(%[src])                        \n\t" \
   "pasubub    %[ftmp1],   %[ftmp1],       %[ftmp2]            \n\t" \
   "mthc1      $0,         %[ftmp1]                            \n\t" \
   "biadd      %[ftmp1],   %[ftmp1]                            \n\t" \
   "paddw      %[ftmp3],   %[ftmp3],       %[ftmp1]            \n\t"
 #else /* _MIPS_SIM == _ABI64 || _MIPS_SIM == _ABIN32 */
 #define SAD_SRC_AVGREF_ABS_SUB_4                                    \
   "gslwlc1    %[ftmp1],   0x03(%[second_pred])                \n\t" \
   "gslwrc1    %[ftmp1],   0x00(%[second_pred])                \n\t" \
   "gslwlc1    %[ftmp2],   0x03(%[ref])                        \n\t" \
   "gslwrc1    %[ftmp2],   0x00(%[ref])                        \n\t" \
   "pavgb      %[ftmp2],   %[ftmp1],       %[ftmp2]            \n\t" \
   "gsldlc1    %[ftmp1],   0x07(%[src])                        \n\t" \
   "gsldrc1    %[ftmp1],   0x00(%[src])                        \n\t" \
   "pasubub    %[ftmp1],   %[ftmp1],       %[ftmp2]            \n\t" \
   "mthc1      $0,         %[ftmp1]                            \n\t" \
   "biadd      %[ftmp1],   %[ftmp1]                            \n\t" \
   "paddw      %[ftmp3],   %[ftmp3],       %[ftmp1]            \n\t"
 #endif /* _MIPS_SIM == _ABIO32 */

 // depending on call sites, pass **ref_array to avoid & in subsequent call and
 // de-dup with 4D below.
 #define sadMxNxK_mmi(m, n, k)                                                 \
   void vpx_sad##m##x##n##x##k##_mmi(const uint8_t *src, int src_stride,       \
                                     const uint8_t *ref_array, int ref_stride, \
                                     uint32_t *sad_array) {                    \
     int i;                                                                    \
     for (i = 0; i < (k); ++i)                                                 \
       sad_array[i] =                                                          \
           vpx_sad##m##x##n##_mmi(src, src_stride, &ref_array[i], ref_stride); \
   }

 // This appears to be equivalent to the above when k == 4 and refs is const
 #define sadMxNx4D_mmi(m, n)                                                  \
   void vpx_sad##m##x##n##x4d_mmi(const uint8_t *src, int src_stride,         \
                                  const uint8_t *const ref_array[],           \
                                  int ref_stride, uint32_t *sad_array) {      \
     int i;                                                                   \
     for (i = 0; i < 4; ++i)                                                  \
       sad_array[i] =                                                         \
           vpx_sad##m##x##n##_mmi(src, src_stride, ref_array[i], ref_stride); \
   }

 static inline unsigned int vpx_sad64x(const uint8_t *src, int src_stride,
                                       const uint8_t *ref, int ref_stride,
                                       int counter) {
   unsigned int sad;
   double ftmp1, ftmp2, ftmp3, ftmp4, ftmp5;
   mips_reg l_counter = counter;

   __asm__ volatile (
     "xor        %[ftmp5],   %[ftmp5],       %[ftmp5]            \n\t"
     "1:                                                         \n\t"
     // Include two loop body, to reduce loop time.
     SAD_SRC_REF_ABS_SUB_64
     MMI_ADDU(%[src],     %[src],         %[src_stride])
     MMI_ADDU(%[ref],     %[ref],         %[ref_stride])
     SAD_SRC_REF_ABS_SUB_64
     MMI_ADDU(%[src],     %[src],         %[src_stride])
     MMI_ADDU(%[ref],     %[ref],         %[ref_stride])
     MMI_ADDIU(%[counter], %[counter], -0x02)
     "bnez       %[counter], 1b                                  \n\t"
     "mfc1       %[sad],     %[ftmp5]                            \n\t"
     : [ftmp1]"=&f"(ftmp1), [ftmp2]"=&f"(ftmp2), [ftmp3]"=&f"(ftmp3),
       [ftmp4]"=&f"(ftmp4), [ftmp5]"=&f"(ftmp5), [counter]"+&r"(l_counter),
       [src]"+&r"(src), [ref]"+&r"(ref), [sad]"=&r"(sad)
     : [src_stride]"r"((mips_reg)src_stride),
       [ref_stride]"r"((mips_reg)ref_stride)
   );

   return sad;
 }

 #define vpx_sad64xN(H)                                                   \
   unsigned int vpx_sad64x##H##_mmi(const uint8_t *src, int src_stride,   \
                                    const uint8_t *ref, int ref_stride) { \
     return vpx_sad64x(src, src_stride, ref, ref_stride, H);              \
   }

 vpx_sad64xN(64);
 vpx_sad64xN(32);
 sadMxNx4D_mmi(64, 64);
 sadMxNx4D_mmi(64, 32);

 static inline unsigned int vpx_sad_avg64x(const uint8_t *src, int src_stride,
                                           const uint8_t *ref, int ref_stride,
                                           const uint8_t *second_pred,
                                           int counter) {
   unsigned int sad;
   double ftmp1, ftmp2, ftmp3, ftmp4, ftmp5;
   mips_reg l_counter = counter;

   __asm__ volatile (
     "xor        %[ftmp5],   %[ftmp5],       %[ftmp5]            \n\t"
     "1:                                                         \n\t"
     // Include two loop body, to reduce loop time.
     SAD_SRC_AVGREF_ABS_SUB_64
     MMI_ADDIU(%[second_pred], %[second_pred], 0x40)
     MMI_ADDU(%[src],     %[src],         %[src_stride])
     MMI_ADDU(%[ref],     %[ref],         %[ref_stride])
     SAD_SRC_AVGREF_ABS_SUB_64
     MMI_ADDIU(%[second_pred], %[second_pred], 0x40)
     MMI_ADDU(%[src],     %[src],         %[src_stride])
     MMI_ADDU(%[ref],     %[ref],         %[ref_stride])
     MMI_ADDIU(%[counter], %[counter], -0x02)
     "bnez       %[counter], 1b                                  \n\t"
     "mfc1       %[sad],     %[ftmp5]                            \n\t"
     : [ftmp1]"=&f"(ftmp1), [ftmp2]"=&f"(ftmp2), [ftmp3]"=&f"(ftmp3),
       [ftmp4]"=&f"(ftmp4), [ftmp5]"=&f"(ftmp5), [counter]"+&r"(l_counter),
       [src]"+&r"(src), [ref]"+&r"(ref),
       [second_pred]"+&r"((mips_reg)second_pred),
       [sad]"=&r"(sad)
     : [src_stride]"r"((mips_reg)src_stride),
       [ref_stride]"r"((mips_reg)ref_stride)
   );

   return sad;
 }

 #define vpx_sad_avg64xN(H)                                                   \
   unsigned int vpx_sad64x##H##_avg_mmi(const uint8_t *src, int src_stride,   \
                                        const uint8_t *ref, int ref_stride,   \
                                        const uint8_t *second_pred) {         \
     return vpx_sad_avg64x(src, src_stride, ref, ref_stride, second_pred, H); \
   }

 vpx_sad_avg64xN(64);
 vpx_sad_avg64xN(32);

 static inline unsigned int vpx_sad32x(const uint8_t *src, int src_stride,
                                       const uint8_t *ref, int ref_stride,
                                       int counter) {
   unsigned int sad;
   double ftmp1, ftmp2, ftmp3, ftmp4, ftmp5;
   mips_reg l_counter = counter;

   __asm__ volatile (
     "xor        %[ftmp5],   %[ftmp5],       %[ftmp5]            \n\t"
     "1:                                                         \n\t"
     // Include two loop body, to reduce loop time.
     SAD_SRC_REF_ABS_SUB_32
     MMI_ADDU(%[src],     %[src],         %[src_stride])
     MMI_ADDU(%[ref],     %[ref],         %[ref_stride])
     SAD_SRC_REF_ABS_SUB_32
     MMI_ADDU(%[src],     %[src],         %[src_stride])
     MMI_ADDU(%[ref],     %[ref],         %[ref_stride])
     MMI_ADDIU(%[counter], %[counter], -0x02)
     "bnez       %[counter], 1b                                  \n\t"
     "mfc1       %[sad],     %[ftmp5]                            \n\t"
     : [ftmp1]"=&f"(ftmp1), [ftmp2]"=&f"(ftmp2), [ftmp3]"=&f"(ftmp3),
       [ftmp4]"=&f"(ftmp4), [ftmp5]"=&f"(ftmp5), [counter]"+&r"(l_counter),
       [src]"+&r"(src), [ref]"+&r"(ref), [sad]"=&r"(sad)
     : [src_stride]"r"((mips_reg)src_stride),
       [ref_stride]"r"((mips_reg)ref_stride)
   );

   return sad;
 }

 #define vpx_sad32xN(H)                                                   \
   unsigned int vpx_sad32x##H##_mmi(const uint8_t *src, int src_stride,   \
                                    const uint8_t *ref, int ref_stride) { \
     return vpx_sad32x(src, src_stride, ref, ref_stride, H);              \
   }

 vpx_sad32xN(64);
 vpx_sad32xN(32);
 vpx_sad32xN(16);
 sadMxNx4D_mmi(32, 64);
 sadMxNx4D_mmi(32, 32);
 sadMxNx4D_mmi(32, 16);

 static inline unsigned int vpx_sad_avg32x(const uint8_t *src, int src_stride,
                                           const uint8_t *ref, int ref_stride,
                                           const uint8_t *second_pred,
                                           int counter) {
   unsigned int sad;
   double ftmp1, ftmp2, ftmp3, ftmp4, ftmp5;
   mips_reg l_counter = counter;

   __asm__ volatile (
     "xor        %[ftmp5],   %[ftmp5],       %[ftmp5]            \n\t"
     "1:                                                         \n\t"
     // Include two loop body, to reduce loop time.
     SAD_SRC_AVGREF_ABS_SUB_32
     MMI_ADDIU(%[second_pred], %[second_pred], 0x20)
     MMI_ADDU(%[src],     %[src],         %[src_stride])
     MMI_ADDU(%[ref],     %[ref],         %[ref_stride])
     SAD_SRC_AVGREF_ABS_SUB_32
     MMI_ADDIU(%[second_pred], %[second_pred], 0x20)
     MMI_ADDU(%[src],     %[src],         %[src_stride])
     MMI_ADDU(%[ref],     %[ref],         %[ref_stride])
     MMI_ADDIU(%[counter], %[counter], -0x02)
     "bnez       %[counter], 1b                                  \n\t"
     "mfc1       %[sad],     %[ftmp5]                            \n\t"
     : [ftmp1]"=&f"(ftmp1), [ftmp2]"=&f"(ftmp2), [ftmp3]"=&f"(ftmp3),
       [ftmp4]"=&f"(ftmp4), [ftmp5]"=&f"(ftmp5), [counter]"+&r"(l_counter),
       [src]"+&r"(src), [ref]"+&r"(ref),
       [second_pred]"+&r"((mips_reg)second_pred),
       [sad]"=&r"(sad)
     : [src_stride]"r"((mips_reg)src_stride),
       [ref_stride]"r"((mips_reg)ref_stride)
   );

   return sad;
 }

 #define vpx_sad_avg32xN(H)                                                   \
   unsigned int vpx_sad32x##H##_avg_mmi(const uint8_t *src, int src_stride,   \
                                        const uint8_t *ref, int ref_stride,   \
                                        const uint8_t *second_pred) {         \
     return vpx_sad_avg32x(src, src_stride, ref, ref_stride, second_pred, H); \
   }

 vpx_sad_avg32xN(64);
 vpx_sad_avg32xN(32);
 vpx_sad_avg32xN(16);

 static inline unsigned int vpx_sad16x(const uint8_t *src, int src_stride,
                                       const uint8_t *ref, int ref_stride,
                                       int counter) {
   unsigned int sad;
   double ftmp1, ftmp2, ftmp3, ftmp4, ftmp5;
   mips_reg l_counter = counter;

   __asm__ volatile (
     "xor        %[ftmp5],   %[ftmp5],       %[ftmp5]            \n\t"
     "1:                                                         \n\t"
     // Include two loop body, to reduce loop time.
     SAD_SRC_REF_ABS_SUB_16
     MMI_ADDU(%[src],     %[src],         %[src_stride])
     MMI_ADDU(%[ref],     %[ref],         %[ref_stride])
     SAD_SRC_REF_ABS_SUB_16
     MMI_ADDU(%[src],     %[src],         %[src_stride])
     MMI_ADDU(%[ref],     %[ref],         %[ref_stride])
     MMI_ADDIU(%[counter], %[counter], -0x02)
     "bnez       %[counter], 1b                                  \n\t"
     "mfc1       %[sad],     %[ftmp5]                            \n\t"
     : [ftmp1]"=&f"(ftmp1), [ftmp2]"=&f"(ftmp2), [ftmp3]"=&f"(ftmp3),
       [ftmp4]"=&f"(ftmp4), [ftmp5]"=&f"(ftmp5), [counter]"+&r"(l_counter),
       [src]"+&r"(src), [ref]"+&r"(ref), [sad]"=&r"(sad)
     : [src_stride]"r"((mips_reg)src_stride),
       [ref_stride]"r"((mips_reg)ref_stride)
   );

   return sad;
 }

 #define vpx_sad16xN(H)                                                   \
   unsigned int vpx_sad16x##H##_mmi(const uint8_t *src, int src_stride,   \
                                    const uint8_t *ref, int ref_stride) { \
     return vpx_sad16x(src, src_stride, ref, ref_stride, H);              \
   }

 vpx_sad16xN(32);
 vpx_sad16xN(16);
 vpx_sad16xN(8);
 sadMxNxK_mmi(16, 16, 3);
 sadMxNxK_mmi(16, 16, 8);
 sadMxNxK_mmi(16, 8, 3);
 sadMxNxK_mmi(16, 8, 8);
 sadMxNx4D_mmi(16, 32);
 sadMxNx4D_mmi(16, 16);
 sadMxNx4D_mmi(16, 8);

 static inline unsigned int vpx_sad_avg16x(const uint8_t *src, int src_stride,
                                           const uint8_t *ref, int ref_stride,
                                           const uint8_t *second_pred,
                                           int counter) {
   unsigned int sad;
   double ftmp1, ftmp2, ftmp3, ftmp4, ftmp5;
   mips_reg l_counter = counter;

   __asm__ volatile (
     "xor        %[ftmp5],   %[ftmp5],       %[ftmp5]            \n\t"
     "1:                                                         \n\t"
     // Include two loop body, to reduce loop time.
     SAD_SRC_AVGREF_ABS_SUB_16
     MMI_ADDIU(%[second_pred], %[second_pred], 0x10)
     MMI_ADDU(%[src],     %[src],         %[src_stride])
     MMI_ADDU(%[ref],     %[ref],         %[ref_stride])
     SAD_SRC_AVGREF_ABS_SUB_16
     MMI_ADDIU(%[second_pred], %[second_pred], 0x10)
     MMI_ADDU(%[src],     %[src],         %[src_stride])
     MMI_ADDU(%[ref],     %[ref],         %[ref_stride])
     MMI_ADDIU(%[counter], %[counter], -0x02)
     "bnez       %[counter], 1b                                  \n\t"
     "mfc1       %[sad],     %[ftmp5]                            \n\t"
     : [ftmp1]"=&f"(ftmp1), [ftmp2]"=&f"(ftmp2), [ftmp3]"=&f"(ftmp3),
       [ftmp4]"=&f"(ftmp4), [ftmp5]"=&f"(ftmp5), [counter]"+&r"(l_counter),
       [src]"+&r"(src), [ref]"+&r"(ref),
       [second_pred]"+&r"((mips_reg)second_pred),
       [sad]"=&r"(sad)
     : [src_stride]"r"((mips_reg)src_stride),
       [ref_stride]"r"((mips_reg)ref_stride)
   );

   return sad;
 }

 #define vpx_sad_avg16xN(H)                                                   \
   unsigned int vpx_sad16x##H##_avg_mmi(const uint8_t *src, int src_stride,   \
                                        const uint8_t *ref, int ref_stride,   \
                                        const uint8_t *second_pred) {         \
     return vpx_sad_avg16x(src, src_stride, ref, ref_stride, second_pred, H); \
   }

 vpx_sad_avg16xN(32);
 vpx_sad_avg16xN(16);
 vpx_sad_avg16xN(8);

 static inline unsigned int vpx_sad8x(const uint8_t *src, int src_stride,
                                      const uint8_t *ref, int ref_stride,
                                      int counter) {
   unsigned int sad;
   double ftmp1, ftmp2, ftmp3;
   mips_reg l_counter = counter;

   __asm__ volatile (
     "xor        %[ftmp3],   %[ftmp3],       %[ftmp3]            \n\t"
     "1:                                                         \n\t"
     // Include two loop body, to reduce loop time.
     SAD_SRC_REF_ABS_SUB_8
     MMI_ADDU(%[src],     %[src],         %[src_stride])
     MMI_ADDU(%[ref],     %[ref],         %[ref_stride])
     SAD_SRC_REF_ABS_SUB_8
     MMI_ADDU(%[src],     %[src],         %[src_stride])
     MMI_ADDU(%[ref],     %[ref],         %[ref_stride])
     MMI_ADDIU(%[counter], %[counter], -0x02)
     "bnez       %[counter], 1b                                  \n\t"
     "mfc1       %[sad],     %[ftmp3]                            \n\t"
     : [ftmp1]"=&f"(ftmp1), [ftmp2]"=&f"(ftmp2), [ftmp3]"=&f"(ftmp3),
       [counter]"+&r"(l_counter), [src]"+&r"(src), [ref]"+&r"(ref),
       [sad]"=&r"(sad)
     : [src_stride]"r"((mips_reg)src_stride),
       [ref_stride]"r"((mips_reg)ref_stride)
   );

   return sad;
 }

 #define vpx_sad8xN(H)                                                   \
   unsigned int vpx_sad8x##H##_mmi(const uint8_t *src, int src_stride,   \
                                   const uint8_t *ref, int ref_stride) { \
     return vpx_sad8x(src, src_stride, ref, ref_stride, H);              \
   }

 vpx_sad8xN(16);
 vpx_sad8xN(8);
 vpx_sad8xN(4);
 sadMxNxK_mmi(8, 16, 3);
 sadMxNxK_mmi(8, 16, 8);
 sadMxNxK_mmi(8, 8, 3);
 sadMxNxK_mmi(8, 8, 8);
 sadMxNx4D_mmi(8, 16);
 sadMxNx4D_mmi(8, 8);
 sadMxNx4D_mmi(8, 4);

 static inline unsigned int vpx_sad_avg8x(const uint8_t *src, int src_stride,
                                          const uint8_t *ref, int ref_stride,
                                          const uint8_t *second_pred,
                                          int counter) {
   unsigned int sad;
   double ftmp1, ftmp2, ftmp3;
   mips_reg l_counter = counter;

   __asm__ volatile (
     "xor        %[ftmp3],   %[ftmp3],       %[ftmp3]            \n\t"
     "1:                                                         \n\t"
     // Include two loop body, to reduce loop time.
     SAD_SRC_AVGREF_ABS_SUB_8
     MMI_ADDIU(%[second_pred], %[second_pred], 0x08)
     MMI_ADDU(%[src],     %[src],         %[src_stride])
     MMI_ADDU(%[ref],     %[ref],         %[ref_stride])
     SAD_SRC_AVGREF_ABS_SUB_8
     MMI_ADDIU(%[second_pred], %[second_pred], 0x08)
     MMI_ADDU(%[src],     %[src],         %[src_stride])
     MMI_ADDU(%[ref],     %[ref],         %[ref_stride])
     MMI_ADDIU(%[counter], %[counter], -0x02)
     "bnez       %[counter], 1b                                  \n\t"
     "mfc1       %[sad],     %[ftmp3]                            \n\t"
     : [ftmp1]"=&f"(ftmp1), [ftmp2]"=&f"(ftmp2), [ftmp3]"=&f"(ftmp3),
       [counter]"+&r"(l_counter), [src]"+&r"(src), [ref]"+&r"(ref),
       [second_pred]"+&r"((mips_reg)second_pred),
       [sad]"=&r"(sad)
     : [src_stride]"r"((mips_reg)src_stride),
       [ref_stride]"r"((mips_reg)ref_stride)
   );

   return sad;
 }

 #define vpx_sad_avg8xN(H)                                                   \
   unsigned int vpx_sad8x##H##_avg_mmi(const uint8_t *src, int src_stride,   \
                                       const uint8_t *ref, int ref_stride,   \
                                       const uint8_t *second_pred) {         \
     return vpx_sad_avg8x(src, src_stride, ref, ref_stride, second_pred, H); \
   }

 vpx_sad_avg8xN(16);
 vpx_sad_avg8xN(8);
 vpx_sad_avg8xN(4);

 static inline unsigned int vpx_sad4x(const uint8_t *src, int src_stride,
                                      const uint8_t *ref, int ref_stride,
                                      int counter) {
   unsigned int sad;
   double ftmp1, ftmp2, ftmp3;
   mips_reg l_counter = counter;

   __asm__ volatile (
     "xor        %[ftmp3],   %[ftmp3],       %[ftmp3]            \n\t"
     "1:                                                         \n\t"
     // Include two loop body, to reduce loop time.
     SAD_SRC_REF_ABS_SUB_4
     MMI_ADDU(%[src],     %[src],         %[src_stride])
     MMI_ADDU(%[ref],     %[ref],         %[ref_stride])
     SAD_SRC_REF_ABS_SUB_4
     MMI_ADDU(%[src],     %[src],         %[src_stride])
     MMI_ADDU(%[ref],     %[ref],         %[ref_stride])
     MMI_ADDIU(%[counter], %[counter], -0x02)
     "bnez       %[counter], 1b                                  \n\t"
     "mfc1       %[sad],     %[ftmp3]                            \n\t"
     : [ftmp1]"=&f"(ftmp1), [ftmp2]"=&f"(ftmp2), [ftmp3]"=&f"(ftmp3),
       [counter]"+&r"(l_counter), [src]"+&r"(src), [ref]"+&r"(ref),
       [sad]"=&r"(sad)
     : [src_stride]"r"((mips_reg)src_stride),
       [ref_stride]"r"((mips_reg)ref_stride)
   );

   return sad;
 }

 #define vpx_sad4xN(H)                                                   \
   unsigned int vpx_sad4x##H##_mmi(const uint8_t *src, int src_stride,   \
                                   const uint8_t *ref, int ref_stride) { \
     return vpx_sad4x(src, src_stride, ref, ref_stride, H);              \
   }

 vpx_sad4xN(8);
 vpx_sad4xN(4);
 sadMxNxK_mmi(4, 4, 3);
 sadMxNxK_mmi(4, 4, 8);
 sadMxNx4D_mmi(4, 8);
 sadMxNx4D_mmi(4, 4);

 static inline unsigned int vpx_sad_avg4x(const uint8_t *src, int src_stride,
                                          const uint8_t *ref, int ref_stride,
                                          const uint8_t *second_pred,
                                          int counter) {
   unsigned int sad;
   double ftmp1, ftmp2, ftmp3;
   mips_reg l_counter = counter;

   __asm__ volatile (
     "xor        %[ftmp3],   %[ftmp3],       %[ftmp3]            \n\t"
     "1:                                                         \n\t"
     // Include two loop body, to reduce loop time.
     SAD_SRC_AVGREF_ABS_SUB_4
     MMI_ADDIU(%[second_pred], %[second_pred], 0x04)
     MMI_ADDU(%[src],     %[src],         %[src_stride])
     MMI_ADDU(%[ref],     %[ref],         %[ref_stride])
     SAD_SRC_AVGREF_ABS_SUB_4
     MMI_ADDIU(%[second_pred], %[second_pred], 0x04)
     MMI_ADDU(%[src],     %[src],         %[src_stride])
     MMI_ADDU(%[ref],     %[ref],         %[ref_stride])
     MMI_ADDIU(%[counter], %[counter], -0x02)
     "bnez       %[counter], 1b                                  \n\t"
     "mfc1       %[sad],     %[ftmp3]                            \n\t"
     : [ftmp1]"=&f"(ftmp1), [ftmp2]"=&f"(ftmp2), [ftmp3]"=&f"(ftmp3),
       [counter]"+&r"(l_counter), [src]"+&r"(src), [ref]"+&r"(ref),
       [second_pred]"+&r"((mips_reg)second_pred),
       [sad]"=&r"(sad)
     : [src_stride]"r"((mips_reg)src_stride),
       [ref_stride]"r"((mips_reg)ref_stride)
   );

   return sad;
 }

 #define vpx_sad_avg4xN(H)                                                   \
   unsigned int vpx_sad4x##H##_avg_mmi(const uint8_t *src, int src_stride,   \
                                       const uint8_t *ref, int ref_stride,   \
                                       const uint8_t *second_pred) {         \
     return vpx_sad_avg4x(src, src_stride, ref, ref_stride, second_pred, H); \
   }

 vpx_sad_avg4xN(8);
 vpx_sad_avg4xN(4);