libyuv:loongson Correct the optimization of mmi on loongson3a platform.
When loading or storing the data, the unaligned address will greatly degrade
the optimization performance, so non-aligned access instructions are required
on the loongson platform.
Also delete the optimization function:ScaleARGBFilterCols_MMI,
because it degraded the performance.
BUG=libyuv:804
R=fbarchard@chromium.org
Change-Id: If4c15886a21cdcbac7ae8b336292e4549acf1e47
Reviewed-on: https://chromium-review.googlesource.com/1164627
Reviewed-by: Frank Barchard <fbarchard@chromium.org>
Commit-Queue: Frank Barchard <fbarchard@chromium.org>
diff --git a/include/libyuv/scale_row.h b/include/libyuv/scale_row.h
index 3042136..282d521 100644
--- a/include/libyuv/scale_row.h
+++ b/include/libyuv/scale_row.h
@@ -115,7 +115,6 @@
#define HAS_FIXEDDIV_MIPS
#define HAS_SCALEARGBCOLS_MMI
#define HAS_SCALEARGBCOLSUP2_MMI
-#define HAS_SCALEARGBFILTERCOLS_MMI
#define HAS_SCALEARGBROWDOWN2_MMI
#define HAS_SCALEARGBROWDOWNEVEN_MMI
#define HAS_SCALEROWDOWN2_MMI
@@ -592,21 +591,11 @@
int dst_width,
int x,
int dx);
-void ScaleARGBFilterCols_MMI(uint8_t* dst_argb,
- const uint8_t* src_argb,
- int dst_width,
- int x,
- int dx);
void ScaleARGBCols_MMI(uint8_t* dst_argb,
const uint8_t* src_argb,
int dst_width,
int x,
int dx);
-void ScaleARGBFilterCols_Any_MMI(uint8_t* dst_ptr,
- const uint8_t* src_ptr,
- int dst_width,
- int x,
- int dx);
void ScaleARGBCols_Any_MMI(uint8_t* dst_ptr,
const uint8_t* src_ptr,
int dst_width,
diff --git a/source/row_mmi.cc b/source/row_mmi.cc
index dab8010..3649952 100644
--- a/source/row_mmi.cc
+++ b/source/row_mmi.cc
@@ -7,10 +7,8 @@
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
-#include <sys/time.h>
#include "libyuv/row.h"
-#include <stdio.h>
#include <string.h> // For memcpy and memset.
#include "libyuv/basic_types.h"
@@ -4492,7 +4490,8 @@
"psubh %[y00], %[y10], %[y20] \n\t"
"packushb %[sobel], %[sobel], %[y00] \n\t" // clamp255
- "sdc1 %[sobel], 0(%[dst_sobelx]) \n\t"
+ "gssdrc1 %[sobel], 0(%[dst_sobelx]) \n\t"
+ "gssdlc1 %[sobel], 7(%[dst_sobelx]) \n\t"
"daddiu %[src_y0], %[src_y0], 8 \n\t"
"daddiu %[src_y1], %[src_y1], 8 \n\t"
@@ -4587,7 +4586,8 @@
"psubh %[y00], %[y02], %[y12] \n\t"
"packushb %[sobel], %[sobel], %[y00] \n\t" // clamp255
- "sdc1 %[sobel], 0(%[dst_sobely]) \n\t"
+ "gssdrc1 %[sobel], 0(%[dst_sobely]) \n\t"
+ "gssdlc1 %[sobel], 7(%[dst_sobely]) \n\t"
"daddiu %[src_y0], %[src_y0], 8 \n\t"
"daddiu %[src_y1], %[src_y1], 8 \n\t"
@@ -4624,13 +4624,15 @@
"punpcklbh %[t1], %[t0], %[t0] \n\t"
"or %[t1], %[t1], %[c1] \n\t"
// 255 s1 s1 s1 s55 s0 s0 s0
- "sdc1 %[t1], 0x00(%[dst_argb]) \n\t"
+ "gssdrc1 %[t1], 0x00(%[dst_argb]) \n\t"
+ "gssdlc1 %[t1], 0x07(%[dst_argb]) \n\t"
// s3 s3 s2 s2->s3 s3 s3 s3 s2 s2 s2 s2
"punpckhbh %[t1], %[t0], %[t0] \n\t"
"or %[t1], %[t1], %[c1] \n\t"
// 255 s3 s3 s3 255 s2 s2 s2
- "sdc1 %[t1], 0x08(%[dst_argb]) \n\t"
+ "gssdrc1 %[t1], 0x08(%[dst_argb]) \n\t"
+ "gssdlc1 %[t1], 0x0f(%[dst_argb]) \n\t"
// s7 s6 s5 s4->s7 s7 s6 s6 s5 s5 s4 s4
"punpckhbh %[t0], %[t2], %[t2] \n\t"
@@ -4638,12 +4640,14 @@
// s5 s5 s4 s4->s5 s5 s5 s5 s4 s4 s4 s4
"punpcklbh %[t1], %[t0], %[t0] \n\t"
"or %[t1], %[t1], %[c1] \n\t"
- "sdc1 %[t1], 0x10(%[dst_argb]) \n\t"
+ "gssdrc1 %[t1], 0x10(%[dst_argb]) \n\t"
+ "gssdlc1 %[t1], 0x17(%[dst_argb]) \n\t"
// s7 s7 s6 s6->s7 s7 s7 s7 s6 s6 s6 s6
"punpckhbh %[t1], %[t0], %[t0] \n\t"
"or %[t1], %[t1], %[c1] \n\t"
- "sdc1 %[t1], 0x18(%[dst_argb]) \n\t"
+ "gssdrc1 %[t1], 0x18(%[dst_argb]) \n\t"
+ "gssdlc1 %[t1], 0x1f(%[dst_argb]) \n\t"
"daddiu %[dst_argb], %[dst_argb], 32 \n\t"
"daddiu %[src_sobelx], %[src_sobelx], 8 \n\t"
@@ -4665,10 +4669,13 @@
uint64_t tb = 0;
__asm__ volatile(
"1: \n\t"
- "ldc1 %[tr], 0x0(%[src_sobelx]) \n\t" // r=src_sobelx[i]
- "ldc1 %[tb], 0x0(%[src_sobely]) \n\t" // b=src_sobely[i]
+ "gsldrc1 %[tr], 0x0(%[src_sobelx]) \n\t"
+ "gsldlc1 %[tr], 0x7(%[src_sobelx]) \n\t" // r=src_sobelx[i]
+ "gsldrc1 %[tb], 0x0(%[src_sobely]) \n\t"
+ "gsldlc1 %[tb], 0x7(%[src_sobely]) \n\t" // b=src_sobely[i]
"paddusb %[tr], %[tr], %[tb] \n\t" // g
- "sdc1 %[tr], 0x0(%[dst_y]) \n\t"
+ "gssdrc1 %[tr], 0x0(%[dst_y]) \n\t"
+ "gssdlc1 %[tr], 0x7(%[dst_y]) \n\t"
"daddiu %[dst_y], %[dst_y], 8 \n\t"
"daddiu %[src_sobelx], %[src_sobelx], 8 \n\t"
@@ -4705,10 +4712,12 @@
"punpcklbh %[cr], %[tr], %[c1] \n\t"
// c1 r1 g1 b1 c0 r0 g0 b0
"punpcklhw %[result], %[gb], %[cr] \n\t"
- "sdc1 %[result], 0x00(%[dst_argb]) \n\t"
+ "gssdrc1 %[result], 0x00(%[dst_argb]) \n\t"
+ "gssdlc1 %[result], 0x07(%[dst_argb]) \n\t"
// c3 r3 g3 b3 c2 r2 g2 b2
"punpckhhw %[result], %[gb], %[cr] \n\t"
- "sdc1 %[result], 0x08(%[dst_argb]) \n\t"
+ "gssdrc1 %[result], 0x08(%[dst_argb]) \n\t"
+ "gssdlc1 %[result], 0x0f(%[dst_argb]) \n\t"
// g7 b7 g6 b6 g5 b5 g4 b4
"punpckhbh %[gb], %[tb], %[tg] \n\t"
@@ -4716,10 +4725,12 @@
"punpckhbh %[cr], %[tr], %[c1] \n\t"
// c5 r5 g5 b5 c4 r4 g4 b4
"punpcklhw %[result], %[gb], %[cr] \n\t"
- "sdc1 %[result], 0x10(%[dst_argb]) \n\t"
+ "gssdrc1 %[result], 0x10(%[dst_argb]) \n\t"
+ "gssdlc1 %[result], 0x17(%[dst_argb]) \n\t"
// c7 r7 g7 b7 c6 r6 g6 b6
"punpckhhw %[result], %[gb], %[cr] \n\t"
- "sdc1 %[result], 0x18(%[dst_argb]) \n\t"
+ "gssdrc1 %[result], 0x18(%[dst_argb]) \n\t"
+ "gssdlc1 %[result], 0x1f(%[dst_argb]) \n\t"
"daddiu %[dst_argb], %[dst_argb], 32 \n\t"
"daddiu %[src_sobelx], %[src_sobelx], 8 \n\t"
@@ -4748,12 +4759,14 @@
"punpcklhw %[dest], %[src], %[src] \n\t"
"and %[dest], %[dest], %[mask0] \n\t"
"or %[dest], %[dest], %[mask1] \n\t"
- "sdc1 %[dest], 0x00(%[dst_ptr]) \n\t"
+ "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t"
+ "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t"
"punpckhhw %[dest], %[src], %[src] \n\t"
"and %[dest], %[dest], %[mask0] \n\t"
"or %[dest], %[dest], %[mask1] \n\t"
- "sdc1 %[dest], 0x08(%[dst_ptr]) \n\t"
+ "gssdrc1 %[dest], 0x08(%[dst_ptr]) \n\t"
+ "gssdlc1 %[dest], 0x0f(%[dst_ptr]) \n\t"
"daddiu %[src_ptr], %[src_ptr], 0x04 \n\t"
"daddiu %[dst_ptr], %[dst_ptr], 0x10 \n\t"
@@ -4955,7 +4968,8 @@
"gsldlc1 %[temp], 3(%[src]) \n\t"
"gsldrc1 %[temp], -4(%[src]) \n\t"
"pshufh %[temp], %[temp], %[shuff] \n\t"
- "sdc1 %[temp], 0x0(%[dst]) \n\t"
+ "gssdrc1 %[temp], 0x0(%[dst]) \n\t"
+ "gssdlc1 %[temp], 0x7(%[dst]) \n\t"
"daddiu %[src], %[src], -0x08 \n\t"
"daddiu %[dst], %[dst], 0x08 \n\t"
@@ -4975,18 +4989,22 @@
uint64_t shift = 0x08;
__asm__ volatile(
"1: \n\t"
- "ldc1 %[t0], 0x00(%[src_uv]) \n\t"
- "ldc1 %[t1], 0x08(%[src_uv]) \n\t"
+ "gsldrc1 %[t0], 0x00(%[src_uv]) \n\t"
+ "gsldlc1 %[t0], 0x07(%[src_uv]) \n\t"
+ "gsldrc1 %[t1], 0x08(%[src_uv]) \n\t"
+ "gsldlc1 %[t1], 0x0f(%[src_uv]) \n\t"
"and %[t2], %[t0], %[c0] \n\t"
"and %[t3], %[t1], %[c0] \n\t"
"packushb %[t2], %[t2], %[t3] \n\t"
- "sdc1 %[t2], 0x0(%[dst_u]) \n\t"
+ "gssdrc1 %[t2], 0x0(%[dst_u]) \n\t"
+ "gssdlc1 %[t2], 0x7(%[dst_u]) \n\t"
"psrlh %[t2], %[t0], %[shift] \n\t"
"psrlh %[t3], %[t1], %[shift] \n\t"
"packushb %[t2], %[t2], %[t3] \n\t"
- "sdc1 %[t2], 0x0(%[dst_v]) \n\t"
+ "gssdrc1 %[t2], 0x0(%[dst_v]) \n\t"
+ "gssdlc1 %[t2], 0x7(%[dst_v]) \n\t"
"daddiu %[src_uv], %[src_uv], 16 \n\t"
"daddiu %[dst_u], %[dst_u], 8 \n\t"
@@ -5008,12 +5026,16 @@
uint64_t temp[3];
__asm__ volatile(
"1: \n\t"
- "ldc1 %[t0], 0x0(%[src_u]) \n\t"
- "ldc1 %[t1], 0x0(%[src_v]) \n\t"
+ "gsldrc1 %[t0], 0x0(%[src_u]) \n\t"
+ "gsldlc1 %[t0], 0x7(%[src_u]) \n\t"
+ "gsldrc1 %[t1], 0x0(%[src_v]) \n\t"
+ "gsldlc1 %[t1], 0x7(%[src_v]) \n\t"
"punpcklbh %[t2], %[t0], %[t1] \n\t"
- "sdc1 %[t2], 0x0(%[dst_uv]) \n\t"
+ "gssdrc1 %[t2], 0x0(%[dst_uv]) \n\t"
+ "gssdlc1 %[t2], 0x7(%[dst_uv]) \n\t"
"punpckhbh %[t2], %[t0], %[t1] \n\t"
- "sdc1 %[t2], 0x8(%[dst_uv]) \n\t"
+ "gssdrc1 %[t2], 0x8(%[dst_uv]) \n\t"
+ "gssdlc1 %[t2], 0xf(%[dst_uv]) \n\t"
"daddiu %[src_u], %[src_u], 8 \n\t"
"daddiu %[src_v], %[src_v], 8 \n\t"
@@ -5149,13 +5171,17 @@
uint64_t src_stride = 0x0;
__asm__ volatile(
"1: \n\t"
- "ldc1 %[t0], 0x00(%[src_yuy2]) \n\t"
+ "gsldrc1 %[t0], 0x00(%[src_yuy2]) \n\t"
+ "gsldlc1 %[t0], 0x07(%[src_yuy2]) \n\t"
"daddu %[src_stride], %[src_yuy2], %[src_stride_yuy2] \n\t"
- "ldc1 %[t1], 0x00(%[src_stride]) \n\t"
- "pavgb %[t0], %[t0], %[t1] \n\t"
+ "gsldrc1 %[t1], 0x00(%[src_stride]) \n\t"
+ "gsldlc1 %[t1], 0x07(%[src_stride]) \n\t"
+ "pavgb %[t0], %[t0], %[t1] \n\t"
- "ldc1 %[t2], 0x08(%[src_yuy2]) \n\t"
- "ldc1 %[t1], 0x08(%[src_stride]) \n\t"
+ "gsldrc1 %[t2], 0x08(%[src_yuy2]) \n\t"
+ "gsldlc1 %[t2], 0x0f(%[src_yuy2]) \n\t"
+ "gsldrc1 %[t1], 0x08(%[src_stride]) \n\t"
+ "gsldlc1 %[t1], 0x0f(%[src_stride]) \n\t"
"pavgb %[t1], %[t2], %[t1] \n\t"
"and %[t0], %[t0], %[c0] \n\t"
@@ -5167,12 +5193,16 @@
"and %[d0], %[t0], %[c1] \n\t"
"psrlh %[d1], %[t1], %[shift] \n\t"
- "ldc1 %[t0], 0x10(%[src_yuy2]) \n\t"
- "ldc1 %[t1], 0x10(%[src_stride]) \n\t"
+ "gsldrc1 %[t0], 0x10(%[src_yuy2]) \n\t"
+ "gsldlc1 %[t0], 0x17(%[src_yuy2]) \n\t"
+ "gsldrc1 %[t1], 0x10(%[src_stride]) \n\t"
+ "gsldlc1 %[t1], 0x17(%[src_stride]) \n\t"
"pavgb %[t0], %[t0], %[t1] \n\t"
- "ldc1 %[t2], 0x18(%[src_yuy2]) \n\t"
- "ldc1 %[t1], 0x18(%[src_stride]) \n\t"
+ "gsldrc1 %[t2], 0x18(%[src_yuy2]) \n\t"
+ "gsldlc1 %[t2], 0x1f(%[src_yuy2]) \n\t"
+ "gsldrc1 %[t1], 0x18(%[src_stride]) \n\t"
+ "gsldlc1 %[t1], 0x1f(%[src_stride]) \n\t"
"pavgb %[t1], %[t2], %[t1] \n\t"
"and %[t0], %[t0], %[c0] \n\t"
@@ -5186,8 +5216,10 @@
"packushb %[d0], %[d0], %[d2] \n\t"
"packushb %[d1], %[d1], %[d3] \n\t"
- "sdc1 %[d0], 0x0(%[dst_u]) \n\t"
- "sdc1 %[d1], 0x0(%[dst_v]) \n\t"
+ "gssdrc1 %[d0], 0x0(%[dst_u]) \n\t"
+ "gssdlc1 %[d0], 0x7(%[dst_u]) \n\t"
+ "gssdrc1 %[d1], 0x0(%[dst_v]) \n\t"
+ "gssdlc1 %[d1], 0x7(%[dst_v]) \n\t"
"daddiu %[src_yuy2], %[src_yuy2], 32 \n\t"
"daddiu %[dst_u], %[dst_u], 8 \n\t"
"daddiu %[dst_v], %[dst_v], 8 \n\t"
@@ -5215,8 +5247,10 @@
uint64_t shift = 0x08;
__asm__ volatile(
"1: \n\t"
- "ldc1 %[t0], 0x00(%[src_yuy2]) \n\t"
- "ldc1 %[t1], 0x08(%[src_yuy2]) \n\t"
+ "gsldrc1 %[t0], 0x00(%[src_yuy2]) \n\t"
+ "gsldlc1 %[t0], 0x07(%[src_yuy2]) \n\t"
+ "gsldrc1 %[t1], 0x08(%[src_yuy2]) \n\t"
+ "gsldlc1 %[t1], 0x0f(%[src_yuy2]) \n\t"
"and %[t0], %[t0], %[c0] \n\t"
"and %[t1], %[t1], %[c0] \n\t"
"psrlh %[t0], %[t0], %[shift] \n\t"
@@ -5226,8 +5260,10 @@
"and %[d0], %[t0], %[c1] \n\t"
"psrlh %[d1], %[t1], %[shift] \n\t"
- "ldc1 %[t0], 0x10(%[src_yuy2]) \n\t"
- "ldc1 %[t1], 0x18(%[src_yuy2]) \n\t"
+ "gsldrc1 %[t0], 0x10(%[src_yuy2]) \n\t"
+ "gsldlc1 %[t0], 0x17(%[src_yuy2]) \n\t"
+ "gsldrc1 %[t1], 0x18(%[src_yuy2]) \n\t"
+ "gsldlc1 %[t1], 0x1f(%[src_yuy2]) \n\t"
"and %[t0], %[t0], %[c0] \n\t"
"and %[t1], %[t1], %[c0] \n\t"
"psrlh %[t0], %[t0], %[shift] \n\t"
@@ -5239,8 +5275,10 @@
"packushb %[d0], %[d0], %[d2] \n\t"
"packushb %[d1], %[d1], %[d3] \n\t"
- "sdc1 %[d0], 0x0(%[dst_u]) \n\t"
- "sdc1 %[d1], 0x0(%[dst_v]) \n\t"
+ "gssdrc1 %[d0], 0x0(%[dst_u]) \n\t"
+ "gssdlc1 %[d0], 0x7(%[dst_u]) \n\t"
+ "gssdrc1 %[d1], 0x0(%[dst_v]) \n\t"
+ "gssdlc1 %[d1], 0x7(%[dst_v]) \n\t"
"daddiu %[src_yuy2], %[src_yuy2], 32 \n\t"
"daddiu %[dst_u], %[dst_u], 8 \n\t"
"daddiu %[dst_v], %[dst_v], 8 \n\t"
@@ -5256,17 +5294,19 @@
// Copy row of YUY2 Y's (422) into Y (420/422).
void YUY2ToYRow_MMI(const uint8_t* src_yuy2, uint8_t* dst_y, int width) {
- // Output a row of UV values, filtering 2 rows of YUY2.
uint64_t c0 = 0x00ff00ff00ff00ff;
uint64_t temp[2];
__asm__ volatile(
"1: \n\t"
- "ldc1 %[t0], 0x00(%[src_yuy2]) \n\t"
- "ldc1 %[t1], 0x08(%[src_yuy2]) \n\t"
+ "gsldrc1 %[t0], 0x00(%[src_yuy2]) \n\t"
+ "gsldlc1 %[t0], 0x07(%[src_yuy2]) \n\t"
+ "gsldrc1 %[t1], 0x08(%[src_yuy2]) \n\t"
+ "gsldlc1 %[t1], 0x0f(%[src_yuy2]) \n\t"
"and %[t0], %[t0], %[c0] \n\t"
"and %[t1], %[t1], %[c0] \n\t"
"packushb %[t0], %[t0], %[t1] \n\t"
- "sdc1 %[t0], 0x0(%[dst_y]) \n\t"
+ "gssdrc1 %[t0], 0x0(%[dst_y]) \n\t"
+ "gssdlc1 %[t0], 0x7(%[dst_y]) \n\t"
"daddiu %[src_yuy2], %[src_yuy2], 16 \n\t"
"daddiu %[dst_y], %[dst_y], 8 \n\t"
"daddiu %[width], %[width], -8 \n\t"
@@ -5292,13 +5332,17 @@
uint64_t src_stride = 0x0;
__asm__ volatile(
"1: \n\t"
- "ldc1 %[t0], 0x00(%[src_uyvy]) \n\t"
+ "gsldrc1 %[t0], 0x00(%[src_uyvy]) \n\t"
+ "gsldlc1 %[t0], 0x07(%[src_uyvy]) \n\t"
"daddu %[src_stride], %[src_uyvy], %[src_stride_uyvy] \n\t"
- "ldc1 %[t1], 0x00(%[src_stride]) \n\t"
+ "gsldrc1 %[t1], 0x00(%[src_stride]) \n\t"
+ "gsldlc1 %[t1], 0x07(%[src_stride]) \n\t"
"pavgb %[t0], %[t0], %[t1] \n\t"
- "ldc1 %[t2], 0x08(%[src_uyvy]) \n\t"
- "ldc1 %[t1], 0x08(%[src_stride]) \n\t"
+ "gsldrc1 %[t2], 0x08(%[src_uyvy]) \n\t"
+ "gsldlc1 %[t2], 0x0f(%[src_uyvy]) \n\t"
+ "gsldrc1 %[t1], 0x08(%[src_stride]) \n\t"
+ "gsldlc1 %[t1], 0x0f(%[src_stride]) \n\t"
"pavgb %[t1], %[t2], %[t1] \n\t"
"and %[t0], %[t0], %[c0] \n\t"
@@ -5308,12 +5352,16 @@
"and %[d0], %[t0], %[c0] \n\t"
"psrlh %[d1], %[t1], %[shift] \n\t"
- "ldc1 %[t0], 0x10(%[src_uyvy]) \n\t"
- "ldc1 %[t1], 0x10(%[src_stride]) \n\t"
+ "gsldrc1 %[t0], 0x10(%[src_uyvy]) \n\t"
+ "gsldlc1 %[t0], 0x17(%[src_uyvy]) \n\t"
+ "gsldrc1 %[t1], 0x10(%[src_stride]) \n\t"
+ "gsldlc1 %[t1], 0x17(%[src_stride]) \n\t"
"pavgb %[t0], %[t0], %[t1] \n\t"
- "ldc1 %[t2], 0x18(%[src_uyvy]) \n\t"
- "ldc1 %[t1], 0x18(%[src_stride]) \n\t"
+ "gsldrc1 %[t2], 0x18(%[src_uyvy]) \n\t"
+ "gsldlc1 %[t2], 0x1f(%[src_uyvy]) \n\t"
+ "gsldrc1 %[t1], 0x18(%[src_stride]) \n\t"
+ "gsldlc1 %[t1], 0x1f(%[src_stride]) \n\t"
"pavgb %[t1], %[t2], %[t1] \n\t"
"and %[t0], %[t0], %[c0] \n\t"
@@ -5325,8 +5373,10 @@
"packushb %[d0], %[d0], %[d2] \n\t"
"packushb %[d1], %[d1], %[d3] \n\t"
- "sdc1 %[d0], 0x0(%[dst_u]) \n\t"
- "sdc1 %[d1], 0x0(%[dst_v]) \n\t"
+ "gssdrc1 %[d0], 0x0(%[dst_u]) \n\t"
+ "gssdlc1 %[d0], 0x7(%[dst_u]) \n\t"
+ "gssdrc1 %[d1], 0x0(%[dst_v]) \n\t"
+ "gssdlc1 %[d1], 0x7(%[dst_v]) \n\t"
"daddiu %[src_uyvy], %[src_uyvy], 32 \n\t"
"daddiu %[dst_u], %[dst_u], 8 \n\t"
"daddiu %[dst_v], %[dst_v], 8 \n\t"
@@ -5354,8 +5404,10 @@
uint64_t shift = 0x08;
__asm__ volatile(
"1: \n\t"
- "ldc1 %[t0], 0x00(%[src_uyvy]) \n\t"
- "ldc1 %[t1], 0x08(%[src_uyvy]) \n\t"
+ "gsldrc1 %[t0], 0x00(%[src_uyvy]) \n\t"
+ "gsldlc1 %[t0], 0x07(%[src_uyvy]) \n\t"
+ "gsldrc1 %[t1], 0x08(%[src_uyvy]) \n\t"
+ "gsldlc1 %[t1], 0x0f(%[src_uyvy]) \n\t"
"and %[t0], %[t0], %[c0] \n\t"
"and %[t1], %[t1], %[c0] \n\t"
"packushb %[t0], %[t0], %[t1] \n\t"
@@ -5363,8 +5415,10 @@
"and %[d0], %[t0], %[c0] \n\t"
"psrlh %[d1], %[t1], %[shift] \n\t"
- "ldc1 %[t0], 0x10(%[src_uyvy]) \n\t"
- "ldc1 %[t1], 0x18(%[src_uyvy]) \n\t"
+ "gsldrc1 %[t0], 0x10(%[src_uyvy]) \n\t"
+ "gsldlc1 %[t0], 0x17(%[src_uyvy]) \n\t"
+ "gsldrc1 %[t1], 0x18(%[src_uyvy]) \n\t"
+ "gsldlc1 %[t1], 0x1f(%[src_uyvy]) \n\t"
"and %[t0], %[t0], %[c0] \n\t"
"and %[t1], %[t1], %[c0] \n\t"
"packushb %[t0], %[t0], %[t1] \n\t"
@@ -5374,8 +5428,10 @@
"packushb %[d0], %[d0], %[d2] \n\t"
"packushb %[d1], %[d1], %[d3] \n\t"
- "sdc1 %[d0], 0x0(%[dst_u]) \n\t"
- "sdc1 %[d1], 0x0(%[dst_v]) \n\t"
+ "gssdrc1 %[d0], 0x0(%[dst_u]) \n\t"
+ "gssdlc1 %[d0], 0x7(%[dst_u]) \n\t"
+ "gssdrc1 %[d1], 0x0(%[dst_v]) \n\t"
+ "gssdlc1 %[d1], 0x7(%[dst_v]) \n\t"
"daddiu %[src_uyvy], %[src_uyvy], 32 \n\t"
"daddiu %[dst_u], %[dst_u], 8 \n\t"
"daddiu %[dst_v], %[dst_v], 8 \n\t"
@@ -5397,15 +5453,18 @@
uint64_t temp[2];
__asm__ volatile(
"1: \n\t"
- "ldc1 %[t0], 0x00(%[src_uyvy]) \n\t"
- "ldc1 %[t1], 0x08(%[src_uyvy]) \n\t"
+ "gsldrc1 %[t0], 0x00(%[src_uyvy]) \n\t"
+ "gsldlc1 %[t0], 0x07(%[src_uyvy]) \n\t"
+ "gsldrc1 %[t1], 0x08(%[src_uyvy]) \n\t"
+ "gsldlc1 %[t1], 0x0f(%[src_uyvy]) \n\t"
"dsrl %[t0], %[t0], %[shift] \n\t"
"dsrl %[t1], %[t1], %[shift] \n\t"
"and %[t0], %[t0], %[c0] \n\t"
"and %[t1], %[t1], %[c0] \n\t"
"and %[t1], %[t1], %[c0] \n\t"
"packushb %[t0], %[t0], %[t1] \n\t"
- "sdc1 %[t0], 0x0(%[dst_y]) \n\t"
+ "gssdrc1 %[t0], 0x0(%[dst_y]) \n\t"
+ "gssdlc1 %[t0], 0x7(%[dst_y]) \n\t"
"daddiu %[src_uyvy], %[src_uyvy], 16 \n\t"
"daddiu %[dst_y], %[dst_y], 8 \n\t"
"daddiu %[width], %[width], -8 \n\t"
@@ -5670,19 +5729,22 @@
uint64_t uv = 0x0;
uint64_t uv_stride = 0x0;
__asm__ volatile(
- "1: \n\t"
- "ldc1 %[uv], 0x0(%[src_ptr]) \n\t"
- "daddu $t0, %[src_ptr], %[stride] \n\t"
- "ldc1 %[uv_stride], 0x0($t0) \n\t"
+ "1: \n\t"
+ "gsldrc1 %[uv], 0x0(%[src_ptr]) \n\t"
+ "gsldlc1 %[uv], 0x7(%[src_ptr]) \n\t"
+ "daddu $t0, %[src_ptr], %[stride] \n\t"
+ "gsldrc1 %[uv_stride], 0x0($t0) \n\t"
+ "gsldlc1 %[uv_stride], 0x7($t0) \n\t"
- "pavgb %[uv], %[uv], %[uv_stride] \n\t"
- "sdc1 %[uv], 0x0(%[dst_ptr]) \n\t"
+ "pavgb %[uv], %[uv], %[uv_stride] \n\t"
+ "gssdrc1 %[uv], 0x0(%[dst_ptr]) \n\t"
+ "gssdlc1 %[uv], 0x7(%[dst_ptr]) \n\t"
- "daddiu %[src_ptr], %[src_ptr], 8 \n\t"
- "daddiu %[dst_ptr], %[dst_ptr], 8 \n\t"
- "daddiu %[width], %[width], -8 \n\t"
- "bgtz %[width], 1b \n\t"
- "nop \n\t"
+ "daddiu %[src_ptr], %[src_ptr], 8 \n\t"
+ "daddiu %[dst_ptr], %[dst_ptr], 8 \n\t"
+ "daddiu %[width], %[width], -8 \n\t"
+ "bgtz %[width], 1b \n\t"
+ "nop \n\t"
: [uv] "=&f"(uv), [uv_stride] "=&f"(uv_stride)
: [src_ptr] "r"(src_ptr), [dst_ptr] "r"(dst_ptr), [width] "r"(width),
[stride] "r"((int64_t)src_stride)
@@ -5700,10 +5762,12 @@
"pshufh %[fy1], %[fy1], %[zero] \n\t"
"psubh %[fy0], %[fy0], %[fy1] \n\t"
"1: \n\t"
- "ldc1 %[t0], 0x0(%[src_ptr]) \n\t"
+ "gsldrc1 %[t0], 0x0(%[src_ptr]) \n\t"
+ "gsldlc1 %[t0], 0x7(%[src_ptr]) \n\t"
"punpcklbh %[d0], %[t0], %[zero] \n\t"
"punpckhbh %[d1], %[t0], %[zero] \n\t"
- "ldc1 %[t0], 0x0(%[src_ptr1]) \n\t"
+ "gsldrc1 %[t0], 0x0(%[src_ptr1]) \n\t"
+ "gsldlc1 %[t0], 0x7(%[src_ptr1]) \n\t"
"punpcklbh %[d2], %[t0], %[zero] \n\t"
"punpckhbh %[d3], %[t0], %[zero] \n\t"
@@ -5720,7 +5784,8 @@
"psrlh %[d1], %[d1], %[shift] \n\t"
"packushb %[d0], %[d0], %[d1] \n\t"
- "sdc1 %[d0], 0x0(%[dst_ptr]) \n\t"
+ "gssdrc1 %[d0], 0x0(%[dst_ptr]) \n\t"
+ "gssdlc1 %[d0], 0x7(%[dst_ptr]) \n\t"
"daddiu %[src_ptr], %[src_ptr], 8 \n\t"
"daddiu %[src_ptr1], %[src_ptr1], 8 \n\t"
"daddiu %[dst_ptr], %[dst_ptr], 8 \n\t"
diff --git a/source/scale_mmi.cc b/source/scale_mmi.cc
index 4757d89..e12c6bb 100644
--- a/source/scale_mmi.cc
+++ b/source/scale_mmi.cc
@@ -38,10 +38,12 @@
__asm__ volatile(
"1: \n\t"
- "ldc1 %[src0], 0x00(%[src_ptr]) \n\t"
+ "gsldrc1 %[src0], 0x00(%[src_ptr]) \n\t"
+ "gsldlc1 %[src0], 0x07(%[src_ptr]) \n\t"
"psrlh %[src0], %[src0], %[shift] \n\t"
- "ldc1 %[src1], 0x08(%[src_ptr]) \n\t"
+ "gsldrc1 %[src1], 0x08(%[src_ptr]) \n\t"
+ "gsldlc1 %[src1], 0x0f(%[src_ptr]) \n\t"
"psrlh %[src1], %[src1], %[shift] \n\t"
"packushb %[dest], %[src0], %[src1] \n\t"
@@ -72,9 +74,11 @@
__asm__ volatile(
"1: \n\t"
- "ldc1 %[src0], 0x00(%[src_ptr]) \n\t"
+ "gsldrc1 %[src0], 0x00(%[src_ptr]) \n\t"
+ "gsldlc1 %[src0], 0x07(%[src_ptr]) \n\t"
"and %[dest0], %[src0], %[mask] \n\t"
- "ldc1 %[src1], 0x08(%[src_ptr]) \n\t"
+ "gsldrc1 %[src1], 0x08(%[src_ptr]) \n\t"
+ "gsldlc1 %[src1], 0x0f(%[src_ptr]) \n\t"
"and %[dest1], %[src1], %[mask] \n\t"
"packushb %[dest0], %[dest0], %[dest1] \n\t"
@@ -114,11 +118,13 @@
__asm__ volatile(
"1: \n\t"
- "ldc1 %[s0], 0x00(%[s]) \n\t"
+ "gsldrc1 %[s0], 0x00(%[s]) \n\t"
+ "gsldlc1 %[s0], 0x07(%[s]) \n\t"
"psrlh %[s1], %[s0], %[shift1] \n\t"
"and %[s0], %[s0], %[mask] \n\t"
- "ldc1 %[t0], 0x00(%[t]) \n\t"
+ "gsldrc1 %[t0], 0x00(%[t]) \n\t"
+ "gsldlc1 %[t0], 0x07(%[t]) \n\t"
"psrlh %[t1], %[t0], %[shift1] \n\t"
"and %[t0], %[t0], %[mask] \n\t"
@@ -128,11 +134,13 @@
"paddh %[dest0], %[dest0], %[ph] \n\t"
"psrlh %[dest0], %[dest0], %[shift0] \n\t"
- "ldc1 %[s0], 0x08(%[s]) \n\t"
+ "gsldrc1 %[s0], 0x08(%[s]) \n\t"
+ "gsldlc1 %[s0], 0x0f(%[s]) \n\t"
"psrlh %[s1], %[s0], %[shift1] \n\t"
"and %[s0], %[s0], %[mask] \n\t"
- "ldc1 %[t0], 0x08(%[t]) \n\t"
+ "gsldrc1 %[t0], 0x08(%[t]) \n\t"
+ "gsldlc1 %[t0], 0x0f(%[t]) \n\t"
"psrlh %[t1], %[t0], %[shift1] \n\t"
"and %[t0], %[t0], %[mask] \n\t"
@@ -172,8 +180,10 @@
__asm__ volatile(
"1: \n\t"
- "ldc1 %[src0], 0x00(%[src_ptr]) \n\t"
- "ldc1 %[src1], 0x08(%[src_ptr]) \n\t"
+ "gsldrc1 %[src0], 0x00(%[src_ptr]) \n\t"
+ "gsldlc1 %[src0], 0x07(%[src_ptr]) \n\t"
+ "gsldrc1 %[src1], 0x08(%[src_ptr]) \n\t"
+ "gsldlc1 %[src1], 0x0f(%[src_ptr]) \n\t"
"punpckhwd %[dest], %[src0], %[src1] \n\t"
"gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t"
@@ -237,12 +247,14 @@
__asm__ volatile(
"1: \n\t"
- "ldc1 %[s0], 0x00(%[s]) \n\t"
+ "gsldrc1 %[s0], 0x00(%[s]) \n\t"
+ "gsldlc1 %[s0], 0x07(%[s]) \n\t"
"punpcklbh %[s_lo], %[s0], %[mask] \n\t"
"punpckhbh %[s_hi], %[s0], %[mask] \n\t"
"paddh %[dest_lo], %[s_lo], %[s_hi] \n\t"
- "ldc1 %[t0], 0x00(%[t]) \n\t"
+ "gsldrc1 %[t0], 0x00(%[t]) \n\t"
+ "gsldlc1 %[t0], 0x07(%[t]) \n\t"
"punpcklbh %[t_lo], %[t0], %[mask] \n\t"
"punpckhbh %[t_hi], %[t0], %[mask] \n\t"
"paddh %[dest_lo], %[dest_lo], %[t_lo] \n\t"
@@ -251,12 +263,14 @@
"paddh %[dest_lo], %[dest_lo], %[ph] \n\t"
"psrlh %[dest_lo], %[dest_lo], %[shfit] \n\t"
- "ldc1 %[s0], 0x08(%[s]) \n\t"
+ "gsldrc1 %[s0], 0x08(%[s]) \n\t"
+ "gsldlc1 %[s0], 0x0f(%[s]) \n\t"
"punpcklbh %[s_lo], %[s0], %[mask] \n\t"
"punpckhbh %[s_hi], %[s0], %[mask] \n\t"
"paddh %[dest_hi], %[s_lo], %[s_hi] \n\t"
- "ldc1 %[t0], 0x08(%[t]) \n\t"
+ "gsldrc1 %[t0], 0x08(%[t]) \n\t"
+ "gsldlc1 %[t0], 0x0f(%[t]) \n\t"
"punpcklbh %[t_lo], %[t0], %[mask] \n\t"
"punpckhbh %[t_hi], %[t0], %[mask] \n\t"
"paddh %[dest_hi], %[dest_hi], %[t_lo] \n\t"
@@ -293,10 +307,12 @@
__asm__ volatile(
"1: \n\t"
- "ldc1 %[src0], 0x00(%[src_ptr]) \n\t"
+ "gsldrc1 %[src0], 0x00(%[src_ptr]) \n\t"
+ "gsldlc1 %[src0], 0x07(%[src_ptr]) \n\t"
"psrlw %[src0], %[src0], %[shift] \n\t"
- "ldc1 %[src1], 0x08(%[src_ptr]) \n\t"
+ "gsldrc1 %[src1], 0x08(%[src_ptr]) \n\t"
+ "gsldlc1 %[src1], 0x0f(%[src_ptr]) \n\t"
"psrlw %[src1], %[src1], %[shift] \n\t"
"packsswh %[dest], %[src0], %[src1] \n\t"
@@ -324,8 +340,10 @@
__asm__ volatile(
"1: \n\t"
- "ldc1 %[src0], 0x00(%[src_ptr]) \n\t"
- "ldc1 %[src1], 0x08(%[src_ptr]) \n\t"
+ "gsldrc1 %[src0], 0x00(%[src_ptr]) \n\t"
+ "gsldlc1 %[src0], 0x07(%[src_ptr]) \n\t"
+ "gsldrc1 %[src1], 0x08(%[src_ptr]) \n\t"
+ "gsldlc1 %[src1], 0x0f(%[src_ptr]) \n\t"
"punpcklhw %[dest_lo], %[src0], %[src1] \n\t"
"punpckhhw %[dest_hi], %[src0], %[src1] \n\t"
@@ -364,11 +382,13 @@
__asm__ volatile(
"1: \n\t"
- "ldc1 %[s0], 0x00(%[s]) \n\t"
+ "gsldrc1 %[s0], 0x00(%[s]) \n\t"
+ "gsldlc1 %[s0], 0x07(%[s]) \n\t"
"psrlw %[s1], %[s0], %[shift0] \n\t"
"and %[s0], %[s0], %[mask] \n\t"
- "ldc1 %[t0], 0x00(%[t]) \n\t"
+ "gsldrc1 %[t0], 0x00(%[t]) \n\t"
+ "gsldlc1 %[t0], 0x07(%[t]) \n\t"
"psrlw %[t1], %[t0], %[shift0] \n\t"
"and %[t0], %[t0], %[mask] \n\t"
@@ -378,11 +398,13 @@
"paddw %[dest0], %[dest0], %[ph] \n\t"
"psrlw %[dest0], %[dest0], %[shift1] \n\t"
- "ldc1 %[s0], 0x08(%[s]) \n\t"
+ "gsldrc1 %[s0], 0x08(%[s]) \n\t"
+ "gsldlc1 %[s0], 0x0f(%[s]) \n\t"
"psrlw %[s1], %[s0], %[shift0] \n\t"
"and %[s0], %[s0], %[mask] \n\t"
- "ldc1 %[t0], 0x08(%[t]) \n\t"
+ "gsldrc1 %[t0], 0x08(%[t]) \n\t"
+ "gsldlc1 %[t0], 0x0f(%[t]) \n\t"
"psrlw %[t1], %[t0], %[shift0] \n\t"
"and %[t0], %[t0], %[mask] \n\t"
@@ -425,18 +447,22 @@
__asm__ volatile(
"1: \n\t"
- "ldc1 %[src0], 0x00(%[src_ptr]) \n\t"
+ "gsldrc1 %[src0], 0x00(%[src_ptr]) \n\t"
+ "gsldlc1 %[src0], 0x07(%[src_ptr]) \n\t"
"psrlw %[src0], %[src0], %[shift] \n\t"
"and %[src0], %[src0], %[mask] \n\t"
- "ldc1 %[src1], 0x08(%[src_ptr]) \n\t"
+ "gsldrc1 %[src1], 0x08(%[src_ptr]) \n\t"
+ "gsldlc1 %[src1], 0x0f(%[src_ptr]) \n\t"
"psrlw %[src1], %[src1], %[shift] \n\t"
"and %[src1], %[src1], %[mask] \n\t"
"packsswh %[dest_lo], %[src0], %[src1] \n\t"
- "ldc1 %[src0], 0x10(%[src_ptr]) \n\t"
+ "gsldrc1 %[src0], 0x10(%[src_ptr]) \n\t"
+ "gsldlc1 %[src0], 0x17(%[src_ptr]) \n\t"
"psrlw %[src0], %[src0], %[shift] \n\t"
"and %[src0], %[src0], %[mask] \n\t"
- "ldc1 %[src1], 0x18(%[src_ptr]) \n\t"
+ "gsldrc1 %[src1], 0x18(%[src_ptr]) \n\t"
+ "gsldlc1 %[src1], 0x1f(%[src_ptr]) \n\t"
"psrlw %[src1], %[src1], %[shift] \n\t"
"and %[src1], %[src1], %[mask] \n\t"
"packsswh %[dest_hi], %[src0], %[src1] \n\t"
@@ -469,13 +495,17 @@
__asm__ volatile(
"1: \n\t"
- "ldc1 %[src0], 0x00(%[src_ptr]) \n\t"
- "ldc1 %[src1], 0x08(%[src_ptr]) \n\t"
+ "gsldrc1 %[src0], 0x00(%[src_ptr]) \n\t"
+ "gsldlc1 %[src0], 0x07(%[src_ptr]) \n\t"
+ "gsldrc1 %[src1], 0x08(%[src_ptr]) \n\t"
+ "gsldlc1 %[src1], 0x0f(%[src_ptr]) \n\t"
"punpckhhw %[dest_lo], %[src0], %[src1] \n\t"
"punpcklhw %[dest_lo], %[dest_lo], %[mask] \n\t"
- "ldc1 %[src0], 0x10(%[src_ptr]) \n\t"
- "ldc1 %[src1], 0x18(%[src_ptr]) \n\t"
+ "gsldrc1 %[src0], 0x10(%[src_ptr]) \n\t"
+ "gsldlc1 %[src0], 0x17(%[src_ptr]) \n\t"
+ "gsldrc1 %[src1], 0x18(%[src_ptr]) \n\t"
+ "gsldlc1 %[src1], 0x1f(%[src_ptr]) \n\t"
"punpckhhw %[dest_hi], %[src0], %[src1] \n\t"
"punpcklhw %[dest_hi], %[dest_hi], %[mask] \n\t"
@@ -691,7 +721,8 @@
__asm__ volatile(
"1: \n\t"
- "ldc1 %[src], 0x00(%[src_ptr]) \n\t"
+ "gsldrc1 %[src], 0x00(%[src_ptr]) \n\t"
+ "gsldlc1 %[src], 0x07(%[src_ptr]) \n\t"
"punpcklhw %[dest], %[src], %[src] \n\t"
"gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t"
@@ -721,9 +752,11 @@
"punpcklbh %[src_lo], %[src], %[mask] \n\t"
"punpckhbh %[src_hi], %[src], %[mask] \n\t"
- "ldc1 %[dest0], 0x00(%[dst_ptr]) \n\t"
+ "gsldrc1 %[dest0], 0x00(%[dst_ptr]) \n\t"
+ "gsldlc1 %[dest0], 0x07(%[dst_ptr]) \n\t"
"paddush %[dest0], %[dest0], %[src_lo] \n\t"
- "ldc1 %[dest1], 0x08(%[dst_ptr]) \n\t"
+ "gsldrc1 %[dest1], 0x08(%[dst_ptr]) \n\t"
+ "gsldlc1 %[dest1], 0x0f(%[dst_ptr]) \n\t"
"paddush %[dest1], %[dest1], %[src_hi] \n\t"
"gssdlc1 %[dest0], 0x07(%[dst_ptr]) \n\t"
@@ -750,16 +783,19 @@
__asm__ volatile(
"1: \n\t"
- "ldc1 %[src], 0x00(%[src_ptr]) \n\t"
+ "gsldrc1 %[src], 0x00(%[src_ptr]) \n\t"
+ "gsldlc1 %[src], 0x07(%[src_ptr]) \n\t"
"punpcklhw %[src_lo], %[src], %[mask] \n\t"
"punpckhhw %[src_hi], %[src], %[mask] \n\t"
- "ldc1 %[dest0], 0x00(%[dst_ptr]) \n\t"
+ "gsldrc1 %[dest0], 0x00(%[dst_ptr]) \n\t"
+ "gsldlc1 %[dest0], 0x07(%[dst_ptr]) \n\t"
"paddw %[dest0], %[dest0], %[src_lo] \n\t"
"gssdlc1 %[dest0], 0x07(%[dst_ptr]) \n\t"
"gssdrc1 %[dest0], 0x00(%[dst_ptr]) \n\t"
- "ldc1 %[dest1], 0x08(%[dst_ptr]) \n\t"
+ "gsldrc1 %[dest1], 0x08(%[dst_ptr]) \n\t"
+ "gsldlc1 %[dest1], 0x0f(%[dst_ptr]) \n\t"
"paddw %[dest1], %[dest1], %[src_hi] \n\t"
"gssdlc1 %[dest1], 0x0f(%[dst_ptr]) \n\t"
"gssdrc1 %[dest1], 0x08(%[dst_ptr]) \n\t"
@@ -922,7 +958,8 @@
__asm__ volatile(
"1: \n\t"
- "ldc1 %[src], 0x00(%[src_ptr]) \n\t"
+ "gsldrc1 %[src], 0x00(%[src_ptr]) \n\t"
+ "gsldlc1 %[src], 0x07(%[src_ptr]) \n\t"
"punpcklwd %[dest0], %[src], %[src] \n\t"
"gssdlc1 %[dest0], 0x07(%[dst_ptr]) \n\t"
"gssdrc1 %[dest0], 0x00(%[dst_ptr]) \n\t"
@@ -939,67 +976,6 @@
: "memory");
}
-void ScaleARGBFilterCols_MMI(uint8_t* dst_argb,
- const uint8_t* src_argb,
- int dst_width,
- int x,
- int dx) {
- uint64_t dest, src, src_hi, src_lo;
- int xi, xf, nxf;
- int64_t fxf, fnxf;
-
- const uint8_t* src_ptr = src_argb;
-
- const uint64_t mask0 = 0;
- const uint64_t mask1 = 0x7fULL;
-
- const uint64_t shift2 = 2;
- const uint64_t shift9 = 9;
- const uint64_t shift7 = 7;
- const uint64_t shift16 = 16;
-
- __asm__ volatile(
- "1: \n\t"
- "dsrl %[xi], %[x], %[shift16] \n\t"
- "dsll %[xi], %[xi], %[shift2] \n\t"
-
- "dadd %[src_ptr], %[src_argb], %[xi] \n\t"
- "ldc1 %[src], 0x00(%[src_ptr]) \n\t"
- "punpcklbh %[src_lo], %[src], %[mask0] \n\t"
- "punpckhbh %[src_hi], %[src], %[mask0] \n\t"
-
- "dsrl %[xf], %[x], %[shift9] \n\t"
- "andi %[xf], %[xf], 0x7f \n\t"
- "xori %[nxf], %[xf], 0x7f \n\t"
- "dmtc1 %[xf], %[fxf] \n\t"
- "pshufh %[fxf], %[fxf], %[mask0] \n\t"
- "dmtc1 %[nxf], %[fnxf] \n\t"
- "pshufh %[fnxf], %[fnxf], %[mask0] \n\t"
-
- "pmullh %[src_lo], %[src_lo], %[fnxf] \n\t"
- "pmullh %[src_hi], %[src_hi], %[fxf] \n\t"
- "paddh %[dest], %[src_lo], %[src_hi] \n\t"
- "psrlh %[dest], %[dest], %[shift7] \n\t"
- "packushb %[dest], %[dest], %[mask0] \n\t"
-
- "dadd %[x], %[x], %[dx] \n\t"
-
- "swc1 %[dest], 0x00(%[dst_ptr]) \n\t"
-
- "daddiu %[dst_ptr], %[dst_ptr], 0x04 \n\t"
- "daddi %[width], %[width], -0x01 \n\t"
- "bnez %[width], 1b \n\t"
- : [src] "=&f"(src), [dest] "=&f"(dest), [src_hi] "=&f"(src_hi),
- [src_lo] "=&f"(src_lo), [fxf] "=&f"(fxf), [fnxf] "=&f"(fnxf),
- [xi] "=&r"(xi), [xf] "=&r"(xf), [nxf] "=&r"(nxf)
- : [src_argb] "r"(src_argb), [src_ptr] "r"(src_ptr),
- [dst_ptr] "r"(dst_argb), [width] "r"(dst_width), [x] "r"(x),
- [dx] "r"(dx), [mask0] "f"(mask0), [mask1] "f"(mask1),
- [shift2] "r"(shift2), [shift7] "f"(shift7), [shift9] "r"(shift9),
- [shift16] "r"(shift16)
- : "memory");
-}
-
// Divide num by div and return as 16.16 fixed point result.
/* LibYUVBaseTest.TestFixedDiv */
int FixedDiv_MIPS(int num, int div) {
@@ -1058,9 +1034,11 @@
__asm__ volatile(
"1: \n\t"
- "ldc1 %[src0], 0x00(%[src1_ptr]) \n\t"
+ "gsldrc1 %[src0], 0x00(%[src1_ptr]) \n\t"
+ "gsldlc1 %[src0], 0x07(%[src1_ptr]) \n\t"
"pmaddhw %[dest04], %[src0], %[mask0] \n\t"
- "ldc1 %[src1], 0x00(%[src2_ptr]) \n\t"
+ "gsldrc1 %[src1], 0x00(%[src2_ptr]) \n\t"
+ "gsldlc1 %[src1], 0x07(%[src2_ptr]) \n\t"
"pmaddhw %[dest], %[src1], %[mask1] \n\t"
"paddw %[dest04], %[dest04], %[dest] \n\t"
"paddw %[dest04], %[dest04], %[ph] \n\t"
@@ -1072,9 +1050,11 @@
"paddw %[dest15], %[dest15], %[ph] \n\t"
"psrlw %[dest15], %[dest15], %[shift] \n\t"
- "ldc1 %[src0], 0x02(%[src1_ptr]) \n\t"
+ "gsldrc1 %[src0], 0x02(%[src1_ptr]) \n\t"
+ "gsldlc1 %[src0], 0x09(%[src1_ptr]) \n\t"
"pmaddhw %[dest26], %[src0], %[mask0] \n\t"
- "ldc1 %[src1], 0x02(%[src2_ptr]) \n\t"
+ "gsldrc1 %[src1], 0x02(%[src2_ptr]) \n\t"
+ "gsldlc1 %[src1], 0x09(%[src2_ptr]) \n\t"
"pmaddhw %[dest], %[src1], %[mask1] \n\t"
"paddw %[dest26], %[dest26], %[dest] \n\t"
"paddw %[dest26], %[dest26], %[ph] \n\t"