Merge "Remove tile param"
diff --git a/build/make/Makefile b/build/make/Makefile
index b56b490..f1b1cca 100644
--- a/build/make/Makefile
+++ b/build/make/Makefile
@@ -22,8 +22,10 @@
exampletest: .DEFAULT
install:: .DEFAULT
test:: .DEFAULT
+test-no-data-check:: .DEFAULT
testdata:: .DEFAULT
utiltest: .DEFAULT
+exampletest-no-data-check utiltest-no-data-check: .DEFAULT
# Note: md5sum is not installed on OS X, but openssl is. Openssl may not be
@@ -113,6 +115,9 @@
testdata::
.PHONY: utiltest
utiltest:
+.PHONY: test-no-data-check exampletest-no-data-check utiltest-no-data-check
+test-no-data-check::
+exampletest-no-data-check utiltest-no-data-check:
# Add compiler flags for intrinsic files
ifeq ($(TOOLCHAIN), x86-os2-gcc)
diff --git a/build/make/configure.sh b/build/make/configure.sh
index c5bed61..688fa12 100644
--- a/build/make/configure.sh
+++ b/build/make/configure.sh
@@ -728,6 +728,13 @@
# Handle darwin variants. Newer SDKs allow targeting older
# platforms, so use the newest one available.
case ${toolchain} in
+ arm*-darwin*)
+ ios_sdk_dir="$(show_darwin_sdk_path iphoneos)"
+ if [ -d "${ios_sdk_dir}" ]; then
+ add_cflags "-isysroot ${ios_sdk_dir}"
+ add_ldflags "-isysroot ${ios_sdk_dir}"
+ fi
+ ;;
*-darwin*)
osx_sdk_dir="$(show_darwin_sdk_path macosx)"
if [ -d "${osx_sdk_dir}" ]; then
@@ -803,7 +810,14 @@
if disabled neon && enabled neon_asm; then
die "Disabling neon while keeping neon-asm is not supported"
fi
- soft_enable media
+ case ${toolchain} in
+ *-darwin*)
+ # Neon is guaranteed on iOS 6+ devices, while old media extensions
+ # no longer assemble with iOS 9 SDK
+ ;;
+ *)
+ soft_enable media
+ esac
;;
armv6)
soft_enable media
diff --git a/libs.mk b/libs.mk
index 0ca8379..6215990 100644
--- a/libs.mk
+++ b/libs.mk
@@ -508,11 +508,13 @@
define test_shard_template
test:: test_shard.$(1)
-test_shard.$(1): $(LIBVPX_TEST_BIN) testdata
+test-no-data-check:: test_shard_ndc.$(1)
+test_shard.$(1) test_shard_ndc.$(1): $(LIBVPX_TEST_BIN)
@set -e; \
export GTEST_SHARD_INDEX=$(1); \
export GTEST_TOTAL_SHARDS=$(2); \
$(LIBVPX_TEST_BIN)
+test_shard.$(1): testdata
.PHONY: test_shard.$(1)
endef
@@ -557,15 +559,16 @@
# TODO(tomfinegan): Support running the debug versions of tools?
TEST_BIN_PATH := $(addsuffix /$(TGT_OS:win64=x64)/Release, $(TEST_BIN_PATH))
endif
-utiltest: testdata
+utiltest utiltest-no-data-check:
$(qexec)$(SRC_PATH_BARE)/test/vpxdec.sh \
--test-data-path $(LIBVPX_TEST_DATA_PATH) \
--bin-path $(TEST_BIN_PATH)
$(qexec)$(SRC_PATH_BARE)/test/vpxenc.sh \
--test-data-path $(LIBVPX_TEST_DATA_PATH) \
--bin-path $(TEST_BIN_PATH)
+utiltest: testdata
else
-utiltest:
+utiltest utiltest-no-data-check:
@echo Unit tests must be enabled to make the utiltest target.
endif
@@ -583,11 +586,12 @@
# TODO(tomfinegan): Support running the debug versions of tools?
EXAMPLES_BIN_PATH := $(TGT_OS:win64=x64)/Release
endif
-exampletest: examples testdata
+exampletest exampletest-no-data-check: examples
$(qexec)$(SRC_PATH_BARE)/test/examples.sh \
--test-data-path $(LIBVPX_TEST_DATA_PATH) \
--bin-path $(EXAMPLES_BIN_PATH)
+exampletest: testdata
else
-exampletest:
+exampletest exampletest-no-data-check:
@echo Unit tests must be enabled to make the exampletest target.
endif
diff --git a/test/fdct4x4_test.cc b/test/fdct4x4_test.cc
index f685fb4..6294af1 100644
--- a/test/fdct4x4_test.cc
+++ b/test/fdct4x4_test.cc
@@ -541,13 +541,13 @@
INSTANTIATE_TEST_CASE_P(
MSA, Trans4x4DCT,
::testing::Values(
- make_tuple(&vp9_fdct4x4_c, &vp9_idct4x4_16_add_msa, 1, VPX_BITS_8)));
+ make_tuple(&vp9_fdct4x4_msa, &vp9_idct4x4_16_add_msa, 0, VPX_BITS_8)));
INSTANTIATE_TEST_CASE_P(
MSA, Trans4x4HT,
::testing::Values(
- make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_msa, 0, VPX_BITS_8),
- make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_msa, 1, VPX_BITS_8),
- make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_msa, 2, VPX_BITS_8),
- make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_msa, 3, VPX_BITS_8)));
+ make_tuple(&vp9_fht4x4_msa, &vp9_iht4x4_16_add_msa, 0, VPX_BITS_8),
+ make_tuple(&vp9_fht4x4_msa, &vp9_iht4x4_16_add_msa, 1, VPX_BITS_8),
+ make_tuple(&vp9_fht4x4_msa, &vp9_iht4x4_16_add_msa, 2, VPX_BITS_8),
+ make_tuple(&vp9_fht4x4_msa, &vp9_iht4x4_16_add_msa, 3, VPX_BITS_8)));
#endif // HAVE_MSA && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
} // namespace
diff --git a/test/test_intra_pred_speed.cc b/test/test_intra_pred_speed.cc
index 352cde2..46d4a25 100644
--- a/test/test_intra_pred_speed.cc
+++ b/test/test_intra_pred_speed.cc
@@ -264,8 +264,8 @@
INTRA_PRED_TEST(NEON, TestIntraPred8, vp9_dc_predictor_8x8_neon,
vp9_dc_left_predictor_8x8_neon, vp9_dc_top_predictor_8x8_neon,
vp9_dc_128_predictor_8x8_neon, vp9_v_predictor_8x8_neon,
- vp9_h_predictor_8x8_neon, NULL, NULL, NULL, NULL, NULL, NULL,
- vp9_tm_predictor_8x8_neon)
+ vp9_h_predictor_8x8_neon, vp9_d45_predictor_8x8_neon, NULL,
+ NULL, NULL, NULL, NULL, vp9_tm_predictor_8x8_neon)
#endif // HAVE_NEON
@@ -316,8 +316,8 @@
vp9_dc_left_predictor_16x16_neon,
vp9_dc_top_predictor_16x16_neon,
vp9_dc_128_predictor_16x16_neon, vp9_v_predictor_16x16_neon,
- vp9_h_predictor_16x16_neon, NULL, NULL, NULL, NULL, NULL, NULL,
- vp9_tm_predictor_16x16_neon)
+ vp9_h_predictor_16x16_neon, vp9_d45_predictor_16x16_neon, NULL,
+ NULL, NULL, NULL, NULL, vp9_tm_predictor_16x16_neon)
#endif // HAVE_NEON
#if HAVE_MSA
diff --git a/test/tools_common.sh b/test/tools_common.sh
index 60424ed..0bdcc08 100755
--- a/test/tools_common.sh
+++ b/test/tools_common.sh
@@ -409,6 +409,7 @@
YUV_RAW_INPUT_HEIGHT=288
Y4M_NOSQ_PAR_INPUT="${LIBVPX_TEST_DATA_PATH}/park_joy_90p_8_420_a10-1.y4m"
+Y4M_720P_INPUT="${LIBVPX_TEST_DATA_PATH}/niklas_1280_720_30.y4m"
# Setup a trap function to clean up after tests complete.
trap cleanup EXIT
diff --git a/test/vp9_avg_test.cc b/test/vp9_avg_test.cc
index 56b5250..139c412 100644
--- a/test/vp9_avg_test.cc
+++ b/test/vp9_avg_test.cc
@@ -288,4 +288,16 @@
#endif
+#if HAVE_MSA
+INSTANTIATE_TEST_CASE_P(
+ MSA, AverageTest,
+ ::testing::Values(
+ make_tuple(16, 16, 0, 8, &vp9_avg_8x8_msa),
+ make_tuple(16, 16, 5, 8, &vp9_avg_8x8_msa),
+ make_tuple(32, 32, 15, 8, &vp9_avg_8x8_msa),
+ make_tuple(16, 16, 0, 4, &vp9_avg_4x4_msa),
+ make_tuple(16, 16, 5, 4, &vp9_avg_4x4_msa),
+ make_tuple(32, 32, 15, 4, &vp9_avg_4x4_msa)));
+#endif
+
} // namespace
diff --git a/test/vpxenc.sh b/test/vpxenc.sh
index 1faa145..e899499 100755
--- a/test/vpxenc.sh
+++ b/test/vpxenc.sh
@@ -60,6 +60,10 @@
echo ""${Y4M_NOSQ_PAR_INPUT}""
}
+y4m_input_720p() {
+ echo ""${Y4M_720P_INPUT}""
+}
+
# Echo default vpxenc real time encoding params. $1 is the codec, which defaults
# to vp8 if unspecified.
vpxenc_rt_params() {
@@ -68,7 +72,7 @@
--buf-initial-sz=500
--buf-optimal-sz=600
--buf-sz=1000
- --cpu-used=-5
+ --cpu-used=-6
--end-usage=cbr
--error-resilient=1
--kf-max-dist=90000
@@ -258,6 +262,63 @@
fi
}
+vpxenc_vp9_webm_rt_multithread_tiled() {
+ if [ "$(vpxenc_can_encode_vp9)" = "yes" ] && \
+ [ "$(webm_io_available)" = "yes" ]; then
+ local readonly output="${VPX_TEST_OUTPUT_DIR}/vp9_rt_multithread_tiled.webm"
+ local readonly tilethread_min=2
+ local readonly tilethread_max=4
+ local readonly num_threads="$(seq ${tilethread_min} ${tilethread_max})"
+ local readonly num_tile_cols="$(seq ${tilethread_min} ${tilethread_max})"
+
+ for threads in ${num_threads}; do
+ for tile_cols in ${num_tile_cols}; do
+ vpxenc $(y4m_input_720p) \
+ $(vpxenc_rt_params vp9) \
+ --threads=${threads} \
+ --tile-columns=${tile_cols} \
+ --output="${output}"
+ done
+ done
+
+ if [ ! -e "${output}" ]; then
+ elog "Output file does not exist."
+ return 1
+ fi
+
+ rm "${output}"
+ fi
+}
+
+vpxenc_vp9_webm_rt_multithread_tiled_frameparallel() {
+ if [ "$(vpxenc_can_encode_vp9)" = "yes" ] && \
+ [ "$(webm_io_available)" = "yes" ]; then
+ local readonly output="${VPX_TEST_OUTPUT_DIR}/vp9_rt_mt_t_fp.webm"
+ local readonly tilethread_min=2
+ local readonly tilethread_max=4
+ local readonly num_threads="$(seq ${tilethread_min} ${tilethread_max})"
+ local readonly num_tile_cols="$(seq ${tilethread_min} ${tilethread_max})"
+
+ for threads in ${num_threads}; do
+ for tile_cols in ${num_tile_cols}; do
+ vpxenc $(y4m_input_720p) \
+ $(vpxenc_rt_params vp9) \
+ --threads=${threads} \
+ --tile-columns=${tile_cols} \
+ --frame-parallel=1 \
+ --output="${output}"
+ done
+ done
+
+ if [ ! -e "${output}" ]; then
+ elog "Output file does not exist."
+ return 1
+ fi
+
+ rm "${output}"
+ fi
+}
+
vpxenc_vp9_webm_2pass() {
if [ "$(vpxenc_can_encode_vp9)" = "yes" ] && \
[ "$(webm_io_available)" = "yes" ]; then
@@ -357,6 +418,8 @@
vpxenc_vp9_ivf
vpxenc_vp9_webm
vpxenc_vp9_webm_rt
+ vpxenc_vp9_webm_rt_multithread_tiled
+ vpxenc_vp9_webm_rt_multithread_tiled_frameparallel
vpxenc_vp9_webm_2pass
vpxenc_vp9_ivf_lossless
vpxenc_vp9_ivf_minq0_maxq0
diff --git a/vp8/common/arm/neon/vp8_subpixelvariance_neon.c b/vp8/common/arm/neon/vp8_subpixelvariance_neon.c
index 974d3b6..3c8ed11 100644
--- a/vp8/common/arm/neon/vp8_subpixelvariance_neon.c
+++ b/vp8/common/arm/neon/vp8_subpixelvariance_neon.c
@@ -12,7 +12,7 @@
#include "vpx_ports/mem.h"
#include "vpx/vpx_integer.h"
-static const uint16_t bilinear_taps_coeff[8][2] = {
+static const uint8_t bilinear_taps_coeff[8][2] = {
{128, 0},
{112, 16},
{ 96, 32},
@@ -972,9 +972,9 @@
int pixel_step,
unsigned int output_height,
unsigned int output_width,
- const uint16_t *vpx_filter) {
- const uint8x8_t f0 = vmov_n_u8((uint8_t)vpx_filter[0]);
- const uint8x8_t f1 = vmov_n_u8((uint8_t)vpx_filter[1]);
+ const uint8_t *vpx_filter) {
+ const uint8x8_t f0 = vmov_n_u8(vpx_filter[0]);
+ const uint8x8_t f1 = vmov_n_u8(vpx_filter[1]);
unsigned int i;
for (i = 0; i < output_height; ++i) {
const uint8x8_t src_0 = vld1_u8(&src_ptr[0]);
diff --git a/vp9/common/arm/neon/vp9_reconintra_neon.c b/vp9/common/arm/neon/vp9_reconintra_neon.c
index 13c46a5..92706bf 100644
--- a/vp9/common/arm/neon/vp9_reconintra_neon.c
+++ b/vp9/common/arm/neon/vp9_reconintra_neon.c
@@ -338,6 +338,43 @@
dst[3 * stride + 3] = above[7];
}
+void vp9_d45_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ static const uint8_t shuffle1[8] = { 1, 2, 3, 4, 5, 6, 7, 7 };
+ static const uint8_t shuffle2[8] = { 2, 3, 4, 5, 6, 7, 7, 7 };
+ const uint8x8_t sh_12345677 = vld1_u8(shuffle1);
+ const uint8x8_t sh_23456777 = vld1_u8(shuffle2);
+ const uint8x8_t A0 = vld1_u8(above); // top row
+ const uint8x8_t A1 = vtbl1_u8(A0, sh_12345677);
+ const uint8x8_t A2 = vtbl1_u8(A0, sh_23456777);
+ const uint8x8_t avg1 = vhadd_u8(A0, A2);
+ uint8x8_t row = vrhadd_u8(avg1, A1);
+ int i;
+ (void)left;
+ for (i = 0; i < 7; ++i) {
+ vst1_u8(dst + i * stride, row);
+ row = vtbl1_u8(row, sh_12345677);
+ }
+ vst1_u8(dst + i * stride, row);
+}
+
+void vp9_d45_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const uint8x16_t A0 = vld1q_u8(above); // top row
+ const uint8x16_t above_right = vld1q_dup_u8(above + 15);
+ const uint8x16_t A1 = vextq_u8(A0, above_right, 1);
+ const uint8x16_t A2 = vextq_u8(A0, above_right, 2);
+ const uint8x16_t avg1 = vhaddq_u8(A0, A2);
+ uint8x16_t row = vrhaddq_u8(avg1, A1);
+ int i;
+ (void)left;
+ for (i = 0; i < 15; ++i) {
+ vst1q_u8(dst + i * stride, row);
+ row = vextq_u8(row, above_right, 1);
+ }
+ vst1q_u8(dst + i * stride, row);
+}
+
// -----------------------------------------------------------------------------
void vp9_d135_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
diff --git a/vp9/common/mips/msa/vp9_macros_msa.h b/vp9/common/mips/msa/vp9_macros_msa.h
index 23b281f..4385075 100644
--- a/vp9/common/mips/msa/vp9_macros_msa.h
+++ b/vp9/common/mips/msa/vp9_macros_msa.h
@@ -440,6 +440,17 @@
}
#define ST_SH8(...) ST_H8(v8i16, __VA_ARGS__)
+/* Description : Store vectors of word elements with stride
+ Arguments : Inputs - in0, in1, stride
+ - pdst (destination pointer to store to)
+ Details : Store 4 word elements from 'in0' to (pdst)
+ Store 4 word elements from 'in1' to (pdst + stride)
+*/
+#define ST_SW2(in0, in1, pdst, stride) { \
+ ST_SW(in0, (pdst)); \
+ ST_SW(in1, (pdst) + stride); \
+}
+
/* Description : Store as 2x4 byte block to destination memory from input vector
Arguments : Inputs - in, stidx, pdst, stride
Return Type - unsigned byte
@@ -467,6 +478,24 @@
SH(out3_m, pblk_2x4_m + 3 * stride); \
}
+/* Description : Store 4x2 byte block to destination memory from input vector
+ Arguments : Inputs - in, pdst, stride
+ Details : Index 0 word element from 'in' vector is copied to a GP
+ register and stored to (pdst)
+ Index 1 word element from 'in' vector is copied to a GP
+ register and stored to (pdst + stride)
+*/
+#define ST4x2_UB(in, pdst, stride) { \
+ uint32_t out0_m, out1_m; \
+ uint8_t *pblk_4x2_m = (uint8_t *)(pdst); \
+ \
+ out0_m = __msa_copy_u_w((v4i32)in, 0); \
+ out1_m = __msa_copy_u_w((v4i32)in, 1); \
+ \
+ SW(out0_m, pblk_4x2_m); \
+ SW(out1_m, pblk_4x2_m + stride); \
+}
+
/* Description : Store as 4x4 byte block to destination memory from input vector
Arguments : Inputs - in0, in1, pdst, stride
Return Type - unsigned byte
@@ -763,6 +792,39 @@
}
#define DPADD_SB4_SH(...) DPADD_SB4(v8i16, __VA_ARGS__)
+/* Description : Dot product & addition of halfword vector elements
+ Arguments : Inputs - mult0, mult1
+ cnst0, cnst1
+ Outputs - out0, out1
+ Return Type - as per RTYPE
+ Details : Signed halfword elements from 'mult0' are multiplied with
+ signed halfword elements from 'cnst0' producing a result
+ twice the size of input i.e. signed word.
+ The multiplication result of adjacent odd-even elements
+ are added to the 'out0' vector
+*/
+#define DPADD_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) { \
+ out0 = (RTYPE)__msa_dpadd_s_w((v4i32)out0, (v8i16)mult0, (v8i16)cnst0); \
+ out1 = (RTYPE)__msa_dpadd_s_w((v4i32)out1, (v8i16)mult1, (v8i16)cnst1); \
+}
+#define DPADD_SH2_SW(...) DPADD_SH2(v4i32, __VA_ARGS__)
+
+/* Description : Dot product & addition of double word vector elements
+ Arguments : Inputs - mult0, mult1
+ Outputs - out0, out1
+ Return Type - as per RTYPE
+ Details : Each signed word element from 'mult0' is multiplied with itself
+ producing an intermediate result twice the size of input
+ i.e. signed double word
+ The multiplication result of adjacent odd-even elements
+ are added to the 'out0' vector
+*/
+#define DPADD_SD2(RTYPE, mult0, mult1, out0, out1) { \
+ out0 = (RTYPE)__msa_dpadd_s_d((v2i64)out0, (v4i32)mult0, (v4i32)mult0); \
+ out1 = (RTYPE)__msa_dpadd_s_d((v2i64)out1, (v4i32)mult1, (v4i32)mult1); \
+}
+#define DPADD_SD2_SD(...) DPADD_SD2(v2i64, __VA_ARGS__)
+
/* Description : Minimum values between unsigned elements of
either vector are copied to the output vector
Arguments : Inputs - in0, in1, min_vec
@@ -844,6 +906,34 @@
}
#define HADD_UB4_UH(...) HADD_UB4(v8u16, __VA_ARGS__)
+/* Description : Horizontal subtraction of unsigned byte vector elements
+ Arguments : Inputs - in0, in1
+ Outputs - out0, out1
+ Return Type - as per RTYPE
+ Details : Each unsigned odd byte element from 'in0' is subtracted from
+ even unsigned byte element from 'in0' (pairwise) and the
+ halfword result is written to 'out0'
+*/
+#define HSUB_UB2(RTYPE, in0, in1, out0, out1) { \
+ out0 = (RTYPE)__msa_hsub_u_h((v16u8)in0, (v16u8)in0); \
+ out1 = (RTYPE)__msa_hsub_u_h((v16u8)in1, (v16u8)in1); \
+}
+#define HSUB_UB2_SH(...) HSUB_UB2(v8i16, __VA_ARGS__)
+
+/* Description : Horizontal subtraction of signed halfword vector elements
+ Arguments : Inputs - in0, in1
+ Outputs - out0, out1
+ Return Type - as per RTYPE
+ Details : Each signed odd halfword element from 'in0' is subtracted from
+ even signed halfword element from 'in0' (pairwise) and the
+ word result is written to 'out0'
+*/
+#define HSUB_UH2(RTYPE, in0, in1, out0, out1) { \
+ out0 = (RTYPE)__msa_hsub_s_w((v8i16)in0, (v8i16)in0); \
+ out1 = (RTYPE)__msa_hsub_s_w((v8i16)in1, (v8i16)in1); \
+}
+#define HSUB_UH2_SW(...) HSUB_UH2(v4i32, __VA_ARGS__)
+
/* Description : Insert specified word elements from input vectors to 1
destination vector
Arguments : Inputs - in0, in1, in2, in3 (4 input vectors)
@@ -1472,6 +1562,22 @@
}
#define SRLI_H4_SH(...) SRLI_H4(v8i16, __VA_ARGS__)
+/* Description : Multiplication of pairs of vectors
+ Arguments : Inputs - in0, in1, in2, in3
+ Outputs - out0, out1
+ Details : Each element from 'in0' is multiplied with elements from 'in1'
+ and the result is written to 'out0'
+*/
+#define MUL2(in0, in1, in2, in3, out0, out1) { \
+ out0 = in0 * in1; \
+ out1 = in2 * in3; \
+}
+#define MUL4(in0, in1, in2, in3, in4, in5, in6, in7, \
+ out0, out1, out2, out3) { \
+ MUL2(in0, in1, in2, in3, out0, out1); \
+ MUL2(in4, in5, in6, in7, out2, out3); \
+}
+
/* Description : Addition of 2 pairs of vectors
Arguments : Inputs - in0, in1, in2, in3
Outputs - out0, out1
diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl
index dac1423..22a5efd 100644
--- a/vp9/common/vp9_rtcd_defs.pl
+++ b/vp9/common/vp9_rtcd_defs.pl
@@ -99,7 +99,7 @@
specialize qw/vp9_d207_predictor_8x8/, "$ssse3_x86inc";
add_proto qw/void vp9_d45_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_d45_predictor_8x8/, "$ssse3_x86inc";
+specialize qw/vp9_d45_predictor_8x8 neon/, "$ssse3_x86inc";
add_proto qw/void vp9_d63_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
specialize qw/vp9_d63_predictor_8x8/, "$ssse3_x86inc";
@@ -138,7 +138,7 @@
specialize qw/vp9_d207_predictor_16x16/, "$ssse3_x86inc";
add_proto qw/void vp9_d45_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_d45_predictor_16x16/, "$ssse3_x86inc";
+specialize qw/vp9_d45_predictor_16x16 neon/, "$ssse3_x86inc";
add_proto qw/void vp9_d63_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
specialize qw/vp9_d63_predictor_16x16/, "$ssse3_x86inc";
@@ -878,10 +878,10 @@
specialize qw/vp9_sub_pixel_avg_variance4x4/, "$sse_x86inc", "$ssse3_x86inc";
add_proto qw/unsigned int vp9_avg_8x8/, "const uint8_t *, int p";
-specialize qw/vp9_avg_8x8 sse2 neon/;
+specialize qw/vp9_avg_8x8 sse2 neon msa/;
add_proto qw/unsigned int vp9_avg_4x4/, "const uint8_t *, int p";
-specialize qw/vp9_avg_4x4 sse2/;
+specialize qw/vp9_avg_4x4 sse2 msa/;
add_proto qw/void vp9_minmax_8x8/, "const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max";
specialize qw/vp9_minmax_8x8 sse2/;
@@ -916,7 +916,7 @@
# ENCODEMB INVOKE
add_proto qw/void vp9_subtract_block/, "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride";
-specialize qw/vp9_subtract_block neon/, "$sse2_x86inc";
+specialize qw/vp9_subtract_block neon msa/, "$sse2_x86inc";
#
# Denoiser
@@ -948,7 +948,7 @@
specialize qw/vp9_fdct8x8_quant/;
} else {
add_proto qw/int64_t vp9_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz";
- specialize qw/vp9_block_error avx2/, "$sse2_x86inc";
+ specialize qw/vp9_block_error avx2 msa/, "$sse2_x86inc";
add_proto qw/int64_t vp9_block_error_fp/, "const int16_t *coeff, const int16_t *dqcoeff, int block_size";
specialize qw/vp9_block_error_fp sse2/;
@@ -1023,7 +1023,7 @@
specialize qw/vp9_fdct32x32_rd sse2/;
} else {
add_proto qw/void vp9_fht4x4/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
- specialize qw/vp9_fht4x4 sse2/;
+ specialize qw/vp9_fht4x4 sse2 msa/;
add_proto qw/void vp9_fht8x8/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
specialize qw/vp9_fht8x8 sse2 msa/;
@@ -1032,13 +1032,13 @@
specialize qw/vp9_fht16x16 sse2 msa/;
add_proto qw/void vp9_fwht4x4/, "const int16_t *input, tran_low_t *output, int stride";
- specialize qw/vp9_fwht4x4/, "$mmx_x86inc";
+ specialize qw/vp9_fwht4x4 msa/, "$mmx_x86inc";
add_proto qw/void vp9_fdct4x4_1/, "const int16_t *input, tran_low_t *output, int stride";
specialize qw/vp9_fdct4x4_1 sse2/;
add_proto qw/void vp9_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride";
- specialize qw/vp9_fdct4x4 sse2/;
+ specialize qw/vp9_fdct4x4 sse2 msa/;
add_proto qw/void vp9_fdct8x8_1/, "const int16_t *input, tran_low_t *output, int stride";
specialize qw/vp9_fdct8x8_1 sse2 neon msa/;
@@ -1077,7 +1077,7 @@
specialize qw/vp9_full_range_search/;
add_proto qw/void vp9_temporal_filter_apply/, "uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count";
-specialize qw/vp9_temporal_filter_apply sse2/;
+specialize qw/vp9_temporal_filter_apply sse2 msa/;
if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
diff --git a/vp9/encoder/mips/msa/vp9_avg_msa.c b/vp9/encoder/mips/msa/vp9_avg_msa.c
new file mode 100644
index 0000000..f2e8b27
--- /dev/null
+++ b/vp9/encoder/mips/msa/vp9_avg_msa.c
@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vp9_rtcd.h"
+#include "vp9/common/mips/msa/vp9_macros_msa.h"
+
+uint32_t vp9_avg_8x8_msa(const uint8_t *src, int32_t src_stride) {
+ uint32_t sum_out;
+ v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+ v8u16 sum0, sum1, sum2, sum3, sum4, sum5, sum6, sum7;
+ v4u32 sum = { 0 };
+
+ LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+ HADD_UB4_UH(src0, src1, src2, src3, sum0, sum1, sum2, sum3);
+ HADD_UB4_UH(src4, src5, src6, src7, sum4, sum5, sum6, sum7);
+ ADD4(sum0, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum0, sum2, sum4, sum6);
+ ADD2(sum0, sum2, sum4, sum6, sum0, sum4);
+ sum0 += sum4;
+
+ sum = __msa_hadd_u_w(sum0, sum0);
+ sum0 = (v8u16)__msa_pckev_h((v8i16)sum, (v8i16)sum);
+ sum = __msa_hadd_u_w(sum0, sum0);
+ sum = (v4u32)__msa_srari_w((v4i32)sum, 6);
+ sum_out = __msa_copy_u_w((v4i32)sum, 0);
+
+ return sum_out;
+}
+
+uint32_t vp9_avg_4x4_msa(const uint8_t *src, int32_t src_stride) {
+ uint32_t sum_out;
+ uint32_t src0, src1, src2, src3;
+ v16u8 vec = { 0 };
+ v8u16 sum0;
+ v4u32 sum1;
+ v2u64 sum2;
+
+ LW4(src, src_stride, src0, src1, src2, src3);
+ INSERT_W4_UB(src0, src1, src2, src3, vec);
+
+ sum0 = __msa_hadd_u_h(vec, vec);
+ sum1 = __msa_hadd_u_w(sum0, sum0);
+ sum0 = (v8u16)__msa_pckev_h((v8i16)sum1, (v8i16)sum1);
+ sum1 = __msa_hadd_u_w(sum0, sum0);
+ sum2 = __msa_hadd_u_d(sum1, sum1);
+ sum1 = (v4u32)__msa_srari_w((v4i32)sum2, 4);
+ sum_out = __msa_copy_u_w((v4i32)sum1, 0);
+
+ return sum_out;
+}
diff --git a/vp9/encoder/mips/msa/vp9_error_msa.c b/vp9/encoder/mips/msa/vp9_error_msa.c
new file mode 100644
index 0000000..9709092
--- /dev/null
+++ b/vp9/encoder/mips/msa/vp9_error_msa.c
@@ -0,0 +1,114 @@
+/*
+ * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vp9_rtcd.h"
+#include "vp9/common/mips/msa/vp9_macros_msa.h"
+
+#define BLOCK_ERROR_BLOCKSIZE_MSA(BSize) \
+static int64_t block_error_##BSize##size_msa(const int16_t *coeff_ptr, \
+ const int16_t *dq_coeff_ptr, \
+ int64_t *ssz) { \
+ int64_t err = 0; \
+ uint32_t loop_cnt; \
+ v8i16 coeff, dq_coeff, coeff_r_h, coeff_l_h; \
+ v4i32 diff_r, diff_l, coeff_r_w, coeff_l_w; \
+ v2i64 sq_coeff_r, sq_coeff_l; \
+ v2i64 err0, err_dup0, err1, err_dup1; \
+ \
+ coeff = LD_SH(coeff_ptr); \
+ dq_coeff = LD_SH(dq_coeff_ptr); \
+ UNPCK_SH_SW(coeff, coeff_r_w, coeff_l_w); \
+ ILVRL_H2_SH(coeff, dq_coeff, coeff_r_h, coeff_l_h); \
+ HSUB_UH2_SW(coeff_r_h, coeff_l_h, diff_r, diff_l); \
+ DOTP_SW2_SD(coeff_r_w, coeff_l_w, coeff_r_w, coeff_l_w, \
+ sq_coeff_r, sq_coeff_l); \
+ DOTP_SW2_SD(diff_r, diff_l, diff_r, diff_l, err0, err1); \
+ \
+ coeff = LD_SH(coeff_ptr + 8); \
+ dq_coeff = LD_SH(dq_coeff_ptr + 8); \
+ UNPCK_SH_SW(coeff, coeff_r_w, coeff_l_w); \
+ ILVRL_H2_SH(coeff, dq_coeff, coeff_r_h, coeff_l_h); \
+ HSUB_UH2_SW(coeff_r_h, coeff_l_h, diff_r, diff_l); \
+ DPADD_SD2_SD(coeff_r_w, coeff_l_w, sq_coeff_r, sq_coeff_l); \
+ DPADD_SD2_SD(diff_r, diff_l, err0, err1); \
+ \
+ coeff_ptr += 16; \
+ dq_coeff_ptr += 16; \
+ \
+ for (loop_cnt = ((BSize >> 4) - 1); loop_cnt--;) { \
+ coeff = LD_SH(coeff_ptr); \
+ dq_coeff = LD_SH(dq_coeff_ptr); \
+ UNPCK_SH_SW(coeff, coeff_r_w, coeff_l_w); \
+ ILVRL_H2_SH(coeff, dq_coeff, coeff_r_h, coeff_l_h); \
+ HSUB_UH2_SW(coeff_r_h, coeff_l_h, diff_r, diff_l); \
+ DPADD_SD2_SD(coeff_r_w, coeff_l_w, sq_coeff_r, sq_coeff_l); \
+ DPADD_SD2_SD(diff_r, diff_l, err0, err1); \
+ \
+ coeff = LD_SH(coeff_ptr + 8); \
+ dq_coeff = LD_SH(dq_coeff_ptr + 8); \
+ UNPCK_SH_SW(coeff, coeff_r_w, coeff_l_w); \
+ ILVRL_H2_SH(coeff, dq_coeff, coeff_r_h, coeff_l_h); \
+ HSUB_UH2_SW(coeff_r_h, coeff_l_h, diff_r, diff_l); \
+ DPADD_SD2_SD(coeff_r_w, coeff_l_w, sq_coeff_r, sq_coeff_l); \
+ DPADD_SD2_SD(diff_r, diff_l, err0, err1); \
+ \
+ coeff_ptr += 16; \
+ dq_coeff_ptr += 16; \
+ } \
+ \
+ err_dup0 = __msa_splati_d(sq_coeff_r, 1); \
+ err_dup1 = __msa_splati_d(sq_coeff_l, 1); \
+ sq_coeff_r += err_dup0; \
+ sq_coeff_l += err_dup1; \
+ *ssz = __msa_copy_s_d(sq_coeff_r, 0); \
+ *ssz += __msa_copy_s_d(sq_coeff_l, 0); \
+ \
+ err_dup0 = __msa_splati_d(err0, 1); \
+ err_dup1 = __msa_splati_d(err1, 1); \
+ err0 += err_dup0; \
+ err1 += err_dup1; \
+ err = __msa_copy_s_d(err0, 0); \
+ err += __msa_copy_s_d(err1, 0); \
+ \
+ return err; \
+}
+
+BLOCK_ERROR_BLOCKSIZE_MSA(16);
+BLOCK_ERROR_BLOCKSIZE_MSA(64);
+BLOCK_ERROR_BLOCKSIZE_MSA(256);
+BLOCK_ERROR_BLOCKSIZE_MSA(1024);
+
+int64_t vp9_block_error_msa(const tran_low_t *coeff_ptr,
+ const tran_low_t *dq_coeff_ptr,
+ intptr_t blk_size, int64_t *ssz) {
+ int64_t err;
+ const int16_t *coeff = (const int16_t *)coeff_ptr;
+ const int16_t *dq_coeff = (const int16_t *)dq_coeff_ptr;
+
+ switch (blk_size) {
+ case 16:
+ err = block_error_16size_msa(coeff, dq_coeff, ssz);
+ break;
+ case 64:
+ err = block_error_64size_msa(coeff, dq_coeff, ssz);
+ break;
+ case 256:
+ err = block_error_256size_msa(coeff, dq_coeff, ssz);
+ break;
+ case 1024:
+ err = block_error_1024size_msa(coeff, dq_coeff, ssz);
+ break;
+ default:
+ err = vp9_block_error_c(coeff_ptr, dq_coeff_ptr, blk_size, ssz);
+ break;
+ }
+
+ return err;
+}
diff --git a/vp9/encoder/mips/msa/vp9_fdct4x4_msa.c b/vp9/encoder/mips/msa/vp9_fdct4x4_msa.c
new file mode 100644
index 0000000..790b4fb
--- /dev/null
+++ b/vp9/encoder/mips/msa/vp9_fdct4x4_msa.c
@@ -0,0 +1,129 @@
+/*
+ * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+
+#include "./vp9_rtcd.h"
+#include "vp9/encoder/mips/msa/vp9_fdct_msa.h"
+
+void vp9_fwht4x4_msa(const int16_t *input, int16_t *output,
+ int32_t src_stride) {
+ v8i16 in0, in1, in2, in3, in4;
+
+ LD_SH4(input, src_stride, in0, in1, in2, in3);
+
+ in0 += in1;
+ in3 -= in2;
+ in4 = (in0 - in3) >> 1;
+ SUB2(in4, in1, in4, in2, in1, in2);
+ in0 -= in2;
+ in3 += in1;
+
+ TRANSPOSE4x4_SH_SH(in0, in2, in3, in1, in0, in2, in3, in1);
+
+ in0 += in2;
+ in1 -= in3;
+ in4 = (in0 - in1) >> 1;
+ SUB2(in4, in2, in4, in3, in2, in3);
+ in0 -= in3;
+ in1 += in2;
+
+ SLLI_4V(in0, in1, in2, in3, 2);
+
+ TRANSPOSE4x4_SH_SH(in0, in3, in1, in2, in0, in3, in1, in2);
+
+ ST4x2_UB(in0, output, 4);
+ ST4x2_UB(in3, output + 4, 4);
+ ST4x2_UB(in1, output + 8, 4);
+ ST4x2_UB(in2, output + 12, 4);
+}
+
+void vp9_fdct4x4_msa(const int16_t *input, int16_t *output,
+ int32_t src_stride) {
+ v8i16 in0, in1, in2, in3;
+
+ LD_SH4(input, src_stride, in0, in1, in2, in3);
+
+ /* fdct4 pre-process */
+ {
+ v8i16 vec, mask;
+ v16i8 zero = { 0 };
+ v16i8 one = __msa_ldi_b(1);
+
+ mask = (v8i16)__msa_sldi_b(zero, one, 15);
+ SLLI_4V(in0, in1, in2, in3, 4);
+ vec = __msa_ceqi_h(in0, 0);
+ vec = vec ^ 255;
+ vec = mask & vec;
+ in0 += vec;
+ }
+
+ VP9_FDCT4(in0, in1, in2, in3, in0, in1, in2, in3);
+ TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3);
+ VP9_FDCT4(in0, in1, in2, in3, in0, in1, in2, in3);
+ TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3);
+ ADD4(in0, 1, in1, 1, in2, 1, in3, 1, in0, in1, in2, in3);
+ SRA_4V(in0, in1, in2, in3, 2);
+ PCKEV_D2_SH(in1, in0, in3, in2, in0, in2);
+ ST_SH2(in0, in2, output, 8);
+}
+
+void vp9_fht4x4_msa(const int16_t *input, int16_t *output, int32_t stride,
+ int32_t tx_type) {
+ v8i16 in0, in1, in2, in3;
+
+ LD_SH4(input, stride, in0, in1, in2, in3);
+
+ /* fdct4 pre-process */
+ {
+ v8i16 temp, mask;
+ v16i8 zero = { 0 };
+ v16i8 one = __msa_ldi_b(1);
+
+ mask = (v8i16)__msa_sldi_b(zero, one, 15);
+ SLLI_4V(in0, in1, in2, in3, 4);
+ temp = __msa_ceqi_h(in0, 0);
+ temp = (v8i16)__msa_xori_b((v16u8)temp, 255);
+ temp = mask & temp;
+ in0 += temp;
+ }
+
+ switch (tx_type) {
+ case DCT_DCT:
+ VP9_FDCT4(in0, in1, in2, in3, in0, in1, in2, in3);
+ TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3);
+ VP9_FDCT4(in0, in1, in2, in3, in0, in1, in2, in3);
+ break;
+ case ADST_DCT:
+ VP9_FADST4(in0, in1, in2, in3, in0, in1, in2, in3);
+ TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3);
+ VP9_FDCT4(in0, in1, in2, in3, in0, in1, in2, in3);
+ break;
+ case DCT_ADST:
+ VP9_FDCT4(in0, in1, in2, in3, in0, in1, in2, in3);
+ TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3);
+ VP9_FADST4(in0, in1, in2, in3, in0, in1, in2, in3);
+ break;
+ case ADST_ADST:
+ VP9_FADST4(in0, in1, in2, in3, in0, in1, in2, in3);
+ TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3);
+ VP9_FADST4(in0, in1, in2, in3, in0, in1, in2, in3);
+ break;
+ default:
+ assert(0);
+ break;
+ }
+
+ TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3);
+ ADD4(in0, 1, in1, 1, in2, 1, in3, 1, in0, in1, in2, in3);
+ SRA_4V(in0, in1, in2, in3, 2);
+ PCKEV_D2_SH(in1, in0, in3, in2, in0, in2);
+ ST_SH2(in0, in2, output, 8);
+}
diff --git a/vp9/encoder/mips/msa/vp9_fdct_msa.h b/vp9/encoder/mips/msa/vp9_fdct_msa.h
index 3523a3d..ad66576 100644
--- a/vp9/encoder/mips/msa/vp9_fdct_msa.h
+++ b/vp9/encoder/mips/msa/vp9_fdct_msa.h
@@ -190,6 +190,67 @@
vec1 >>= 2; \
}
+#define VP9_FDCT4(in0, in1, in2, in3, out0, out1, out2, out3) { \
+ v8i16 cnst0_m, cnst1_m, cnst2_m, cnst3_m; \
+ v8i16 vec0_m, vec1_m, vec2_m, vec3_m; \
+ v4i32 vec4_m, vec5_m, vec6_m, vec7_m; \
+ v8i16 coeff_m = { cospi_16_64, -cospi_16_64, cospi_8_64, \
+ cospi_24_64, -cospi_8_64, 0, 0, 0 }; \
+ \
+ BUTTERFLY_4(in0, in1, in2, in3, vec0_m, vec1_m, vec2_m, vec3_m); \
+ ILVR_H2_SH(vec1_m, vec0_m, vec3_m, vec2_m, vec0_m, vec2_m); \
+ SPLATI_H2_SH(coeff_m, 0, 1, cnst0_m, cnst1_m); \
+ cnst1_m = __msa_ilvev_h(cnst1_m, cnst0_m); \
+ vec5_m = __msa_dotp_s_w(vec0_m, cnst1_m); \
+ \
+ SPLATI_H2_SH(coeff_m, 4, 3, cnst2_m, cnst3_m); \
+ cnst2_m = __msa_ilvev_h(cnst3_m, cnst2_m); \
+ vec7_m = __msa_dotp_s_w(vec2_m, cnst2_m); \
+ \
+ vec4_m = __msa_dotp_s_w(vec0_m, cnst0_m); \
+ cnst2_m = __msa_splati_h(coeff_m, 2); \
+ cnst2_m = __msa_ilvev_h(cnst2_m, cnst3_m); \
+ vec6_m = __msa_dotp_s_w(vec2_m, cnst2_m); \
+ \
+ SRARI_W4_SW(vec4_m, vec5_m, vec6_m, vec7_m, DCT_CONST_BITS); \
+ PCKEV_H4_SH(vec4_m, vec4_m, vec5_m, vec5_m, vec6_m, vec6_m, \
+ vec7_m, vec7_m, out0, out2, out1, out3); \
+}
+
+#define VP9_FADST4(in0, in1, in2, in3, out0, out1, out2, out3) { \
+ v4i32 s0_m, s1_m, s2_m, s3_m, constant_m; \
+ v4i32 in0_r_m, in1_r_m, in2_r_m, in3_r_m; \
+ \
+ UNPCK_R_SH_SW(in0, in0_r_m); \
+ UNPCK_R_SH_SW(in1, in1_r_m); \
+ UNPCK_R_SH_SW(in2, in2_r_m); \
+ UNPCK_R_SH_SW(in3, in3_r_m); \
+ \
+ constant_m = __msa_fill_w(sinpi_4_9); \
+ MUL2(in0_r_m, constant_m, in3_r_m, constant_m, s1_m, s0_m); \
+ \
+ constant_m = __msa_fill_w(sinpi_1_9); \
+ s0_m += in0_r_m * constant_m; \
+ s1_m -= in1_r_m * constant_m; \
+ \
+ constant_m = __msa_fill_w(sinpi_2_9); \
+ s0_m += in1_r_m * constant_m; \
+ s1_m += in3_r_m * constant_m; \
+ \
+ s2_m = in0_r_m + in1_r_m - in3_r_m; \
+ \
+ constant_m = __msa_fill_w(sinpi_3_9); \
+ MUL2(in2_r_m, constant_m, s2_m, constant_m, s3_m, in1_r_m); \
+ \
+ in0_r_m = s0_m + s3_m; \
+ s2_m = s1_m - s3_m; \
+ s3_m = s1_m - s0_m + s3_m; \
+ \
+ SRARI_W4_SW(in0_r_m, in1_r_m, s2_m, s3_m, DCT_CONST_BITS); \
+ PCKEV_H4_SH(in0_r_m, in0_r_m, in1_r_m, in1_r_m, s2_m, s2_m, \
+ s3_m, s3_m, out0, out1, out2, out3); \
+}
+
#define VP9_FDCT8(in0, in1, in2, in3, in4, in5, in6, in7, \
out0, out1, out2, out3, out4, out5, out6, out7) { \
v8i16 s0_m, s1_m, s2_m, s3_m, s4_m, s5_m, s6_m; \
diff --git a/vp9/encoder/mips/msa/vp9_subtract_msa.c b/vp9/encoder/mips/msa/vp9_subtract_msa.c
new file mode 100644
index 0000000..1b8b694
--- /dev/null
+++ b/vp9/encoder/mips/msa/vp9_subtract_msa.c
@@ -0,0 +1,264 @@
+/*
+ * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vp9_rtcd.h"
+#include "vp9/common/mips/msa/vp9_macros_msa.h"
+
+static void sub_blk_4x4_msa(const uint8_t *src_ptr, int32_t src_stride,
+ const uint8_t *pred_ptr, int32_t pred_stride,
+ int16_t *diff_ptr, int32_t diff_stride) {
+ uint32_t src0, src1, src2, src3;
+ uint32_t pred0, pred1, pred2, pred3;
+ v16i8 src = { 0 };
+ v16i8 pred = { 0 };
+ v16u8 src_l0, src_l1;
+ v8i16 diff0, diff1;
+
+ LW4(src_ptr, src_stride, src0, src1, src2, src3);
+ LW4(pred_ptr, pred_stride, pred0, pred1, pred2, pred3);
+ INSERT_W4_SB(src0, src1, src2, src3, src);
+ INSERT_W4_SB(pred0, pred1, pred2, pred3, pred);
+ ILVRL_B2_UB(src, pred, src_l0, src_l1);
+ HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
+ ST8x4_UB(diff0, diff1, diff_ptr, (2 * diff_stride));
+}
+
+static void sub_blk_8x8_msa(const uint8_t *src_ptr, int32_t src_stride,
+ const uint8_t *pred_ptr, int32_t pred_stride,
+ int16_t *diff_ptr, int32_t diff_stride) {
+ uint32_t loop_cnt;
+ uint64_t src0, src1, pred0, pred1;
+ v16i8 src = { 0 };
+ v16i8 pred = { 0 };
+ v16u8 src_l0, src_l1;
+ v8i16 diff0, diff1;
+
+ for (loop_cnt = 4; loop_cnt--;) {
+ LD2(src_ptr, src_stride, src0, src1);
+ src_ptr += (2 * src_stride);
+ LD2(pred_ptr, pred_stride, pred0, pred1);
+ pred_ptr += (2 * pred_stride);
+
+ INSERT_D2_SB(src0, src1, src);
+ INSERT_D2_SB(pred0, pred1, pred);
+ ILVRL_B2_UB(src, pred, src_l0, src_l1);
+ HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
+ ST_SH2(diff0, diff1, diff_ptr, diff_stride);
+ diff_ptr += (2 * diff_stride);
+ }
+}
+
+static void sub_blk_16x16_msa(const uint8_t *src, int32_t src_stride,
+ const uint8_t *pred, int32_t pred_stride,
+ int16_t *diff, int32_t diff_stride) {
+ int8_t count;
+ v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
+ v16i8 pred0, pred1, pred2, pred3, pred4, pred5, pred6, pred7;
+ v16u8 src_l0, src_l1;
+ v8i16 diff0, diff1;
+
+ for (count = 2; count--;) {
+ LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+ src += (8 * src_stride);
+
+ LD_SB8(pred, pred_stride,
+ pred0, pred1, pred2, pred3, pred4, pred5, pred6, pred7);
+ pred += (8 * pred_stride);
+
+ ILVRL_B2_UB(src0, pred0, src_l0, src_l1);
+ HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
+ ST_SH2(diff0, diff1, diff, 8);
+ diff += diff_stride;
+
+ ILVRL_B2_UB(src1, pred1, src_l0, src_l1);
+ HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
+ ST_SH2(diff0, diff1, diff, 8);
+ diff += diff_stride;
+
+ ILVRL_B2_UB(src2, pred2, src_l0, src_l1);
+ HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
+ ST_SH2(diff0, diff1, diff, 8);
+ diff += diff_stride;
+
+ ILVRL_B2_UB(src3, pred3, src_l0, src_l1);
+ HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
+ ST_SH2(diff0, diff1, diff, 8);
+ diff += diff_stride;
+
+ ILVRL_B2_UB(src4, pred4, src_l0, src_l1);
+ HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
+ ST_SH2(diff0, diff1, diff, 8);
+ diff += diff_stride;
+
+ ILVRL_B2_UB(src5, pred5, src_l0, src_l1);
+ HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
+ ST_SH2(diff0, diff1, diff, 8);
+ diff += diff_stride;
+
+ ILVRL_B2_UB(src6, pred6, src_l0, src_l1);
+ HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
+ ST_SH2(diff0, diff1, diff, 8);
+ diff += diff_stride;
+
+ ILVRL_B2_UB(src7, pred7, src_l0, src_l1);
+ HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
+ ST_SH2(diff0, diff1, diff, 8);
+ diff += diff_stride;
+ }
+}
+
+static void sub_blk_32x32_msa(const uint8_t *src, int32_t src_stride,
+ const uint8_t *pred, int32_t pred_stride,
+ int16_t *diff, int32_t diff_stride) {
+ uint32_t loop_cnt;
+ v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
+ v16i8 pred0, pred1, pred2, pred3, pred4, pred5, pred6, pred7;
+ v16u8 src_l0, src_l1;
+ v8i16 diff0, diff1;
+
+ for (loop_cnt = 8; loop_cnt--;) {
+ LD_SB2(src, 16, src0, src1);
+ src += src_stride;
+ LD_SB2(src, 16, src2, src3);
+ src += src_stride;
+ LD_SB2(src, 16, src4, src5);
+ src += src_stride;
+ LD_SB2(src, 16, src6, src7);
+ src += src_stride;
+
+ LD_SB2(pred, 16, pred0, pred1);
+ pred += pred_stride;
+ LD_SB2(pred, 16, pred2, pred3);
+ pred += pred_stride;
+ LD_SB2(pred, 16, pred4, pred5);
+ pred += pred_stride;
+ LD_SB2(pred, 16, pred6, pred7);
+ pred += pred_stride;
+
+ ILVRL_B2_UB(src0, pred0, src_l0, src_l1);
+ HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
+ ST_SH2(diff0, diff1, diff, 8);
+ ILVRL_B2_UB(src1, pred1, src_l0, src_l1);
+ HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
+ ST_SH2(diff0, diff1, diff + 16, 8);
+ diff += diff_stride;
+
+ ILVRL_B2_UB(src2, pred2, src_l0, src_l1);
+ HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
+ ST_SH2(diff0, diff1, diff, 8);
+ ILVRL_B2_UB(src3, pred3, src_l0, src_l1);
+ HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
+ ST_SH2(diff0, diff1, diff + 16, 8);
+ diff += diff_stride;
+
+ ILVRL_B2_UB(src4, pred4, src_l0, src_l1);
+ HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
+ ST_SH2(diff0, diff1, diff, 8);
+ ILVRL_B2_UB(src5, pred5, src_l0, src_l1);
+ HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
+ ST_SH2(diff0, diff1, diff + 16, 8);
+ diff += diff_stride;
+
+ ILVRL_B2_UB(src6, pred6, src_l0, src_l1);
+ HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
+ ST_SH2(diff0, diff1, diff, 8);
+ ILVRL_B2_UB(src7, pred7, src_l0, src_l1);
+ HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
+ ST_SH2(diff0, diff1, diff + 16, 8);
+ diff += diff_stride;
+ }
+}
+
+static void sub_blk_64x64_msa(const uint8_t *src, int32_t src_stride,
+ const uint8_t *pred, int32_t pred_stride,
+ int16_t *diff, int32_t diff_stride) {
+ uint32_t loop_cnt;
+ v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
+ v16i8 pred0, pred1, pred2, pred3, pred4, pred5, pred6, pred7;
+ v16u8 src_l0, src_l1;
+ v8i16 diff0, diff1;
+
+ for (loop_cnt = 32; loop_cnt--;) {
+ LD_SB4(src, 16, src0, src1, src2, src3);
+ src += src_stride;
+ LD_SB4(src, 16, src4, src5, src6, src7);
+ src += src_stride;
+
+ LD_SB4(pred, 16, pred0, pred1, pred2, pred3);
+ pred += pred_stride;
+ LD_SB4(pred, 16, pred4, pred5, pred6, pred7);
+ pred += pred_stride;
+
+ ILVRL_B2_UB(src0, pred0, src_l0, src_l1);
+ HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
+ ST_SH2(diff0, diff1, diff, 8);
+ ILVRL_B2_UB(src1, pred1, src_l0, src_l1);
+ HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
+ ST_SH2(diff0, diff1, diff + 16, 8);
+ ILVRL_B2_UB(src2, pred2, src_l0, src_l1);
+ HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
+ ST_SH2(diff0, diff1, diff + 32, 8);
+ ILVRL_B2_UB(src3, pred3, src_l0, src_l1);
+ HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
+ ST_SH2(diff0, diff1, diff + 48, 8);
+ diff += diff_stride;
+
+ ILVRL_B2_UB(src4, pred4, src_l0, src_l1);
+ HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
+ ST_SH2(diff0, diff1, diff, 8);
+ ILVRL_B2_UB(src5, pred5, src_l0, src_l1);
+ HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
+ ST_SH2(diff0, diff1, diff + 16, 8);
+ ILVRL_B2_UB(src6, pred6, src_l0, src_l1);
+ HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
+ ST_SH2(diff0, diff1, diff + 32, 8);
+ ILVRL_B2_UB(src7, pred7, src_l0, src_l1);
+ HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
+ ST_SH2(diff0, diff1, diff + 48, 8);
+ diff += diff_stride;
+ }
+}
+
+void vp9_subtract_block_msa(int32_t rows, int32_t cols,
+ int16_t *diff_ptr, ptrdiff_t diff_stride,
+ const uint8_t *src_ptr, ptrdiff_t src_stride,
+ const uint8_t *pred_ptr, ptrdiff_t pred_stride) {
+ if (rows == cols) {
+ switch (rows) {
+ case 4:
+ sub_blk_4x4_msa(src_ptr, src_stride, pred_ptr, pred_stride,
+ diff_ptr, diff_stride);
+ break;
+ case 8:
+ sub_blk_8x8_msa(src_ptr, src_stride, pred_ptr, pred_stride,
+ diff_ptr, diff_stride);
+ break;
+ case 16:
+ sub_blk_16x16_msa(src_ptr, src_stride, pred_ptr, pred_stride,
+ diff_ptr, diff_stride);
+ break;
+ case 32:
+ sub_blk_32x32_msa(src_ptr, src_stride, pred_ptr, pred_stride,
+ diff_ptr, diff_stride);
+ break;
+ case 64:
+ sub_blk_64x64_msa(src_ptr, src_stride, pred_ptr, pred_stride,
+ diff_ptr, diff_stride);
+ break;
+ default:
+ vp9_subtract_block_c(rows, cols, diff_ptr, diff_stride, src_ptr,
+ src_stride, pred_ptr, pred_stride);
+ break;
+ }
+ } else {
+ vp9_subtract_block_c(rows, cols, diff_ptr, diff_stride, src_ptr, src_stride,
+ pred_ptr, pred_stride);
+ }
+}
diff --git a/vp9/encoder/mips/msa/vp9_temporal_filter_msa.c b/vp9/encoder/mips/msa/vp9_temporal_filter_msa.c
new file mode 100644
index 0000000..4053bff
--- /dev/null
+++ b/vp9/encoder/mips/msa/vp9_temporal_filter_msa.c
@@ -0,0 +1,289 @@
+/*
+ * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vp9_rtcd.h"
+#include "vp9/common/mips/msa/vp9_macros_msa.h"
+
+static void temporal_filter_apply_8size_msa(uint8_t *frm1_ptr,
+ uint32_t stride,
+ uint8_t *frm2_ptr,
+ int32_t filt_sth,
+ int32_t filt_wgt,
+ uint32_t *acc,
+ uint16_t *cnt) {
+ uint32_t row;
+ uint64_t f0, f1, f2, f3;
+ v16i8 frm2, frm1 = { 0 };
+ v16i8 frm4, frm3 = { 0 };
+ v16u8 frm_r, frm_l;
+ v8i16 frm2_r, frm2_l;
+ v8i16 diff0, diff1, mod0_h, mod1_h;
+ v4i32 cnst3, cnst16, filt_wt, strength;
+ v4i32 mod0_w, mod1_w, mod2_w, mod3_w;
+ v4i32 diff0_r, diff0_l, diff1_r, diff1_l;
+ v4i32 frm2_rr, frm2_rl, frm2_lr, frm2_ll;
+ v4i32 acc0, acc1, acc2, acc3;
+ v8i16 cnt0, cnt1;
+
+ filt_wt = __msa_fill_w(filt_wgt);
+ strength = __msa_fill_w(filt_sth);
+ cnst3 = __msa_ldi_w(3);
+ cnst16 = __msa_ldi_w(16);
+
+ for (row = 2; row--;) {
+ LD4(frm1_ptr, stride, f0, f1, f2, f3);
+ frm1_ptr += (4 * stride);
+
+ LD_SB2(frm2_ptr, 16, frm2, frm4);
+ frm2_ptr += 32;
+
+ LD_SW2(acc, 4, acc0, acc1);
+ LD_SW2(acc + 8, 4, acc2, acc3);
+ LD_SH2(cnt, 8, cnt0, cnt1);
+
+ INSERT_D2_SB(f0, f1, frm1);
+ INSERT_D2_SB(f2, f3, frm3);
+ ILVRL_B2_UB(frm1, frm2, frm_r, frm_l);
+ HSUB_UB2_SH(frm_r, frm_l, diff0, diff1);
+ UNPCK_SH_SW(diff0, diff0_r, diff0_l);
+ UNPCK_SH_SW(diff1, diff1_r, diff1_l);
+ MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l,
+ diff1_l, mod0_w, mod1_w, mod2_w, mod3_w);
+ MUL4(mod0_w, cnst3, mod1_w, cnst3, mod2_w, cnst3, mod3_w, cnst3,
+ mod0_w, mod1_w, mod2_w, mod3_w);
+ SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength);
+
+ diff0_r = (mod0_w < cnst16);
+ diff0_l = (mod1_w < cnst16);
+ diff1_r = (mod2_w < cnst16);
+ diff1_l = (mod3_w < cnst16);
+
+ SUB4(cnst16, mod0_w, cnst16, mod1_w, cnst16, mod2_w, cnst16, mod3_w,
+ mod0_w, mod1_w, mod2_w, mod3_w);
+
+ mod0_w = diff0_r & mod0_w;
+ mod1_w = diff0_l & mod1_w;
+ mod2_w = diff1_r & mod2_w;
+ mod3_w = diff1_l & mod3_w;
+
+ MUL4(mod0_w, filt_wt, mod1_w, filt_wt, mod2_w, filt_wt, mod3_w, filt_wt,
+ mod0_w, mod1_w, mod2_w, mod3_w);
+ PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h);
+ ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h);
+ ST_SH2(mod0_h, mod1_h, cnt, 8);
+ cnt += 16;
+
+ UNPCK_UB_SH(frm2, frm2_r, frm2_l);
+ UNPCK_SH_SW(frm2_r, frm2_rr, frm2_rl);
+ UNPCK_SH_SW(frm2_l, frm2_lr, frm2_ll);
+ MUL4(mod0_w, frm2_rr, mod1_w, frm2_rl, mod2_w, frm2_lr, mod3_w, frm2_ll,
+ mod0_w, mod1_w, mod2_w, mod3_w);
+ ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3,
+ mod0_w, mod1_w, mod2_w, mod3_w);
+
+ ST_SW2(mod0_w, mod1_w, acc, 4);
+ acc += 8;
+ ST_SW2(mod2_w, mod3_w, acc, 4);
+ acc += 8;
+
+ LD_SW2(acc, 4, acc0, acc1);
+ LD_SW2(acc + 8, 4, acc2, acc3);
+ LD_SH2(cnt, 8, cnt0, cnt1);
+
+ ILVRL_B2_UB(frm3, frm4, frm_r, frm_l);
+ HSUB_UB2_SH(frm_r, frm_l, diff0, diff1);
+ UNPCK_SH_SW(diff0, diff0_r, diff0_l);
+ UNPCK_SH_SW(diff1, diff1_r, diff1_l);
+ MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l,
+ diff1_l, mod0_w, mod1_w, mod2_w, mod3_w);
+ MUL4(mod0_w, cnst3, mod1_w, cnst3, mod2_w, cnst3, mod3_w, cnst3,
+ mod0_w, mod1_w, mod2_w, mod3_w);
+ SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength);
+
+ diff0_r = (mod0_w < cnst16);
+ diff0_l = (mod1_w < cnst16);
+ diff1_r = (mod2_w < cnst16);
+ diff1_l = (mod3_w < cnst16);
+
+ SUB4(cnst16, mod0_w, cnst16, mod1_w, cnst16, mod2_w, cnst16, mod3_w,
+ mod0_w, mod1_w, mod2_w, mod3_w);
+
+ mod0_w = diff0_r & mod0_w;
+ mod1_w = diff0_l & mod1_w;
+ mod2_w = diff1_r & mod2_w;
+ mod3_w = diff1_l & mod3_w;
+
+ MUL4(mod0_w, filt_wt, mod1_w, filt_wt, mod2_w, filt_wt, mod3_w, filt_wt,
+ mod0_w, mod1_w, mod2_w, mod3_w);
+ PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h);
+ ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h);
+ ST_SH2(mod0_h, mod1_h, cnt, 8);
+ cnt += 16;
+ UNPCK_UB_SH(frm4, frm2_r, frm2_l);
+ UNPCK_SH_SW(frm2_r, frm2_rr, frm2_rl);
+ UNPCK_SH_SW(frm2_l, frm2_lr, frm2_ll);
+ MUL4(mod0_w, frm2_rr, mod1_w, frm2_rl, mod2_w, frm2_lr, mod3_w, frm2_ll,
+ mod0_w, mod1_w, mod2_w, mod3_w);
+ ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3,
+ mod0_w, mod1_w, mod2_w, mod3_w);
+
+ ST_SW2(mod0_w, mod1_w, acc, 4);
+ acc += 8;
+ ST_SW2(mod2_w, mod3_w, acc, 4);
+ acc += 8;
+ }
+}
+
+static void temporal_filter_apply_16size_msa(uint8_t *frm1_ptr,
+ uint32_t stride,
+ uint8_t *frm2_ptr,
+ int32_t filt_sth,
+ int32_t filt_wgt,
+ uint32_t *acc,
+ uint16_t *cnt) {
+ uint32_t row;
+ v16i8 frm1, frm2, frm3, frm4;
+ v16u8 frm_r, frm_l;
+ v16i8 zero = { 0 };
+ v8u16 frm2_r, frm2_l;
+ v8i16 diff0, diff1, mod0_h, mod1_h;
+ v4i32 cnst3, cnst16, filt_wt, strength;
+ v4i32 mod0_w, mod1_w, mod2_w, mod3_w;
+ v4i32 diff0_r, diff0_l, diff1_r, diff1_l;
+ v4i32 frm2_rr, frm2_rl, frm2_lr, frm2_ll;
+ v4i32 acc0, acc1, acc2, acc3;
+ v8i16 cnt0, cnt1;
+
+ filt_wt = __msa_fill_w(filt_wgt);
+ strength = __msa_fill_w(filt_sth);
+ cnst3 = __msa_ldi_w(3);
+ cnst16 = __msa_ldi_w(16);
+
+ for (row = 8; row--;) {
+ LD_SB2(frm1_ptr, stride, frm1, frm3);
+ frm1_ptr += stride;
+
+ LD_SB2(frm2_ptr, 16, frm2, frm4);
+ frm2_ptr += 16;
+
+ LD_SW2(acc, 4, acc0, acc1);
+ LD_SW2(acc, 4, acc2, acc3);
+ LD_SH2(cnt, 8, cnt0, cnt1);
+
+ ILVRL_B2_UB(frm1, frm2, frm_r, frm_l);
+ HSUB_UB2_SH(frm_r, frm_l, diff0, diff1);
+ UNPCK_SH_SW(diff0, diff0_r, diff0_l);
+ UNPCK_SH_SW(diff1, diff1_r, diff1_l);
+ MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l, diff1_l,
+ mod0_w, mod1_w, mod2_w, mod3_w);
+ MUL4(mod0_w, cnst3, mod1_w, cnst3, mod2_w, cnst3, mod3_w, cnst3,
+ mod0_w, mod1_w, mod2_w, mod3_w);
+ SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength);
+
+ diff0_r = (mod0_w < cnst16);
+ diff0_l = (mod1_w < cnst16);
+ diff1_r = (mod2_w < cnst16);
+ diff1_l = (mod3_w < cnst16);
+
+ SUB4(cnst16, mod0_w, cnst16, mod1_w, cnst16, mod2_w, cnst16, mod3_w,
+ mod0_w, mod1_w, mod2_w, mod3_w);
+
+ mod0_w = diff0_r & mod0_w;
+ mod1_w = diff0_l & mod1_w;
+ mod2_w = diff1_r & mod2_w;
+ mod3_w = diff1_l & mod3_w;
+
+ MUL4(mod0_w, filt_wt, mod1_w, filt_wt, mod2_w, filt_wt, mod3_w, filt_wt,
+ mod0_w, mod1_w, mod2_w, mod3_w);
+ PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h);
+ ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h);
+ ST_SH2(mod0_h, mod1_h, cnt, 8);
+ cnt += 16;
+
+ ILVRL_B2_UH(zero, frm2, frm2_r, frm2_l);
+ UNPCK_SH_SW(frm2_r, frm2_rr, frm2_rl);
+ UNPCK_SH_SW(frm2_l, frm2_lr, frm2_ll);
+ MUL4(mod0_w, frm2_rr, mod1_w, frm2_rl, mod2_w, frm2_lr, mod3_w, frm2_ll,
+ mod0_w, mod1_w, mod2_w, mod3_w);
+ ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3,
+ mod0_w, mod1_w, mod2_w, mod3_w);
+
+ ST_SW2(mod0_w, mod1_w, acc, 4);
+ acc += 8;
+ ST_SW2(mod2_w, mod3_w, acc, 4);
+ acc += 8;
+
+ LD_SW2(acc, 4, acc0, acc1);
+ LD_SW2(acc + 8, 4, acc2, acc3);
+ LD_SH2(cnt, 8, cnt0, cnt1);
+
+ ILVRL_B2_UB(frm3, frm4, frm_r, frm_l);
+ HSUB_UB2_SH(frm_r, frm_l, diff0, diff1);
+ UNPCK_SH_SW(diff0, diff0_r, diff0_l);
+ UNPCK_SH_SW(diff1, diff1_r, diff1_l);
+ MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l, diff1_l,
+ mod0_w, mod1_w, mod2_w, mod3_w);
+ MUL4(mod0_w, cnst3, mod1_w, cnst3, mod2_w, cnst3, mod3_w, cnst3,
+ mod0_w, mod1_w, mod2_w, mod3_w);
+ SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength);
+
+ diff0_r = (mod0_w < cnst16);
+ diff0_l = (mod1_w < cnst16);
+ diff1_r = (mod2_w < cnst16);
+ diff1_l = (mod3_w < cnst16);
+
+ SUB4(cnst16, mod0_w, cnst16, mod1_w, cnst16, mod2_w, cnst16, mod3_w,
+ mod0_w, mod1_w, mod2_w, mod3_w);
+
+ mod0_w = diff0_r & mod0_w;
+ mod1_w = diff0_l & mod1_w;
+ mod2_w = diff1_r & mod2_w;
+ mod3_w = diff1_l & mod3_w;
+
+ MUL4(mod0_w, filt_wt, mod1_w, filt_wt, mod2_w, filt_wt, mod3_w, filt_wt,
+ mod0_w, mod1_w, mod2_w, mod3_w);
+ PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h);
+ ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h);
+ ST_SH2(mod0_h, mod1_h, cnt, 8);
+ cnt += 16;
+
+ ILVRL_B2_UH(zero, frm4, frm2_r, frm2_l);
+ UNPCK_SH_SW(frm2_r, frm2_rr, frm2_rl);
+ UNPCK_SH_SW(frm2_l, frm2_lr, frm2_ll);
+ MUL4(mod0_w, frm2_rr, mod1_w, frm2_rl, mod2_w, frm2_lr, mod3_w, frm2_ll,
+ mod0_w, mod1_w, mod2_w, mod3_w);
+ ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3,
+ mod0_w, mod1_w, mod2_w, mod3_w);
+ ST_SW2(mod0_w, mod1_w, acc, 4);
+ acc += 8;
+ ST_SW2(mod2_w, mod3_w, acc, 4);
+ acc += 8;
+
+ frm1_ptr += stride;
+ frm2_ptr += 16;
+ }
+}
+
+void vp9_temporal_filter_apply_msa(uint8_t *frame1_ptr, uint32_t stride,
+ uint8_t *frame2_ptr, uint32_t blk_w,
+ uint32_t blk_h, int32_t strength,
+ int32_t filt_wgt, uint32_t *accu,
+ uint16_t *cnt) {
+ if (8 == (blk_w * blk_h)) {
+ temporal_filter_apply_8size_msa(frame1_ptr, stride, frame2_ptr,
+ strength, filt_wgt, accu, cnt);
+ } else if (16 == (blk_w * blk_h)) {
+ temporal_filter_apply_16size_msa(frame1_ptr, stride, frame2_ptr,
+ strength, filt_wgt, accu, cnt);
+ } else {
+ vp9_temporal_filter_apply_c(frame1_ptr, stride, frame2_ptr, blk_w, blk_h,
+ strength, filt_wgt, accu, cnt);
+ }
+}
diff --git a/vp9/encoder/vp9_aq_cyclicrefresh.c b/vp9/encoder/vp9_aq_cyclicrefresh.c
index 0e4d863..6270bf4 100644
--- a/vp9/encoder/vp9_aq_cyclicrefresh.c
+++ b/vp9/encoder/vp9_aq_cyclicrefresh.c
@@ -51,7 +51,7 @@
// Rate target ratio to set q delta.
double rate_ratio_qdelta;
// Boost factor for rate target ratio, for segment CR_SEGMENT_ID_BOOST2.
- double rate_boost_fac;
+ int rate_boost_fac;
double low_content_avg;
int qindex_delta[3];
};
@@ -129,7 +129,8 @@
else if (bsize >= BLOCK_16X16 &&
rate < cr->thresh_rate_sb &&
is_inter_block(mbmi) &&
- mbmi->mv[0].as_int == 0)
+ mbmi->mv[0].as_int == 0 &&
+ cr->rate_boost_fac > 10)
// More aggressive delta-q for bigger blocks with zero motion.
return CR_SEGMENT_ID_BOOST2;
else
@@ -464,10 +465,10 @@
cm->height <= 288 &&
rc->avg_frame_bandwidth < 3400) {
cr->motion_thresh = 4;
- cr->rate_boost_fac = 1.25;
+ cr->rate_boost_fac = 10;
} else {
cr->motion_thresh = 32;
- cr->rate_boost_fac = 1.7;
+ cr->rate_boost_fac = 17;
}
}
@@ -541,9 +542,9 @@
vp9_set_segdata(seg, CR_SEGMENT_ID_BOOST1, SEG_LVL_ALT_Q, qindex_delta);
// Set a more aggressive (higher) q delta for segment BOOST2.
- qindex_delta = compute_deltaq(cpi, cm->base_qindex,
- MIN(CR_MAX_RATE_TARGET_RATIO,
- cr->rate_boost_fac * cr->rate_ratio_qdelta));
+ qindex_delta = compute_deltaq(
+ cpi, cm->base_qindex, MIN(CR_MAX_RATE_TARGET_RATIO,
+ 0.1 * cr->rate_boost_fac * cr->rate_ratio_qdelta));
cr->qindex_delta[2] = qindex_delta;
vp9_set_segdata(seg, CR_SEGMENT_ID_BOOST2, SEG_LVL_ALT_Q, qindex_delta);
diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index d1289fb..cd8c4e1 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -2888,8 +2888,7 @@
if (xd->lossless)
return ONLY_4X4;
if (cpi->common.frame_type == KEY_FRAME &&
- cpi->sf.use_nonrd_pick_mode &&
- cpi->sf.partition_search_type == VAR_BASED_PARTITION)
+ cpi->sf.use_nonrd_pick_mode)
return ALLOW_16X16;
if (cpi->sf.tx_size_search_method == USE_LARGESTALL)
return ALLOW_32X32;
@@ -3575,15 +3574,26 @@
set_offsets(cpi, tile_info, x, mi_row, mi_col, BLOCK_64X64);
if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && cm->seg.enabled &&
xd->mi[0]->mbmi.segment_id) {
- x->max_partition_size = BLOCK_64X64;
+ // Use lower max_partition_size for low resoultions.
+ if (cm->width <= 352 && cm->height <= 288)
+ x->max_partition_size = BLOCK_32X32;
+ else
+ x->max_partition_size = BLOCK_64X64;
x->min_partition_size = BLOCK_8X8;
nonrd_pick_partition(cpi, td, tile_data, tp, mi_row, mi_col,
BLOCK_64X64, &dummy_rdc, 1,
INT64_MAX, td->pc_root);
} else {
choose_partitioning(cpi, tile_info, x, mi_row, mi_col);
- nonrd_select_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col,
- BLOCK_64X64, 1, &dummy_rdc, td->pc_root);
+ // TODO(marpan): Seems like nonrd_select_partition does not support
+ // 4x4 partition. Since 4x4 is used on key frame, use this switch
+ // for now.
+ if (cm->frame_type == KEY_FRAME)
+ nonrd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col,
+ BLOCK_64X64, 1, &dummy_rdc, td->pc_root);
+ else
+ nonrd_select_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col,
+ BLOCK_64X64, 1, &dummy_rdc, td->pc_root);
}
break;
diff --git a/vp9/encoder/vp9_ratectrl.c b/vp9/encoder/vp9_ratectrl.c
index 425073f..85003f6 100644
--- a/vp9/encoder/vp9_ratectrl.c
+++ b/vp9/encoder/vp9_ratectrl.c
@@ -1596,7 +1596,10 @@
target = calc_pframe_target_size_one_pass_cbr(cpi);
vp9_rc_set_frame_target(cpi, target);
- cpi->resize_state = vp9_resize_one_pass_cbr(cpi);
+ if (cpi->oxcf.resize_mode == RESIZE_DYNAMIC)
+ cpi->resize_state = vp9_resize_one_pass_cbr(cpi);
+ else
+ cpi->resize_state = 0;
}
int vp9_compute_qdelta(const RATE_CONTROL *rc, double qstart, double qtarget,
@@ -1781,7 +1784,7 @@
++cpi->resize_buffer_underflow;
++cpi->resize_count;
// Check for resize action every "window" frames.
- if (cpi->resize_count == window) {
+ if (cpi->resize_count >= window) {
int avg_qp = cpi->resize_avg_qp / cpi->resize_count;
// Resize down if buffer level has underflowed sufficent amount in past
// window, and we are at original resolution.
diff --git a/vp9/vp9cx.mk b/vp9/vp9cx.mk
index 4743c7c..e78c111 100644
--- a/vp9/vp9cx.mk
+++ b/vp9/vp9cx.mk
@@ -152,9 +152,14 @@
VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_subtract_neon.c
VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_variance_neon.c
+VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_avg_msa.c
+VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_error_msa.c
+VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_fdct4x4_msa.c
VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_fdct8x8_msa.c
VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_fdct16x16_msa.c
VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_fdct32x32_msa.c
VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_fdct_msa.h
+VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_subtract_msa.c
+VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_temporal_filter_msa.c
VP9_CX_SRCS-yes := $(filter-out $(VP9_CX_SRCS_REMOVE-yes),$(VP9_CX_SRCS-yes))