Merge "examples.mk,vpxdec: rm libwebm muxer dependency" into main
diff --git a/README b/README
index e360df0..9fa50038 100644
--- a/README
+++ b/README
@@ -64,9 +64,15 @@
arm64-android-gcc
arm64-darwin-gcc
arm64-darwin20-gcc
+ arm64-darwin21-gcc
+ arm64-darwin22-gcc
arm64-linux-gcc
arm64-win64-gcc
arm64-win64-vs15
+ arm64-win64-vs16
+ arm64-win64-vs16-clangcl
+ arm64-win64-vs17
+ arm64-win64-vs17-clangcl
armv7-android-gcc
armv7-darwin-gcc
armv7-linux-rvct
@@ -77,6 +83,8 @@
armv7-win32-vs15
armv7s-darwin-gcc
armv8-linux-gcc
+ loongarch32-linux-gcc
+ loongarch64-linux-gcc
mips32-linux-gcc
mips64-linux-gcc
ppc64le-linux-gcc
@@ -117,6 +125,8 @@
x86_64-darwin18-gcc
x86_64-darwin19-gcc
x86_64-darwin20-gcc
+ x86_64-darwin21-gcc
+ x86_64-darwin22-gcc
x86_64-iphonesimulator-gcc
x86_64-linux-gcc
x86_64-linux-icc
diff --git a/build/make/Makefile b/build/make/Makefile
index 5c38c18..65ac229 100644
--- a/build/make/Makefile
+++ b/build/make/Makefile
@@ -304,6 +304,19 @@
$(qexec)$$(AR) $$(ARFLAGS) $$@ $$^
endef
+# Don't use -Wl,-z,defs with Clang's sanitizers.
+#
+# Clang's AddressSanitizer documentation says "When linking shared libraries,
+# the AddressSanitizer run-time is not linked, so -Wl,-z,defs may cause link
+# errors (don't use it with AddressSanitizer)." See
+# https://clang.llvm.org/docs/AddressSanitizer.html#usage.
+NO_UNDEFINED := -Wl,-z,defs
+ifeq ($(findstring clang,$(CC)),clang)
+ ifneq ($(filter -fsanitize=%,$(LDFLAGS)),)
+ NO_UNDEFINED :=
+ endif
+endif
+
define so_template
# Not using a pattern rule here because we don't want to generate empty
# archives when they are listed as a dependency in files not responsible
@@ -313,7 +326,8 @@
$(1):
$(if $(quiet),@echo " [LD] $$@")
$(qexec)$$(LD) -shared $$(LDFLAGS) \
- -Wl,--no-undefined -Wl,-soname,$$(SONAME) \
+ $(NO_UNDEFINED) \
+ -Wl,-soname,$$(SONAME) \
-Wl,--version-script,$$(EXPORTS_FILE) -o $$@ \
$$(filter %.o,$$^) $$(extralibs)
endef
diff --git a/build/make/configure.sh b/build/make/configure.sh
index ec9af5e..6fd67f1 100644
--- a/build/make/configure.sh
+++ b/build/make/configure.sh
@@ -521,6 +521,7 @@
EXE_SFX = ${EXE_SFX}
VCPROJ_SFX = ${VCPROJ_SFX}
RTCD_OPTIONS = ${RTCD_OPTIONS}
+LIBWEBM_CXXFLAGS = ${LIBWEBM_CXXFLAGS}
LIBYUV_CXXFLAGS = ${LIBYUV_CXXFLAGS}
EOF
diff --git a/build/make/gen_msvs_vcxproj.sh b/build/make/gen_msvs_vcxproj.sh
index 58bb66b..1e1db05 100755
--- a/build/make/gen_msvs_vcxproj.sh
+++ b/build/make/gen_msvs_vcxproj.sh
@@ -141,7 +141,17 @@
case "$opt" in
--help|-h) show_help
;;
- --target=*) target="${optval}"
+ --target=*)
+ target="${optval}"
+ platform_toolset=$(echo ${target} | awk 'BEGIN{FS="-"}{print $4}')
+ case "$platform_toolset" in
+ clangcl) platform_toolset="ClangCl"
+ ;;
+ "")
+ ;;
+ *) die Unrecognized Visual Studio Platform Toolset in $opt
+ ;;
+ esac
;;
--out=*) outfile="$optval"
;;
@@ -259,6 +269,10 @@
;;
arm64*)
platforms[0]="ARM64"
+ # As of Visual Studio 2022 17.5.5, clang-cl does not support ARM64EC.
+ if [ "$vs_ver" -ge 17 -a "$platform_toolset" != "ClangCl" ]; then
+ platforms[1]="ARM64EC"
+ fi
asm_Debug_cmdline="armasm64 -nologo -oldit "%(FullPath)""
asm_Release_cmdline="armasm64 -nologo -oldit "%(FullPath)""
;;
@@ -335,17 +349,21 @@
else
tag_content ConfigurationType StaticLibrary
fi
- if [ "$vs_ver" = "14" ]; then
- tag_content PlatformToolset v140
- fi
- if [ "$vs_ver" = "15" ]; then
- tag_content PlatformToolset v141
- fi
- if [ "$vs_ver" = "16" ]; then
- tag_content PlatformToolset v142
- fi
- if [ "$vs_ver" = "17" ]; then
- tag_content PlatformToolset v143
+ if [ -n "$platform_toolset" ]; then
+ tag_content PlatformToolset "$platform_toolset"
+ else
+ if [ "$vs_ver" = "14" ]; then
+ tag_content PlatformToolset v140
+ fi
+ if [ "$vs_ver" = "15" ]; then
+ tag_content PlatformToolset v141
+ fi
+ if [ "$vs_ver" = "16" ]; then
+ tag_content PlatformToolset v142
+ fi
+ if [ "$vs_ver" = "17" ]; then
+ tag_content PlatformToolset v143
+ fi
fi
tag_content CharacterSet Unicode
if [ "$config" = "Release" ]; then
diff --git a/configure b/configure
index 2070772..b73436b 100755
--- a/configure
+++ b/configure
@@ -106,7 +106,9 @@
all_platforms="${all_platforms} arm64-win64-gcc"
all_platforms="${all_platforms} arm64-win64-vs15"
all_platforms="${all_platforms} arm64-win64-vs16"
+all_platforms="${all_platforms} arm64-win64-vs16-clangcl"
all_platforms="${all_platforms} arm64-win64-vs17"
+all_platforms="${all_platforms} arm64-win64-vs17-clangcl"
all_platforms="${all_platforms} armv7-android-gcc" #neon Cortex-A8
all_platforms="${all_platforms} armv7-darwin-gcc" #neon Cortex-A8
all_platforms="${all_platforms} armv7-linux-rvct" #neon Cortex-A8
@@ -647,6 +649,7 @@
check_add_cflags -Wimplicit-function-declaration
check_add_cflags -Wmissing-declarations
check_add_cflags -Wmissing-prototypes
+ check_add_cflags -Wshadow
check_add_cflags -Wuninitialized
check_add_cflags -Wunreachable-code-loop-increment
check_add_cflags -Wunused
@@ -677,13 +680,16 @@
check_add_cxxflags -Wc++17-extensions
check_add_cxxflags -Wc++20-extensions
- # disable some warnings specific to libyuv.
+ # disable some warnings specific to libyuv / libwebm.
check_cxxflags -Wno-missing-declarations \
&& LIBYUV_CXXFLAGS="${LIBYUV_CXXFLAGS} -Wno-missing-declarations"
check_cxxflags -Wno-missing-prototypes \
&& LIBYUV_CXXFLAGS="${LIBYUV_CXXFLAGS} -Wno-missing-prototypes"
check_cxxflags -Wno-pass-failed \
&& LIBYUV_CXXFLAGS="${LIBYUV_CXXFLAGS} -Wno-pass-failed"
+ check_cxxflags -Wno-shadow \
+ && LIBWEBM_CXXFLAGS="${LIBWEBM_CXXFLAGS} -Wno-shadow" \
+ && LIBYUV_CXXFLAGS="${LIBYUV_CXXFLAGS} -Wno-shadow"
check_cxxflags -Wno-unused-parameter \
&& LIBYUV_CXXFLAGS="${LIBYUV_CXXFLAGS} -Wno-unused-parameter"
fi
diff --git a/examples.mk b/examples.mk
index 9f83230..22726a3 100644
--- a/examples.mk
+++ b/examples.mk
@@ -57,6 +57,7 @@
# Add compile flags and include path for libwebm sources.
ifeq ($(CONFIG_WEBM_IO),yes)
CXXFLAGS += -D__STDC_CONSTANT_MACROS -D__STDC_LIMIT_MACROS
+ $(BUILD_PFX)third_party/libwebm/%.cc.o: CXXFLAGS += $(LIBWEBM_CXXFLAGS)
INC_PATH-yes += $(SRC_PATH_BARE)/third_party/libwebm
endif
diff --git a/libs.mk b/libs.mk
index 1411fee..f6f6cc9 100644
--- a/libs.mk
+++ b/libs.mk
@@ -178,6 +178,7 @@
INSTALL-LIBS-yes += include/vpx/vpx_integer.h
INSTALL-LIBS-$(CONFIG_DECODERS) += include/vpx/vpx_decoder.h
INSTALL-LIBS-$(CONFIG_ENCODERS) += include/vpx/vpx_encoder.h
+INSTALL-LIBS-$(CONFIG_ENCODERS) += include/vpx/vpx_tpl.h
ifeq ($(CONFIG_EXTERNAL_BUILD),yes)
ifeq ($(CONFIG_MSVS),yes)
INSTALL-LIBS-yes += $(foreach p,$(VS_PLATFORMS),$(LIBSUBDIR)/$(p)/$(CODEC_LIB).lib)
diff --git a/test/comp_avg_pred_test.cc b/test/comp_avg_pred_test.cc
index f747c35..d8fabd5 100644
--- a/test/comp_avg_pred_test.cc
+++ b/test/comp_avg_pred_test.cc
@@ -81,11 +81,11 @@
// Only the reference buffer may have a stride not equal to width.
Buffer<Pixel> ref = Buffer<Pixel>(width, height, ref_padding ? 8 : 0);
ASSERT_TRUE(ref.Init());
- Buffer<Pixel> pred = Buffer<Pixel>(width, height, 0, 16);
+ Buffer<Pixel> pred = Buffer<Pixel>(width, height, 0, 32);
ASSERT_TRUE(pred.Init());
- Buffer<Pixel> avg_ref = Buffer<Pixel>(width, height, 0, 16);
+ Buffer<Pixel> avg_ref = Buffer<Pixel>(width, height, 0, 32);
ASSERT_TRUE(avg_ref.Init());
- Buffer<Pixel> avg_chk = Buffer<Pixel>(width, height, 0, 16);
+ Buffer<Pixel> avg_chk = Buffer<Pixel>(width, height, 0, 32);
ASSERT_TRUE(avg_chk.Init());
const int bitdepth_mask = (1 << bitdepth) - 1;
for (int h = 0; h < height; ++h) {
@@ -121,11 +121,11 @@
const int height = 32;
Buffer<Pixel> ref = Buffer<Pixel>(width, height, 8);
ASSERT_TRUE(ref.Init());
- Buffer<Pixel> pred = Buffer<Pixel>(width, height, 0, 16);
+ Buffer<Pixel> pred = Buffer<Pixel>(width, height, 0, 32);
ASSERT_TRUE(pred.Init());
- Buffer<Pixel> avg_ref = Buffer<Pixel>(width, height, 0, 16);
+ Buffer<Pixel> avg_ref = Buffer<Pixel>(width, height, 0, 32);
ASSERT_TRUE(avg_ref.Init());
- Buffer<Pixel> avg_chk = Buffer<Pixel>(width, height, 0, 16);
+ Buffer<Pixel> avg_chk = Buffer<Pixel>(width, height, 0, 32);
ASSERT_TRUE(avg_chk.Init());
for (int i = 0; i < 500; ++i) {
@@ -167,9 +167,9 @@
const int height = 1 << height_pow;
Buffer<Pixel> ref = Buffer<Pixel>(width, height, ref_padding ? 8 : 0);
ASSERT_TRUE(ref.Init());
- Buffer<Pixel> pred = Buffer<Pixel>(width, height, 0, 16);
+ Buffer<Pixel> pred = Buffer<Pixel>(width, height, 0, 32);
ASSERT_TRUE(pred.Init());
- Buffer<Pixel> avg = Buffer<Pixel>(width, height, 0, 16);
+ Buffer<Pixel> avg = Buffer<Pixel>(width, height, 0, 32);
ASSERT_TRUE(avg.Init());
const int bitdepth_mask = (1 << bitdepth) - 1;
for (int h = 0; h < height; ++h) {
@@ -217,6 +217,11 @@
::testing::Values(&vpx_comp_avg_pred_sse2));
#endif // HAVE_SSE2
+#if HAVE_AVX2
+INSTANTIATE_TEST_SUITE_P(AVX2, AvgPredTestLBD,
+ ::testing::Values(&vpx_comp_avg_pred_avx2));
+#endif // HAVE_AVX2
+
#if HAVE_NEON
INSTANTIATE_TEST_SUITE_P(NEON, AvgPredTestLBD,
::testing::Values(&vpx_comp_avg_pred_neon));
diff --git a/test/encode_api_test.cc b/test/encode_api_test.cc
index e435ed8..e8a044a 100644
--- a/test/encode_api_test.cc
+++ b/test/encode_api_test.cc
@@ -11,6 +11,7 @@
#include <climits>
#include <cstring>
#include <initializer_list>
+#include <new>
#include "third_party/googletest/src/include/gtest/gtest.h"
#include "test/codec_factory.h"
@@ -20,7 +21,7 @@
#include "./vpx_config.h"
#include "vpx/vp8cx.h"
-#include "vpx/vpx_encoder.h"
+#include "vpx/vpx_tpl.h"
namespace {
@@ -368,7 +369,7 @@
: public ::libvpx_test::EncoderTest,
public ::testing::TestWithParam<const libvpx_test::CodecFactory *> {
public:
- EncodeApiGetTplStatsTest() : EncoderTest(GetParam()) {}
+ EncodeApiGetTplStatsTest() : EncoderTest(GetParam()), test_io_(false) {}
~EncodeApiGetTplStatsTest() override {}
protected:
@@ -384,36 +385,82 @@
}
}
- vpx_codec_err_t AllocateTplList(VpxTplFrameStats **data) {
- // Allocate MAX_ARF_GOP_SIZE * sizeof(VpxTplFrameStats) that will be filled
- // by VP9E_GET_TPL_STATS
- *data =
+ vpx_codec_err_t AllocateTplList(VpxTplGopStats *data) {
+ // Allocate MAX_ARF_GOP_SIZE (50) * sizeof(VpxTplFrameStats) that will be
+ // filled by VP9E_GET_TPL_STATS.
+ // MAX_ARF_GOP_SIZE is used here because the test doesn't know the size of
+ // each GOP before getting TPL stats from the encoder.
+ data->size = 50;
+ data->frame_stats_list =
static_cast<VpxTplFrameStats *>(calloc(50, sizeof(VpxTplFrameStats)));
- if (*data == nullptr) return VPX_CODEC_MEM_ERROR;
+ if (data->frame_stats_list == nullptr) return VPX_CODEC_MEM_ERROR;
return VPX_CODEC_OK;
}
+ void CompareTplGopStats(const VpxTplGopStats &ref_gop_stats,
+ const VpxTplGopStats &test_gop_stats) {
+ ASSERT_EQ(ref_gop_stats.size, test_gop_stats.size);
+ for (int frame = 0; frame < ref_gop_stats.size; frame++) {
+ const VpxTplFrameStats &ref_frame_stats =
+ ref_gop_stats.frame_stats_list[frame];
+ const VpxTplFrameStats &test_frame_stats =
+ test_gop_stats.frame_stats_list[frame];
+ ASSERT_EQ(ref_frame_stats.num_blocks, test_frame_stats.num_blocks);
+ ASSERT_EQ(ref_frame_stats.frame_width, test_frame_stats.frame_width);
+ ASSERT_EQ(ref_frame_stats.frame_height, test_frame_stats.frame_height);
+ for (int block = 0; block < ref_frame_stats.num_blocks; block++) {
+ const VpxTplBlockStats &ref_block_stats =
+ ref_frame_stats.block_stats_list[block];
+ const VpxTplBlockStats &test_block_stats =
+ test_frame_stats.block_stats_list[block];
+ ASSERT_EQ(ref_block_stats.inter_cost, test_block_stats.inter_cost);
+ ASSERT_EQ(ref_block_stats.intra_cost, test_block_stats.intra_cost);
+ ASSERT_EQ(ref_block_stats.mv_c, test_block_stats.mv_c);
+ ASSERT_EQ(ref_block_stats.mv_r, test_block_stats.mv_r);
+ ASSERT_EQ(ref_block_stats.recrf_dist, test_block_stats.recrf_dist);
+ ASSERT_EQ(ref_block_stats.recrf_rate, test_block_stats.recrf_rate);
+ ASSERT_EQ(ref_block_stats.ref_frame_index,
+ test_block_stats.ref_frame_index);
+ }
+ }
+ }
+
void PostEncodeFrameHook(::libvpx_test::Encoder *encoder) override {
::libvpx_test::CxDataIterator iter = encoder->GetCxData();
while (const vpx_codec_cx_pkt_t *pkt = iter.Next()) {
switch (pkt->kind) {
case VPX_CODEC_CX_FRAME_PKT: {
- VpxTplFrameStats *tpl_stats = NULL;
+ VpxTplGopStats tpl_stats;
EXPECT_EQ(AllocateTplList(&tpl_stats), VPX_CODEC_OK);
- encoder->Control(VP9E_GET_TPL_STATS, tpl_stats);
+ encoder->Control(VP9E_GET_TPL_STATS, &tpl_stats);
bool stats_not_all_zero = false;
- for (unsigned int i = 0; i < cfg_.g_lag_in_frames; i++) {
- if (tpl_stats[i].frame_width != 0) {
- ASSERT_EQ(tpl_stats[i].frame_width, width_);
- ASSERT_EQ(tpl_stats[i].frame_height, height_);
- ASSERT_GT(tpl_stats[i].num_blocks, 0);
- ASSERT_NE(tpl_stats[i].block_stats_list, nullptr);
+ for (int i = 0; i < tpl_stats.size; i++) {
+ VpxTplFrameStats *frame_stats_list = tpl_stats.frame_stats_list;
+ if (frame_stats_list[i].frame_width != 0) {
+ ASSERT_EQ(frame_stats_list[i].frame_width, width_);
+ ASSERT_EQ(frame_stats_list[i].frame_height, height_);
+ ASSERT_GT(frame_stats_list[i].num_blocks, 0);
+ ASSERT_NE(frame_stats_list[i].block_stats_list, nullptr);
stats_not_all_zero = true;
}
}
ASSERT_TRUE(stats_not_all_zero);
- // Free the memory right away now as this is only a test.
- free(tpl_stats);
+ if (test_io_ && tpl_stats.size > 0) {
+ libvpx_test::TempOutFile *temp_out_file =
+ new (std::nothrow) libvpx_test::TempOutFile("w+");
+ ASSERT_NE(temp_out_file, nullptr);
+ ASSERT_NE(temp_out_file->file(), nullptr);
+ vpx_write_tpl_gop_stats(temp_out_file->file(), &tpl_stats);
+ rewind(temp_out_file->file());
+ VpxTplGopStats gop_stats_io;
+ ASSERT_EQ(
+ vpx_read_tpl_gop_stats(temp_out_file->file(), &gop_stats_io),
+ VPX_CODEC_OK);
+ CompareTplGopStats(gop_stats_io, tpl_stats);
+ vpx_free_tpl_gop_stats(&gop_stats_io);
+ delete temp_out_file;
+ }
+ free(tpl_stats.frame_stats_list);
break;
}
default: break;
@@ -423,6 +470,7 @@
int width_;
int height_;
+ bool test_io_;
};
TEST_P(EncodeApiGetTplStatsTest, GetTplStats) {
@@ -430,7 +478,17 @@
width_ = 352;
height_ = 288;
::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", width_,
- height_, 30, 1, 0, 150);
+ height_, 30, 1, 0, 50);
+ ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+}
+
+TEST_P(EncodeApiGetTplStatsTest, GetTplStatsIO) {
+ cfg_.g_lag_in_frames = 25;
+ width_ = 352;
+ height_ = 288;
+ test_io_ = true;
+ ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", width_,
+ height_, 30, 1, 0, 50);
ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
}
diff --git a/test/encode_test_driver.h b/test/encode_test_driver.h
index a5cd830..165fcfa 100644
--- a/test/encode_test_driver.h
+++ b/test/encode_test_driver.h
@@ -19,7 +19,7 @@
#if CONFIG_VP8_ENCODER || CONFIG_VP9_ENCODER
#include "vpx/vp8cx.h"
#endif
-#include "vpx/vpx_encoder.h"
+#include "vpx/vpx_tpl.h"
namespace libvpx_test {
@@ -154,7 +154,7 @@
ASSERT_EQ(VPX_CODEC_OK, res) << EncoderError();
}
- void Control(int ctrl_id, VpxTplFrameStats *arg) {
+ void Control(int ctrl_id, VpxTplGopStats *arg) {
const vpx_codec_err_t res = vpx_codec_control_(&encoder_, ctrl_id, arg);
ASSERT_EQ(VPX_CODEC_OK, res) << EncoderError();
}
diff --git a/test/test.mk b/test/test.mk
index bbcdd0c..b64e89b 100644
--- a/test/test.mk
+++ b/test/test.mk
@@ -85,6 +85,7 @@
LIBVPX_TEST_SRCS-$(CONFIG_DECODERS) += ../webmdec.h
LIBVPX_TEST_SRCS-$(CONFIG_DECODERS) += webm_video_source.h
LIBVPX_TEST_SRCS-$(CONFIG_VP9_DECODER) += vp9_skip_loopfilter_test.cc
+$(BUILD_PFX)third_party/libwebm/%.cc.o: CXXFLAGS += $(LIBWEBM_CXXFLAGS)
endif
LIBVPX_TEST_SRCS-$(CONFIG_DECODERS) += decode_api_test.cc
diff --git a/test/video_source.h b/test/video_source.h
index a10ff6f..5ed99d0 100644
--- a/test/video_source.h
+++ b/test/video_source.h
@@ -64,7 +64,7 @@
return fopen(path_to_source.c_str(), "rb");
}
-static FILE *GetTempOutFile(std::string *file_name) {
+static FILE *GetTempOutFile(std::string *file_name, const char *io_mode) {
file_name->clear();
#if defined(_WIN32)
char fname[MAX_PATH];
@@ -73,7 +73,7 @@
// Assume for now that the filename generated is unique per process
if (GetTempFileNameA(tmppath, "lvx", 0, fname)) {
file_name->assign(fname);
- return fopen(fname, "wb+");
+ return fopen(fname, io_mode);
}
}
return nullptr;
@@ -94,13 +94,16 @@
const int fd = mkstemp(temp_file_name.get());
if (fd == -1) return nullptr;
*file_name = temp_file_name.get();
- return fdopen(fd, "wb+");
+ return fdopen(fd, io_mode);
#endif
}
class TempOutFile {
public:
- TempOutFile() { file_ = GetTempOutFile(&file_name_); }
+ TempOutFile() { file_ = GetTempOutFile(&file_name_, "wb+"); }
+ TempOutFile(const char *io_mode) {
+ file_ = GetTempOutFile(&file_name_, io_mode);
+ }
~TempOutFile() {
CloseFile();
if (!file_name_.empty()) {
diff --git a/vp8/common/mips/msa/vp8_macros_msa.h b/vp8/common/mips/msa/vp8_macros_msa.h
index 7cb3c98..cc85b9a 100644
--- a/vp8/common/mips/msa/vp8_macros_msa.h
+++ b/vp8/common/mips/msa/vp8_macros_msa.h
@@ -40,160 +40,160 @@
#define ST_SW(...) ST_W(v4i32, __VA_ARGS__)
#if (__mips_isa_rev >= 6)
-#define LW(psrc) \
- ({ \
- const uint8_t *psrc_m = (const uint8_t *)(psrc); \
- uint32_t val_m; \
- \
- asm volatile("lw %[val_m], %[psrc_m] \n\t" \
- \
- : [val_m] "=r"(val_m) \
- : [psrc_m] "m"(*psrc_m)); \
- \
- val_m; \
+#define LW(psrc) \
+ ({ \
+ const uint8_t *lw_psrc_m = (const uint8_t *)(psrc); \
+ uint32_t lw_val_m; \
+ \
+ asm volatile("lw %[lw_val_m], %[lw_psrc_m] \n\t" \
+ \
+ : [lw_val_m] "=r"(lw_val_m) \
+ : [lw_psrc_m] "m"(*lw_psrc_m)); \
+ \
+ lw_val_m; \
})
#if (__mips == 64)
-#define LD(psrc) \
- ({ \
- const uint8_t *psrc_m = (const uint8_t *)(psrc); \
- uint64_t val_m = 0; \
- \
- asm volatile("ld %[val_m], %[psrc_m] \n\t" \
- \
- : [val_m] "=r"(val_m) \
- : [psrc_m] "m"(*psrc_m)); \
- \
- val_m; \
+#define LD(psrc) \
+ ({ \
+ const uint8_t *ld_psrc_m = (const uint8_t *)(psrc); \
+ uint64_t ld_val_m = 0; \
+ \
+ asm volatile("ld %[ld_val_m], %[ld_psrc_m] \n\t" \
+ \
+ : [ld_val_m] "=r"(ld_val_m) \
+ : [ld_psrc_m] "m"(*ld_psrc_m)); \
+ \
+ ld_val_m; \
})
#else // !(__mips == 64)
-#define LD(psrc) \
- ({ \
- const uint8_t *psrc_ld = (const uint8_t *)(psrc); \
- uint32_t val0_m, val1_m; \
- uint64_t val_m = 0; \
- \
- val0_m = LW(psrc_ld); \
- val1_m = LW(psrc_ld + 4); \
- \
- val_m = (uint64_t)(val1_m); \
- val_m = (uint64_t)((val_m << 32) & 0xFFFFFFFF00000000); \
- val_m = (uint64_t)(val_m | (uint64_t)val0_m); \
- \
- val_m; \
+#define LD(psrc) \
+ ({ \
+ const uint8_t *ld_psrc_m = (const uint8_t *)(psrc); \
+ uint32_t ld_val0_m, ld_val1_m; \
+ uint64_t ld_val_m = 0; \
+ \
+ ld_val0_m = LW(ld_psrc_m); \
+ ld_val1_m = LW(ld_psrc_m + 4); \
+ \
+ ld_val_m = (uint64_t)(ld_val1_m); \
+ ld_val_m = (uint64_t)((ld_val_m << 32) & 0xFFFFFFFF00000000); \
+ ld_val_m = (uint64_t)(ld_val_m | (uint64_t)ld_val0_m); \
+ \
+ ld_val_m; \
})
#endif // (__mips == 64)
-#define SH(val, pdst) \
- { \
- uint8_t *pdst_m = (uint8_t *)(pdst); \
- const uint16_t val_m = (val); \
- \
- asm volatile("sh %[val_m], %[pdst_m] \n\t" \
- \
- : [pdst_m] "=m"(*pdst_m) \
- : [val_m] "r"(val_m)); \
+#define SH(val, pdst) \
+ { \
+ uint8_t *sh_pdst_m = (uint8_t *)(pdst); \
+ const uint16_t sh_val_m = (val); \
+ \
+ asm volatile("sh %[sh_val_m], %[sh_pdst_m] \n\t" \
+ \
+ : [sh_pdst_m] "=m"(*sh_pdst_m) \
+ : [sh_val_m] "r"(sh_val_m)); \
}
-#define SW(val, pdst) \
- { \
- uint8_t *pdst_m = (uint8_t *)(pdst); \
- const uint32_t val_m = (val); \
- \
- asm volatile("sw %[val_m], %[pdst_m] \n\t" \
- \
- : [pdst_m] "=m"(*pdst_m) \
- : [val_m] "r"(val_m)); \
+#define SW(val, pdst) \
+ { \
+ uint8_t *sw_pdst_m = (uint8_t *)(pdst); \
+ const uint32_t sw_val_m = (val); \
+ \
+ asm volatile("sw %[sw_val_m], %[sw_pdst_m] \n\t" \
+ \
+ : [sw_pdst_m] "=m"(*sw_pdst_m) \
+ : [sw_val_m] "r"(sw_val_m)); \
}
-#define SD(val, pdst) \
- { \
- uint8_t *pdst_m = (uint8_t *)(pdst); \
- const uint64_t val_m = (val); \
- \
- asm volatile("sd %[val_m], %[pdst_m] \n\t" \
- \
- : [pdst_m] "=m"(*pdst_m) \
- : [val_m] "r"(val_m)); \
+#define SD(val, pdst) \
+ { \
+ uint8_t *sd_pdst_m = (uint8_t *)(pdst); \
+ const uint64_t sd_val_m = (val); \
+ \
+ asm volatile("sd %[sd_val_m], %[sd_pdst_m] \n\t" \
+ \
+ : [sd_pdst_m] "=m"(*sd_pdst_m) \
+ : [sd_val_m] "r"(sd_val_m)); \
}
#else // !(__mips_isa_rev >= 6)
-#define LW(psrc) \
- ({ \
- const uint8_t *psrc_m = (const uint8_t *)(psrc); \
- uint32_t val_m; \
- \
- asm volatile( \
- "lwr %[val_m], 0(%[psrc_m]) \n\t" \
- "lwl %[val_m], 3(%[psrc_m]) \n\t" \
- : [val_m] "=&r"(val_m) \
- : [psrc_m] "r"(psrc_m)); \
- \
- val_m; \
+#define LW(psrc) \
+ ({ \
+ const uint8_t *lw_psrc_m = (const uint8_t *)(psrc); \
+ uint32_t lw_val_m; \
+ \
+ asm volatile( \
+ "lwr %[lw_val_m], 0(%[lw_psrc_m]) \n\t" \
+ "lwl %[lw_val_m], 3(%[lw_psrc_m]) \n\t" \
+ : [lw_val_m] "=&r"(lw_val_m) \
+ : [lw_psrc_m] "r"(lw_psrc_m)); \
+ \
+ lw_val_m; \
})
#if (__mips == 64)
-#define LD(psrc) \
- ({ \
- const uint8_t *psrc_m = (const uint8_t *)(psrc); \
- uint64_t val_m = 0; \
- \
- asm volatile( \
- "ldr %[val_m], 0(%[psrc_m]) \n\t" \
- "ldl %[val_m], 7(%[psrc_m]) \n\t" \
- : [val_m] "=&r"(val_m) \
- : [psrc_m] "r"(psrc_m)); \
- \
- val_m; \
+#define LD(psrc) \
+ ({ \
+ const uint8_t *ld_psrc_m = (const uint8_t *)(psrc); \
+ uint64_t ld_val_m = 0; \
+ \
+ asm volatile( \
+ "ldr %[ld_val_m], 0(%[ld_psrc_m]) \n\t" \
+ "ldl %[ld_val_m], 7(%[ld_psrc_m]) \n\t" \
+ : [ld_val_m] "=&r"(ld_val_m) \
+ : [ld_psrc_m] "r"(ld_psrc_m)); \
+ \
+ ld_val_m; \
})
#else // !(__mips == 64)
-#define LD(psrc) \
- ({ \
- const uint8_t *psrc_m1 = (const uint8_t *)(psrc); \
- uint32_t val0_m, val1_m; \
- uint64_t val_m = 0; \
- \
- val0_m = LW(psrc_m1); \
- val1_m = LW(psrc_m1 + 4); \
- \
- val_m = (uint64_t)(val1_m); \
- val_m = (uint64_t)((val_m << 32) & 0xFFFFFFFF00000000); \
- val_m = (uint64_t)(val_m | (uint64_t)val0_m); \
- \
- val_m; \
+#define LD(psrc) \
+ ({ \
+ const uint8_t *ld_psrc_m1 = (const uint8_t *)(psrc); \
+ uint32_t ld_val0_m, ld_val1_m; \
+ uint64_t ld_val_m = 0; \
+ \
+ ld_val0_m = LW(ld_psrc_m1); \
+ ld_val1_m = LW(ld_psrc_m1 + 4); \
+ \
+ ld_val_m = (uint64_t)(ld_val1_m); \
+ ld_val_m = (uint64_t)((ld_val_m << 32) & 0xFFFFFFFF00000000); \
+ ld_val_m = (uint64_t)(ld_val_m | (uint64_t)ld_val0_m); \
+ \
+ ld_val_m; \
})
#endif // (__mips == 64)
-#define SH(val, pdst) \
- { \
- uint8_t *pdst_m = (uint8_t *)(pdst); \
- const uint16_t val_m = (val); \
- \
- asm volatile("ush %[val_m], %[pdst_m] \n\t" \
- \
- : [pdst_m] "=m"(*pdst_m) \
- : [val_m] "r"(val_m)); \
+#define SH(val, pdst) \
+ { \
+ uint8_t *sh_pdst_m = (uint8_t *)(pdst); \
+ const uint16_t sh_val_m = (val); \
+ \
+ asm volatile("ush %[sh_val_m], %[sh_pdst_m] \n\t" \
+ \
+ : [sh_pdst_m] "=m"(*sh_pdst_m) \
+ : [sh_val_m] "r"(sh_val_m)); \
}
-#define SW(val, pdst) \
- { \
- uint8_t *pdst_m = (uint8_t *)(pdst); \
- const uint32_t val_m = (val); \
- \
- asm volatile("usw %[val_m], %[pdst_m] \n\t" \
- \
- : [pdst_m] "=m"(*pdst_m) \
- : [val_m] "r"(val_m)); \
+#define SW(val, pdst) \
+ { \
+ uint8_t *sw_pdst_m = (uint8_t *)(pdst); \
+ const uint32_t sw_val_m = (val); \
+ \
+ asm volatile("usw %[sw_val_m], %[sw_pdst_m] \n\t" \
+ \
+ : [sw_pdst_m] "=m"(*sw_pdst_m) \
+ : [sw_val_m] "r"(sw_val_m)); \
}
-#define SD(val, pdst) \
- { \
- uint8_t *pdst_m1 = (uint8_t *)(pdst); \
- uint32_t val0_m, val1_m; \
- \
- val0_m = (uint32_t)((val)&0x00000000FFFFFFFF); \
- val1_m = (uint32_t)(((val) >> 32) & 0x00000000FFFFFFFF); \
- \
- SW(val0_m, pdst_m1); \
- SW(val1_m, pdst_m1 + 4); \
+#define SD(val, pdst) \
+ { \
+ uint8_t *sd_pdst_m1 = (uint8_t *)(pdst); \
+ uint32_t sd_val0_m, sd_val1_m; \
+ \
+ sd_val0_m = (uint32_t)((val)&0x00000000FFFFFFFF); \
+ sd_val1_m = (uint32_t)(((val) >> 32) & 0x00000000FFFFFFFF); \
+ \
+ SW(sd_val0_m, sd_pdst_m1); \
+ SW(sd_val1_m, sd_pdst_m1 + 4); \
}
#endif // (__mips_isa_rev >= 6)
diff --git a/vp8/decoder/onyxd_int.h b/vp8/decoder/onyxd_int.h
index a6bedc4..56500a8 100644
--- a/vp8/decoder/onyxd_int.h
+++ b/vp8/decoder/onyxd_int.h
@@ -135,27 +135,6 @@
int vp8_create_decoder_instances(struct frame_buffers *fb, VP8D_CONFIG *oxcf);
int vp8_remove_decoder_instances(struct frame_buffers *fb);
-#if CONFIG_DEBUG
-#define CHECK_MEM_ERROR(lval, expr) \
- do { \
- assert(pbi->common.error.setjmp); \
- (lval) = (expr); \
- if (!(lval)) \
- vpx_internal_error(&pbi->common.error, VPX_CODEC_MEM_ERROR, \
- "Failed to allocate " #lval " at %s:%d", __FILE__, \
- __LINE__); \
- } while (0)
-#else
-#define CHECK_MEM_ERROR(lval, expr) \
- do { \
- assert(pbi->common.error.setjmp); \
- (lval) = (expr); \
- if (!(lval)) \
- vpx_internal_error(&pbi->common.error, VPX_CODEC_MEM_ERROR, \
- "Failed to allocate " #lval); \
- } while (0)
-#endif
-
#ifdef __cplusplus
} // extern "C"
#endif
diff --git a/vp8/decoder/threading.c b/vp8/decoder/threading.c
index 490f62d..9ea6a4f 100644
--- a/vp8/decoder/threading.c
+++ b/vp8/decoder/threading.c
@@ -30,11 +30,13 @@
#include "error_concealment.h"
#endif
-#define CALLOC_ARRAY(p, n) CHECK_MEM_ERROR((p), vpx_calloc(sizeof(*(p)), (n)))
-#define CALLOC_ARRAY_ALIGNED(p, n, algn) \
- do { \
- CHECK_MEM_ERROR((p), vpx_memalign((algn), sizeof(*(p)) * (n))); \
- memset((p), 0, (n) * sizeof(*(p))); \
+#define CALLOC_ARRAY(p, n) \
+ CHECK_MEM_ERROR(&pbi->common.error, (p), vpx_calloc(sizeof(*(p)), (n)))
+#define CALLOC_ARRAY_ALIGNED(p, n, algn) \
+ do { \
+ CHECK_MEM_ERROR(&pbi->common.error, (p), \
+ vpx_memalign((algn), sizeof(*(p)) * (n))); \
+ memset((p), 0, (n) * sizeof(*(p))); \
} while (0)
static void setup_decoding_thread_data(VP8D_COMP *pbi, MACROBLOCKD *xd,
@@ -754,7 +756,7 @@
uv_width = width >> 1;
/* Allocate a vpx_atomic_int for each mb row. */
- CHECK_MEM_ERROR(pbi->mt_current_mb_col,
+ CHECK_MEM_ERROR(&pc->error, pbi->mt_current_mb_col,
vpx_malloc(sizeof(*pbi->mt_current_mb_col) * pc->mb_rows));
for (i = 0; i < pc->mb_rows; ++i)
vpx_atomic_init(&pbi->mt_current_mb_col[i], 0);
@@ -762,7 +764,7 @@
/* Allocate memory for above_row buffers. */
CALLOC_ARRAY(pbi->mt_yabove_row, pc->mb_rows);
for (i = 0; i < pc->mb_rows; ++i) {
- CHECK_MEM_ERROR(pbi->mt_yabove_row[i],
+ CHECK_MEM_ERROR(&pc->error, pbi->mt_yabove_row[i],
vpx_memalign(16, sizeof(unsigned char) *
(width + (VP8BORDERINPIXELS << 1))));
vp8_zero_array(pbi->mt_yabove_row[i], width + (VP8BORDERINPIXELS << 1));
@@ -770,7 +772,7 @@
CALLOC_ARRAY(pbi->mt_uabove_row, pc->mb_rows);
for (i = 0; i < pc->mb_rows; ++i) {
- CHECK_MEM_ERROR(pbi->mt_uabove_row[i],
+ CHECK_MEM_ERROR(&pc->error, pbi->mt_uabove_row[i],
vpx_memalign(16, sizeof(unsigned char) *
(uv_width + VP8BORDERINPIXELS)));
vp8_zero_array(pbi->mt_uabove_row[i], uv_width + VP8BORDERINPIXELS);
@@ -778,7 +780,7 @@
CALLOC_ARRAY(pbi->mt_vabove_row, pc->mb_rows);
for (i = 0; i < pc->mb_rows; ++i) {
- CHECK_MEM_ERROR(pbi->mt_vabove_row[i],
+ CHECK_MEM_ERROR(&pc->error, pbi->mt_vabove_row[i],
vpx_memalign(16, sizeof(unsigned char) *
(uv_width + VP8BORDERINPIXELS)));
vp8_zero_array(pbi->mt_vabove_row[i], uv_width + VP8BORDERINPIXELS);
@@ -787,17 +789,17 @@
/* Allocate memory for left_col buffers. */
CALLOC_ARRAY(pbi->mt_yleft_col, pc->mb_rows);
for (i = 0; i < pc->mb_rows; ++i)
- CHECK_MEM_ERROR(pbi->mt_yleft_col[i],
+ CHECK_MEM_ERROR(&pc->error, pbi->mt_yleft_col[i],
vpx_calloc(sizeof(unsigned char) * 16, 1));
CALLOC_ARRAY(pbi->mt_uleft_col, pc->mb_rows);
for (i = 0; i < pc->mb_rows; ++i)
- CHECK_MEM_ERROR(pbi->mt_uleft_col[i],
+ CHECK_MEM_ERROR(&pc->error, pbi->mt_uleft_col[i],
vpx_calloc(sizeof(unsigned char) * 8, 1));
CALLOC_ARRAY(pbi->mt_vleft_col, pc->mb_rows);
for (i = 0; i < pc->mb_rows; ++i)
- CHECK_MEM_ERROR(pbi->mt_vleft_col[i],
+ CHECK_MEM_ERROR(&pc->error, pbi->mt_vleft_col[i],
vpx_calloc(sizeof(unsigned char) * 8, 1));
}
}
diff --git a/vp8/encoder/encodeframe.c b/vp8/encoder/encodeframe.c
index 6201075..dc29945 100644
--- a/vp8/encoder/encodeframe.c
+++ b/vp8/encoder/encodeframe.c
@@ -123,7 +123,7 @@
unsigned int tmp;
/* Create a list to sort to */
- CHECK_MEM_ERROR(sortlist,
+ CHECK_MEM_ERROR(&cpi->common.error, sortlist,
vpx_calloc(sizeof(unsigned int), cpi->common.MBs));
/* Copy map to sort list */
diff --git a/vp8/encoder/ethreading.c b/vp8/encoder/ethreading.c
index cb35f4f..2583cb0 100644
--- a/vp8/encoder/ethreading.c
+++ b/vp8/encoder/ethreading.c
@@ -510,16 +510,16 @@
if (th_count == 0) return 0;
- CHECK_MEM_ERROR(cpi->h_encoding_thread,
+ CHECK_MEM_ERROR(&cpi->common.error, cpi->h_encoding_thread,
vpx_malloc(sizeof(pthread_t) * th_count));
- CHECK_MEM_ERROR(cpi->h_event_start_encoding,
+ CHECK_MEM_ERROR(&cpi->common.error, cpi->h_event_start_encoding,
vpx_malloc(sizeof(sem_t) * th_count));
- CHECK_MEM_ERROR(cpi->h_event_end_encoding,
+ CHECK_MEM_ERROR(&cpi->common.error, cpi->h_event_end_encoding,
vpx_malloc(sizeof(sem_t) * th_count));
- CHECK_MEM_ERROR(cpi->mb_row_ei,
+ CHECK_MEM_ERROR(&cpi->common.error, cpi->mb_row_ei,
vpx_memalign(32, sizeof(MB_ROW_COMP) * th_count));
memset(cpi->mb_row_ei, 0, sizeof(MB_ROW_COMP) * th_count);
- CHECK_MEM_ERROR(cpi->en_thread_data,
+ CHECK_MEM_ERROR(&cpi->common.error, cpi->en_thread_data,
vpx_malloc(sizeof(ENCODETHREAD_DATA) * th_count));
vpx_atomic_store_release(&cpi->b_multi_threaded, 1);
diff --git a/vp8/encoder/onyx_if.c b/vp8/encoder/onyx_if.c
index a780048..8941329 100644
--- a/vp8/encoder/onyx_if.c
+++ b/vp8/encoder/onyx_if.c
@@ -1169,7 +1169,8 @@
#else
unsigned int tokens = cm->mb_rows * cm->mb_cols * 24 * 16;
#endif
- CHECK_MEM_ERROR(cpi->tok, vpx_calloc(tokens, sizeof(*cpi->tok)));
+ CHECK_MEM_ERROR(&cpi->common.error, cpi->tok,
+ vpx_calloc(tokens, sizeof(*cpi->tok)));
}
/* Data used for real time vc mode to see if gf needs refreshing */
@@ -1178,37 +1179,39 @@
/* Structures used to monitor GF usage */
vpx_free(cpi->gf_active_flags);
CHECK_MEM_ERROR(
- cpi->gf_active_flags,
+ &cpi->common.error, cpi->gf_active_flags,
vpx_calloc(sizeof(*cpi->gf_active_flags), cm->mb_rows * cm->mb_cols));
cpi->gf_active_count = cm->mb_rows * cm->mb_cols;
vpx_free(cpi->mb_activity_map);
CHECK_MEM_ERROR(
- cpi->mb_activity_map,
+ &cpi->common.error, cpi->mb_activity_map,
vpx_calloc(sizeof(*cpi->mb_activity_map), cm->mb_rows * cm->mb_cols));
/* allocate memory for storing last frame's MVs for MV prediction. */
vpx_free(cpi->lfmv);
- CHECK_MEM_ERROR(cpi->lfmv, vpx_calloc((cm->mb_rows + 2) * (cm->mb_cols + 2),
- sizeof(*cpi->lfmv)));
+ CHECK_MEM_ERROR(
+ &cpi->common.error, cpi->lfmv,
+ vpx_calloc((cm->mb_rows + 2) * (cm->mb_cols + 2), sizeof(*cpi->lfmv)));
vpx_free(cpi->lf_ref_frame_sign_bias);
- CHECK_MEM_ERROR(cpi->lf_ref_frame_sign_bias,
+ CHECK_MEM_ERROR(&cpi->common.error, cpi->lf_ref_frame_sign_bias,
vpx_calloc((cm->mb_rows + 2) * (cm->mb_cols + 2),
sizeof(*cpi->lf_ref_frame_sign_bias)));
vpx_free(cpi->lf_ref_frame);
- CHECK_MEM_ERROR(cpi->lf_ref_frame,
+ CHECK_MEM_ERROR(&cpi->common.error, cpi->lf_ref_frame,
vpx_calloc((cm->mb_rows + 2) * (cm->mb_cols + 2),
sizeof(*cpi->lf_ref_frame)));
/* Create the encoder segmentation map and set all entries to 0 */
vpx_free(cpi->segmentation_map);
CHECK_MEM_ERROR(
- cpi->segmentation_map,
+ &cpi->common.error, cpi->segmentation_map,
vpx_calloc(cm->mb_rows * cm->mb_cols, sizeof(*cpi->segmentation_map)));
cpi->cyclic_refresh_mode_index = 0;
vpx_free(cpi->active_map);
- CHECK_MEM_ERROR(cpi->active_map, vpx_calloc(cm->mb_rows * cm->mb_cols,
- sizeof(*cpi->active_map)));
+ CHECK_MEM_ERROR(
+ &cpi->common.error, cpi->active_map,
+ vpx_calloc(cm->mb_rows * cm->mb_cols, sizeof(*cpi->active_map)));
memset(cpi->active_map, 1, (cm->mb_rows * cm->mb_cols));
#if CONFIG_MULTITHREAD
@@ -1226,7 +1229,7 @@
int i;
vpx_free(cpi->mt_current_mb_col);
- CHECK_MEM_ERROR(cpi->mt_current_mb_col,
+ CHECK_MEM_ERROR(&cpi->common.error, cpi->mt_current_mb_col,
vpx_malloc(sizeof(*cpi->mt_current_mb_col) * cm->mb_rows));
for (i = 0; i < cm->mb_rows; ++i)
vpx_atomic_init(&cpi->mt_current_mb_col[i], 0);
@@ -1235,7 +1238,8 @@
#endif
vpx_free(cpi->tplist);
- CHECK_MEM_ERROR(cpi->tplist, vpx_malloc(sizeof(TOKENLIST) * cm->mb_rows));
+ CHECK_MEM_ERROR(&cpi->common.error, cpi->tplist,
+ vpx_malloc(sizeof(TOKENLIST) * cm->mb_rows));
#if CONFIG_TEMPORAL_DENOISING
if (cpi->oxcf.noise_sensitivity > 0) {
@@ -1773,8 +1777,9 @@
cpi->common.error.setjmp = 1;
- CHECK_MEM_ERROR(cpi->mb.ss, vpx_calloc(sizeof(search_site),
- (MAX_MVSEARCH_STEPS * 8) + 1));
+ CHECK_MEM_ERROR(
+ &cpi->common.error, cpi->mb.ss,
+ vpx_calloc(sizeof(search_site), (MAX_MVSEARCH_STEPS * 8) + 1));
vp8_create_common(&cpi->common);
@@ -1879,18 +1884,19 @@
}
if (cpi->cyclic_refresh_mode_enabled) {
- CHECK_MEM_ERROR(cpi->cyclic_refresh_map,
+ CHECK_MEM_ERROR(&cpi->common.error, cpi->cyclic_refresh_map,
vpx_calloc((cpi->common.mb_rows * cpi->common.mb_cols), 1));
} else {
cpi->cyclic_refresh_map = (signed char *)NULL;
}
- CHECK_MEM_ERROR(cpi->skin_map, vpx_calloc(cm->mb_rows * cm->mb_cols,
- sizeof(cpi->skin_map[0])));
+ CHECK_MEM_ERROR(
+ &cpi->common.error, cpi->skin_map,
+ vpx_calloc(cm->mb_rows * cm->mb_cols, sizeof(cpi->skin_map[0])));
- CHECK_MEM_ERROR(cpi->consec_zero_last,
+ CHECK_MEM_ERROR(&cpi->common.error, cpi->consec_zero_last,
vpx_calloc(cm->mb_rows * cm->mb_cols, 1));
- CHECK_MEM_ERROR(cpi->consec_zero_last_mvbias,
+ CHECK_MEM_ERROR(&cpi->common.error, cpi->consec_zero_last_mvbias,
vpx_calloc((cpi->common.mb_rows * cpi->common.mb_cols), 1));
/*Initialize the feed-forward activity masking.*/
diff --git a/vp8/encoder/onyx_int.h b/vp8/encoder/onyx_int.h
index 46a1791..bde5c2f 100644
--- a/vp8/encoder/onyx_int.h
+++ b/vp8/encoder/onyx_int.h
@@ -731,26 +731,6 @@
void vp8_set_speed_features(VP8_COMP *cpi);
-#if CONFIG_DEBUG
-#define CHECK_MEM_ERROR(lval, expr) \
- do { \
- assert(cpi->common.error.setjmp); \
- (lval) = (expr); \
- if (!(lval)) \
- vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR, \
- "Failed to allocate " #lval " at %s:%d", __FILE__, \
- __LINE__); \
- } while (0)
-#else
-#define CHECK_MEM_ERROR(lval, expr) \
- do { \
- assert(cpi->common.error.setjmp); \
- (lval) = (expr); \
- if (!(lval)) \
- vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR, \
- "Failed to allocate " #lval); \
- } while (0)
-#endif
#ifdef __cplusplus
} // extern "C"
#endif
diff --git a/vp8/vp8_cx_iface.c b/vp8/vp8_cx_iface.c
index a9d1f80..0821eef 100644
--- a/vp8/vp8_cx_iface.c
+++ b/vp8/vp8_cx_iface.c
@@ -911,12 +911,6 @@
}
}
- if (setjmp(ctx->cpi->common.error.jmp)) {
- ctx->cpi->common.error.setjmp = 0;
- vpx_clear_system_state();
- return VPX_CODEC_CORRUPT_FRAME;
- }
-
/* Initialize the encoder instance on the first frame*/
if (!res && ctx->cpi) {
unsigned int lib_flags;
@@ -927,6 +921,13 @@
unsigned char *cx_data_end;
int comp_data_state = 0;
+ if (setjmp(ctx->cpi->common.error.jmp)) {
+ ctx->cpi->common.error.setjmp = 0;
+ vpx_clear_system_state();
+ return VPX_CODEC_CORRUPT_FRAME;
+ }
+ ctx->cpi->common.error.setjmp = 1;
+
/* Set up internal flags */
if (ctx->base.init_flags & VPX_CODEC_USE_PSNR) {
((VP8_COMP *)ctx->cpi)->b_calculate_psnr = 1;
@@ -962,8 +963,6 @@
cx_data_end = ctx->cx_data + cx_data_sz;
lib_flags = 0;
- ctx->cpi->common.error.setjmp = 1;
-
while (cx_data_sz >= ctx->cx_data_sz / 2) {
comp_data_state = vp8_get_compressed_data(
ctx->cpi, &lib_flags, &size, cx_data, cx_data_end, &dst_time_stamp,
@@ -1059,6 +1058,7 @@
}
}
}
+ ctx->cpi->common.error.setjmp = 0;
}
return res;
diff --git a/vp8/vp8_dx_iface.c b/vp8/vp8_dx_iface.c
index 55a77ba..fdc0b35 100644
--- a/vp8/vp8_dx_iface.c
+++ b/vp8/vp8_dx_iface.c
@@ -310,6 +310,7 @@
VP8D_COMP *pbi = ctx->yv12_frame_buffers.pbi[0];
VP8_COMMON *const pc = &pbi->common;
if (setjmp(pbi->common.error.jmp)) {
+ pbi->common.error.setjmp = 0;
vp8_remove_decoder_instances(fb);
vp8_zero(fb->pbi);
vpx_clear_system_state();
@@ -494,6 +495,7 @@
/* get ready for the next series of fragments */
ctx->fragments.count = 0;
+ pbi->common.error.setjmp = 0;
}
return res;
diff --git a/vp9/common/vp9_common.h b/vp9/common/vp9_common.h
index 8d2bed3..d63bad9 100644
--- a/vp9/common/vp9_common.h
+++ b/vp9/common/vp9_common.h
@@ -46,27 +46,6 @@
return num_values > 0 ? get_msb(num_values) + 1 : 0;
}
-#if CONFIG_DEBUG
-#define CHECK_MEM_ERROR(cm, lval, expr) \
- do { \
- assert(&(cm)->error.setjmp); \
- (lval) = (expr); \
- if (!(lval)) \
- vpx_internal_error(&(cm)->error, VPX_CODEC_MEM_ERROR, \
- "Failed to allocate " #lval " at %s:%d", __FILE__, \
- __LINE__); \
- } while (0)
-#else
-#define CHECK_MEM_ERROR(cm, lval, expr) \
- do { \
- assert(&(cm)->error.setjmp); \
- (lval) = (expr); \
- if (!(lval)) \
- vpx_internal_error(&(cm)->error, VPX_CODEC_MEM_ERROR, \
- "Failed to allocate " #lval); \
- } while (0)
-#endif
-
#define VP9_SYNC_CODE_0 0x49
#define VP9_SYNC_CODE_1 0x83
#define VP9_SYNC_CODE_2 0x42
diff --git a/vp9/common/vp9_thread_common.c b/vp9/common/vp9_thread_common.c
index ad44781..1c6ecc0 100644
--- a/vp9/common/vp9_thread_common.c
+++ b/vp9/common/vp9_thread_common.c
@@ -283,7 +283,7 @@
{
int i;
- CHECK_MEM_ERROR(cm, lf_sync->mutex,
+ CHECK_MEM_ERROR(&cm->error, lf_sync->mutex,
vpx_malloc(sizeof(*lf_sync->mutex) * rows));
if (lf_sync->mutex) {
for (i = 0; i < rows; ++i) {
@@ -291,7 +291,7 @@
}
}
- CHECK_MEM_ERROR(cm, lf_sync->cond,
+ CHECK_MEM_ERROR(&cm->error, lf_sync->cond,
vpx_malloc(sizeof(*lf_sync->cond) * rows));
if (lf_sync->cond) {
for (i = 0; i < rows; ++i) {
@@ -299,11 +299,11 @@
}
}
- CHECK_MEM_ERROR(cm, lf_sync->lf_mutex,
+ CHECK_MEM_ERROR(&cm->error, lf_sync->lf_mutex,
vpx_malloc(sizeof(*lf_sync->lf_mutex)));
pthread_mutex_init(lf_sync->lf_mutex, NULL);
- CHECK_MEM_ERROR(cm, lf_sync->recon_done_mutex,
+ CHECK_MEM_ERROR(&cm->error, lf_sync->recon_done_mutex,
vpx_malloc(sizeof(*lf_sync->recon_done_mutex) * rows));
if (lf_sync->recon_done_mutex) {
for (i = 0; i < rows; ++i) {
@@ -311,7 +311,7 @@
}
}
- CHECK_MEM_ERROR(cm, lf_sync->recon_done_cond,
+ CHECK_MEM_ERROR(&cm->error, lf_sync->recon_done_cond,
vpx_malloc(sizeof(*lf_sync->recon_done_cond) * rows));
if (lf_sync->recon_done_cond) {
for (i = 0; i < rows; ++i) {
@@ -321,15 +321,15 @@
}
#endif // CONFIG_MULTITHREAD
- CHECK_MEM_ERROR(cm, lf_sync->lfdata,
+ CHECK_MEM_ERROR(&cm->error, lf_sync->lfdata,
vpx_malloc(num_workers * sizeof(*lf_sync->lfdata)));
lf_sync->num_workers = num_workers;
lf_sync->num_active_workers = lf_sync->num_workers;
- CHECK_MEM_ERROR(cm, lf_sync->cur_sb_col,
+ CHECK_MEM_ERROR(&cm->error, lf_sync->cur_sb_col,
vpx_malloc(sizeof(*lf_sync->cur_sb_col) * rows));
- CHECK_MEM_ERROR(cm, lf_sync->num_tiles_done,
+ CHECK_MEM_ERROR(&cm->error, lf_sync->num_tiles_done,
vpx_malloc(sizeof(*lf_sync->num_tiles_done) *
mi_cols_aligned_to_sb(cm->mi_rows) >>
MI_BLOCK_SIZE_LOG2));
diff --git a/vp9/decoder/vp9_decodeframe.c b/vp9/decoder/vp9_decodeframe.c
index 6eae41f..6ec1d9f 100644
--- a/vp9/decoder/vp9_decodeframe.c
+++ b/vp9/decoder/vp9_decodeframe.c
@@ -1469,7 +1469,7 @@
vpx_free(cm->cur_frame->mvs);
cm->cur_frame->mi_rows = cm->mi_rows;
cm->cur_frame->mi_cols = cm->mi_cols;
- CHECK_MEM_ERROR(cm, cm->cur_frame->mvs,
+ CHECK_MEM_ERROR(&cm->error, cm->cur_frame->mvs,
(MV_REF *)vpx_calloc(cm->mi_rows * cm->mi_cols,
sizeof(*cm->cur_frame->mvs)));
}
@@ -1776,7 +1776,8 @@
if (jobq_size > row_mt_worker_data->jobq_size) {
vpx_free(row_mt_worker_data->jobq_buf);
- CHECK_MEM_ERROR(cm, row_mt_worker_data->jobq_buf, vpx_calloc(1, jobq_size));
+ CHECK_MEM_ERROR(&cm->error, row_mt_worker_data->jobq_buf,
+ vpx_calloc(1, jobq_size));
vp9_jobq_init(&row_mt_worker_data->jobq, row_mt_worker_data->jobq_buf,
jobq_size);
row_mt_worker_data->jobq_size = jobq_size;
@@ -1923,7 +1924,7 @@
const int is_last_row = sb_rows - 1 == cur_sb_row;
int mi_col_start, mi_col_end;
if (!tile_data_recon)
- CHECK_MEM_ERROR(cm, tile_data_recon,
+ CHECK_MEM_ERROR(&cm->error, tile_data_recon,
vpx_memalign(32, sizeof(TileWorkerData)));
tile_data_recon->xd = pbi->mb;
@@ -2025,7 +2026,7 @@
if (cm->lf.filter_level && !cm->skip_loop_filter &&
pbi->lf_worker.data1 == NULL) {
- CHECK_MEM_ERROR(cm, pbi->lf_worker.data1,
+ CHECK_MEM_ERROR(&cm->error, pbi->lf_worker.data1,
vpx_memalign(32, sizeof(LFWorkerData)));
pbi->lf_worker.hook = vp9_loop_filter_worker;
if (pbi->max_threads > 1 && !winterface->reset(&pbi->lf_worker)) {
@@ -2192,8 +2193,6 @@
volatile int mi_row = 0;
volatile int n = tile_data->buf_start;
- tile_data->error_info.setjmp = 1;
-
if (setjmp(tile_data->error_info.jmp)) {
tile_data->error_info.setjmp = 0;
tile_data->xd.corrupted = 1;
@@ -2206,6 +2205,7 @@
}
return 0;
}
+ tile_data->error_info.setjmp = 1;
tile_data->xd.corrupted = 0;
@@ -2285,7 +2285,7 @@
if (pbi->num_tile_workers == 0) {
const int num_threads = pbi->max_threads;
- CHECK_MEM_ERROR(cm, pbi->tile_workers,
+ CHECK_MEM_ERROR(&cm->error, pbi->tile_workers,
vpx_malloc(num_threads * sizeof(*pbi->tile_workers)));
for (n = 0; n < num_threads; ++n) {
VPxWorker *const worker = &pbi->tile_workers[n];
@@ -2824,7 +2824,7 @@
const int num_jobs = sb_rows << cm->log2_tile_cols;
if (pbi->row_mt_worker_data == NULL) {
- CHECK_MEM_ERROR(cm, pbi->row_mt_worker_data,
+ CHECK_MEM_ERROR(&cm->error, pbi->row_mt_worker_data,
vpx_calloc(1, sizeof(*pbi->row_mt_worker_data)));
#if CONFIG_MULTITHREAD
pthread_mutex_init(&pbi->row_mt_worker_data->recon_done_mutex, NULL);
@@ -3006,7 +3006,8 @@
// platforms without DECLARE_ALIGNED().
assert((sizeof(*pbi->tile_worker_data) % 16) == 0);
vpx_free(pbi->tile_worker_data);
- CHECK_MEM_ERROR(cm, pbi->tile_worker_data, vpx_memalign(32, twd_size));
+ CHECK_MEM_ERROR(&cm->error, pbi->tile_worker_data,
+ vpx_memalign(32, twd_size));
pbi->total_tiles = tile_rows * tile_cols;
}
diff --git a/vp9/decoder/vp9_decoder.c b/vp9/decoder/vp9_decoder.c
index 92cd91f..5a7e9f9 100644
--- a/vp9/decoder/vp9_decoder.c
+++ b/vp9/decoder/vp9_decoder.c
@@ -66,7 +66,7 @@
{
int i;
CHECK_MEM_ERROR(
- cm, row_mt_worker_data->recon_sync_mutex,
+ &cm->error, row_mt_worker_data->recon_sync_mutex,
vpx_malloc(sizeof(*row_mt_worker_data->recon_sync_mutex) * num_jobs));
if (row_mt_worker_data->recon_sync_mutex) {
for (i = 0; i < num_jobs; ++i) {
@@ -75,7 +75,7 @@
}
CHECK_MEM_ERROR(
- cm, row_mt_worker_data->recon_sync_cond,
+ &cm->error, row_mt_worker_data->recon_sync_cond,
vpx_malloc(sizeof(*row_mt_worker_data->recon_sync_cond) * num_jobs));
if (row_mt_worker_data->recon_sync_cond) {
for (i = 0; i < num_jobs; ++i) {
@@ -86,24 +86,24 @@
#endif
row_mt_worker_data->num_sbs = num_sbs;
for (plane = 0; plane < 3; ++plane) {
- CHECK_MEM_ERROR(cm, row_mt_worker_data->dqcoeff[plane],
+ CHECK_MEM_ERROR(&cm->error, row_mt_worker_data->dqcoeff[plane],
vpx_memalign(32, dqcoeff_size));
memset(row_mt_worker_data->dqcoeff[plane], 0, dqcoeff_size);
- CHECK_MEM_ERROR(cm, row_mt_worker_data->eob[plane],
+ CHECK_MEM_ERROR(&cm->error, row_mt_worker_data->eob[plane],
vpx_calloc(num_sbs << EOBS_PER_SB_LOG2,
sizeof(*row_mt_worker_data->eob[plane])));
}
- CHECK_MEM_ERROR(cm, row_mt_worker_data->partition,
+ CHECK_MEM_ERROR(&cm->error, row_mt_worker_data->partition,
vpx_calloc(num_sbs * PARTITIONS_PER_SB,
sizeof(*row_mt_worker_data->partition)));
- CHECK_MEM_ERROR(cm, row_mt_worker_data->recon_map,
+ CHECK_MEM_ERROR(&cm->error, row_mt_worker_data->recon_map,
vpx_calloc(num_sbs, sizeof(*row_mt_worker_data->recon_map)));
// allocate memory for thread_data
if (row_mt_worker_data->thread_data == NULL) {
const size_t thread_size =
max_threads * sizeof(*row_mt_worker_data->thread_data);
- CHECK_MEM_ERROR(cm, row_mt_worker_data->thread_data,
+ CHECK_MEM_ERROR(&cm->error, row_mt_worker_data->thread_data,
vpx_memalign(32, thread_size));
}
}
@@ -181,9 +181,10 @@
cm->error.setjmp = 1;
- CHECK_MEM_ERROR(cm, cm->fc, (FRAME_CONTEXT *)vpx_calloc(1, sizeof(*cm->fc)));
+ CHECK_MEM_ERROR(&cm->error, cm->fc,
+ (FRAME_CONTEXT *)vpx_calloc(1, sizeof(*cm->fc)));
CHECK_MEM_ERROR(
- cm, cm->frame_contexts,
+ &cm->error, cm->frame_contexts,
(FRAME_CONTEXT *)vpx_calloc(FRAME_CONTEXTS, sizeof(*cm->frame_contexts)));
pbi->need_resync = 1;
diff --git a/vp9/encoder/vp9_bitstream.c b/vp9/encoder/vp9_bitstream.c
index 17c123a..ca56d14 100644
--- a/vp9/encoder/vp9_bitstream.c
+++ b/vp9/encoder/vp9_bitstream.c
@@ -967,13 +967,13 @@
int i;
const size_t worker_data_size =
cpi->num_workers * sizeof(*cpi->vp9_bitstream_worker_data);
- CHECK_MEM_ERROR(cm, cpi->vp9_bitstream_worker_data,
+ CHECK_MEM_ERROR(&cm->error, cpi->vp9_bitstream_worker_data,
vpx_memalign(16, worker_data_size));
memset(cpi->vp9_bitstream_worker_data, 0, worker_data_size);
for (i = 1; i < cpi->num_workers; ++i) {
cpi->vp9_bitstream_worker_data[i].dest_size =
cpi->oxcf.width * cpi->oxcf.height;
- CHECK_MEM_ERROR(cm, cpi->vp9_bitstream_worker_data[i].dest,
+ CHECK_MEM_ERROR(&cm->error, cpi->vp9_bitstream_worker_data[i].dest,
vpx_malloc(cpi->vp9_bitstream_worker_data[i].dest_size));
}
}
diff --git a/vp9/encoder/vp9_context_tree.c b/vp9/encoder/vp9_context_tree.c
index b74b902..42073f7 100644
--- a/vp9/encoder/vp9_context_tree.c
+++ b/vp9/encoder/vp9_context_tree.c
@@ -25,16 +25,17 @@
int i, k;
ctx->num_4x4_blk = num_blk;
- CHECK_MEM_ERROR(cm, ctx->zcoeff_blk, vpx_calloc(num_blk, sizeof(uint8_t)));
+ CHECK_MEM_ERROR(&cm->error, ctx->zcoeff_blk,
+ vpx_calloc(num_blk, sizeof(uint8_t)));
for (i = 0; i < MAX_MB_PLANE; ++i) {
for (k = 0; k < 3; ++k) {
- CHECK_MEM_ERROR(cm, ctx->coeff[i][k],
+ CHECK_MEM_ERROR(&cm->error, ctx->coeff[i][k],
vpx_memalign(32, num_pix * sizeof(*ctx->coeff[i][k])));
- CHECK_MEM_ERROR(cm, ctx->qcoeff[i][k],
+ CHECK_MEM_ERROR(&cm->error, ctx->qcoeff[i][k],
vpx_memalign(32, num_pix * sizeof(*ctx->qcoeff[i][k])));
- CHECK_MEM_ERROR(cm, ctx->dqcoeff[i][k],
+ CHECK_MEM_ERROR(&cm->error, ctx->dqcoeff[i][k],
vpx_memalign(32, num_pix * sizeof(*ctx->dqcoeff[i][k])));
- CHECK_MEM_ERROR(cm, ctx->eobs[i][k],
+ CHECK_MEM_ERROR(&cm->error, ctx->eobs[i][k],
vpx_memalign(32, num_blk * sizeof(*ctx->eobs[i][k])));
ctx->coeff_pbuf[i][k] = ctx->coeff[i][k];
ctx->qcoeff_pbuf[i][k] = ctx->qcoeff[i][k];
@@ -100,10 +101,10 @@
int nodes;
vpx_free(td->leaf_tree);
- CHECK_MEM_ERROR(cm, td->leaf_tree,
+ CHECK_MEM_ERROR(&cm->error, td->leaf_tree,
vpx_calloc(leaf_nodes, sizeof(*td->leaf_tree)));
vpx_free(td->pc_tree);
- CHECK_MEM_ERROR(cm, td->pc_tree,
+ CHECK_MEM_ERROR(&cm->error, td->pc_tree,
vpx_calloc(tree_nodes, sizeof(*td->pc_tree)));
this_pc = &td->pc_tree[0];
diff --git a/vp9/encoder/vp9_denoiser.c b/vp9/encoder/vp9_denoiser.c
index 77d7239..baea8eb 100644
--- a/vp9/encoder/vp9_denoiser.c
+++ b/vp9/encoder/vp9_denoiser.c
@@ -634,11 +634,11 @@
denoiser->num_ref_frames = use_svc ? SVC_REF_FRAMES : NONSVC_REF_FRAMES;
init_num_ref_frames = use_svc ? MAX_REF_FRAMES : NONSVC_REF_FRAMES;
denoiser->num_layers = num_layers;
- CHECK_MEM_ERROR(cm, denoiser->running_avg_y,
+ CHECK_MEM_ERROR(&cm->error, denoiser->running_avg_y,
vpx_calloc(denoiser->num_ref_frames * num_layers,
sizeof(denoiser->running_avg_y[0])));
CHECK_MEM_ERROR(
- cm, denoiser->mc_running_avg_y,
+ &cm->error, denoiser->mc_running_avg_y,
vpx_calloc(num_layers, sizeof(denoiser->mc_running_avg_y[0])));
for (layer = 0; layer < num_layers; ++layer) {
diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index 3a04239..a979ae1 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -1545,7 +1545,7 @@
}
if (low_res && threshold_4x4avg < INT64_MAX)
- CHECK_MEM_ERROR(cm, vt2, vpx_calloc(16, sizeof(*vt2)));
+ CHECK_MEM_ERROR(&cm->error, vt2, vpx_calloc(16, sizeof(*vt2)));
// Fill in the entire tree of 8x8 (or 4x4 under some conditions) variances
// for splits.
for (i = 0; i < 4; i++) {
@@ -5783,7 +5783,7 @@
if (cm->last_width != cm->width || cm->last_height != cm->height) {
if (cpi->source_diff_var) vpx_free(cpi->source_diff_var);
- CHECK_MEM_ERROR(cm, cpi->source_diff_var,
+ CHECK_MEM_ERROR(&cm->error, cpi->source_diff_var,
vpx_calloc(cm->MBs, sizeof(cpi->source_diff_var)));
}
@@ -5823,7 +5823,7 @@
if (cpi->tile_data == NULL || cpi->allocated_tiles < tile_cols * tile_rows) {
if (cpi->tile_data != NULL) vpx_free(cpi->tile_data);
CHECK_MEM_ERROR(
- cm, cpi->tile_data,
+ &cm->error, cpi->tile_data,
vpx_malloc(tile_cols * tile_rows * sizeof(*cpi->tile_data)));
cpi->allocated_tiles = tile_cols * tile_rows;
diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c
index f76eec2..9d5c003 100644
--- a/vp9/encoder/vp9_encoder.c
+++ b/vp9/encoder/vp9_encoder.c
@@ -681,9 +681,10 @@
return (i == VP9_LEVELS) ? LEVEL_UNKNOWN : vp9_level_defs[i].level;
}
-int vp9_set_roi_map(VP9_COMP *cpi, unsigned char *map, unsigned int rows,
- unsigned int cols, int delta_q[8], int delta_lf[8],
- int skip[8], int ref_frame[8]) {
+vpx_codec_err_t vp9_set_roi_map(VP9_COMP *cpi, unsigned char *map,
+ unsigned int rows, unsigned int cols,
+ int delta_q[8], int delta_lf[8], int skip[8],
+ int ref_frame[8]) {
VP9_COMMON *cm = &cpi->common;
vpx_roi_map_t *roi = &cpi->roi;
const int range = 63;
@@ -694,13 +695,13 @@
// Check number of rows and columns match
if (frame_rows != (int)rows || frame_cols != (int)cols) {
- return -1;
+ return VPX_CODEC_INVALID_PARAM;
}
if (!check_seg_range(delta_q, range) || !check_seg_range(delta_lf, range) ||
!check_seg_range(ref_frame, ref_frame_range) ||
!check_seg_range(skip, skip_range))
- return -1;
+ return VPX_CODEC_INVALID_PARAM;
// Also disable segmentation if no deltas are specified.
if (!map ||
@@ -714,14 +715,15 @@
ref_frame[6] == -1 && ref_frame[7] == -1))) {
vp9_disable_segmentation(&cm->seg);
cpi->roi.enabled = 0;
- return 0;
+ return VPX_CODEC_OK;
}
if (roi->roi_map) {
vpx_free(roi->roi_map);
roi->roi_map = NULL;
}
- CHECK_MEM_ERROR(cm, roi->roi_map, vpx_malloc(rows * cols));
+ roi->roi_map = vpx_malloc(rows * cols);
+ if (!roi->roi_map) return VPX_CODEC_MEM_ERROR;
// Copy to ROI structure in the compressor.
memcpy(roi->roi_map, map, rows * cols);
@@ -733,7 +735,7 @@
roi->rows = rows;
roi->cols = cols;
- return 0;
+ return VPX_CODEC_OK;
}
int vp9_set_active_map(VP9_COMP *cpi, unsigned char *new_map_16x16, int rows,
@@ -1374,7 +1376,7 @@
VP9_COMMON *cm = &cpi->common;
int mi_size = cm->mi_cols * cm->mi_rows;
- CHECK_MEM_ERROR(cm, cpi->mbmi_ext_base,
+ CHECK_MEM_ERROR(&cm->error, cpi->mbmi_ext_base,
vpx_calloc(mi_size, sizeof(*cpi->mbmi_ext_base)));
}
@@ -1393,14 +1395,14 @@
{
unsigned int tokens = get_token_alloc(cm->mb_rows, cm->mb_cols);
- CHECK_MEM_ERROR(cm, cpi->tile_tok[0][0],
+ CHECK_MEM_ERROR(&cm->error, cpi->tile_tok[0][0],
vpx_calloc(tokens, sizeof(*cpi->tile_tok[0][0])));
}
sb_rows = mi_cols_aligned_to_sb(cm->mi_rows) >> MI_BLOCK_SIZE_LOG2;
vpx_free(cpi->tplist[0][0]);
CHECK_MEM_ERROR(
- cm, cpi->tplist[0][0],
+ &cm->error, cpi->tplist[0][0],
vpx_calloc(sb_rows * 4 * (1 << 6), sizeof(*cpi->tplist[0][0])));
vp9_setup_pc_tree(&cpi->common, &cpi->td);
@@ -1996,48 +1998,48 @@
// Create the encoder segmentation map and set all entries to 0
vpx_free(cpi->segmentation_map);
- CHECK_MEM_ERROR(cm, cpi->segmentation_map,
+ CHECK_MEM_ERROR(&cm->error, cpi->segmentation_map,
vpx_calloc(cm->mi_rows * cm->mi_cols, 1));
// Create a map used for cyclic background refresh.
if (cpi->cyclic_refresh) vp9_cyclic_refresh_free(cpi->cyclic_refresh);
- CHECK_MEM_ERROR(cm, cpi->cyclic_refresh,
+ CHECK_MEM_ERROR(&cm->error, cpi->cyclic_refresh,
vp9_cyclic_refresh_alloc(cm->mi_rows, cm->mi_cols));
// Create a map used to mark inactive areas.
vpx_free(cpi->active_map.map);
- CHECK_MEM_ERROR(cm, cpi->active_map.map,
+ CHECK_MEM_ERROR(&cm->error, cpi->active_map.map,
vpx_calloc(cm->mi_rows * cm->mi_cols, 1));
// And a place holder structure is the coding context
// for use if we want to save and restore it
vpx_free(cpi->coding_context.last_frame_seg_map_copy);
- CHECK_MEM_ERROR(cm, cpi->coding_context.last_frame_seg_map_copy,
+ CHECK_MEM_ERROR(&cm->error, cpi->coding_context.last_frame_seg_map_copy,
vpx_calloc(cm->mi_rows * cm->mi_cols, 1));
}
static void alloc_copy_partition_data(VP9_COMP *cpi) {
VP9_COMMON *const cm = &cpi->common;
if (cpi->prev_partition == NULL) {
- CHECK_MEM_ERROR(cm, cpi->prev_partition,
+ CHECK_MEM_ERROR(&cm->error, cpi->prev_partition,
(BLOCK_SIZE *)vpx_calloc(cm->mi_stride * cm->mi_rows,
sizeof(*cpi->prev_partition)));
}
if (cpi->prev_segment_id == NULL) {
CHECK_MEM_ERROR(
- cm, cpi->prev_segment_id,
+ &cm->error, cpi->prev_segment_id,
(int8_t *)vpx_calloc((cm->mi_stride >> 3) * ((cm->mi_rows >> 3) + 1),
sizeof(*cpi->prev_segment_id)));
}
if (cpi->prev_variance_low == NULL) {
- CHECK_MEM_ERROR(cm, cpi->prev_variance_low,
+ CHECK_MEM_ERROR(&cm->error, cpi->prev_variance_low,
(uint8_t *)vpx_calloc(
(cm->mi_stride >> 3) * ((cm->mi_rows >> 3) + 1) * 25,
sizeof(*cpi->prev_variance_low)));
}
if (cpi->copied_frame_cnt == NULL) {
CHECK_MEM_ERROR(
- cm, cpi->copied_frame_cnt,
+ &cm->error, cpi->copied_frame_cnt,
(uint8_t *)vpx_calloc((cm->mi_stride >> 3) * ((cm->mi_rows >> 3) + 1),
sizeof(*cpi->copied_frame_cnt)));
}
@@ -2370,9 +2372,10 @@
cm->free_mi = vp9_enc_free_mi;
cm->setup_mi = vp9_enc_setup_mi;
- CHECK_MEM_ERROR(cm, cm->fc, (FRAME_CONTEXT *)vpx_calloc(1, sizeof(*cm->fc)));
+ CHECK_MEM_ERROR(&cm->error, cm->fc,
+ (FRAME_CONTEXT *)vpx_calloc(1, sizeof(*cm->fc)));
CHECK_MEM_ERROR(
- cm, cm->frame_contexts,
+ &cm->error, cm->frame_contexts,
(FRAME_CONTEXT *)vpx_calloc(FRAME_CONTEXTS, sizeof(*cm->frame_contexts)));
cpi->compute_frame_low_motion_onepass = 1;
@@ -2399,38 +2402,38 @@
realloc_segmentation_maps(cpi);
CHECK_MEM_ERROR(
- cm, cpi->skin_map,
+ &cm->error, cpi->skin_map,
vpx_calloc(cm->mi_rows * cm->mi_cols, sizeof(cpi->skin_map[0])));
#if !CONFIG_REALTIME_ONLY
- CHECK_MEM_ERROR(cm, cpi->alt_ref_aq, vp9_alt_ref_aq_create());
+ CHECK_MEM_ERROR(&cm->error, cpi->alt_ref_aq, vp9_alt_ref_aq_create());
#endif
CHECK_MEM_ERROR(
- cm, cpi->consec_zero_mv,
+ &cm->error, cpi->consec_zero_mv,
vpx_calloc(cm->mi_rows * cm->mi_cols, sizeof(*cpi->consec_zero_mv)));
- CHECK_MEM_ERROR(cm, cpi->nmvcosts[0],
+ CHECK_MEM_ERROR(&cm->error, cpi->nmvcosts[0],
vpx_calloc(MV_VALS, sizeof(*cpi->nmvcosts[0])));
- CHECK_MEM_ERROR(cm, cpi->nmvcosts[1],
+ CHECK_MEM_ERROR(&cm->error, cpi->nmvcosts[1],
vpx_calloc(MV_VALS, sizeof(*cpi->nmvcosts[1])));
- CHECK_MEM_ERROR(cm, cpi->nmvcosts_hp[0],
+ CHECK_MEM_ERROR(&cm->error, cpi->nmvcosts_hp[0],
vpx_calloc(MV_VALS, sizeof(*cpi->nmvcosts_hp[0])));
- CHECK_MEM_ERROR(cm, cpi->nmvcosts_hp[1],
+ CHECK_MEM_ERROR(&cm->error, cpi->nmvcosts_hp[1],
vpx_calloc(MV_VALS, sizeof(*cpi->nmvcosts_hp[1])));
- CHECK_MEM_ERROR(cm, cpi->nmvsadcosts[0],
+ CHECK_MEM_ERROR(&cm->error, cpi->nmvsadcosts[0],
vpx_calloc(MV_VALS, sizeof(*cpi->nmvsadcosts[0])));
- CHECK_MEM_ERROR(cm, cpi->nmvsadcosts[1],
+ CHECK_MEM_ERROR(&cm->error, cpi->nmvsadcosts[1],
vpx_calloc(MV_VALS, sizeof(*cpi->nmvsadcosts[1])));
- CHECK_MEM_ERROR(cm, cpi->nmvsadcosts_hp[0],
+ CHECK_MEM_ERROR(&cm->error, cpi->nmvsadcosts_hp[0],
vpx_calloc(MV_VALS, sizeof(*cpi->nmvsadcosts_hp[0])));
- CHECK_MEM_ERROR(cm, cpi->nmvsadcosts_hp[1],
+ CHECK_MEM_ERROR(&cm->error, cpi->nmvsadcosts_hp[1],
vpx_calloc(MV_VALS, sizeof(*cpi->nmvsadcosts_hp[1])));
for (i = 0; i < (sizeof(cpi->mbgraph_stats) / sizeof(cpi->mbgraph_stats[0]));
i++) {
CHECK_MEM_ERROR(
- cm, cpi->mbgraph_stats[i].mb_stats,
+ &cm->error, cpi->mbgraph_stats[i].mb_stats,
vpx_calloc(cm->MBs * sizeof(*cpi->mbgraph_stats[i].mb_stats), 1));
}
@@ -2474,7 +2477,7 @@
}
if (cpi->b_calculate_consistency) {
- CHECK_MEM_ERROR(cm, cpi->ssim_vars,
+ CHECK_MEM_ERROR(&cm->error, cpi->ssim_vars,
vpx_calloc(cpi->common.mi_rows * cpi->common.mi_cols,
sizeof(*cpi->ssim_vars) * 4));
cpi->worst_consistency = 100.0;
@@ -2559,7 +2562,7 @@
vpx_free(lc->rc_twopass_stats_in.buf);
lc->rc_twopass_stats_in.sz = packets_in_layer * packet_sz;
- CHECK_MEM_ERROR(cm, lc->rc_twopass_stats_in.buf,
+ CHECK_MEM_ERROR(&cm->error, lc->rc_twopass_stats_in.buf,
vpx_malloc(lc->rc_twopass_stats_in.sz));
lc->twopass.stats_in_start = lc->rc_twopass_stats_in.buf;
lc->twopass.stats_in = lc->twopass.stats_in_start;
@@ -2614,7 +2617,7 @@
const int h = num_8x8_blocks_high_lookup[bsize];
const int num_cols = (cm->mi_cols + w - 1) / w;
const int num_rows = (cm->mi_rows + h - 1) / h;
- CHECK_MEM_ERROR(cm, cpi->mi_ssim_rdmult_scaling_factors,
+ CHECK_MEM_ERROR(&cm->error, cpi->mi_ssim_rdmult_scaling_factors,
vpx_calloc(num_rows * num_cols,
sizeof(*cpi->mi_ssim_rdmult_scaling_factors)));
}
@@ -2625,11 +2628,10 @@
#endif // CONFIG_NON_GREEDY_MV
for (i = 0; i < MAX_ARF_GOP_SIZE; ++i) {
cpi->tpl_stats[i].tpl_stats_ptr = NULL;
- cpi->tpl_frame_stats[i].block_stats_list = NULL;
}
// Allocate memory to store variances for a frame.
- CHECK_MEM_ERROR(cm, cpi->source_diff_var,
+ CHECK_MEM_ERROR(&cm->error, cpi->source_diff_var,
vpx_calloc(cm->MBs, sizeof(cpi->source_diff_var)));
cpi->source_var_thresh = 0;
cpi->frames_till_next_var_check = 0;
@@ -3752,7 +3754,7 @@
case 6: l = 150; break;
}
if (!cpi->common.postproc_state.limits) {
- CHECK_MEM_ERROR(cm, cpi->common.postproc_state.limits,
+ CHECK_MEM_ERROR(&cm->error, cpi->common.postproc_state.limits,
vpx_calloc(cpi->un_scaled_source->y_width,
sizeof(*cpi->common.postproc_state.limits)));
}
@@ -4096,7 +4098,7 @@
svc->spatial_layer_id == svc->number_spatial_layers - 2) {
if (svc->prev_partition_svc == NULL) {
CHECK_MEM_ERROR(
- cm, svc->prev_partition_svc,
+ &cm->error, svc->prev_partition_svc,
(BLOCK_SIZE *)vpx_calloc(cm->mi_stride * cm->mi_rows,
sizeof(*svc->prev_partition_svc)));
}
@@ -4448,10 +4450,13 @@
const int orig_rc_max_frame_bandwidth = rc->max_frame_bandwidth;
#if CONFIG_RATE_CTRL
- const FRAME_UPDATE_TYPE update_type =
- cpi->twopass.gf_group.update_type[cpi->twopass.gf_group.index];
- const ENCODE_FRAME_TYPE frame_type = get_encode_frame_type(update_type);
- RATE_QSTEP_MODEL *rq_model = &cpi->rq_model[frame_type];
+ RATE_QSTEP_MODEL *rq_model;
+ {
+ const FRAME_UPDATE_TYPE update_type =
+ cpi->twopass.gf_group.update_type[cpi->twopass.gf_group.index];
+ const ENCODE_FRAME_TYPE frame_type = get_encode_frame_type(update_type);
+ rq_model = &cpi->rq_model[frame_type];
+ }
init_rq_history(rq_history);
#endif // CONFIG_RATE_CTRL
@@ -5295,7 +5300,7 @@
cpi->mb_wiener_variance = NULL;
CHECK_MEM_ERROR(
- cm, cpi->mb_wiener_variance,
+ &cm->error, cpi->mb_wiener_variance,
vpx_calloc(cm->mb_rows * cm->mb_cols, sizeof(*cpi->mb_wiener_variance)));
cpi->mb_wiener_var_rows = cm->mb_rows;
cpi->mb_wiener_var_cols = cm->mb_cols;
@@ -6542,7 +6547,7 @@
pthread_mutex_init(&cpi->kmeans_mutex, NULL);
#endif
CHECK_MEM_ERROR(
- cm, cpi->kmeans_data_arr,
+ &cm->error, cpi->kmeans_data_arr,
vpx_calloc(mi_rows * mi_cols, sizeof(*cpi->kmeans_data_arr)));
cpi->kmeans_data_stride = mi_cols;
cpi->kmeans_data_arr_alloc = 1;
diff --git a/vp9/encoder/vp9_encoder.h b/vp9/encoder/vp9_encoder.h
index cca1617..2e0c4db 100644
--- a/vp9/encoder/vp9_encoder.h
+++ b/vp9/encoder/vp9_encoder.h
@@ -18,6 +18,7 @@
#include "vpx/internal/vpx_codec_internal.h"
#include "vpx/vpx_ext_ratectrl.h"
#include "vpx/vp8cx.h"
+#include "vpx/vpx_tpl.h"
#if CONFIG_INTERNAL_STATS
#include "vpx_dsp/ssim.h"
#endif
@@ -745,7 +746,7 @@
BLOCK_SIZE tpl_bsize;
TplDepFrame tpl_stats[MAX_ARF_GOP_SIZE];
// Used to store TPL stats before propagation
- VpxTplFrameStats tpl_frame_stats[MAX_ARF_GOP_SIZE];
+ VpxTplGopStats tpl_gop_stats;
YV12_BUFFER_CONFIG *tpl_recon_frames[REF_FRAMES];
EncFrameBuf enc_frame_buf[REF_FRAMES];
#if CONFIG_MULTITHREAD
@@ -1060,7 +1061,7 @@
VP9_COMMON *const cm = &cpi->common;
const int unit_width = get_num_unit_4x4(cpi->frame_info.frame_width);
const int unit_height = get_num_unit_4x4(cpi->frame_info.frame_height);
- CHECK_MEM_ERROR(cm, cpi->partition_info,
+ CHECK_MEM_ERROR(&cm->error, cpi->partition_info,
(PARTITION_INFO *)vpx_calloc(unit_width * unit_height,
sizeof(PARTITION_INFO)));
memset(cpi->partition_info, 0,
@@ -1088,7 +1089,7 @@
VP9_COMMON *const cm = &cpi->common;
const int unit_width = get_num_unit_4x4(cpi->frame_info.frame_width);
const int unit_height = get_num_unit_4x4(cpi->frame_info.frame_height);
- CHECK_MEM_ERROR(cm, cpi->motion_vector_info,
+ CHECK_MEM_ERROR(&cm->error, cpi->motion_vector_info,
(MOTION_VECTOR_INFO *)vpx_calloc(unit_width * unit_height,
sizeof(MOTION_VECTOR_INFO)));
memset(cpi->motion_vector_info, 0,
@@ -1107,7 +1108,7 @@
static INLINE void tpl_stats_info_init(struct VP9_COMP *cpi) {
VP9_COMMON *const cm = &cpi->common;
CHECK_MEM_ERROR(
- cm, cpi->tpl_stats_info,
+ &cm->error, cpi->tpl_stats_info,
(TplDepStats *)vpx_calloc(MAX_LAG_BUFFERS, sizeof(TplDepStats)));
memset(cpi->tpl_stats_info, 0, MAX_LAG_BUFFERS * sizeof(TplDepStats));
}
@@ -1126,7 +1127,7 @@
VP9_COMMON *const cm = &cpi->common;
const int unit_width = get_num_unit_16x16(cpi->frame_info.frame_width);
const int unit_height = get_num_unit_16x16(cpi->frame_info.frame_height);
- CHECK_MEM_ERROR(cm, cpi->fp_motion_vector_info,
+ CHECK_MEM_ERROR(&cm->error, cpi->fp_motion_vector_info,
(MOTION_VECTOR_INFO *)vpx_calloc(unit_width * unit_height,
sizeof(MOTION_VECTOR_INFO)));
}
@@ -1457,9 +1458,10 @@
VP9_LEVEL vp9_get_level(const Vp9LevelSpec *const level_spec);
-int vp9_set_roi_map(VP9_COMP *cpi, unsigned char *map, unsigned int rows,
- unsigned int cols, int delta_q[8], int delta_lf[8],
- int skip[8], int ref_frame[8]);
+vpx_codec_err_t vp9_set_roi_map(VP9_COMP *cpi, unsigned char *map,
+ unsigned int rows, unsigned int cols,
+ int delta_q[8], int delta_lf[8], int skip[8],
+ int ref_frame[8]);
void vp9_new_framerate(VP9_COMP *cpi, double framerate);
@@ -1474,7 +1476,7 @@
if (new_fb_ptr->mvs == NULL || new_fb_ptr->mi_rows < cm->mi_rows ||
new_fb_ptr->mi_cols < cm->mi_cols) {
vpx_free(new_fb_ptr->mvs);
- CHECK_MEM_ERROR(cm, new_fb_ptr->mvs,
+ CHECK_MEM_ERROR(&cm->error, new_fb_ptr->mvs,
(MV_REF *)vpx_calloc(cm->mi_rows * cm->mi_cols,
sizeof(*new_fb_ptr->mvs)));
new_fb_ptr->mi_rows = cm->mi_rows;
diff --git a/vp9/encoder/vp9_ethread.c b/vp9/encoder/vp9_ethread.c
index 453fe2e..fadd233 100644
--- a/vp9/encoder/vp9_ethread.c
+++ b/vp9/encoder/vp9_ethread.c
@@ -94,10 +94,10 @@
vp9_bitstream_encode_tiles_buffer_dealloc(cpi);
vp9_encode_free_mt_data(cpi);
- CHECK_MEM_ERROR(cm, cpi->workers,
+ CHECK_MEM_ERROR(&cm->error, cpi->workers,
vpx_malloc(num_workers * sizeof(*cpi->workers)));
- CHECK_MEM_ERROR(cm, cpi->tile_thr_data,
+ CHECK_MEM_ERROR(&cm->error, cpi->tile_thr_data,
vpx_calloc(num_workers, sizeof(*cpi->tile_thr_data)));
for (i = 0; i < num_workers; i++) {
@@ -111,7 +111,7 @@
thread_data->cpi = cpi;
// Allocate thread data.
- CHECK_MEM_ERROR(cm, thread_data->td,
+ CHECK_MEM_ERROR(&cm->error, thread_data->td,
vpx_memalign(32, sizeof(*thread_data->td)));
vp9_zero(*thread_data->td);
@@ -121,7 +121,7 @@
vp9_setup_pc_tree(cm, thread_data->td);
// Allocate frame counters in thread data.
- CHECK_MEM_ERROR(cm, thread_data->td->counts,
+ CHECK_MEM_ERROR(&cm->error, thread_data->td->counts,
vpx_calloc(1, sizeof(*thread_data->td->counts)));
// Create threads
@@ -292,7 +292,7 @@
{
int i;
- CHECK_MEM_ERROR(cm, row_mt_sync->mutex,
+ CHECK_MEM_ERROR(&cm->error, row_mt_sync->mutex,
vpx_malloc(sizeof(*row_mt_sync->mutex) * rows));
if (row_mt_sync->mutex) {
for (i = 0; i < rows; ++i) {
@@ -300,7 +300,7 @@
}
}
- CHECK_MEM_ERROR(cm, row_mt_sync->cond,
+ CHECK_MEM_ERROR(&cm->error, row_mt_sync->cond,
vpx_malloc(sizeof(*row_mt_sync->cond) * rows));
if (row_mt_sync->cond) {
for (i = 0; i < rows; ++i) {
@@ -310,7 +310,7 @@
}
#endif // CONFIG_MULTITHREAD
- CHECK_MEM_ERROR(cm, row_mt_sync->cur_col,
+ CHECK_MEM_ERROR(&cm->error, row_mt_sync->cur_col,
vpx_malloc(sizeof(*row_mt_sync->cur_col) * rows));
// Set up nsync.
diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c
index 71d8775..8fdd976 100644
--- a/vp9/encoder/vp9_firstpass.c
+++ b/vp9/encoder/vp9_firstpass.c
@@ -1422,7 +1422,7 @@
if (cpi->row_mt_bit_exact && cpi->twopass.fp_mb_float_stats == NULL)
CHECK_MEM_ERROR(
- cm, cpi->twopass.fp_mb_float_stats,
+ &cm->error, cpi->twopass.fp_mb_float_stats,
vpx_calloc(cm->MBs * sizeof(*cpi->twopass.fp_mb_float_stats), 1));
{
diff --git a/vp9/encoder/vp9_mbgraph.c b/vp9/encoder/vp9_mbgraph.c
index 9487fc5..fafc673 100644
--- a/vp9/encoder/vp9_mbgraph.c
+++ b/vp9/encoder/vp9_mbgraph.c
@@ -288,7 +288,7 @@
int *arf_not_zz;
CHECK_MEM_ERROR(
- cm, arf_not_zz,
+ &cm->error, arf_not_zz,
vpx_calloc(cm->mb_rows * cm->mb_cols * sizeof(*arf_not_zz), 1));
// We are not interested in results beyond the alt ref itself.
diff --git a/vp9/encoder/vp9_mcomp.c b/vp9/encoder/vp9_mcomp.c
index 64e9ef0..0ea0f85 100644
--- a/vp9/encoder/vp9_mcomp.c
+++ b/vp9/encoder/vp9_mcomp.c
@@ -297,7 +297,7 @@
besterr =
vfp->vf(CONVERT_TO_BYTEPTR(comp_pred16), w, src, src_stride, sse1);
} else {
- DECLARE_ALIGNED(16, uint8_t, comp_pred[64 * 64]);
+ DECLARE_ALIGNED(32, uint8_t, comp_pred[64 * 64]);
vpx_comp_avg_pred(comp_pred, second_pred, w, h, y + offset, y_stride);
besterr = vfp->vf(comp_pred, w, src, src_stride, sse1);
}
@@ -312,7 +312,7 @@
uint32_t besterr;
(void)xd;
if (second_pred != NULL) {
- DECLARE_ALIGNED(16, uint8_t, comp_pred[64 * 64]);
+ DECLARE_ALIGNED(32, uint8_t, comp_pred[64 * 64]);
vpx_comp_avg_pred(comp_pred, second_pred, w, h, y + offset, y_stride);
besterr = vfp->vf(comp_pred, w, src, src_stride, sse1);
} else {
@@ -635,7 +635,7 @@
vp9_build_inter_predictor(pre_address, y_stride, pred, w, this_mv, sf, w, h,
0, kernel, MV_PRECISION_Q3, 0, 0);
if (second_pred != NULL) {
- DECLARE_ALIGNED(16, uint8_t, comp_pred[64 * 64]);
+ DECLARE_ALIGNED(32, uint8_t, comp_pred[64 * 64]);
vpx_comp_avg_pred(comp_pred, second_pred, w, h, pred, w);
besterr = vfp->vf(comp_pred, w, src_address, src_stride, sse);
} else {
@@ -654,7 +654,7 @@
vp9_build_inter_predictor(pre_address, y_stride, pred, w, this_mv, sf, w, h,
0, kernel, MV_PRECISION_Q3, 0, 0);
if (second_pred != NULL) {
- DECLARE_ALIGNED(16, uint8_t, comp_pred[64 * 64]);
+ DECLARE_ALIGNED(32, uint8_t, comp_pred[64 * 64]);
vpx_comp_avg_pred(comp_pred, second_pred, w, h, pred, w);
besterr = vfp->vf(comp_pred, w, src_address, src_stride, sse);
} else {
diff --git a/vp9/encoder/vp9_multi_thread.c b/vp9/encoder/vp9_multi_thread.c
index 45659f2..0843cd97 100644
--- a/vp9/encoder/vp9_multi_thread.c
+++ b/vp9/encoder/vp9_multi_thread.c
@@ -59,7 +59,7 @@
int i;
CHECK_MEM_ERROR(
- cm, this_tile->row_base_thresh_freq_fact,
+ &cm->error, this_tile->row_base_thresh_freq_fact,
(int *)vpx_calloc(sb_rows * BLOCK_SIZES * MAX_MODES,
sizeof(*(this_tile->row_base_thresh_freq_fact))));
for (i = 0; i < sb_rows * BLOCK_SIZES * MAX_MODES; i++)
@@ -85,7 +85,7 @@
multi_thread_ctxt->allocated_tile_rows = tile_rows;
multi_thread_ctxt->allocated_vert_unit_rows = jobs_per_tile_col;
- CHECK_MEM_ERROR(cm, multi_thread_ctxt->job_queue,
+ CHECK_MEM_ERROR(&cm->error, multi_thread_ctxt->job_queue,
(JobQueue *)vpx_memalign(32, total_jobs * sizeof(JobQueue)));
#if CONFIG_MULTITHREAD
diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index f051c62..464705a 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -1937,10 +1937,10 @@
// Prediction buffer from second frame.
#if CONFIG_VP9_HIGHBITDEPTH
- DECLARE_ALIGNED(16, uint16_t, second_pred_alloc_16[64 * 64]);
+ DECLARE_ALIGNED(32, uint16_t, second_pred_alloc_16[64 * 64]);
uint8_t *second_pred;
#else
- DECLARE_ALIGNED(16, uint8_t, second_pred[64 * 64]);
+ DECLARE_ALIGNED(32, uint8_t, second_pred[64 * 64]);
#endif // CONFIG_VP9_HIGHBITDEPTH
// Check number of iterations do not exceed the max
diff --git a/vp9/encoder/vp9_speed_features.c b/vp9/encoder/vp9_speed_features.c
index 60720e3..48c21c5 100644
--- a/vp9/encoder/vp9_speed_features.c
+++ b/vp9/encoder/vp9_speed_features.c
@@ -676,7 +676,7 @@
if (cpi->content_state_sb_fd == NULL &&
(!cpi->use_svc ||
svc->spatial_layer_id == svc->number_spatial_layers - 1)) {
- CHECK_MEM_ERROR(cm, cpi->content_state_sb_fd,
+ CHECK_MEM_ERROR(&cm->error, cpi->content_state_sb_fd,
(uint8_t *)vpx_calloc(
(cm->mi_stride >> 3) * ((cm->mi_rows >> 3) + 1),
sizeof(uint8_t)));
@@ -832,13 +832,13 @@
}
if (cpi->count_arf_frame_usage == NULL) {
CHECK_MEM_ERROR(
- cm, cpi->count_arf_frame_usage,
+ &cm->error, cpi->count_arf_frame_usage,
(uint8_t *)vpx_calloc((cm->mi_stride >> 3) * ((cm->mi_rows >> 3) + 1),
sizeof(*cpi->count_arf_frame_usage)));
}
if (cpi->count_lastgolden_frame_usage == NULL)
CHECK_MEM_ERROR(
- cm, cpi->count_lastgolden_frame_usage,
+ &cm->error, cpi->count_lastgolden_frame_usage,
(uint8_t *)vpx_calloc((cm->mi_stride >> 3) * ((cm->mi_rows >> 3) + 1),
sizeof(*cpi->count_lastgolden_frame_usage)));
}
diff --git a/vp9/encoder/vp9_svc_layercontext.c b/vp9/encoder/vp9_svc_layercontext.c
index f08d668..e472127 100644
--- a/vp9/encoder/vp9_svc_layercontext.c
+++ b/vp9/encoder/vp9_svc_layercontext.c
@@ -163,17 +163,17 @@
lc->actual_num_seg1_blocks = 0;
lc->actual_num_seg2_blocks = 0;
lc->counter_encode_maxq_scene_change = 0;
- CHECK_MEM_ERROR(cm, lc->map,
+ CHECK_MEM_ERROR(&cm->error, lc->map,
vpx_malloc(mi_rows * mi_cols * sizeof(*lc->map)));
memset(lc->map, 0, mi_rows * mi_cols);
last_coded_q_map_size =
mi_rows * mi_cols * sizeof(*lc->last_coded_q_map);
- CHECK_MEM_ERROR(cm, lc->last_coded_q_map,
+ CHECK_MEM_ERROR(&cm->error, lc->last_coded_q_map,
vpx_malloc(last_coded_q_map_size));
assert(MAXQ <= 255);
memset(lc->last_coded_q_map, MAXQ, last_coded_q_map_size);
consec_zero_mv_size = mi_rows * mi_cols * sizeof(*lc->consec_zero_mv);
- CHECK_MEM_ERROR(cm, lc->consec_zero_mv,
+ CHECK_MEM_ERROR(&cm->error, lc->consec_zero_mv,
vpx_malloc(consec_zero_mv_size));
memset(lc->consec_zero_mv, 0, consec_zero_mv_size);
}
diff --git a/vp9/encoder/vp9_tpl_model.c b/vp9/encoder/vp9_tpl_model.c
index ed771dc..9f4bafd 100644
--- a/vp9/encoder/vp9_tpl_model.c
+++ b/vp9/encoder/vp9_tpl_model.c
@@ -154,17 +154,43 @@
int frame_idx;
for (frame_idx = 0; frame_idx < MAX_ARF_GOP_SIZE; ++frame_idx) {
TplDepFrame *tpl_frame = &cpi->tpl_stats[frame_idx];
- VpxTplFrameStats *tpl_frame_stats = &cpi->tpl_frame_stats[frame_idx];
memset(tpl_frame->tpl_stats_ptr, 0,
tpl_frame->height * tpl_frame->width *
sizeof(*tpl_frame->tpl_stats_ptr));
- memset(tpl_frame_stats->block_stats_list, 0,
- tpl_frame->height * tpl_frame->width *
- sizeof(*tpl_frame_stats->block_stats_list));
tpl_frame->is_valid = 0;
}
}
+static void free_tpl_frame_stats_list(VpxTplGopStats *tpl_gop_stats) {
+ int frame_idx;
+ for (frame_idx = 0; frame_idx < tpl_gop_stats->size; ++frame_idx) {
+ vpx_free(tpl_gop_stats->frame_stats_list[frame_idx].block_stats_list);
+ }
+ vpx_free(tpl_gop_stats->frame_stats_list);
+}
+
+static void init_tpl_stats_before_propagation(
+ struct vpx_internal_error_info *error_info, VpxTplGopStats *tpl_gop_stats,
+ TplDepFrame *tpl_stats, int tpl_gop_frames) {
+ int frame_idx;
+ free_tpl_frame_stats_list(tpl_gop_stats);
+ CHECK_MEM_ERROR(
+ error_info, tpl_gop_stats->frame_stats_list,
+ vpx_calloc(tpl_gop_frames, sizeof(*tpl_gop_stats->frame_stats_list)));
+ tpl_gop_stats->size = tpl_gop_frames;
+ for (frame_idx = 0; frame_idx < tpl_gop_frames; ++frame_idx) {
+ const int mi_rows = tpl_stats[frame_idx].height;
+ const int mi_cols = tpl_stats[frame_idx].width;
+ CHECK_MEM_ERROR(
+ error_info, tpl_gop_stats->frame_stats_list[frame_idx].block_stats_list,
+ vpx_calloc(
+ mi_rows * mi_cols,
+ sizeof(
+ *tpl_gop_stats->frame_stats_list[frame_idx].block_stats_list)));
+ tpl_gop_stats->frame_stats_list[frame_idx].num_blocks = mi_rows * mi_cols;
+ }
+}
+
#if CONFIG_NON_GREEDY_MV
static uint32_t full_pixel_motion_search(VP9_COMP *cpi, ThreadData *td,
MotionField *motion_field,
@@ -1106,7 +1132,7 @@
int frame_idx, BLOCK_SIZE bsize) {
TplDepFrame *tpl_frame = &cpi->tpl_stats[frame_idx];
VpxTplFrameStats *tpl_frame_stats_before_propagation =
- &cpi->tpl_frame_stats[frame_idx];
+ &cpi->tpl_gop_stats.frame_stats_list[frame_idx];
YV12_BUFFER_CONFIG *this_frame = gf_picture[frame_idx].frame;
YV12_BUFFER_CONFIG *ref_frame[MAX_INTER_REF_FRAMES] = { NULL, NULL, NULL };
@@ -1320,7 +1346,7 @@
vpx_free(cpi->select_mv_arr);
CHECK_MEM_ERROR(
- cm, cpi->select_mv_arr,
+ &cm->error, cpi->select_mv_arr,
vpx_calloc(mi_rows * mi_cols * 4, sizeof(*cpi->select_mv_arr)));
#endif
@@ -1335,26 +1361,20 @@
for (rf_idx = 0; rf_idx < MAX_INTER_REF_FRAMES; ++rf_idx) {
vpx_free(cpi->tpl_stats[frame].mv_mode_arr[rf_idx]);
CHECK_MEM_ERROR(
- cm, cpi->tpl_stats[frame].mv_mode_arr[rf_idx],
+ &cm->error, cpi->tpl_stats[frame].mv_mode_arr[rf_idx],
vpx_calloc(mi_rows * mi_cols * 4,
sizeof(*cpi->tpl_stats[frame].mv_mode_arr[rf_idx])));
vpx_free(cpi->tpl_stats[frame].rd_diff_arr[rf_idx]);
CHECK_MEM_ERROR(
- cm, cpi->tpl_stats[frame].rd_diff_arr[rf_idx],
+ &cm->error, cpi->tpl_stats[frame].rd_diff_arr[rf_idx],
vpx_calloc(mi_rows * mi_cols * 4,
sizeof(*cpi->tpl_stats[frame].rd_diff_arr[rf_idx])));
}
#endif
vpx_free(cpi->tpl_stats[frame].tpl_stats_ptr);
- CHECK_MEM_ERROR(cm, cpi->tpl_stats[frame].tpl_stats_ptr,
+ CHECK_MEM_ERROR(&cm->error, cpi->tpl_stats[frame].tpl_stats_ptr,
vpx_calloc(mi_rows * mi_cols,
sizeof(*cpi->tpl_stats[frame].tpl_stats_ptr)));
- vpx_free(cpi->tpl_frame_stats[frame].block_stats_list);
- CHECK_MEM_ERROR(
- cm, cpi->tpl_frame_stats[frame].block_stats_list,
- vpx_calloc(mi_rows * mi_cols,
- sizeof(*cpi->tpl_frame_stats[frame].block_stats_list)));
- cpi->tpl_frame_stats[frame].num_blocks = mi_rows * mi_cols;
cpi->tpl_stats[frame].is_valid = 0;
cpi->tpl_stats[frame].width = mi_cols;
cpi->tpl_stats[frame].height = mi_rows;
@@ -1385,8 +1405,8 @@
#endif
vpx_free(cpi->tpl_stats[frame].tpl_stats_ptr);
cpi->tpl_stats[frame].is_valid = 0;
- vpx_free(cpi->tpl_frame_stats[frame].block_stats_list);
}
+ free_tpl_frame_stats_list(&cpi->tpl_gop_stats);
}
#if CONFIG_RATE_CTRL
@@ -1442,6 +1462,9 @@
init_tpl_stats(cpi);
+ init_tpl_stats_before_propagation(&cpi->common.error, &cpi->tpl_gop_stats,
+ cpi->tpl_stats, tpl_group_frames);
+
// Backward propagation from tpl_group_frames to 1.
for (frame_idx = tpl_group_frames - 1; frame_idx > 0; --frame_idx) {
if (gf_picture[frame_idx].update_type == USE_BUF_FRAME) continue;
diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c
index f067efd..409069b 100644
--- a/vp9/vp9_cx_iface.c
+++ b/vp9/vp9_cx_iface.c
@@ -29,6 +29,8 @@
#include "vp9/vp9_cx_iface.h"
#include "vp9/vp9_iface_common.h"
+#include "vpx/vpx_tpl.h"
+
typedef struct vp9_extracfg {
int cpu_used; // available cpu percentage in 1/16
unsigned int enable_auto_alt_ref;
@@ -815,6 +817,7 @@
assert(codec_err != VPX_CODEC_OK);
return codec_err;
}
+ ctx->cpi->common.error.setjmp = 1;
ctx->cfg = *cfg;
set_encoder_config(&ctx->oxcf, &ctx->cfg, &ctx->extra_cfg);
@@ -1633,13 +1636,9 @@
if (data) {
vpx_roi_map_t *roi = (vpx_roi_map_t *)data;
-
- if (!vp9_set_roi_map(ctx->cpi, roi->roi_map, roi->rows, roi->cols,
- roi->delta_q, roi->delta_lf, roi->skip,
- roi->ref_frame)) {
- return VPX_CODEC_OK;
- }
- return VPX_CODEC_INVALID_PARAM;
+ return vp9_set_roi_map(ctx->cpi, roi->roi_map, roi->rows, roi->cols,
+ roi->delta_q, roi->delta_lf, roi->skip,
+ roi->ref_frame);
}
return VPX_CODEC_INVALID_PARAM;
}
@@ -1793,16 +1792,16 @@
static vpx_codec_err_t ctrl_get_tpl_stats(vpx_codec_alg_priv_t *ctx,
va_list args) {
VP9_COMP *const cpi = ctx->cpi;
- VpxTplFrameStats *data = va_arg(args, VpxTplFrameStats *);
+ VpxTplGopStats *data = va_arg(args, VpxTplGopStats *);
+ VpxTplFrameStats *frame_stats_list = cpi->tpl_gop_stats.frame_stats_list;
int i;
if (data == NULL) {
return VPX_CODEC_INVALID_PARAM;
}
- for (i = 0; i < MAX_ARF_GOP_SIZE; i++) {
- data[i].frame_width = cpi->tpl_frame_stats[i].frame_width;
- data[i].frame_height = cpi->tpl_frame_stats[i].frame_height;
- data[i].num_blocks = cpi->tpl_frame_stats[i].num_blocks;
- data[i].block_stats_list = cpi->tpl_frame_stats[i].block_stats_list;
+ data->size = cpi->tpl_gop_stats.size;
+
+ for (i = 0; i < data->size; i++) {
+ data->frame_stats_list[i] = frame_stats_list[i];
}
return VPX_CODEC_OK;
diff --git a/vpx/exports_com b/vpx/exports_com
index 2ab0509..f0b46aa 100644
--- a/vpx/exports_com
+++ b/vpx/exports_com
@@ -14,3 +14,6 @@
text vpx_img_free
text vpx_img_set_rect
text vpx_img_wrap
+text vpx_free_tpl_gop_stats
+text vpx_read_tpl_gop_stats
+text vpx_write_tpl_gop_stats
diff --git a/vpx/internal/vpx_codec_internal.h b/vpx/internal/vpx_codec_internal.h
index 670fe38..aae3218 100644
--- a/vpx/internal/vpx_codec_internal.h
+++ b/vpx/internal/vpx_codec_internal.h
@@ -48,6 +48,8 @@
#include "../vpx_encoder.h"
#include <stdarg.h>
+#include "vpx_config.h"
+
#ifdef __cplusplus
extern "C" {
#endif
@@ -427,6 +429,27 @@
jmp_buf jmp;
};
+#if CONFIG_DEBUG
+#define CHECK_MEM_ERROR(error, lval, expr) \
+ do { \
+ assert((error)->setjmp); \
+ (lval) = (expr); \
+ if (!(lval)) \
+ vpx_internal_error(error, VPX_CODEC_MEM_ERROR, \
+ "Failed to allocate " #lval " at %s:%d", __FILE__, \
+ __LINE__); \
+ } while (0)
+#else
+#define CHECK_MEM_ERROR(error, lval, expr) \
+ do { \
+ assert((error)->setjmp); \
+ (lval) = (expr); \
+ if (!(lval)) \
+ vpx_internal_error(error, VPX_CODEC_MEM_ERROR, \
+ "Failed to allocate " #lval); \
+ } while (0)
+#endif
+
#define CLANG_ANALYZER_NORETURN
#if defined(__has_feature)
#if __has_feature(attribute_analyzer_noreturn)
diff --git a/vpx/src/vpx_tpl.c b/vpx/src/vpx_tpl.c
new file mode 100644
index 0000000..9cdb4a0
--- /dev/null
+++ b/vpx/src/vpx_tpl.c
@@ -0,0 +1,107 @@
+/*
+ * Copyright (c) 2023 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdlib.h>
+
+#include "vpx/vpx_codec.h"
+#include "vpx/vpx_tpl.h"
+#include "vpx_mem/vpx_mem.h"
+
+#define CHECK_FPRINTF_ERROR(expr) \
+ do { \
+ if (expr < 0) { \
+ return VPX_CODEC_ERROR; \
+ } \
+ } while (0)
+
+#define CHECK_FSCANF_ERROR(expr, expected_value) \
+ do { \
+ if (expr != expected_value) { \
+ return VPX_CODEC_ERROR; \
+ } \
+ } while (0)
+
+vpx_codec_err_t vpx_write_tpl_gop_stats(FILE *tpl_file,
+ const VpxTplGopStats *tpl_gop_stats) {
+ int i;
+ if (tpl_file == NULL || tpl_gop_stats == NULL) return VPX_CODEC_INVALID_PARAM;
+ CHECK_FPRINTF_ERROR(fprintf(tpl_file, "%d\n", tpl_gop_stats->size));
+
+ for (i = 0; i < tpl_gop_stats->size; i++) {
+ VpxTplFrameStats frame_stats = tpl_gop_stats->frame_stats_list[i];
+ const int num_blocks = frame_stats.num_blocks;
+ int block;
+ CHECK_FPRINTF_ERROR(fprintf(tpl_file, "%d %d %d\n", frame_stats.frame_width,
+ frame_stats.frame_height, num_blocks));
+ for (block = 0; block < num_blocks; block++) {
+ VpxTplBlockStats block_stats = frame_stats.block_stats_list[block];
+ CHECK_FPRINTF_ERROR(
+ fprintf(tpl_file,
+ "%" PRId64 " %" PRId64 " %" PRId16 " %" PRId16 " %" PRId64
+ " %" PRId64 " %d\n",
+ block_stats.inter_cost, block_stats.intra_cost,
+ block_stats.mv_c, block_stats.mv_r, block_stats.recrf_dist,
+ block_stats.recrf_rate, block_stats.ref_frame_index));
+ }
+ }
+
+ return VPX_CODEC_OK;
+}
+
+vpx_codec_err_t vpx_read_tpl_gop_stats(FILE *tpl_file,
+ VpxTplGopStats *tpl_gop_stats) {
+ int i, frame_list_size;
+ if (tpl_file == NULL || tpl_gop_stats == NULL) return VPX_CODEC_INVALID_PARAM;
+ CHECK_FSCANF_ERROR(fscanf(tpl_file, "%d\n", &frame_list_size), 1);
+ tpl_gop_stats->size = frame_list_size;
+ tpl_gop_stats->frame_stats_list = (VpxTplFrameStats *)vpx_calloc(
+ frame_list_size, sizeof(tpl_gop_stats->frame_stats_list[0]));
+ if (tpl_gop_stats->frame_stats_list == NULL) {
+ return VPX_CODEC_MEM_ERROR;
+ }
+ for (i = 0; i < frame_list_size; i++) {
+ VpxTplFrameStats *frame_stats = &tpl_gop_stats->frame_stats_list[i];
+ int num_blocks, width, height, block;
+ CHECK_FSCANF_ERROR(
+ fscanf(tpl_file, "%d %d %d\n", &width, &height, &num_blocks), 3);
+ frame_stats->num_blocks = num_blocks;
+ frame_stats->frame_width = width;
+ frame_stats->frame_height = height;
+ frame_stats->block_stats_list = (VpxTplBlockStats *)vpx_calloc(
+ num_blocks, sizeof(frame_stats->block_stats_list[0]));
+ if (frame_stats->block_stats_list == NULL) {
+ vpx_free_tpl_gop_stats(tpl_gop_stats);
+ return VPX_CODEC_MEM_ERROR;
+ }
+ for (block = 0; block < num_blocks; block++) {
+ VpxTplBlockStats *block_stats = &frame_stats->block_stats_list[block];
+ CHECK_FSCANF_ERROR(
+ fscanf(tpl_file,
+ "%" SCNd64 " %" SCNd64 " %" SCNd16 " %" SCNd16 " %" SCNd64
+ " %" SCNd64 " %d\n",
+ &block_stats->inter_cost, &block_stats->intra_cost,
+ &block_stats->mv_c, &block_stats->mv_r,
+ &block_stats->recrf_dist, &block_stats->recrf_rate,
+ &block_stats->ref_frame_index),
+ 7);
+ }
+ }
+
+ return VPX_CODEC_OK;
+}
+
+void vpx_free_tpl_gop_stats(VpxTplGopStats *data) {
+ int frame;
+ if (data == NULL) return;
+ for (frame = 0; frame < data->size; frame++) {
+ vpx_free(data->frame_stats_list[frame].block_stats_list);
+ }
+ vpx_free(data->frame_stats_list);
+}
diff --git a/vpx/vpx_codec.mk b/vpx/vpx_codec.mk
index de86579..25c815e 100644
--- a/vpx/vpx_codec.mk
+++ b/vpx/vpx_codec.mk
@@ -27,6 +27,7 @@
API_DOC_SRCS-yes += vpx_ext_ratectrl.h
API_DOC_SRCS-yes += vpx_frame_buffer.h
API_DOC_SRCS-yes += vpx_image.h
+API_DOC_SRCS-yes += vpx_tpl.h
API_SRCS-yes += src/vpx_decoder.c
API_SRCS-yes += vpx_decoder.h
@@ -36,9 +37,11 @@
API_SRCS-yes += internal/vpx_ratectrl_rtc.h
API_SRCS-yes += src/vpx_codec.c
API_SRCS-yes += src/vpx_image.c
+API_SRCS-yes += src/vpx_tpl.c
API_SRCS-yes += vpx_codec.h
API_SRCS-yes += vpx_codec.mk
API_SRCS-yes += vpx_frame_buffer.h
API_SRCS-yes += vpx_image.h
API_SRCS-yes += vpx_integer.h
API_SRCS-yes += vpx_ext_ratectrl.h
+API_SRCS-yes += vpx_tpl.h
diff --git a/vpx/vpx_encoder.h b/vpx/vpx_encoder.h
index 2de8089..c45d1a2 100644
--- a/vpx/vpx_encoder.h
+++ b/vpx/vpx_encoder.h
@@ -31,6 +31,7 @@
#include "./vpx_codec.h"
#include "./vpx_ext_ratectrl.h"
+#include "./vpx_tpl.h"
/*! Temporal Scalability: Maximum length of the sequence defining frame
* layer membership
@@ -57,9 +58,9 @@
* types, removing or reassigning enums, adding/removing/rearranging
* fields to structures
*/
-#define VPX_ENCODER_ABI_VERSION \
- (16 + VPX_CODEC_ABI_VERSION + \
- VPX_EXT_RATECTRL_ABI_VERSION) /**<\hideinitializer*/
+#define VPX_ENCODER_ABI_VERSION \
+ (16 + VPX_CODEC_ABI_VERSION + VPX_EXT_RATECTRL_ABI_VERSION + \
+ VPX_TPL_ABI_VERSION) /**<\hideinitializer*/
/*! \brief Encoder capabilities bitfield
*
@@ -252,25 +253,6 @@
VPX_KF_DISABLED = 0 /**< Encoder does not place keyframes. */
};
-/*!\brief Temporal dependency model stats for each block before propagation */
-typedef struct VpxTplBlockStats {
- int64_t intra_cost; /**< Intra cost */
- int64_t inter_cost; /**< Inter cost */
- int16_t mv_r; /**< Motion vector row */
- int16_t mv_c; /**< Motion vector col */
- int64_t recrf_rate; /**< Rate from reconstructed ref frame */
- int64_t recrf_dist; /**< Distortion from reconstructed ref frame */
- int ref_frame_index; /**< Ref frame index */
-} VpxTplBlockStats;
-
-/*!\brief Temporal dependency model stats for each frame before propagation */
-typedef struct VpxTplFrameStats {
- int frame_width; /**< Frame width */
- int frame_height; /**< Frame height */
- int num_blocks; /**< Number of blocks. Size of block_stats_list */
- VpxTplBlockStats *block_stats_list; /**< List of tpl stats for each block */
-} VpxTplFrameStats;
-
/*!\brief Encoded Frame Flags
*
* This type indicates a bitfield to be passed to vpx_codec_encode(), defining
diff --git a/vpx/vpx_tpl.h b/vpx/vpx_tpl.h
new file mode 100644
index 0000000..50aec49
--- /dev/null
+++ b/vpx/vpx_tpl.h
@@ -0,0 +1,99 @@
+/*
+ * Copyright (c) 2023 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+/*!\file
+ * \brief Describes the TPL stats descriptor and associated operations
+ *
+ */
+#ifndef VPX_VPX_VPX_TPL_H_
+#define VPX_VPX_VPX_TPL_H_
+
+#include <stdio.h>
+
+#include "./vpx_integer.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*!\brief Current ABI version number
+ *
+ * \internal
+ * If this file is altered in any way that changes the ABI, this value
+ * must be bumped. Examples include, but are not limited to, changing
+ * types, removing or reassigning enums, adding/removing/rearranging
+ * fields to structures
+ */
+#define VPX_TPL_ABI_VERSION (1) /**<\hideinitializer*/
+
+/*!\brief Temporal dependency model stats for each block before propagation */
+typedef struct VpxTplBlockStats {
+ int64_t intra_cost; /**< Intra cost */
+ int64_t inter_cost; /**< Inter cost */
+ int16_t mv_r; /**< Motion vector row */
+ int16_t mv_c; /**< Motion vector col */
+ int64_t recrf_rate; /**< Rate from reconstructed ref frame */
+ int64_t recrf_dist; /**< Distortion from reconstructed ref frame */
+ int ref_frame_index; /**< Ref frame index */
+} VpxTplBlockStats;
+
+/*!\brief Temporal dependency model stats for each frame before propagation */
+typedef struct VpxTplFrameStats {
+ int frame_width; /**< Frame width */
+ int frame_height; /**< Frame height */
+ int num_blocks; /**< Number of blocks. Size of block_stats_list */
+ VpxTplBlockStats *block_stats_list; /**< List of tpl stats for each block */
+} VpxTplFrameStats;
+
+/*!\brief Temporal dependency model stats for each GOP before propagation */
+typedef struct VpxTplGopStats {
+ int size; /**< GOP size, also the size of frame_stats_list. */
+ VpxTplFrameStats *frame_stats_list; /**< List of tpl stats for each frame */
+} VpxTplGopStats;
+
+/*!\brief Write VpxTplGopStats to file
+ *
+ * Accepts an opened file handle and writes \p tpl_gop_stats.
+ *
+ * \param[in] tpl_file A FILE pointer that's already been opened.
+ * \param[in] tpl_gop_stats VpxTplGopStats that contains TPL stats for the
+ * whole GOP.
+ *
+ * \return VPX_CODEC_OK if TPL stats are successfully written.
+ */
+vpx_codec_err_t vpx_write_tpl_gop_stats(FILE *tpl_file,
+ const VpxTplGopStats *tpl_gop_stats);
+
+/*!\brief Read VpxTplGopStats from file
+ *
+ * Accepts an opened file handle and reads TPL stats and stores them into
+ * \p tpl_gop_stats. Allocates memory for TPL stats.
+ *
+ * \param[in] tpl_file A FILE pointer that's already been opened.
+ * \param[out] tpl_gop_stats VpxTplGopStats that contains TPL stats for the
+ * whole GOP.
+ *
+ * \return VPX_CODEC_OK if TPL stats are successfully read from file.
+ */
+vpx_codec_err_t vpx_read_tpl_gop_stats(FILE *tpl_file,
+ VpxTplGopStats *tpl_gop_stats);
+
+/*!\brief Free the memory allocated for VpxTplGopStats
+ *
+ * \param[in] tpl_gop_stats VpxTplGopStats that contains TPL stats for the
+ * whole GOP.
+ */
+void vpx_free_tpl_gop_stats(VpxTplGopStats *tpl_gop_stats);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // VPX_VPX_VPX_TPL_H_
diff --git a/vpx_dsp/arm/mem_neon.h b/vpx_dsp/arm/mem_neon.h
index 1a20da7..586bfb8 100644
--- a/vpx_dsp/arm/mem_neon.h
+++ b/vpx_dsp/arm/mem_neon.h
@@ -263,6 +263,16 @@
vst1_lane_u32((uint32_t *)buf, a_u32, 1);
}
+static INLINE void store_u8_8x3(uint8_t *s, const ptrdiff_t p,
+ const uint8x8_t s0, const uint8x8_t s1,
+ const uint8x8_t s2) {
+ vst1_u8(s, s0);
+ s += p;
+ vst1_u8(s, s1);
+ s += p;
+ vst1_u8(s, s2);
+}
+
static INLINE void load_u8_8x4(const uint8_t *s, const ptrdiff_t p,
uint8x8_t *const s0, uint8x8_t *const s1,
uint8x8_t *const s2, uint8x8_t *const s3) {
@@ -287,6 +297,16 @@
vst1_u8(s, s3);
}
+static INLINE void load_u8_16x3(const uint8_t *s, const ptrdiff_t p,
+ uint8x16_t *const s0, uint8x16_t *const s1,
+ uint8x16_t *const s2) {
+ *s0 = vld1q_u8(s);
+ s += p;
+ *s1 = vld1q_u8(s);
+ s += p;
+ *s2 = vld1q_u8(s);
+}
+
static INLINE void load_u8_16x4(const uint8_t *s, const ptrdiff_t p,
uint8x16_t *const s0, uint8x16_t *const s1,
uint8x16_t *const s2, uint8x16_t *const s3) {
diff --git a/vpx_dsp/arm/vpx_convolve8_neon.c b/vpx_dsp/arm/vpx_convolve8_neon.c
index b312cc7..505d067 100644
--- a/vpx_dsp/arm/vpx_convolve8_neon.c
+++ b/vpx_dsp/arm/vpx_convolve8_neon.c
@@ -17,6 +17,7 @@
#include "vpx_dsp/arm/mem_neon.h"
#include "vpx_dsp/arm/transpose_neon.h"
#include "vpx_dsp/arm/vpx_convolve8_neon.h"
+#include "vpx_dsp/vpx_filter.h"
#include "vpx_ports/mem.h"
// Note:
@@ -56,17 +57,18 @@
#if defined(__ARM_FEATURE_MATMUL_INT8)
-void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
- uint8_t *dst, ptrdiff_t dst_stride,
- const InterpKernel *filter, int x0_q4,
- int x_step_q4, int y0_q4, int y_step_q4, int w,
- int h) {
+void vpx_convolve8_2d_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const InterpKernel *filter, int x0_q4,
+ int x_step_q4, int y0_q4, int y_step_q4, int w,
+ int h) {
const int8x8_t filters = vmovn_s16(vld1q_s16(filter[x0_q4]));
uint8x16_t s0, s1, s2, s3;
- assert(!((intptr_t)dst & 3));
- assert(!(dst_stride & 3));
+ assert((intptr_t)dst % 4 == 0);
+ assert(dst_stride % 4 == 0);
assert(x_step_q4 == 16);
+ assert(h % 4 == 3);
(void)x_step_q4;
(void)y0_q4;
@@ -75,22 +77,19 @@
src -= 3;
if (w == 4) {
- const uint8x16x2_t permute_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
- do {
- int32x4_t t0, t1, t2, t3;
- int16x8_t t01, t23;
- uint8x8_t d01, d23;
+ const uint8x16x2_t perm_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
+ int16x4_t d0, d1, d2, d3;
+ uint8x8_t d01, d23;
+ do {
load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
- t0 = convolve8_4_usdot(s0, filters, permute_tbl);
- t1 = convolve8_4_usdot(s1, filters, permute_tbl);
- t2 = convolve8_4_usdot(s2, filters, permute_tbl);
- t3 = convolve8_4_usdot(s3, filters, permute_tbl);
- t01 = vcombine_s16(vqmovn_s32(t0), vqmovn_s32(t1));
- t23 = vcombine_s16(vqmovn_s32(t2), vqmovn_s32(t3));
- d01 = vqrshrun_n_s16(t01, 7);
- d23 = vqrshrun_n_s16(t23, 7);
+ d0 = convolve8_4_usdot(s0, filters, perm_tbl);
+ d1 = convolve8_4_usdot(s1, filters, perm_tbl);
+ d2 = convolve8_4_usdot(s2, filters, perm_tbl);
+ d3 = convolve8_4_usdot(s3, filters, perm_tbl);
+ d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
+ d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
store_u8(dst + 0 * dst_stride, dst_stride, d01);
store_u8(dst + 2 * dst_stride, dst_stride, d23);
@@ -98,9 +97,22 @@
src += 4 * src_stride;
dst += 4 * dst_stride;
h -= 4;
- } while (h > 0);
+ } while (h > 3);
+
+ /* Process final three rows (h % 4 == 3). See vpx_convolve_neon.c for
+ * further details on possible values of block height. */
+ load_u8_16x3(src, src_stride, &s0, &s1, &s2);
+
+ d0 = convolve8_4_usdot(s0, filters, perm_tbl);
+ d1 = convolve8_4_usdot(s1, filters, perm_tbl);
+ d2 = convolve8_4_usdot(s2, filters, perm_tbl);
+ d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
+ d23 = vqrshrun_n_s16(vcombine_s16(d2, vdup_n_s16(0)), FILTER_BITS);
+
+ store_u8(dst + 0 * dst_stride, dst_stride, d01);
+ store_u8_4x1(dst + 2 * dst_stride, d23);
} else {
- const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
+ const uint8x16x3_t perm_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
const uint8_t *s;
uint8_t *d;
int width;
@@ -113,10 +125,10 @@
do {
load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
- d0 = convolve8_8_usdot(s0, filters, permute_tbl);
- d1 = convolve8_8_usdot(s1, filters, permute_tbl);
- d2 = convolve8_8_usdot(s2, filters, permute_tbl);
- d3 = convolve8_8_usdot(s3, filters, permute_tbl);
+ d0 = convolve8_8_usdot(s0, filters, perm_tbl);
+ d1 = convolve8_8_usdot(s1, filters, perm_tbl);
+ d2 = convolve8_8_usdot(s2, filters, perm_tbl);
+ d3 = convolve8_8_usdot(s3, filters, perm_tbl);
store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
@@ -127,7 +139,98 @@
src += 4 * src_stride;
dst += 4 * dst_stride;
h -= 4;
- } while (h > 0);
+ } while (h > 3);
+
+ /* Process final three rows (h % 4 == 3). See vpx_convolve_neon.c for
+ * further details on possible values of block height. */
+ width = w;
+ s = src;
+ d = dst;
+ do {
+ load_u8_16x3(s, src_stride, &s0, &s1, &s2);
+
+ d0 = convolve8_8_usdot(s0, filters, perm_tbl);
+ d1 = convolve8_8_usdot(s1, filters, perm_tbl);
+ d2 = convolve8_8_usdot(s2, filters, perm_tbl);
+
+ store_u8_8x3(d, dst_stride, d0, d1, d2);
+
+ s += 8;
+ d += 8;
+ width -= 8;
+ } while (width > 0);
+ }
+}
+
+void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const InterpKernel *filter, int x0_q4,
+ int x_step_q4, int y0_q4, int y_step_q4, int w,
+ int h) {
+ const int8x8_t filters = vmovn_s16(vld1q_s16(filter[x0_q4]));
+ uint8x16_t s0, s1, s2, s3;
+
+ assert((intptr_t)dst % 4 == 0);
+ assert(dst_stride % 4 == 0);
+ assert(x_step_q4 == 16);
+
+ (void)x_step_q4;
+ (void)y0_q4;
+ (void)y_step_q4;
+
+ src -= 3;
+
+ if (w == 4) {
+ const uint8x16x2_t perm_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
+ do {
+ int16x4_t t0, t1, t2, t3;
+ uint8x8_t d01, d23;
+
+ load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
+
+ t0 = convolve8_4_usdot(s0, filters, perm_tbl);
+ t1 = convolve8_4_usdot(s1, filters, perm_tbl);
+ t2 = convolve8_4_usdot(s2, filters, perm_tbl);
+ t3 = convolve8_4_usdot(s3, filters, perm_tbl);
+ d01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS);
+ d23 = vqrshrun_n_s16(vcombine_s16(t2, t3), FILTER_BITS);
+
+ store_u8(dst + 0 * dst_stride, dst_stride, d01);
+ store_u8(dst + 2 * dst_stride, dst_stride, d23);
+
+ src += 4 * src_stride;
+ dst += 4 * dst_stride;
+ h -= 4;
+ } while (h != 0);
+ } else {
+ const uint8x16x3_t perm_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
+ const uint8_t *s;
+ uint8_t *d;
+ int width;
+ uint8x8_t d0, d1, d2, d3;
+
+ do {
+ width = w;
+ s = src;
+ d = dst;
+ do {
+ load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
+
+ d0 = convolve8_8_usdot(s0, filters, perm_tbl);
+ d1 = convolve8_8_usdot(s1, filters, perm_tbl);
+ d2 = convolve8_8_usdot(s2, filters, perm_tbl);
+ d3 = convolve8_8_usdot(s3, filters, perm_tbl);
+
+ store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
+
+ s += 8;
+ d += 8;
+ width -= 8;
+ } while (width != 0);
+ src += 4 * src_stride;
+ dst += 4 * dst_stride;
+ h -= 4;
+ } while (h != 0);
}
}
@@ -139,8 +242,8 @@
const int8x8_t filters = vmovn_s16(vld1q_s16(filter[x0_q4]));
uint8x16_t s0, s1, s2, s3;
- assert(!((intptr_t)dst & 3));
- assert(!(dst_stride & 3));
+ assert((intptr_t)dst % 4 == 0);
+ assert(dst_stride % 4 == 0);
assert(x_step_q4 == 16);
(void)x_step_q4;
@@ -150,24 +253,19 @@
src -= 3;
if (w == 4) {
- const uint8x16x2_t permute_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
+ const uint8x16x2_t perm_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
do {
- int32x4_t t0, t1, t2, t3;
- int16x8_t t01, t23;
+ int16x4_t t0, t1, t2, t3;
uint8x8_t d01, d23, dd01, dd23;
- dd01 = vdup_n_u8(0);
- dd23 = vdup_n_u8(0);
load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
- t0 = convolve8_4_usdot(s0, filters, permute_tbl);
- t1 = convolve8_4_usdot(s1, filters, permute_tbl);
- t2 = convolve8_4_usdot(s2, filters, permute_tbl);
- t3 = convolve8_4_usdot(s3, filters, permute_tbl);
- t01 = vcombine_s16(vqmovn_s32(t0), vqmovn_s32(t1));
- t23 = vcombine_s16(vqmovn_s32(t2), vqmovn_s32(t3));
- d01 = vqrshrun_n_s16(t01, 7);
- d23 = vqrshrun_n_s16(t23, 7);
+ t0 = convolve8_4_usdot(s0, filters, perm_tbl);
+ t1 = convolve8_4_usdot(s1, filters, perm_tbl);
+ t2 = convolve8_4_usdot(s2, filters, perm_tbl);
+ t3 = convolve8_4_usdot(s3, filters, perm_tbl);
+ d01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS);
+ d23 = vqrshrun_n_s16(vcombine_s16(t2, t3), FILTER_BITS);
dd01 = load_u8(dst + 0 * dst_stride, dst_stride);
dd23 = load_u8(dst + 2 * dst_stride, dst_stride);
@@ -181,9 +279,9 @@
src += 4 * src_stride;
dst += 4 * dst_stride;
h -= 4;
- } while (h > 0);
+ } while (h != 0);
} else {
- const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
+ const uint8x16x3_t perm_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
const uint8_t *s;
uint8_t *d;
int width;
@@ -196,10 +294,10 @@
do {
load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
- d0 = convolve8_8_usdot(s0, filters, permute_tbl);
- d1 = convolve8_8_usdot(s1, filters, permute_tbl);
- d2 = convolve8_8_usdot(s2, filters, permute_tbl);
- d3 = convolve8_8_usdot(s3, filters, permute_tbl);
+ d0 = convolve8_8_usdot(s0, filters, perm_tbl);
+ d1 = convolve8_8_usdot(s1, filters, perm_tbl);
+ d2 = convolve8_8_usdot(s2, filters, perm_tbl);
+ d3 = convolve8_8_usdot(s3, filters, perm_tbl);
load_u8_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
@@ -213,11 +311,11 @@
s += 8;
d += 8;
width -= 8;
- } while (width > 0);
+ } while (width != 0);
src += 4 * src_stride;
dst += 4 * dst_stride;
h -= 4;
- } while (h > 0);
+ } while (h != 0);
}
}
@@ -275,8 +373,8 @@
uint8x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
uint8x16x2_t samples_LUT;
- assert(!((intptr_t)dst & 3));
- assert(!(dst_stride & 3));
+ assert((intptr_t)dst % 4 == 0);
+ assert(dst_stride % 4 == 0);
assert(y_step_q4 == 16);
(void)x0_q4;
@@ -288,7 +386,7 @@
if (w == 4) {
const uint8x16_t tran_concat_tbl = vld1q_u8(dot_prod_tran_concat_tbl);
uint8x16_t s0123, s1234, s2345, s3456, s4567, s5678, s6789, s78910;
- int32x4_t d0, d1, d2, d3;
+ int16x4_t d0, d1, d2, d3;
uint8x8_t d01, d23;
load_u8_8x7(src, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
@@ -325,8 +423,8 @@
d1 = convolve8_4_usdot_partial(s1234, s5678, filters);
d2 = convolve8_4_usdot_partial(s2345, s6789, filters);
d3 = convolve8_4_usdot_partial(s3456, s78910, filters);
- d01 = vqrshrun_n_s16(vcombine_s16(vqmovn_s32(d0), vqmovn_s32(d1)), 7);
- d23 = vqrshrun_n_s16(vcombine_s16(vqmovn_s32(d2), vqmovn_s32(d3)), 7);
+ d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
+ d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
store_u8(dst + 0 * dst_stride, dst_stride, d01);
store_u8(dst + 2 * dst_stride, dst_stride, d23);
@@ -341,7 +439,7 @@
src += 4 * src_stride;
dst += 4 * dst_stride;
h -= 4;
- } while (h > 0);
+ } while (h != 0);
} else {
const uint8x16x2_t tran_concat_tbl = vld1q_u8_x2(dot_prod_tran_concat_tbl);
uint8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi,
@@ -426,11 +524,11 @@
s += 4 * src_stride;
d += 4 * dst_stride;
height -= 4;
- } while (height > 0);
+ } while (height != 0);
src += 8;
dst += 8;
w -= 8;
- } while (w > 0);
+ } while (w != 0);
}
}
@@ -444,8 +542,8 @@
uint8x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
uint8x16x2_t samples_LUT;
- assert(!((intptr_t)dst & 3));
- assert(!(dst_stride & 3));
+ assert((intptr_t)dst % 4 == 0);
+ assert(dst_stride % 4 == 0);
assert(y_step_q4 == 16);
(void)x0_q4;
@@ -457,7 +555,7 @@
if (w == 4) {
const uint8x16_t tran_concat_tbl = vld1q_u8(dot_prod_tran_concat_tbl);
uint8x16_t s0123, s1234, s2345, s3456, s4567, s5678, s6789, s78910;
- int32x4_t d0, d1, d2, d3;
+ int16x4_t d0, d1, d2, d3;
uint8x8_t d01, d23, dd01, dd23;
load_u8_8x7(src, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
@@ -494,8 +592,8 @@
d1 = convolve8_4_usdot_partial(s1234, s5678, filters);
d2 = convolve8_4_usdot_partial(s2345, s6789, filters);
d3 = convolve8_4_usdot_partial(s3456, s78910, filters);
- d01 = vqrshrun_n_s16(vcombine_s16(vqmovn_s32(d0), vqmovn_s32(d1)), 7);
- d23 = vqrshrun_n_s16(vcombine_s16(vqmovn_s32(d2), vqmovn_s32(d3)), 7);
+ d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
+ d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
dd01 = load_u8(dst + 0 * dst_stride, dst_stride);
dd23 = load_u8(dst + 2 * dst_stride, dst_stride);
@@ -516,7 +614,7 @@
src += 4 * src_stride;
dst += 4 * dst_stride;
h -= 4;
- } while (h > 0);
+ } while (h != 0);
} else {
const uint8x16x2_t tran_concat_tbl = vld1q_u8_x2(dot_prod_tran_concat_tbl);
uint8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi,
@@ -608,30 +706,31 @@
s += 4 * src_stride;
d += 4 * dst_stride;
height -= 4;
- } while (height > 0);
+ } while (height != 0);
src += 8;
dst += 8;
w -= 8;
- } while (w > 0);
+ } while (w != 0);
}
}
#else // !defined(__ARM_FEATURE_MATMUL_INT8)
-void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
- uint8_t *dst, ptrdiff_t dst_stride,
- const InterpKernel *filter, int x0_q4,
- int x_step_q4, int y0_q4, int y_step_q4, int w,
- int h) {
+void vpx_convolve8_2d_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const InterpKernel *filter, int x0_q4,
+ int x_step_q4, int y0_q4, int y_step_q4, int w,
+ int h) {
const int8x8_t filters = vmovn_s16(vld1q_s16(filter[x0_q4]));
const int16x8_t correct_tmp = vmulq_n_s16(vld1q_s16(filter[x0_q4]), 128);
const int32x4_t correction = vdupq_n_s32((int32_t)vaddvq_s16(correct_tmp));
const uint8x16_t range_limit = vdupq_n_u8(128);
uint8x16_t s0, s1, s2, s3;
- assert(!((intptr_t)dst & 3));
- assert(!(dst_stride & 3));
+ assert((intptr_t)dst % 4 == 0);
+ assert(dst_stride % 4 == 0);
assert(x_step_q4 == 16);
+ assert(h % 4 == 3);
(void)x_step_q4;
(void)y0_q4;
@@ -640,22 +739,19 @@
src -= 3;
if (w == 4) {
- const uint8x16x2_t permute_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
- do {
- int32x4_t t0, t1, t2, t3;
- int16x8_t t01, t23;
- uint8x8_t d01, d23;
+ const uint8x16x2_t perm_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
+ int16x4_t d0, d1, d2, d3;
+ uint8x8_t d01, d23;
+ do {
load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
- t0 = convolve8_4_sdot(s0, filters, correction, range_limit, permute_tbl);
- t1 = convolve8_4_sdot(s1, filters, correction, range_limit, permute_tbl);
- t2 = convolve8_4_sdot(s2, filters, correction, range_limit, permute_tbl);
- t3 = convolve8_4_sdot(s3, filters, correction, range_limit, permute_tbl);
- t01 = vcombine_s16(vqmovn_s32(t0), vqmovn_s32(t1));
- t23 = vcombine_s16(vqmovn_s32(t2), vqmovn_s32(t3));
- d01 = vqrshrun_n_s16(t01, 7);
- d23 = vqrshrun_n_s16(t23, 7);
+ d0 = convolve8_4_sdot(s0, filters, correction, range_limit, perm_tbl);
+ d1 = convolve8_4_sdot(s1, filters, correction, range_limit, perm_tbl);
+ d2 = convolve8_4_sdot(s2, filters, correction, range_limit, perm_tbl);
+ d3 = convolve8_4_sdot(s3, filters, correction, range_limit, perm_tbl);
+ d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
+ d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
store_u8(dst + 0 * dst_stride, dst_stride, d01);
store_u8(dst + 2 * dst_stride, dst_stride, d23);
@@ -663,9 +759,22 @@
src += 4 * src_stride;
dst += 4 * dst_stride;
h -= 4;
- } while (h > 0);
+ } while (h > 3);
+
+ /* Process final three rows (h % 4 == 3). See vpx_convolve_neon.c for
+ * further details on possible values of block height. */
+ load_u8_16x3(src, src_stride, &s0, &s1, &s2);
+
+ d0 = convolve8_4_sdot(s0, filters, correction, range_limit, perm_tbl);
+ d1 = convolve8_4_sdot(s1, filters, correction, range_limit, perm_tbl);
+ d2 = convolve8_4_sdot(s2, filters, correction, range_limit, perm_tbl);
+ d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
+ d23 = vqrshrun_n_s16(vcombine_s16(d2, vdup_n_s16(0)), FILTER_BITS);
+
+ store_u8(dst + 0 * dst_stride, dst_stride, d01);
+ store_u8_4x1(dst + 2 * dst_stride, d23);
} else {
- const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
+ const uint8x16x3_t perm_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
const uint8_t *s;
uint8_t *d;
int width;
@@ -678,25 +787,115 @@
do {
load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
- d0 =
- convolve8_8_sdot(s0, filters, correction, range_limit, permute_tbl);
- d1 =
- convolve8_8_sdot(s1, filters, correction, range_limit, permute_tbl);
- d2 =
- convolve8_8_sdot(s2, filters, correction, range_limit, permute_tbl);
- d3 =
- convolve8_8_sdot(s3, filters, correction, range_limit, permute_tbl);
+ d0 = convolve8_8_sdot(s0, filters, correction, range_limit, perm_tbl);
+ d1 = convolve8_8_sdot(s1, filters, correction, range_limit, perm_tbl);
+ d2 = convolve8_8_sdot(s2, filters, correction, range_limit, perm_tbl);
+ d3 = convolve8_8_sdot(s3, filters, correction, range_limit, perm_tbl);
store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
s += 8;
d += 8;
width -= 8;
- } while (width > 0);
+ } while (width != 0);
src += 4 * src_stride;
dst += 4 * dst_stride;
h -= 4;
- } while (h > 0);
+ } while (h > 3);
+
+ /* Process final three rows (h % 4 == 3). See vpx_convolve_neon.c for
+ * further details on possible values of block height. */
+ width = w;
+ s = src;
+ d = dst;
+ do {
+ load_u8_16x3(s, src_stride, &s0, &s1, &s2);
+
+ d0 = convolve8_8_sdot(s0, filters, correction, range_limit, perm_tbl);
+ d1 = convolve8_8_sdot(s1, filters, correction, range_limit, perm_tbl);
+ d2 = convolve8_8_sdot(s2, filters, correction, range_limit, perm_tbl);
+
+ store_u8_8x3(d, dst_stride, d0, d1, d2);
+
+ s += 8;
+ d += 8;
+ width -= 8;
+ } while (width != 0);
+ }
+}
+
+void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const InterpKernel *filter, int x0_q4,
+ int x_step_q4, int y0_q4, int y_step_q4, int w,
+ int h) {
+ const int8x8_t filters = vmovn_s16(vld1q_s16(filter[x0_q4]));
+ const int16x8_t correct_tmp = vmulq_n_s16(vld1q_s16(filter[x0_q4]), 128);
+ const int32x4_t correction = vdupq_n_s32((int32_t)vaddvq_s16(correct_tmp));
+ const uint8x16_t range_limit = vdupq_n_u8(128);
+ uint8x16_t s0, s1, s2, s3;
+
+ assert((intptr_t)dst % 4 == 0);
+ assert(dst_stride % 4 == 0);
+ assert(x_step_q4 == 16);
+
+ (void)x_step_q4;
+ (void)y0_q4;
+ (void)y_step_q4;
+
+ src -= 3;
+
+ if (w == 4) {
+ const uint8x16x2_t perm_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
+ do {
+ int16x4_t t0, t1, t2, t3;
+ uint8x8_t d01, d23;
+
+ load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
+
+ t0 = convolve8_4_sdot(s0, filters, correction, range_limit, perm_tbl);
+ t1 = convolve8_4_sdot(s1, filters, correction, range_limit, perm_tbl);
+ t2 = convolve8_4_sdot(s2, filters, correction, range_limit, perm_tbl);
+ t3 = convolve8_4_sdot(s3, filters, correction, range_limit, perm_tbl);
+ d01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS);
+ d23 = vqrshrun_n_s16(vcombine_s16(t2, t3), FILTER_BITS);
+
+ store_u8(dst + 0 * dst_stride, dst_stride, d01);
+ store_u8(dst + 2 * dst_stride, dst_stride, d23);
+
+ src += 4 * src_stride;
+ dst += 4 * dst_stride;
+ h -= 4;
+ } while (h != 0);
+ } else {
+ const uint8x16x3_t perm_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
+ const uint8_t *s;
+ uint8_t *d;
+ int width;
+ uint8x8_t d0, d1, d2, d3;
+
+ do {
+ width = w;
+ s = src;
+ d = dst;
+ do {
+ load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
+
+ d0 = convolve8_8_sdot(s0, filters, correction, range_limit, perm_tbl);
+ d1 = convolve8_8_sdot(s1, filters, correction, range_limit, perm_tbl);
+ d2 = convolve8_8_sdot(s2, filters, correction, range_limit, perm_tbl);
+ d3 = convolve8_8_sdot(s3, filters, correction, range_limit, perm_tbl);
+
+ store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
+
+ s += 8;
+ d += 8;
+ width -= 8;
+ } while (width != 0);
+ src += 4 * src_stride;
+ dst += 4 * dst_stride;
+ h -= 4;
+ } while (h != 0);
}
}
@@ -711,8 +910,8 @@
const uint8x16_t range_limit = vdupq_n_u8(128);
uint8x16_t s0, s1, s2, s3;
- assert(!((intptr_t)dst & 3));
- assert(!(dst_stride & 3));
+ assert((intptr_t)dst % 4 == 0);
+ assert(dst_stride % 4 == 0);
assert(x_step_q4 == 16);
(void)x_step_q4;
@@ -722,24 +921,19 @@
src -= 3;
if (w == 4) {
- const uint8x16x2_t permute_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
+ const uint8x16x2_t perm_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
do {
- int32x4_t t0, t1, t2, t3;
- int16x8_t t01, t23;
+ int16x4_t t0, t1, t2, t3;
uint8x8_t d01, d23, dd01, dd23;
- dd01 = vdup_n_u8(0);
- dd23 = vdup_n_u8(0);
load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
- t0 = convolve8_4_sdot(s0, filters, correction, range_limit, permute_tbl);
- t1 = convolve8_4_sdot(s1, filters, correction, range_limit, permute_tbl);
- t2 = convolve8_4_sdot(s2, filters, correction, range_limit, permute_tbl);
- t3 = convolve8_4_sdot(s3, filters, correction, range_limit, permute_tbl);
- t01 = vcombine_s16(vqmovn_s32(t0), vqmovn_s32(t1));
- t23 = vcombine_s16(vqmovn_s32(t2), vqmovn_s32(t3));
- d01 = vqrshrun_n_s16(t01, 7);
- d23 = vqrshrun_n_s16(t23, 7);
+ t0 = convolve8_4_sdot(s0, filters, correction, range_limit, perm_tbl);
+ t1 = convolve8_4_sdot(s1, filters, correction, range_limit, perm_tbl);
+ t2 = convolve8_4_sdot(s2, filters, correction, range_limit, perm_tbl);
+ t3 = convolve8_4_sdot(s3, filters, correction, range_limit, perm_tbl);
+ d01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS);
+ d23 = vqrshrun_n_s16(vcombine_s16(t2, t3), FILTER_BITS);
dd01 = load_u8(dst + 0 * dst_stride, dst_stride);
dd23 = load_u8(dst + 2 * dst_stride, dst_stride);
@@ -753,9 +947,9 @@
src += 4 * src_stride;
dst += 4 * dst_stride;
h -= 4;
- } while (h > 0);
+ } while (h != 0);
} else {
- const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
+ const uint8x16x3_t perm_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
const uint8_t *s;
uint8_t *d;
int width;
@@ -768,14 +962,10 @@
do {
load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
- d0 =
- convolve8_8_sdot(s0, filters, correction, range_limit, permute_tbl);
- d1 =
- convolve8_8_sdot(s1, filters, correction, range_limit, permute_tbl);
- d2 =
- convolve8_8_sdot(s2, filters, correction, range_limit, permute_tbl);
- d3 =
- convolve8_8_sdot(s3, filters, correction, range_limit, permute_tbl);
+ d0 = convolve8_8_sdot(s0, filters, correction, range_limit, perm_tbl);
+ d1 = convolve8_8_sdot(s1, filters, correction, range_limit, perm_tbl);
+ d2 = convolve8_8_sdot(s2, filters, correction, range_limit, perm_tbl);
+ d3 = convolve8_8_sdot(s3, filters, correction, range_limit, perm_tbl);
load_u8_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
@@ -789,11 +979,11 @@
s += 8;
d += 8;
width -= 8;
- } while (width > 0);
+ } while (width != 0);
src += 4 * src_stride;
dst += 4 * dst_stride;
h -= 4;
- } while (h > 0);
+ } while (h != 0);
}
}
@@ -854,8 +1044,8 @@
int8x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
int8x16x2_t samples_LUT;
- assert(!((intptr_t)dst & 3));
- assert(!(dst_stride & 3));
+ assert((intptr_t)dst % 4 == 0);
+ assert(dst_stride % 4 == 0);
assert(y_step_q4 == 16);
(void)x0_q4;
@@ -867,7 +1057,7 @@
if (w == 4) {
const uint8x16_t tran_concat_tbl = vld1q_u8(dot_prod_tran_concat_tbl);
int8x16_t s0123, s1234, s2345, s3456, s4567, s5678, s6789, s78910;
- int32x4_t d0, d1, d2, d3;
+ int16x4_t d0, d1, d2, d3;
uint8x8_t d01, d23;
load_u8_8x7(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6);
@@ -919,8 +1109,8 @@
d1 = convolve8_4_sdot_partial(s1234, s5678, correction, filters);
d2 = convolve8_4_sdot_partial(s2345, s6789, correction, filters);
d3 = convolve8_4_sdot_partial(s3456, s78910, correction, filters);
- d01 = vqrshrun_n_s16(vcombine_s16(vqmovn_s32(d0), vqmovn_s32(d1)), 7);
- d23 = vqrshrun_n_s16(vcombine_s16(vqmovn_s32(d2), vqmovn_s32(d3)), 7);
+ d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
+ d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
store_u8(dst + 0 * dst_stride, dst_stride, d01);
store_u8(dst + 2 * dst_stride, dst_stride, d23);
@@ -935,7 +1125,7 @@
src += 4 * src_stride;
dst += 4 * dst_stride;
h -= 4;
- } while (h > 0);
+ } while (h != 0);
} else {
const uint8x16x2_t tran_concat_tbl = vld1q_u8_x2(dot_prod_tran_concat_tbl);
int8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi,
@@ -1035,11 +1225,11 @@
s += 4 * src_stride;
d += 4 * dst_stride;
height -= 4;
- } while (height > 0);
+ } while (height != 0);
src += 8;
dst += 8;
w -= 8;
- } while (w > 0);
+ } while (w != 0);
}
}
@@ -1057,8 +1247,8 @@
int8x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
int8x16x2_t samples_LUT;
- assert(!((intptr_t)dst & 3));
- assert(!(dst_stride & 3));
+ assert((intptr_t)dst % 4 == 0);
+ assert(dst_stride % 4 == 0);
assert(y_step_q4 == 16);
(void)x0_q4;
@@ -1070,7 +1260,7 @@
if (w == 4) {
const uint8x16_t tran_concat_tbl = vld1q_u8(dot_prod_tran_concat_tbl);
int8x16_t s0123, s1234, s2345, s3456, s4567, s5678, s6789, s78910;
- int32x4_t d0, d1, d2, d3;
+ int16x4_t d0, d1, d2, d3;
uint8x8_t d01, d23, dd01, dd23;
load_u8_8x7(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6);
@@ -1122,8 +1312,8 @@
d1 = convolve8_4_sdot_partial(s1234, s5678, correction, filters);
d2 = convolve8_4_sdot_partial(s2345, s6789, correction, filters);
d3 = convolve8_4_sdot_partial(s3456, s78910, correction, filters);
- d01 = vqrshrun_n_s16(vcombine_s16(vqmovn_s32(d0), vqmovn_s32(d1)), 7);
- d23 = vqrshrun_n_s16(vcombine_s16(vqmovn_s32(d2), vqmovn_s32(d3)), 7);
+ d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
+ d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
dd01 = load_u8(dst + 0 * dst_stride, dst_stride);
dd23 = load_u8(dst + 2 * dst_stride, dst_stride);
@@ -1144,7 +1334,7 @@
src += 4 * src_stride;
dst += 4 * dst_stride;
h -= 4;
- } while (h > 0);
+ } while (h != 0);
} else {
const uint8x16x2_t tran_concat_tbl = vld1q_u8_x2(dot_prod_tran_concat_tbl);
int8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi,
@@ -1251,11 +1441,11 @@
s += 4 * src_stride;
d += 4 * dst_stride;
height -= 4;
- } while (height > 0);
+ } while (height != 0);
src += 8;
dst += 8;
w -= 8;
- } while (w > 0);
+ } while (w != 0);
}
}
@@ -1273,8 +1463,8 @@
const int16x8_t filters = vld1q_s16(filter[x0_q4]);
uint8x8_t t0, t1, t2, t3;
- assert(!((intptr_t)dst & 3));
- assert(!(dst_stride & 3));
+ assert((intptr_t)dst % 4 == 0);
+ assert(dst_stride % 4 == 0);
assert(x_step_q4 == 16);
(void)x_step_q4;
@@ -1286,25 +1476,22 @@
if (h == 4) {
uint8x8_t d01, d23;
int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3;
- int16x8_t tt0, tt1, tt2, tt3;
__builtin_prefetch(src + 0 * src_stride);
__builtin_prefetch(src + 1 * src_stride);
__builtin_prefetch(src + 2 * src_stride);
__builtin_prefetch(src + 3 * src_stride);
+
load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3);
transpose_u8_8x4(&t0, &t1, &t2, &t3);
- tt0 = vreinterpretq_s16_u16(vmovl_u8(t0));
- tt1 = vreinterpretq_s16_u16(vmovl_u8(t1));
- tt2 = vreinterpretq_s16_u16(vmovl_u8(t2));
- tt3 = vreinterpretq_s16_u16(vmovl_u8(t3));
- s0 = vget_low_s16(tt0);
- s1 = vget_low_s16(tt1);
- s2 = vget_low_s16(tt2);
- s3 = vget_low_s16(tt3);
- s4 = vget_high_s16(tt0);
- s5 = vget_high_s16(tt1);
- s6 = vget_high_s16(tt2);
+ s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+ s1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
+ s2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
+ s3 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
+ s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+ s5 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
+ s6 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
+
__builtin_prefetch(dst + 0 * dst_stride);
__builtin_prefetch(dst + 1 * dst_stride);
__builtin_prefetch(dst + 2 * dst_stride);
@@ -1314,32 +1501,22 @@
do {
load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3);
transpose_u8_8x4(&t0, &t1, &t2, &t3);
- tt0 = vreinterpretq_s16_u16(vmovl_u8(t0));
- tt1 = vreinterpretq_s16_u16(vmovl_u8(t1));
- tt2 = vreinterpretq_s16_u16(vmovl_u8(t2));
- tt3 = vreinterpretq_s16_u16(vmovl_u8(t3));
- s7 = vget_low_s16(tt0);
- s8 = vget_low_s16(tt1);
- s9 = vget_low_s16(tt2);
- s10 = vget_low_s16(tt3);
+ s7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+ s8 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
+ s9 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
+ s10 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters);
d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters);
d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters);
d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters);
+ d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
+ d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
- d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), 7);
- d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), 7);
transpose_u8_4x4(&d01, &d23);
- vst1_lane_u32((uint32_t *)(dst + 0 * dst_stride),
- vreinterpret_u32_u8(d01), 0);
- vst1_lane_u32((uint32_t *)(dst + 1 * dst_stride),
- vreinterpret_u32_u8(d23), 0);
- vst1_lane_u32((uint32_t *)(dst + 2 * dst_stride),
- vreinterpret_u32_u8(d01), 1);
- vst1_lane_u32((uint32_t *)(dst + 3 * dst_stride),
- vreinterpret_u32_u8(d23), 1);
+ store_u8(dst + 0 * dst_stride, 2 * dst_stride, d01);
+ store_u8(dst + 1 * dst_stride, 2 * dst_stride, d23);
s0 = s4;
s1 = s5;
@@ -1355,7 +1532,7 @@
} else {
int width;
const uint8_t *s;
- uint8x8_t t4, t5, t6, t7;
+ uint8x8_t t4, t5, t6, t7, d04, d15, d26, d37;
int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
if (w == 4) {
@@ -1395,32 +1572,24 @@
__builtin_prefetch(src + 5 * src_stride);
__builtin_prefetch(src + 6 * src_stride);
__builtin_prefetch(src + 7 * src_stride);
- t0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters);
- t1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters);
- t2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters);
- t3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters);
+ d04 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters);
+ d15 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters);
+ d26 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters);
+ d37 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters);
- transpose_u8_8x4(&t0, &t1, &t2, &t3);
- vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t0), 0);
- dst += dst_stride;
- vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t1), 0);
- dst += dst_stride;
- vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t2), 0);
- dst += dst_stride;
- vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t3), 0);
- dst += dst_stride;
- vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t0), 1);
- dst += dst_stride;
- vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t1), 1);
- dst += dst_stride;
- vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t2), 1);
- dst += dst_stride;
- vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t3), 1);
- dst += dst_stride;
+ transpose_u8_8x4(&d04, &d15, &d26, &d37);
+
+ store_u8(dst + 0 * dst_stride, 4 * dst_stride, d04);
+ store_u8(dst + 1 * dst_stride, 4 * dst_stride, d15);
+ store_u8(dst + 2 * dst_stride, 4 * dst_stride, d26);
+ store_u8(dst + 3 * dst_stride, 4 * dst_stride, d37);
+
+ dst += 8 * dst_stride;
h -= 8;
} while (h > 0);
} else {
uint8_t *d;
+ uint8x8_t d0, d1, d2, d3, d4, d5, d6, d7;
int16x8_t s11, s12, s13, s14;
do {
@@ -1466,17 +1635,18 @@
s13 = vreinterpretq_s16_u16(vmovl_u8(t6));
s14 = vreinterpretq_s16_u16(vmovl_u8(t7));
- t0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters);
- t1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters);
- t2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters);
- t3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters);
- t4 = convolve8_8(s4, s5, s6, s7, s8, s9, s10, s11, filters);
- t5 = convolve8_8(s5, s6, s7, s8, s9, s10, s11, s12, filters);
- t6 = convolve8_8(s6, s7, s8, s9, s10, s11, s12, s13, filters);
- t7 = convolve8_8(s7, s8, s9, s10, s11, s12, s13, s14, filters);
+ d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters);
+ d1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters);
+ d2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters);
+ d3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters);
+ d4 = convolve8_8(s4, s5, s6, s7, s8, s9, s10, s11, filters);
+ d5 = convolve8_8(s5, s6, s7, s8, s9, s10, s11, s12, filters);
+ d6 = convolve8_8(s6, s7, s8, s9, s10, s11, s12, s13, filters);
+ d7 = convolve8_8(s7, s8, s9, s10, s11, s12, s13, s14, filters);
- transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
- store_u8_8x8(d, dst_stride, t0, t1, t2, t3, t4, t5, t6, t7);
+ transpose_u8_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7);
+
+ store_u8_8x8(d, dst_stride, d0, d1, d2, d3, d4, d5, d6, d7);
s0 = s8;
s1 = s9;
@@ -1505,8 +1675,8 @@
const int16x8_t filters = vld1q_s16(filter[x0_q4]);
uint8x8_t t0, t1, t2, t3;
- assert(!((intptr_t)dst & 3));
- assert(!(dst_stride & 3));
+ assert((intptr_t)dst % 4 == 0);
+ assert(dst_stride % 4 == 0);
assert(x_step_q4 == 16);
(void)x_step_q4;
@@ -1516,10 +1686,8 @@
src -= 3;
if (h == 4) {
- uint8x8_t d01, d23;
+ uint8x8_t d01, d23, dd01, dd23;
int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3;
- int16x8_t tt0, tt1, tt2, tt3;
- uint32x4_t d0123 = vdupq_n_u32(0);
__builtin_prefetch(src + 0 * src_stride);
__builtin_prefetch(src + 1 * src_stride);
@@ -1527,17 +1695,14 @@
__builtin_prefetch(src + 3 * src_stride);
load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3);
transpose_u8_8x4(&t0, &t1, &t2, &t3);
- tt0 = vreinterpretq_s16_u16(vmovl_u8(t0));
- tt1 = vreinterpretq_s16_u16(vmovl_u8(t1));
- tt2 = vreinterpretq_s16_u16(vmovl_u8(t2));
- tt3 = vreinterpretq_s16_u16(vmovl_u8(t3));
- s0 = vget_low_s16(tt0);
- s1 = vget_low_s16(tt1);
- s2 = vget_low_s16(tt2);
- s3 = vget_low_s16(tt3);
- s4 = vget_high_s16(tt0);
- s5 = vget_high_s16(tt1);
- s6 = vget_high_s16(tt2);
+ s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+ s1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
+ s2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
+ s3 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
+ s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+ s5 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
+ s6 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
+
__builtin_prefetch(dst + 0 * dst_stride);
__builtin_prefetch(dst + 1 * dst_stride);
__builtin_prefetch(dst + 2 * dst_stride);
@@ -1547,35 +1712,28 @@
do {
load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3);
transpose_u8_8x4(&t0, &t1, &t2, &t3);
- tt0 = vreinterpretq_s16_u16(vmovl_u8(t0));
- tt1 = vreinterpretq_s16_u16(vmovl_u8(t1));
- tt2 = vreinterpretq_s16_u16(vmovl_u8(t2));
- tt3 = vreinterpretq_s16_u16(vmovl_u8(t3));
- s7 = vget_low_s16(tt0);
- s8 = vget_low_s16(tt1);
- s9 = vget_low_s16(tt2);
- s10 = vget_low_s16(tt3);
+ s7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+ s8 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
+ s9 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
+ s10 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters);
d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters);
d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters);
d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters);
+ d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
+ d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
- d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), 7);
- d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), 7);
transpose_u8_4x4(&d01, &d23);
- d0123 = vld1q_lane_u32((uint32_t *)(dst + 0 * dst_stride), d0123, 0);
- d0123 = vld1q_lane_u32((uint32_t *)(dst + 1 * dst_stride), d0123, 2);
- d0123 = vld1q_lane_u32((uint32_t *)(dst + 2 * dst_stride), d0123, 1);
- d0123 = vld1q_lane_u32((uint32_t *)(dst + 3 * dst_stride), d0123, 3);
- d0123 = vreinterpretq_u32_u8(
- vrhaddq_u8(vreinterpretq_u8_u32(d0123), vcombine_u8(d01, d23)));
+ dd01 = load_u8(dst + 0 * dst_stride, 2 * dst_stride);
+ dd23 = load_u8(dst + 1 * dst_stride, 2 * dst_stride);
- vst1q_lane_u32((uint32_t *)(dst + 0 * dst_stride), d0123, 0);
- vst1q_lane_u32((uint32_t *)(dst + 1 * dst_stride), d0123, 2);
- vst1q_lane_u32((uint32_t *)(dst + 2 * dst_stride), d0123, 1);
- vst1q_lane_u32((uint32_t *)(dst + 3 * dst_stride), d0123, 3);
+ d01 = vrhadd_u8(d01, dd01);
+ d23 = vrhadd_u8(d23, dd23);
+
+ store_u8(dst + 0 * dst_stride, 2 * dst_stride, d01);
+ store_u8(dst + 1 * dst_stride, 2 * dst_stride, d23);
s0 = s4;
s1 = s5;
@@ -1595,8 +1753,8 @@
int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
if (w == 4) {
- uint32x4_t d0415 = vdupq_n_u32(0);
- uint32x4_t d2637 = vdupq_n_u32(0);
+ uint8x8_t d04, d15, d26, d37, dd04, dd15, dd26, dd37;
+
do {
load_u8_8x8(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
@@ -1633,48 +1791,35 @@
__builtin_prefetch(src + 5 * src_stride);
__builtin_prefetch(src + 6 * src_stride);
__builtin_prefetch(src + 7 * src_stride);
- t0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters);
- t1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters);
- t2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters);
- t3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters);
+ d04 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters);
+ d15 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters);
+ d26 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters);
+ d37 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters);
- transpose_u8_8x4(&t0, &t1, &t2, &t3);
+ transpose_u8_8x4(&d04, &d15, &d26, &d37);
- d0415 = vld1q_lane_u32((uint32_t *)(dst + 0 * dst_stride), d0415, 0);
- d0415 = vld1q_lane_u32((uint32_t *)(dst + 1 * dst_stride), d0415, 2);
- d2637 = vld1q_lane_u32((uint32_t *)(dst + 2 * dst_stride), d2637, 0);
- d2637 = vld1q_lane_u32((uint32_t *)(dst + 3 * dst_stride), d2637, 2);
- d0415 = vld1q_lane_u32((uint32_t *)(dst + 4 * dst_stride), d0415, 1);
- d0415 = vld1q_lane_u32((uint32_t *)(dst + 5 * dst_stride), d0415, 3);
- d2637 = vld1q_lane_u32((uint32_t *)(dst + 6 * dst_stride), d2637, 1);
- d2637 = vld1q_lane_u32((uint32_t *)(dst + 7 * dst_stride), d2637, 3);
- d0415 = vreinterpretq_u32_u8(
- vrhaddq_u8(vreinterpretq_u8_u32(d0415), vcombine_u8(t0, t1)));
- d2637 = vreinterpretq_u32_u8(
- vrhaddq_u8(vreinterpretq_u8_u32(d2637), vcombine_u8(t2, t3)));
+ dd04 = load_u8(dst + 0 * dst_stride, 4 * dst_stride);
+ dd15 = load_u8(dst + 1 * dst_stride, 4 * dst_stride);
+ dd26 = load_u8(dst + 2 * dst_stride, 4 * dst_stride);
+ dd37 = load_u8(dst + 3 * dst_stride, 4 * dst_stride);
- vst1q_lane_u32((uint32_t *)dst, d0415, 0);
- dst += dst_stride;
- vst1q_lane_u32((uint32_t *)dst, d0415, 2);
- dst += dst_stride;
- vst1q_lane_u32((uint32_t *)dst, d2637, 0);
- dst += dst_stride;
- vst1q_lane_u32((uint32_t *)dst, d2637, 2);
- dst += dst_stride;
- vst1q_lane_u32((uint32_t *)dst, d0415, 1);
- dst += dst_stride;
- vst1q_lane_u32((uint32_t *)dst, d0415, 3);
- dst += dst_stride;
- vst1q_lane_u32((uint32_t *)dst, d2637, 1);
- dst += dst_stride;
- vst1q_lane_u32((uint32_t *)dst, d2637, 3);
- dst += dst_stride;
+ d04 = vrhadd_u8(d04, dd04);
+ d15 = vrhadd_u8(d15, dd15);
+ d26 = vrhadd_u8(d26, dd26);
+ d37 = vrhadd_u8(d37, dd37);
+
+ store_u8(dst + 0 * dst_stride, 4 * dst_stride, d04);
+ store_u8(dst + 1 * dst_stride, 4 * dst_stride, d15);
+ store_u8(dst + 2 * dst_stride, 4 * dst_stride, d26);
+ store_u8(dst + 3 * dst_stride, 4 * dst_stride, d37);
+
+ dst += 8 * dst_stride;
h -= 8;
- } while (h > 0);
+ } while (h != 0);
} else {
uint8_t *d;
+ uint8x8_t d0, d1, d2, d3, d4, d5, d6, d7;
int16x8_t s11, s12, s13, s14;
- uint8x16_t d01, d23, d45, d67;
do {
__builtin_prefetch(src + 0 * src_stride);
@@ -1719,33 +1864,27 @@
s13 = vreinterpretq_s16_u16(vmovl_u8(t6));
s14 = vreinterpretq_s16_u16(vmovl_u8(t7));
- t0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters);
- t1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters);
- t2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters);
- t3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters);
- t4 = convolve8_8(s4, s5, s6, s7, s8, s9, s10, s11, filters);
- t5 = convolve8_8(s5, s6, s7, s8, s9, s10, s11, s12, filters);
- t6 = convolve8_8(s6, s7, s8, s9, s10, s11, s12, s13, filters);
- t7 = convolve8_8(s7, s8, s9, s10, s11, s12, s13, s14, filters);
+ d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters);
+ d1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters);
+ d2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters);
+ d3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters);
+ d4 = convolve8_8(s4, s5, s6, s7, s8, s9, s10, s11, filters);
+ d5 = convolve8_8(s5, s6, s7, s8, s9, s10, s11, s12, filters);
+ d6 = convolve8_8(s6, s7, s8, s9, s10, s11, s12, s13, filters);
+ d7 = convolve8_8(s7, s8, s9, s10, s11, s12, s13, s14, filters);
- transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+ transpose_u8_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7);
- d01 = vcombine_u8(vld1_u8(d + 0 * dst_stride),
- vld1_u8(d + 1 * dst_stride));
- d23 = vcombine_u8(vld1_u8(d + 2 * dst_stride),
- vld1_u8(d + 3 * dst_stride));
- d45 = vcombine_u8(vld1_u8(d + 4 * dst_stride),
- vld1_u8(d + 5 * dst_stride));
- d67 = vcombine_u8(vld1_u8(d + 6 * dst_stride),
- vld1_u8(d + 7 * dst_stride));
- d01 = vrhaddq_u8(d01, vcombine_u8(t0, t1));
- d23 = vrhaddq_u8(d23, vcombine_u8(t2, t3));
- d45 = vrhaddq_u8(d45, vcombine_u8(t4, t5));
- d67 = vrhaddq_u8(d67, vcombine_u8(t6, t7));
+ d0 = vrhadd_u8(d0, vld1_u8(d + 0 * dst_stride));
+ d1 = vrhadd_u8(d1, vld1_u8(d + 1 * dst_stride));
+ d2 = vrhadd_u8(d2, vld1_u8(d + 2 * dst_stride));
+ d3 = vrhadd_u8(d3, vld1_u8(d + 3 * dst_stride));
+ d4 = vrhadd_u8(d4, vld1_u8(d + 4 * dst_stride));
+ d5 = vrhadd_u8(d5, vld1_u8(d + 5 * dst_stride));
+ d6 = vrhadd_u8(d6, vld1_u8(d + 6 * dst_stride));
+ d7 = vrhadd_u8(d7, vld1_u8(d + 7 * dst_stride));
- store_u8_8x8(d, dst_stride, vget_low_u8(d01), vget_high_u8(d01),
- vget_low_u8(d23), vget_high_u8(d23), vget_low_u8(d45),
- vget_high_u8(d45), vget_low_u8(d67), vget_high_u8(d67));
+ store_u8_8x8(d, dst_stride, d0, d1, d2, d3, d4, d5, d6, d7);
s0 = s8;
s1 = s9;
@@ -1761,7 +1900,7 @@
src += 8 * src_stride;
dst += 8 * dst_stride;
h -= 8;
- } while (h > 0);
+ } while (h != 0);
}
}
}
@@ -1773,8 +1912,8 @@
int h) {
const int16x8_t filters = vld1q_s16(filter[y0_q4]);
- assert(!((intptr_t)dst & 3));
- assert(!(dst_stride & 3));
+ assert((intptr_t)dst % 4 == 0);
+ assert(dst_stride % 4 == 0);
assert(y_step_q4 == 16);
(void)x0_q4;
@@ -1784,33 +1923,26 @@
src -= 3 * src_stride;
if (w == 4) {
- uint8x8_t d01, d23;
+ uint8x8_t t0, t1, t2, t3, t4, t5, t6, d01, d23;
int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3;
- s0 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
- src += src_stride;
- s1 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
- src += src_stride;
- s2 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
- src += src_stride;
- s3 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
- src += src_stride;
- s4 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
- src += src_stride;
- s5 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
- src += src_stride;
- s6 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
- src += src_stride;
+ load_u8_8x7(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6);
+ s0 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0)));
+ s1 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1)));
+ s2 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2)));
+ s3 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3)));
+ s4 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t4)));
+ s5 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t5)));
+ s6 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t6)));
+
+ src += 7 * src_stride;
do {
- s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
- src += src_stride;
- s8 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
- src += src_stride;
- s9 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
- src += src_stride;
- s10 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
- src += src_stride;
+ load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3);
+ s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0)));
+ s8 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1)));
+ s9 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2)));
+ s10 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3)));
__builtin_prefetch(dst + 0 * dst_stride);
__builtin_prefetch(dst + 1 * dst_stride);
@@ -1820,21 +1952,16 @@
__builtin_prefetch(src + 1 * src_stride);
__builtin_prefetch(src + 2 * src_stride);
__builtin_prefetch(src + 3 * src_stride);
+
d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters);
d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters);
d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters);
d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters);
+ d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
+ d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
- d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), 7);
- d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), 7);
- vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d01), 0);
- dst += dst_stride;
- vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d01), 1);
- dst += dst_stride;
- vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d23), 0);
- dst += dst_stride;
- vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d23), 1);
- dst += dst_stride;
+ store_u8(dst + 0 * dst_stride, dst_stride, d01);
+ store_u8(dst + 2 * dst_stride, dst_stride, d23);
s0 = s4;
s1 = s5;
@@ -1843,13 +1970,15 @@
s4 = s8;
s5 = s9;
s6 = s10;
+ src += 4 * src_stride;
+ dst += 4 * dst_stride;
h -= 4;
} while (h != 0);
} else {
int height;
const uint8_t *s;
uint8_t *d;
- uint8x8_t t0, t1, t2, t3;
+ uint8x8_t t0, t1, t2, t3, t4, t5, t6, d0, d1, d2, d3;
int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
do {
@@ -1860,33 +1989,26 @@
__builtin_prefetch(src + 4 * src_stride);
__builtin_prefetch(src + 5 * src_stride);
__builtin_prefetch(src + 6 * src_stride);
- s = src;
- s0 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
- s += src_stride;
- s1 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
- s += src_stride;
- s2 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
- s += src_stride;
- s3 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
- s += src_stride;
- s4 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
- s += src_stride;
- s5 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
- s += src_stride;
- s6 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
- s += src_stride;
+
+ load_u8_8x7(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6);
+ s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+ s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
+ s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
+ s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
+ s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
+ s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
+ s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
+
+ s = src + 7 * src_stride;
d = dst;
height = h;
do {
- s7 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
- s += src_stride;
- s8 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
- s += src_stride;
- s9 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
- s += src_stride;
- s10 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
- s += src_stride;
+ load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3);
+ s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
+ s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
+ s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
+ s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
__builtin_prefetch(d + 0 * dst_stride);
__builtin_prefetch(d + 1 * dst_stride);
@@ -1896,19 +2018,13 @@
__builtin_prefetch(s + 1 * src_stride);
__builtin_prefetch(s + 2 * src_stride);
__builtin_prefetch(s + 3 * src_stride);
- t0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters);
- t1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters);
- t2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters);
- t3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters);
- vst1_u8(d, t0);
- d += dst_stride;
- vst1_u8(d, t1);
- d += dst_stride;
- vst1_u8(d, t2);
- d += dst_stride;
- vst1_u8(d, t3);
- d += dst_stride;
+ d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters);
+ d1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters);
+ d2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters);
+ d3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters);
+
+ store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
s0 = s4;
s1 = s5;
@@ -1917,6 +2033,8 @@
s4 = s8;
s5 = s9;
s6 = s10;
+ s += 4 * src_stride;
+ d += 4 * dst_stride;
height -= 4;
} while (height != 0);
src += 8;
@@ -1933,8 +2051,8 @@
int h) {
const int16x8_t filters = vld1q_s16(filter[y0_q4]);
- assert(!((intptr_t)dst & 3));
- assert(!(dst_stride & 3));
+ assert((intptr_t)dst % 4 == 0);
+ assert(dst_stride % 4 == 0);
assert(y_step_q4 == 16);
(void)x0_q4;
@@ -1944,34 +2062,26 @@
src -= 3 * src_stride;
if (w == 4) {
- uint8x8_t d01, d23;
+ uint8x8_t t0, t1, t2, t3, t4, t5, t6, d01, d23, dd01, dd23;
int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3;
- uint32x4_t d0123 = vdupq_n_u32(0);
- s0 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
- src += src_stride;
- s1 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
- src += src_stride;
- s2 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
- src += src_stride;
- s3 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
- src += src_stride;
- s4 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
- src += src_stride;
- s5 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
- src += src_stride;
- s6 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
- src += src_stride;
+ load_u8_8x7(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6);
+ s0 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0)));
+ s1 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1)));
+ s2 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2)));
+ s3 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3)));
+ s4 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t4)));
+ s5 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t5)));
+ s6 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t6)));
+
+ src += 7 * src_stride;
do {
- s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
- src += src_stride;
- s8 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
- src += src_stride;
- s9 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
- src += src_stride;
- s10 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
- src += src_stride;
+ load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3);
+ s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0)));
+ s8 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1)));
+ s9 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2)));
+ s10 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3)));
__builtin_prefetch(dst + 0 * dst_stride);
__builtin_prefetch(dst + 1 * dst_stride);
@@ -1981,29 +2091,22 @@
__builtin_prefetch(src + 1 * src_stride);
__builtin_prefetch(src + 2 * src_stride);
__builtin_prefetch(src + 3 * src_stride);
+
d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters);
d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters);
d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters);
d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters);
+ d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
+ d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
- d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), 7);
- d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), 7);
+ dd01 = load_u8(dst + 0 * dst_stride, dst_stride);
+ dd23 = load_u8(dst + 2 * dst_stride, dst_stride);
- d0123 = vld1q_lane_u32((uint32_t *)(dst + 0 * dst_stride), d0123, 0);
- d0123 = vld1q_lane_u32((uint32_t *)(dst + 1 * dst_stride), d0123, 1);
- d0123 = vld1q_lane_u32((uint32_t *)(dst + 2 * dst_stride), d0123, 2);
- d0123 = vld1q_lane_u32((uint32_t *)(dst + 3 * dst_stride), d0123, 3);
- d0123 = vreinterpretq_u32_u8(
- vrhaddq_u8(vreinterpretq_u8_u32(d0123), vcombine_u8(d01, d23)));
+ d01 = vrhadd_u8(d01, dd01);
+ d23 = vrhadd_u8(d23, dd23);
- vst1q_lane_u32((uint32_t *)dst, d0123, 0);
- dst += dst_stride;
- vst1q_lane_u32((uint32_t *)dst, d0123, 1);
- dst += dst_stride;
- vst1q_lane_u32((uint32_t *)dst, d0123, 2);
- dst += dst_stride;
- vst1q_lane_u32((uint32_t *)dst, d0123, 3);
- dst += dst_stride;
+ store_u8(dst + 0 * dst_stride, dst_stride, d01);
+ store_u8(dst + 2 * dst_stride, dst_stride, d23);
s0 = s4;
s1 = s5;
@@ -2012,14 +2115,15 @@
s4 = s8;
s5 = s9;
s6 = s10;
+ src += 4 * src_stride;
+ dst += 4 * dst_stride;
h -= 4;
} while (h != 0);
} else {
int height;
const uint8_t *s;
uint8_t *d;
- uint8x8_t t0, t1, t2, t3;
- uint8x16_t d01, d23, dd01, dd23;
+ uint8x8_t t0, t1, t2, t3, t4, t5, t6, d0, d1, d2, d3;
int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
do {
@@ -2030,33 +2134,26 @@
__builtin_prefetch(src + 4 * src_stride);
__builtin_prefetch(src + 5 * src_stride);
__builtin_prefetch(src + 6 * src_stride);
- s = src;
- s0 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
- s += src_stride;
- s1 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
- s += src_stride;
- s2 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
- s += src_stride;
- s3 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
- s += src_stride;
- s4 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
- s += src_stride;
- s5 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
- s += src_stride;
- s6 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
- s += src_stride;
+
+ load_u8_8x7(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6);
+ s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+ s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
+ s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
+ s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
+ s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
+ s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
+ s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
+
+ s = src + 7 * src_stride;
d = dst;
height = h;
do {
- s7 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
- s += src_stride;
- s8 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
- s += src_stride;
- s9 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
- s += src_stride;
- s10 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
- s += src_stride;
+ load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3);
+ s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
+ s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
+ s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
+ s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
__builtin_prefetch(d + 0 * dst_stride);
__builtin_prefetch(d + 1 * dst_stride);
@@ -2066,28 +2163,18 @@
__builtin_prefetch(s + 1 * src_stride);
__builtin_prefetch(s + 2 * src_stride);
__builtin_prefetch(s + 3 * src_stride);
- t0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters);
- t1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters);
- t2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters);
- t3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters);
- d01 = vcombine_u8(t0, t1);
- d23 = vcombine_u8(t2, t3);
- dd01 = vcombine_u8(vld1_u8(d + 0 * dst_stride),
- vld1_u8(d + 1 * dst_stride));
- dd23 = vcombine_u8(vld1_u8(d + 2 * dst_stride),
- vld1_u8(d + 3 * dst_stride));
- dd01 = vrhaddq_u8(dd01, d01);
- dd23 = vrhaddq_u8(dd23, d23);
+ d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters);
+ d1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters);
+ d2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters);
+ d3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters);
- vst1_u8(d, vget_low_u8(dd01));
- d += dst_stride;
- vst1_u8(d, vget_high_u8(dd01));
- d += dst_stride;
- vst1_u8(d, vget_low_u8(dd23));
- d += dst_stride;
- vst1_u8(d, vget_high_u8(dd23));
- d += dst_stride;
+ d0 = vrhadd_u8(d0, vld1_u8(d + 0 * dst_stride));
+ d1 = vrhadd_u8(d1, vld1_u8(d + 1 * dst_stride));
+ d2 = vrhadd_u8(d2, vld1_u8(d + 2 * dst_stride));
+ d3 = vrhadd_u8(d3, vld1_u8(d + 3 * dst_stride));
+
+ store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
s0 = s4;
s1 = s5;
@@ -2097,6 +2184,8 @@
s5 = s9;
s6 = s10;
height -= 4;
+ s += 4 * src_stride;
+ d += 4 * dst_stride;
} while (height != 0);
src += 8;
dst += 8;
diff --git a/vpx_dsp/arm/vpx_convolve8_neon.h b/vpx_dsp/arm/vpx_convolve8_neon.h
index 07cf824..2f78583 100644
--- a/vpx_dsp/arm/vpx_convolve8_neon.h
+++ b/vpx_dsp/arm/vpx_convolve8_neon.h
@@ -15,10 +15,20 @@
#include "./vpx_config.h"
#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/vpx_filter.h"
+
+#if VPX_ARCH_AARCH64 && \
+ (defined(__ARM_FEATURE_DOTPROD) || defined(__ARM_FEATURE_MATMUL_INT8))
+void vpx_convolve8_2d_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const InterpKernel *filter, int x0_q4,
+ int x_step_q4, int y0_q4, int y_step_q4, int w,
+ int h);
+#endif
#if VPX_ARCH_AARCH64 && defined(__ARM_FEATURE_DOTPROD)
-static INLINE int32x4_t convolve8_4_sdot_partial(const int8x16_t samples_lo,
+static INLINE int16x4_t convolve8_4_sdot_partial(const int8x16_t samples_lo,
const int8x16_t samples_hi,
const int32x4_t correction,
const int8x8_t filters) {
@@ -29,11 +39,11 @@
sum = vdotq_lane_s32(correction, samples_lo, filters, 0);
sum = vdotq_lane_s32(sum, samples_hi, filters, 1);
- /* Narrowing and packing is performed by the caller. */
- return sum;
+ /* Further narrowing and packing is performed by the caller. */
+ return vqmovn_s32(sum);
}
-static INLINE int32x4_t convolve8_4_sdot(uint8x16_t samples,
+static INLINE int16x4_t convolve8_4_sdot(uint8x16_t samples,
const int8x8_t filters,
const int32x4_t correction,
const uint8x16_t range_limit,
@@ -54,8 +64,8 @@
sum = vdotq_lane_s32(correction, permuted_samples[0], filters, 0);
sum = vdotq_lane_s32(sum, permuted_samples[1], filters, 1);
- /* Narrowing and packing is performed by the caller. */
- return sum;
+ /* Further narrowing and packing is performed by the caller. */
+ return vqmovn_s32(sum);
}
static INLINE uint8x8_t convolve8_8_sdot_partial(const int8x16_t samples0_lo,
@@ -78,7 +88,7 @@
/* Narrow and re-pack. */
sum = vcombine_s16(vqmovn_s32(sum0), vqmovn_s32(sum1));
- return vqrshrun_n_s16(sum, 7);
+ return vqrshrun_n_s16(sum, FILTER_BITS);
}
static INLINE uint8x8_t convolve8_8_sdot(uint8x16_t samples,
@@ -111,14 +121,14 @@
/* Narrow and re-pack. */
sum = vcombine_s16(vqmovn_s32(sum0), vqmovn_s32(sum1));
- return vqrshrun_n_s16(sum, 7);
+ return vqrshrun_n_s16(sum, FILTER_BITS);
}
#endif // VPX_ARCH_AARCH64 && defined(__ARM_FEATURE_DOTPROD)
#if VPX_ARCH_AARCH64 && defined(__ARM_FEATURE_MATMUL_INT8)
-static INLINE int32x4_t convolve8_4_usdot_partial(const uint8x16_t samples_lo,
+static INLINE int16x4_t convolve8_4_usdot_partial(const uint8x16_t samples_lo,
const uint8x16_t samples_hi,
const int8x8_t filters) {
/* Sample permutation is performed by the caller. */
@@ -127,11 +137,11 @@
sum = vusdotq_lane_s32(vdupq_n_s32(0), samples_lo, filters, 0);
sum = vusdotq_lane_s32(sum, samples_hi, filters, 1);
- /* Narrowing and packing is performed by the caller. */
- return sum;
+ /* Further narrowing and packing is performed by the caller. */
+ return vqmovn_s32(sum);
}
-static INLINE int32x4_t convolve8_4_usdot(uint8x16_t samples,
+static INLINE int16x4_t convolve8_4_usdot(uint8x16_t samples,
const int8x8_t filters,
const uint8x16x2_t permute_tbl) {
uint8x16_t permuted_samples[2];
@@ -147,8 +157,8 @@
sum = vusdotq_lane_s32(vdupq_n_s32(0), permuted_samples[0], filters, 0);
sum = vusdotq_lane_s32(sum, permuted_samples[1], filters, 1);
- /* Narrowing and packing is performed by the caller. */
- return sum;
+ /* Further narrowing and packing is performed by the caller. */
+ return vqmovn_s32(sum);
}
static INLINE uint8x8_t convolve8_8_usdot_partial(const uint8x16_t samples0_lo,
@@ -169,7 +179,7 @@
/* Narrow and re-pack. */
sum = vcombine_s16(vqmovn_s32(sum0), vqmovn_s32(sum1));
- return vqrshrun_n_s16(sum, 7);
+ return vqrshrun_n_s16(sum, FILTER_BITS);
}
static INLINE uint8x8_t convolve8_8_usdot(uint8x16_t samples,
@@ -196,7 +206,7 @@
/* Narrow and re-pack. */
sum = vcombine_s16(vqmovn_s32(sum0), vqmovn_s32(sum1));
- return vqrshrun_n_s16(sum, 7);
+ return vqrshrun_n_s16(sum, FILTER_BITS);
}
#endif // VPX_ARCH_AARCH64 && defined(__ARM_FEATURE_MATMUL_INT8)
@@ -238,7 +248,7 @@
sum = vmlaq_lane_s16(sum, s7, filters_hi, 3);
sum = vqaddq_s16(sum, vmulq_lane_s16(s3, filters_lo, 3));
sum = vqaddq_s16(sum, vmulq_lane_s16(s4, filters_hi, 0));
- return vqrshrun_n_s16(sum, 7);
+ return vqrshrun_n_s16(sum, FILTER_BITS);
}
static INLINE uint8x8_t scale_filter_8(const uint8x8_t *const s,
diff --git a/vpx_dsp/arm/vpx_convolve_neon.c b/vpx_dsp/arm/vpx_convolve_neon.c
index 830f317..f7db3e6 100644
--- a/vpx_dsp/arm/vpx_convolve_neon.c
+++ b/vpx_dsp/arm/vpx_convolve_neon.c
@@ -14,6 +14,57 @@
#include "vpx_dsp/vpx_dsp_common.h"
#include "vpx_ports/mem.h"
+#if VPX_ARCH_AARCH64 && \
+ (defined(__ARM_FEATURE_DOTPROD) || defined(__ARM_FEATURE_MATMUL_INT8))
+#include "vpx_dsp/arm/vpx_convolve8_neon.h"
+
+void vpx_convolve8_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
+ ptrdiff_t dst_stride, const InterpKernel *filter,
+ int x0_q4, int x_step_q4, int y0_q4, int y_step_q4,
+ int w, int h) {
+ /* Given our constraints: w <= 64, h <= 64, taps == 8 we can reduce the
+ * maximum buffer size to 64 * (64 + 7). */
+ uint8_t temp[64 * 71];
+
+ /* Account for the vertical phase needing 3 lines prior and 4 lines post. */
+ const int intermediate_height = h + 7;
+
+ assert(y_step_q4 == 16);
+ assert(x_step_q4 == 16);
+
+ /* Filter starting 3 lines back. */
+ vpx_convolve8_2d_horiz_neon(src - src_stride * 3, src_stride, temp, w, filter,
+ x0_q4, x_step_q4, y0_q4, y_step_q4, w,
+ intermediate_height);
+
+ /* Step into the temp buffer 3 lines to get the actual frame data */
+ vpx_convolve8_vert_neon(temp + w * 3, w, dst, dst_stride, filter, x0_q4,
+ x_step_q4, y0_q4, y_step_q4, w, h);
+}
+
+void vpx_convolve8_avg_neon(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const InterpKernel *filter, int x0_q4,
+ int x_step_q4, int y0_q4, int y_step_q4, int w,
+ int h) {
+ uint8_t temp[64 * 71];
+ const int intermediate_height = h + 7;
+
+ assert(y_step_q4 == 16);
+ assert(x_step_q4 == 16);
+
+ vpx_convolve8_2d_horiz_neon(src - src_stride * 3, src_stride, temp, w, filter,
+ x0_q4, x_step_q4, y0_q4, y_step_q4, w,
+ intermediate_height);
+
+ vpx_convolve8_avg_vert_neon(temp + w * 3, w, dst, dst_stride, filter, x0_q4,
+ x_step_q4, y0_q4, y_step_q4, w, h);
+}
+
+#else // !(VPX_ARCH_AARCH64 &&
+ // (defined(__ARM_FEATURE_DOTPROD) ||
+ // defined(__ARM_FEATURE_MATMUL_INT8)))
+
void vpx_convolve8_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
ptrdiff_t dst_stride, const InterpKernel *filter,
int x0_q4, int x_step_q4, int y0_q4, int y_step_q4,
@@ -63,3 +114,7 @@
vpx_convolve8_avg_vert_neon(temp + w * 3, w, dst, dst_stride, filter, x0_q4,
x_step_q4, y0_q4, y_step_q4, w, h);
}
+
+#endif // #if VPX_ARCH_AARCH64 &&
+ // (defined(__ARM_FEATURE_DOTPROD) ||
+ // defined(__ARM_FEATURE_MATMUL_INT8))
diff --git a/vpx_dsp/sad.c b/vpx_dsp/sad.c
index 619d7aa..2a4c81d 100644
--- a/vpx_dsp/sad.c
+++ b/vpx_dsp/sad.c
@@ -40,7 +40,7 @@
unsigned int vpx_sad##m##x##n##_avg_c( \
const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
int ref_stride, const uint8_t *second_pred) { \
- DECLARE_ALIGNED(16, uint8_t, comp_pred[m * n]); \
+ DECLARE_ALIGNED(32, uint8_t, comp_pred[m * n]); \
vpx_comp_avg_pred_c(comp_pred, second_pred, m, n, ref_ptr, ref_stride); \
return sad(src_ptr, src_stride, comp_pred, m, m, n); \
} \
diff --git a/vpx_dsp/variance.c b/vpx_dsp/variance.c
index ce1e838..a6793ef 100644
--- a/vpx_dsp/variance.c
+++ b/vpx_dsp/variance.c
@@ -156,7 +156,7 @@
const uint8_t *second_pred) { \
uint16_t fdata3[(H + 1) * W]; \
uint8_t temp2[H * W]; \
- DECLARE_ALIGNED(16, uint8_t, temp3[H * W]); \
+ DECLARE_ALIGNED(32, uint8_t, temp3[H * W]); \
\
var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_stride, 1, H + 1, \
W, bilinear_filters[x_offset]); \
diff --git a/vpx_dsp/vpx_dsp.mk b/vpx_dsp/vpx_dsp.mk
index 67d3fb0..04969f3 100644
--- a/vpx_dsp/vpx_dsp.mk
+++ b/vpx_dsp/vpx_dsp.mk
@@ -424,6 +424,7 @@
DSP_SRCS-$(HAVE_MMI) += mips/variance_mmi.c
DSP_SRCS-$(HAVE_SSE2) += x86/avg_pred_sse2.c
+DSP_SRCS-$(HAVE_AVX2) += x86/avg_pred_avx2.c
DSP_SRCS-$(HAVE_SSE2) += x86/variance_sse2.c # Contains SSE2 and SSSE3
DSP_SRCS-$(HAVE_AVX2) += x86/variance_avx2.c
DSP_SRCS-$(HAVE_VSX) += ppc/variance_vsx.c
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl
index cae4ca81..f20f4e0 100644
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -1321,7 +1321,7 @@
specialize qw/vpx_get4x4sse_cs neon msa vsx/;
add_proto qw/void vpx_comp_avg_pred/, "uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride";
- specialize qw/vpx_comp_avg_pred neon sse2 vsx lsx/;
+ specialize qw/vpx_comp_avg_pred neon sse2 avx2 vsx lsx/;
#
# Subpixel Variance
diff --git a/vpx_dsp/x86/avg_pred_avx2.c b/vpx_dsp/x86/avg_pred_avx2.c
new file mode 100644
index 0000000..f435799
--- /dev/null
+++ b/vpx_dsp/x86/avg_pred_avx2.c
@@ -0,0 +1,111 @@
+/*
+ * Copyright (c) 2023 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <immintrin.h>
+
+#include "./vpx_dsp_rtcd.h"
+
+void vpx_comp_avg_pred_avx2(uint8_t *comp_pred, const uint8_t *pred, int width,
+ int height, const uint8_t *ref, int ref_stride) {
+ int row = 0;
+ // comp_pred and pred must be 32 byte aligned.
+ assert(((intptr_t)comp_pred % 32) == 0);
+ assert(((intptr_t)pred % 32) == 0);
+
+ if (width == 8) {
+ assert(height % 4 == 0);
+ do {
+ const __m256i p = _mm256_load_si256((const __m256i *)pred);
+ const __m128i r_0 = _mm_loadl_epi64((const __m128i *)ref);
+ const __m128i r_1 =
+ _mm_loadl_epi64((const __m128i *)(ref + 2 * ref_stride));
+
+ const __m128i r1 = _mm_castps_si128(_mm_loadh_pi(
+ _mm_castsi128_ps(r_0), (const __m64 *)(ref + ref_stride)));
+ const __m128i r2 = _mm_castps_si128(_mm_loadh_pi(
+ _mm_castsi128_ps(r_1), (const __m64 *)(ref + 3 * ref_stride)));
+
+ const __m256i ref_0123 =
+ _mm256_inserti128_si256(_mm256_castsi128_si256(r1), r2, 1);
+ const __m256i avg = _mm256_avg_epu8(p, ref_0123);
+
+ _mm256_store_si256((__m256i *)comp_pred, avg);
+
+ row += 4;
+ pred += 32;
+ comp_pred += 32;
+ ref += 4 * ref_stride;
+ } while (row < height);
+ } else if (width == 16) {
+ assert(height % 4 == 0);
+ do {
+ const __m256i pred_0 = _mm256_load_si256((const __m256i *)pred);
+ const __m256i pred_1 = _mm256_load_si256((const __m256i *)(pred + 32));
+ const __m256i tmp0 =
+ _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)ref));
+ const __m256i ref_0 = _mm256_inserti128_si256(
+ tmp0, _mm_loadu_si128((const __m128i *)(ref + ref_stride)), 1);
+ const __m256i tmp1 = _mm256_castsi128_si256(
+ _mm_loadu_si128((const __m128i *)(ref + 2 * ref_stride)));
+ const __m256i ref_1 = _mm256_inserti128_si256(
+ tmp1, _mm_loadu_si128((const __m128i *)(ref + 3 * ref_stride)), 1);
+ const __m256i average_0 = _mm256_avg_epu8(pred_0, ref_0);
+ const __m256i average_1 = _mm256_avg_epu8(pred_1, ref_1);
+ _mm256_store_si256((__m256i *)comp_pred, average_0);
+ _mm256_store_si256((__m256i *)(comp_pred + 32), average_1);
+
+ row += 4;
+ pred += 64;
+ comp_pred += 64;
+ ref += 4 * ref_stride;
+ } while (row < height);
+ } else if (width == 32) {
+ assert(height % 2 == 0);
+ do {
+ const __m256i pred_0 = _mm256_load_si256((const __m256i *)pred);
+ const __m256i pred_1 = _mm256_load_si256((const __m256i *)(pred + 32));
+ const __m256i ref_0 = _mm256_loadu_si256((const __m256i *)ref);
+ const __m256i ref_1 =
+ _mm256_loadu_si256((const __m256i *)(ref + ref_stride));
+ const __m256i average_0 = _mm256_avg_epu8(pred_0, ref_0);
+ const __m256i average_1 = _mm256_avg_epu8(pred_1, ref_1);
+ _mm256_store_si256((__m256i *)comp_pred, average_0);
+ _mm256_store_si256((__m256i *)(comp_pred + 32), average_1);
+
+ row += 2;
+ pred += 64;
+ comp_pred += 64;
+ ref += 2 * ref_stride;
+ } while (row < height);
+ } else if (width % 64 == 0) {
+ do {
+ int x;
+ for (x = 0; x < width; x += 64) {
+ const __m256i pred_0 = _mm256_load_si256((const __m256i *)(pred + x));
+ const __m256i pred_1 =
+ _mm256_load_si256((const __m256i *)(pred + x + 32));
+ const __m256i ref_0 = _mm256_loadu_si256((const __m256i *)(ref + x));
+ const __m256i ref_1 =
+ _mm256_loadu_si256((const __m256i *)(ref + x + 32));
+ const __m256i average_0 = _mm256_avg_epu8(pred_0, ref_0);
+ const __m256i average_1 = _mm256_avg_epu8(pred_1, ref_1);
+ _mm256_store_si256((__m256i *)(comp_pred + x), average_0);
+ _mm256_store_si256((__m256i *)(comp_pred + x + 32), average_1);
+ }
+ row++;
+ pred += width;
+ comp_pred += width;
+ ref += ref_stride;
+ } while (row < height);
+ } else {
+ vpx_comp_avg_pred_sse2(comp_pred, pred, width, height, ref, ref_stride);
+ }
+}
diff --git a/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c b/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c
index 2498bba..526c283 100644
--- a/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c
+++ b/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c
@@ -46,7 +46,7 @@
};
#define CALC_CONVOLVE8_HORZ_ROW \
- srcReg = xx_loadu2_mi128(src_ptr - 3 + src_pitch, src_ptr - 3); \
+ srcReg = mm256_loadu2_si128(src_ptr - 3, src_ptr - 3 + src_pitch); \
s1[0] = _mm256_shuffle_epi8(srcReg, filt[0]); \
s1[1] = _mm256_shuffle_epi8(srcReg, filt[1]); \
s1[2] = _mm256_shuffle_epi8(srcReg, filt[2]); \
@@ -60,16 +60,6 @@
_mm256_extractf128_si256(s1[0], 1)); \
output_ptr += output_pitch;
-// 0 0 0 0 hi3 hi2 hi1 hi0 | 0 0 0 0 lo3 lo2 lo1 lo0
-static INLINE __m256i xx_loadu2_mi128(const void *hi, const void *lo) {
- // 0 0 0 0 0 0 0 0 | 0 0 0 0 lo3 lo2 lo1 lo0
- __m256i a = _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(lo)));
-
- // 0 0 0 0 hi3 hi2 hi1 hi0 | 0 0 0 0 lo3 lo2 lo1 lo0
- a = _mm256_inserti128_si256(a, _mm_loadu_si128((const __m128i *)(hi)), 1);
- return a;
-}
-
static INLINE void vpx_filter_block1d16_h8_x_avx2(
const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr,
ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter,
@@ -93,12 +83,7 @@
__m256i srcReg;
// load the 2 strides of source
- srcReg =
- _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(src_ptr - 3)));
- srcReg = _mm256_inserti128_si256(
- srcReg,
- _mm_loadu_si128((const __m128i *)(src_ptr + src_pixels_per_line - 3)),
- 1);
+ srcReg = mm256_loadu2_si128(src_ptr - 3, src_ptr + src_pixels_per_line - 3);
// filter the source buffer
s[0] = _mm256_shuffle_epi8(srcReg, filt[0]);
@@ -109,12 +94,7 @@
// reading 2 strides of the next 16 bytes
// (part of it was being read by earlier read)
- srcReg =
- _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(src_ptr + 5)));
- srcReg = _mm256_inserti128_si256(
- srcReg,
- _mm_loadu_si128((const __m128i *)(src_ptr + src_pixels_per_line + 5)),
- 1);
+ srcReg = mm256_loadu2_si128(src_ptr + 5, src_ptr + src_pixels_per_line + 5);
// filter the source buffer
s[0] = _mm256_shuffle_epi8(srcReg, filt[0]);
@@ -129,60 +109,37 @@
src_ptr += src_stride;
- // average if necessary
- outReg1 = _mm256_castsi256_si128(outReg32b1);
- outReg2 = _mm256_extractf128_si256(outReg32b1, 1);
if (avg) {
- outReg1 = _mm_avg_epu8(outReg1, _mm_load_si128((__m128i *)output_ptr));
- outReg2 = _mm_avg_epu8(
- outReg2, _mm_load_si128((__m128i *)(output_ptr + output_pitch)));
+ const __m256i outReg = mm256_loadu2_si128(
+ (__m128i *)output_ptr, (__m128i *)(output_ptr + output_pitch));
+ outReg32b1 = _mm256_avg_epu8(outReg32b1, outReg);
}
-
- // save 16 bytes
- _mm_store_si128((__m128i *)output_ptr, outReg1);
-
- // save the next 16 bits
- _mm_store_si128((__m128i *)(output_ptr + output_pitch), outReg2);
-
+ mm256_store2_si128((__m128i *)output_ptr,
+ (__m128i *)(output_ptr + output_pitch), &outReg32b1);
output_ptr += dst_stride;
}
// if the number of strides is odd.
// process only 16 bytes
if (i > 0) {
- __m128i srcReg;
-
- // load the first 16 bytes of the last row
- srcReg = _mm_loadu_si128((const __m128i *)(src_ptr - 3));
+ const __m128i srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr - 3));
+ const __m128i srcReg2 = _mm_loadu_si128((const __m128i *)(src_ptr + 5));
+ const __m256i srcReg =
+ _mm256_inserti128_si256(_mm256_castsi128_si256(srcReg1), srcReg2, 1);
// filter the source buffer
- s[0] = _mm256_castsi128_si256(
- _mm_shuffle_epi8(srcReg, _mm256_castsi256_si128(filt[0])));
- s[1] = _mm256_castsi128_si256(
- _mm_shuffle_epi8(srcReg, _mm256_castsi256_si128(filt[1])));
- s[2] = _mm256_castsi128_si256(
- _mm_shuffle_epi8(srcReg, _mm256_castsi256_si128(filt[2])));
- s[3] = _mm256_castsi128_si256(
- _mm_shuffle_epi8(srcReg, _mm256_castsi256_si128(filt[3])));
- outReg1 = convolve8_8_avx2(s, f);
+ s[0] = _mm256_shuffle_epi8(srcReg, filt[0]);
+ s[1] = _mm256_shuffle_epi8(srcReg, filt[1]);
+ s[2] = _mm256_shuffle_epi8(srcReg, filt[2]);
+ s[3] = _mm256_shuffle_epi8(srcReg, filt[3]);
- // reading the next 16 bytes
- // (part of it was being read by earlier read)
- srcReg = _mm_loadu_si128((const __m128i *)(src_ptr + 5));
+ // The low and high 128-bits of each lane contain the first and second
+ // convolve result respectively
+ outReg32b1 = convolve8_16_avx2(s, f);
+ outReg1 = _mm256_castsi256_si128(outReg32b1);
+ outReg2 = _mm256_extractf128_si256(outReg32b1, 1);
- // filter the source buffer
- s[0] = _mm256_castsi128_si256(
- _mm_shuffle_epi8(srcReg, _mm256_castsi256_si128(filt[0])));
- s[1] = _mm256_castsi128_si256(
- _mm_shuffle_epi8(srcReg, _mm256_castsi256_si128(filt[1])));
- s[2] = _mm256_castsi128_si256(
- _mm_shuffle_epi8(srcReg, _mm256_castsi256_si128(filt[2])));
- s[3] = _mm256_castsi128_si256(
- _mm_shuffle_epi8(srcReg, _mm256_castsi256_si128(filt[3])));
- outReg2 = convolve8_8_avx2(s, f);
-
- // shrink to 8 bit each 16 bits, the low and high 64-bits of each lane
- // contain the first and second convolve result respectively
+ // shrink to 8 bit each 16 bits
outReg1 = _mm_packus_epi16(outReg1, outReg2);
// average if necessary
@@ -266,7 +223,6 @@
const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter,
const int avg) {
- __m128i outReg1, outReg2;
__m256i srcRegHead1;
unsigned int i;
ptrdiff_t src_stride, dst_stride;
@@ -345,19 +301,14 @@
src_ptr += src_stride;
// average if necessary
- outReg1 = _mm256_castsi256_si128(s1[0]);
- outReg2 = _mm256_extractf128_si256(s1[0], 1);
if (avg) {
- outReg1 = _mm_avg_epu8(outReg1, _mm_load_si128((__m128i *)output_ptr));
- outReg2 = _mm_avg_epu8(
- outReg2, _mm_load_si128((__m128i *)(output_ptr + out_pitch)));
+ const __m256i outReg = mm256_loadu2_si128(
+ (__m128i *)output_ptr, (__m128i *)(output_ptr + out_pitch));
+ s1[0] = _mm256_avg_epu8(s1[0], outReg);
}
- // save 16 bytes
- _mm_store_si128((__m128i *)output_ptr, outReg1);
-
- // save the next 16 bits
- _mm_store_si128((__m128i *)(output_ptr + out_pitch), outReg2);
+ mm256_store2_si128((__m128i *)output_ptr,
+ (__m128i *)(output_ptr + out_pitch), s1);
output_ptr += dst_stride;
@@ -1094,7 +1045,7 @@
// load the 2 strides of source
// r115 r114 ...... r15 r14 r13 r12 r11 r10 | r015 r014 r013 ...... r07
// r06 r05 r04 r03 r02 r01 r00
- srcReg32b1 = xx_loadu2_mi128(src_ptr - 3 + src_pitch, src_ptr - 3);
+ srcReg32b1 = mm256_loadu2_si128(src_ptr - 3, src_ptr - 3 + src_pitch);
// filter the source buffer
// r16 r15 r14 r13 r15 r14 r13 r12 r14 r13 r12 r11 r13 r12 r11 r10 | r06
@@ -1188,8 +1139,7 @@
const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) {
__m256i f[4], ss[4];
- __m256i r[8];
- __m128i r1[10];
+ __m256i r[9], rr[2];
__m128i s[11];
unsigned int y = output_height;
@@ -1210,48 +1160,35 @@
s[5] = _mm_loadl_epi64((const __m128i *)(src_ptr + 5 * src_pitch));
s[6] = _mm_loadl_epi64((const __m128i *)(src_ptr + 6 * src_pitch));
- // R1-0 xxxx .. . . x| r13 r12 r11 r10 r03 r02 r01 r00
- r1[0] = _mm_unpacklo_epi32(s[0], s[1]);
+ r[0] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[0]), s[2], 1);
+ r[1] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[1]), s[3], 1);
+ r[2] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[2]), s[4], 1);
+ r[3] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[3]), s[5], 1);
+ r[4] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[4]), s[6], 1);
- // R2-1 xxxx .. . . x| r23 r22 r21 r20 r13 r12 r11 r10
- r1[1] = _mm_unpacklo_epi32(s[1], s[2]);
+ // r37.....r24..r33..r31 r30 r23 r22 r21 r20|r17....r14 r07..r05 r04 r13 r12
+ // r11 r10 r03 r02 r01 r00
+ rr[0] = _mm256_unpacklo_epi32(r[0], r[1]);
- // R3-2 xxxx .. . . x| r33 r32 r31 r30 r23 r22 r21 r20
- r1[2] = _mm_unpacklo_epi32(s[2], s[3]);
-
- // R4-3 xxxx .. . . x| r43 r42 r41 r40 r33 r32 r31 r30
- r1[3] = _mm_unpacklo_epi32(s[3], s[4]);
-
- // R5-4 xxxx .. . . x| r53 r52 r51 r50 r43 r42 r41 r40
- r1[4] = _mm_unpacklo_epi32(s[4], s[5]);
-
- // R6-5 xxxx .. . . x| r63 r62 r61 r60 r53 r52 r51 r50
- r1[5] = _mm_unpacklo_epi32(s[5], s[6]);
-
- // 00000000 r33 r32 r31 r30|r23 r22 r21 r20||00000000|r13 r12 r11 r10|r03 r02
- // r01 r00
- r[0] = _mm256_inserti128_si256(_mm256_castsi128_si256(r1[0]), r1[2], 1);
-
- // 00000000 r43 r42 r41 r40|r33 r32 r31 r30||00000000|r23 r22 r21 r20|r13 r12
- // r11 r10
- r[1] = _mm256_inserti128_si256(_mm256_castsi128_si256(r1[1]), r1[3], 1);
-
- // 00000000 r53 r52 r51 r50|r43 r42 r41 r40||00000000|r33 r32 r31 r30|r23 r22
- // r21 r20
- r[2] = _mm256_inserti128_si256(_mm256_castsi128_si256(r1[2]), r1[4], 1);
-
- // 00000000 r63 r62 r61 r60|r53 r52 r51 r50||00000000|r43 r42 r41 r40|r33 r32
- // r31 r30
- r[3] = _mm256_inserti128_si256(_mm256_castsi128_si256(r1[3]), r1[5], 1);
+ // r47.....r34..r43..r41 r40 r33 r32 r31 r30|r27....r24 r17..r15 r14 r23 r22
+ // r21 r20 r13 r12 r11 r10
+ rr[1] = _mm256_unpacklo_epi32(r[1], r[2]);
// r43 r33....r40 r30|r33 r23....r30 r20||r23 r13....r20 r10|r13 r03....r10
// r00|
- ss[0] = _mm256_unpacklo_epi8(r[0], r[1]);
+ ss[0] = _mm256_unpacklo_epi8(rr[0], rr[1]);
+
+ // r37.....r24..r33..r31 r30 r23 r22 r21 r20||r17....r14 r07..r05 r04 r13 r12
+ // r11 r10 r03 r02 r01 r00
+ rr[0] = _mm256_unpacklo_epi32(r[2], r[3]);
+
+ // r47.....r34..r43..r41 r40 r33 r32 r31 r30|r27....r24 r17..r15 r14 r23 r22
+ // r21 r20 r13 r12 r11 r10
+ rr[1] = _mm256_unpacklo_epi32(r[3], r[4]);
// r63 r53....r60 r50|r53 r43....r50 r40||r43 r33....r40 r30|r33 r23....r30
// r20|
- ss[1] = _mm256_unpacklo_epi8(r[2], r[3]);
-
+ ss[1] = _mm256_unpacklo_epi8(rr[0], rr[1]);
// Process 4 rows at a time
while (y >= 4) {
s[7] = _mm_loadl_epi64((const __m128i *)(src_ptr + 7 * src_pitch));
@@ -1259,41 +1196,17 @@
s[9] = _mm_loadl_epi64((const __m128i *)(src_ptr + 9 * src_pitch));
s[10] = _mm_loadl_epi64((const __m128i *)(src_ptr + 10 * src_pitch));
- // R7-6 xxxx .. . . x| r73 r72 r71 r70 r63 r62 r61 r60
- r1[6] = _mm_unpacklo_epi32(s[6], s[7]);
+ r[5] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[5]), s[7], 1);
+ r[6] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[6]), s[8], 1);
+ rr[0] = _mm256_unpacklo_epi32(r[4], r[5]);
+ rr[1] = _mm256_unpacklo_epi32(r[5], r[6]);
+ ss[2] = _mm256_unpacklo_epi8(rr[0], rr[1]);
- // R8-7 xxxx .. . . x| r83 r82 r81 r80 r73 r72 r71 r70
- r1[7] = _mm_unpacklo_epi32(s[7], s[8]);
-
- // R9-8 xxxx .. . . x| r93 r92 r91 r90 r83 r82 r81 r80
- r1[8] = _mm_unpacklo_epi32(s[8], s[9]);
-
- // R10-9 xxxx .. . . x| r10-3 r10-2 r10-1 r10-0 r93 r92 r91 r90
- r1[9] = _mm_unpacklo_epi32(s[9], s[10]);
-
- // 00000000 r73 r72 r71 r70|r63 r62 r61 r60||00000000|r53 r52 r51 r50|r43
- // r42 r41 r40
- r[4] = _mm256_inserti128_si256(_mm256_castsi128_si256(r1[4]), r1[6], 1);
-
- // 00000000 r83 r82 r81 r80|r73 r72 r71 r70||00000000|r63 r62 r61 r60|r53
- // r52 r51 r50
- r[5] = _mm256_inserti128_si256(_mm256_castsi128_si256(r1[5]), r1[7], 1);
-
- // 00000000 r93 r92 r91 r90|r83 r82 r81 r80||00000000|r73 r72 r71 r70|r63
- // r62 r61 r60
- r[6] = _mm256_inserti128_si256(_mm256_castsi128_si256(r1[6]), r1[8], 1);
-
- // 00000000 r10-3 r10-2 r10-1 r10-0|r93 r92 r91 r90||00000000|r83 r82 r81
- // r80|r73 r72 r71 r70
- r[7] = _mm256_inserti128_si256(_mm256_castsi128_si256(r1[7]), r1[9], 1);
-
- // r83 r73....r80 r70|r73 r63....r70 r60||r63 r53....r60 r50|r53 r43....r50
- // r40|
- ss[2] = _mm256_unpacklo_epi8(r[4], r[5]);
-
- // r10-3 r10-3....r10-0 r10-0|r93 r83....r90 r80||r83 r73....r80 r70|r73
- // r63....r70 r60|
- ss[3] = _mm256_unpacklo_epi8(r[6], r[7]);
+ r[7] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[7]), s[9], 1);
+ r[8] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[8]), s[10], 1);
+ rr[0] = _mm256_unpacklo_epi32(r[6], r[7]);
+ rr[1] = _mm256_unpacklo_epi32(r[7], r[8]);
+ ss[3] = _mm256_unpacklo_epi8(rr[0], rr[1]);
ss[0] = convolve8_16_avx2(ss, f);
@@ -1315,17 +1228,17 @@
ss[1] = ss[3];
s[6] = s[10];
+ s[5] = s[9];
- r1[4] = r1[8];
- r1[5] = r1[9];
-
+ r[4] = r[8];
y -= 4;
}
// Process 2 rows
if (y == 2) {
- __m128i ss1[4], f1[4];
+ __m128i ss1[4], f1[4], r1[4];
+ s[4] = _mm_loadl_epi64((const __m128i *)(src_ptr + 4 * src_pitch));
s[7] = _mm_loadl_epi64((const __m128i *)(src_ptr + 7 * src_pitch));
s[8] = _mm_loadl_epi64((const __m128i *)(src_ptr + 8 * src_pitch));
@@ -1334,11 +1247,14 @@
f1[2] = _mm256_castsi256_si128(f[2]);
f1[3] = _mm256_castsi256_si128(f[3]);
+ r1[0] = _mm_unpacklo_epi32(s[4], s[5]);
+ r1[1] = _mm_unpacklo_epi32(s[5], s[6]);
+
// R7-6 xxxx .. . . x| r73 r72 r71 r70 r63 r62 r61 r60
- r1[6] = _mm_unpacklo_epi32(s[6], s[7]);
+ r1[2] = _mm_unpacklo_epi32(s[6], s[7]);
// R8-7 xxxx .. . . x| r83 r82 r81 r80 r73 r72 r71 r70
- r1[7] = _mm_unpacklo_epi32(s[7], s[8]);
+ r1[3] = _mm_unpacklo_epi32(s[7], s[8]);
// r23 r13....r20 r10|r13 r03....r10 r00
ss1[0] = _mm256_castsi256_si128(ss[0]);
@@ -1347,10 +1263,10 @@
ss1[1] = _mm256_castsi256_si128(ss[1]);
// r63 r53....r60 r50|r53 r43....r50 r40
- ss1[2] = _mm_unpacklo_epi8(r1[4], r1[5]);
+ ss1[2] = _mm_unpacklo_epi8(r1[0], r1[1]);
// r83 r73....r80 r70|r73 r63....r70 r60
- ss1[3] = _mm_unpacklo_epi8(r1[6], r1[7]);
+ ss1[3] = _mm_unpacklo_epi8(r1[2], r1[3]);
ss1[0] = convolve8_8_ssse3(ss1, f1);