blob: b282528b088925c471bed9c9e6721cb361fa58a5 [file] [log] [blame]
// Copyright 2020 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// -----------------------------------------------------------------------------
//
// Quantization
//
// Author: Skal (pascal.massimino@gmail.com)
#include <algorithm>
#include <cassert>
#include "src/dsp/dsp.h"
//------------------------------------------------------------------------------
namespace {
void Quantize_C(const uint32_t iq[], uint32_t bias, const int32_t res[],
int16_t coeffs[], uint32_t W, uint32_t H) {
for (uint32_t y = 0; y < H; ++y) {
for (uint32_t x = 0; x < W; ++x) {
coeffs[x] = (res[x] < 0) ? -WP2Quantize16b(-res[x], iq[x], bias)
: WP2Quantize16b( res[x], iq[x], bias);
}
coeffs += W;
res += W;
iq += WP2QStride;
}
}
void Dequantize_C(const int16_t in[], const int16_t dequants[], int16_t out[],
uint32_t W, uint32_t H, uint32_t len) {
assert(len <= W * H);
int16_t* const end = out + W * H;
for (uint32_t y = 0; y < len; y += W) {
for (uint32_t x = 0; x < W; ++x) out[x] = in[x] * dequants[x];
dequants += WP2QStride;
out += W;
in += W;
}
std::fill(out, end, 0);
}
#if defined(WP2_USE_SSE)
//------------------------------------------------------------------------------
// SSE version
void Quantize_SSE(const uint32_t iq[], uint32_t bias, const int32_t res[],
int16_t coeffs[], uint32_t W, uint32_t H) {
const __m128i BIAS = _mm_set1_epi32(bias);
for (uint32_t y = 0; y < H; ++y) {
if (W > 4) {
for (uint32_t x = 0; x < W; x += 8) {
const __m128i IQ0 = _mm_loadu_si128((const __m128i*)&iq[x + 0]);
const __m128i IQ1 = _mm_loadu_si128((const __m128i*)&iq[x + 4]);
const __m128i r0 = _mm_loadu_si128((const __m128i*)&res[x + 0]);
const __m128i r1 = _mm_loadu_si128((const __m128i*)&res[x + 4]);
const __m128i abs_r0 = _mm_abs_epi32(r0);
const __m128i abs_r1 = _mm_abs_epi32(r1);
// |res| * iq + bias
const __m128i A0 = _mm_mullo_epi32(abs_r0, IQ0);
const __m128i A1 = _mm_mullo_epi32(abs_r1, IQ1);
const __m128i B0 = _mm_add_epi32(A0, BIAS);
const __m128i B1 = _mm_add_epi32(A1, BIAS);
// >> WP2QBits
const __m128i C0 = _mm_srli_epi32(B0, WP2QBits);
const __m128i C1 = _mm_srli_epi32(B1, WP2QBits);
// coeffs[] = sign ? -B : B;
const __m128i D0 = _mm_sign_epi32(C0, r0);
const __m128i D1 = _mm_sign_epi32(C1, r1);
const __m128i E = _mm_packs_epi32(D0, D1);
_mm_storeu_si128((__m128i*)&coeffs[x], E);
}
} else {
const __m128i IQ0 = _mm_loadu_si128((const __m128i*)iq);
const __m128i r0 = _mm_loadu_si128((const __m128i*)res);
const __m128i abs_r0 = _mm_abs_epi32(r0);
// |res| * iq + bias
const __m128i A0 = _mm_mullo_epi32(abs_r0, IQ0);
const __m128i B0 = _mm_add_epi32(A0, BIAS);
// >> WP2QBits
const __m128i C0 = _mm_srli_epi32(B0, WP2QBits);
// coeffs[] = sign ? -B : B;
const __m128i D0 = _mm_sign_epi32(C0, r0);
const __m128i E = _mm_packs_epi32(D0, D0);
_mm_storel_epi64((__m128i*)coeffs, E);
}
coeffs += W;
res += W;
iq += WP2QStride;
bias += WP2QStride;
}
}
void Dequantize_SSE(const int16_t in[], const int16_t dequants[], int16_t out[],
uint32_t W, uint32_t H, uint32_t len) {
assert(len <= W * H);
int16_t* const end = out + W * H;
for (uint32_t y = 0; y < len; y += W) {
if (W > 4) {
for (uint32_t x = 0; x < W; x += 8) {
const __m128i A = _mm_loadu_si128((const __m128i*)&in[x]);
const __m128i B = _mm_loadu_si128((const __m128i*)&dequants[x]);
const __m128i C = _mm_mullo_epi16(A, B); // non-saturated!
_mm_storeu_si128((__m128i*)&out[x], C);
}
} else {
const __m128i A = _mm_loadl_epi64((const __m128i*)in);
const __m128i B = _mm_loadl_epi64((const __m128i*)dequants);
const __m128i C = _mm_mullo_epi16(A, B);
_mm_storeu_si128((__m128i*)out, C);
}
dequants += WP2QStride;
out += W;
in += W;
}
std::fill(out, end, 0);
}
WP2_TSAN_IGNORE_FUNCTION void QuantizeInitSSE() {
WP2Quantize = Quantize_SSE;
WP2Dequantize = Dequantize_SSE;
}
#endif // WP2_USE_SSE
//------------------------------------------------------------------------------
} // namespace
void (*WP2Quantize)(const uint32_t iq[], uint32_t bias, const int32_t res[],
int16_t coeffs[], uint32_t W, uint32_t H) = nullptr;
void (*WP2Dequantize)(const int16_t in[], const int16_t dequants[],
int16_t out[],
uint32_t W, uint32_t H, uint32_t len) = nullptr;
static volatile WP2CPUInfo quantize_last_cpuinfo_used =
(WP2CPUInfo)&quantize_last_cpuinfo_used;
WP2_TSAN_IGNORE_FUNCTION void WP2QuantizeInit() {
if (quantize_last_cpuinfo_used == WP2GetCPUInfo) return;
WP2Quantize = Quantize_C;
WP2Dequantize = Dequantize_C;
if (WP2GetCPUInfo != nullptr) {
#if defined(WP2_USE_SSE)
if (WP2GetCPUInfo(kSSE)) QuantizeInitSSE();
#endif
}
quantize_last_cpuinfo_used = WP2GetCPUInfo;
}