Removing divisions from od_dir_find8()
Instead of dividing the squared partial sums by the number n of pixels in the
line, we multiply by 840/n, where 840=3*5*7*8. This not only avoids the
divisions, but it also makes the optimization exact as there is no more
rounding.
ntt-short1 resuts:
MEDIUM (%) HIGH (%)
PSNR -0.012070 -0.059644
PSNRHVS -0.016845 -0.020871
SSIM -0.026984 -0.031257
FASTSSIM -0.026078 0.414901
Change-Id: Ie553d5e3a545dee860a00879d724ecfc00f0a974
diff --git a/vp10/common/od_dering.c b/vp10/common/od_dering.c
index 66eecea..af89b80 100644
--- a/vp10/common/od_dering.c
+++ b/vp10/common/od_dering.c
@@ -61,15 +61,21 @@
static int od_dir_find8(const od_dering_in *img, int stride, int32_t *var,
int coeff_shift) {
int i;
- int cost[8] = {0};
+ int32_t cost[8] = {0};
int partial[8][15] = {{0}};
- int best_cost = 0;
+ int32_t best_cost = 0;
int best_dir = 0;
+ /* Instead of dividing by n between 2 and 8, we multiply by 3*5*7*8/n.
+ The output is then 840 times larger, but we don't care for finding
+ the max. */
+ static const int div_table[] = {0, 840, 420, 280, 210, 168, 140, 120, 105};
for (i = 0; i < 8; i++) {
int j;
for (j = 0; j < 8; j++) {
int x;
- x = img[i*stride + j] >> coeff_shift;
+ /* We subtract 128 here to reduce the maximum range of the squared
+ partial sums. */
+ x = (img[i*stride + j] >> coeff_shift) - 128;
partial[0][i + j] += x;
partial[1][i + j/2] += x;
partial[2][i] += x;
@@ -81,25 +87,28 @@
}
}
for (i = 0; i < 8; i++) {
- cost[2] += partial[2][i]*partial[2][i] >> 3;
- cost[6] += partial[6][i]*partial[6][i] >> 3;
+ cost[2] += partial[2][i]*partial[2][i];
+ cost[6] += partial[6][i]*partial[6][i];
}
+ cost[2] *= div_table[8];
+ cost[6] *= div_table[8];
for (i = 0; i < 7; i++) {
- cost[0] += OD_DIVU_SMALL(partial[0][i]*partial[0][i], i + 1)
- + OD_DIVU_SMALL(partial[0][14 - i]*partial[0][14 - i], i + 1);
- cost[4] += OD_DIVU_SMALL(partial[4][i]*partial[4][i], i + 1)
- + OD_DIVU_SMALL(partial[4][14 - i]*partial[4][14 - i], i + 1);
+ cost[0] += (partial[0][i]*partial[0][i]
+ + partial[0][14 - i]*partial[0][14 - i])*div_table[i + 1];
+ cost[4] += (partial[4][i]*partial[4][i]
+ + partial[4][14 - i]*partial[4][14 - i])*div_table[i + 1];
}
- cost[0] += partial[0][7]*partial[0][8 - 1] >> 3;
- cost[4] += partial[4][7]*partial[4][8 - 1] >> 3;
+ cost[0] += partial[0][7]*partial[0][7]*div_table[8];
+ cost[4] += partial[4][7]*partial[4][7]*div_table[8];
for (i = 1; i < 8; i += 2) {
int j;
for (j = 0; j < 4 + 1; j++) {
- cost[i] += partial[i][3 + j]*partial[i][3 + j] >> 3;
+ cost[i] += partial[i][3 + j]*partial[i][3 + j];
}
+ cost[i] *= div_table[8];
for (j = 0; j < 4 - 1; j++) {
- cost[i] += OD_DIVU_SMALL(partial[i][j]*partial[i][j], 2*j + 2)
- + OD_DIVU_SMALL(partial[i][10 - j]*partial[i][10 - j], 2*j + 2);
+ cost[i] += (partial[i][j]*partial[i][j]
+ + partial[i][10 - j]*partial[i][10 - j])*div_table[2*j + 2];
}
}
for (i = 0; i < 8; i++) {
@@ -111,6 +120,9 @@
/* Difference between the optimal variance and the variance along the
orthogonal direction. Again, the sum(x^2) terms cancel out. */
*var = best_cost - cost[(best_dir + 4) & 7];
+ /* We'd normally divide by 840, but dividing by 1024 is close enough
+ for what we're going to do with this. */
+ *var >>= 10;
return best_dir;
}