blob: c6bdac10f23df341d855d7da0e8633036a185eb2 [file] [log] [blame]
/*
* jdsample-neon.c - upsampling (Arm NEON)
*
* Copyright 2019 The Chromium Authors. All Rights Reserved.
*
* This software is provided 'as-is', without any express or implied
* warranty. In no event will the authors be held liable for any damages
* arising from the use of this software.
*
* Permission is granted to anyone to use this software for any purpose,
* including commercial applications, and to alter it and redistribute it
* freely, subject to the following restrictions:
*
* 1. The origin of this software must not be misrepresented; you must not
* claim that you wrote the original software. If you use this software
* in a product, an acknowledgment in the product documentation would be
* appreciated but is not required.
* 2. Altered source versions must be plainly marked as such, and must not be
* misrepresented as being the original software.
* 3. This notice may not be removed or altered from any source distribution.
*/
#define JPEG_INTERNALS
#include "../../../jinclude.h"
#include "../../../jpeglib.h"
#include "../../../jsimd.h"
#include "../../../jdct.h"
#include "../../../jsimddct.h"
#include "../../jsimd.h"
#include <arm_neon.h>
/*
* The diagram below shows a grid-window of samples (luma or chroma) produced
* by h2v2 downsampling.
*
* s0 s1
* +---------+---------+
* | p0 p1 | p2 p3 |
* r0 | | |
* | p4 p5 | p6 p7 |
* +---------+---------+
* | p8 p9 | p10 p11|
* r1 | | |
* | p12 p13| p14 p15|
* +---------+---------+
* | p16 p17| p18 p19|
* r2 | | |
* | p20 p21| p22 p23|
* +---------+---------+
*
* Every sample contains four of the original pixel channel values. The pixels'
* channel values are centred at positions p0, p1, p2,..., p23 above. For a
* given grid-window position, r1 is always used to denote the row of samples
* containing the pixel channel values we are computing. For the top row of
* pixel channel values in r1 (p8-p11), the nearest neighbouring samples are in
* the row above - denoted by r0. Likewise, for the bottom row of pixels in r1
* (p12-p15), the nearest neighbouring samples are in the row below - denoted
* by r2.
*
* To compute the pixel channel values of the original image, we proportionally
* blend the sample containing the pixel centre with the nearest neighbouring
* samples in each row, column and diagonal.
*
* There are three cases to consider:
*
* 1) The first pixel in this row of the original image.
* Pixel channel value p8 only contains components from sample column s0.
* Its value is computed by blending samples s0r1 and s0r0 in the ratio 3:1.
* 2) The last pixel in this row of the original image.
* Pixel channel value p11 only contains components from sample column s1.
* Its value is computed by blending samples s1r1 and s1r0 in the ratio 3:1.
* 3) General case (all other pixels in the row).
* Apart from the first and last pixels, every other pixel channel value in
* the row contains components from samples in adjacent columns.
*
* For example, the pixel centred at p9 would be computed as follows:
* (9/16 * s0r1) + (3/16 * s0r0) + (3/16 * s1r1) + (1/16 * s1r0)
*
* This can be broken down into two steps:
* 1) Blend samples vertically in columns s0 and s1 in the ratio 3:1:
* s0colsum = 3/4 * s0r1 + 1/4 * s0r0
* s1colsum = 3/4 * s1r1 + 1/4 * s1r0
* 2) Blend the already-blended columns in the ratio 3:1:
* p9 = 3/4 * s0colsum + 1/4 * s1colsum
*
* The bottom row of pixel channel values in row r1 can be computed in the same
* way for each of the three cases, only using samples in row r2 instead of row
* r0 - as r2 is the nearest neighbouring row.
*/
void jsimd_h2v2_fancy_upsample_neon(int max_v_samp_factor,
JDIMENSION downsampled_width,
JSAMPARRAY input_data,
JSAMPARRAY *output_data_ptr)
{
JSAMPARRAY output_data = *output_data_ptr;
JSAMPROW inptr0, inptr1, inptr2, outptr0, outptr1;
int inrow, outrow;
/* Setup constants. */
const uint16x8_t seven_u16 = vdupq_n_u16(7);
const uint8x8_t three_u8 = vdup_n_u8(3);
const uint16x8_t three_u16 = vdupq_n_u16(3);
inrow = outrow = 0;
while (outrow < max_v_samp_factor) {
inptr0 = input_data[inrow - 1];
inptr1 = input_data[inrow];
inptr2 = input_data[inrow + 1];
/* Suffixes 0 and 1 denote the top and bottom rows of output pixels */
/* respectively. */
outptr0 = output_data[outrow++];
outptr1 = output_data[outrow++];
/* Case 1: first pixel channel value in this row of original image. */
int s0colsum0 = GETJSAMPLE(*inptr1) * 3 + GETJSAMPLE(*inptr0);
*outptr0 = (JSAMPLE)((s0colsum0 * 4 + 8) >> 4);
int s0colsum1 = GETJSAMPLE(*inptr1) * 3 + GETJSAMPLE(*inptr2);
*outptr1 = (JSAMPLE)((s0colsum1 * 4 + 8) >> 4);
/* General case as described above. */
/* Step 1: Blend samples vertically in columns s0 and s1. */
/* Leave the divide by 4 to the end when it can be done for both */
/* dimensions at once, right-shifting by 4. */
/* Load and compute s0colsum0 and s0colsum1. */
uint8x16_t s0r0 = vld1q_u8(inptr0);
uint8x16_t s0r1 = vld1q_u8(inptr1);
uint8x16_t s0r2 = vld1q_u8(inptr2);
/* Multiplication makes vectors twice as wide: '_l' and '_h' suffixes */
/* denote low half and high half respectively. */
uint16x8_t s0colsum0_l = vmlal_u8(vmovl_u8(vget_low_u8(s0r0)),
vget_low_u8(s0r1), three_u8);
uint16x8_t s0colsum0_h = vmlal_u8(vmovl_u8(vget_high_u8(s0r0)),
vget_high_u8(s0r1), three_u8);
uint16x8_t s0colsum1_l = vmlal_u8(vmovl_u8(vget_low_u8(s0r2)),
vget_low_u8(s0r1), three_u8);
uint16x8_t s0colsum1_h = vmlal_u8(vmovl_u8(vget_high_u8(s0r2)),
vget_high_u8(s0r1), three_u8);
/* Load and compute s1colsum0 and s1colsum1. */
uint8x16_t s1r0 = vld1q_u8(inptr0 + 1);
uint8x16_t s1r1 = vld1q_u8(inptr1 + 1);
uint8x16_t s1r2 = vld1q_u8(inptr2 + 1);
uint16x8_t s1colsum0_l = vmlal_u8(vmovl_u8(vget_low_u8(s1r0)),
vget_low_u8(s1r1), three_u8);
uint16x8_t s1colsum0_h = vmlal_u8(vmovl_u8(vget_high_u8(s1r0)),
vget_high_u8(s1r1), three_u8);
uint16x8_t s1colsum1_l = vmlal_u8(vmovl_u8(vget_low_u8(s1r2)),
vget_low_u8(s1r1), three_u8);
uint16x8_t s1colsum1_h = vmlal_u8(vmovl_u8(vget_high_u8(s1r2)),
vget_high_u8(s1r1), three_u8);
/* Step 2: Blend the already-blended columns. */
uint16x8_t output0_p1_l = vmlaq_u16(s1colsum0_l, s0colsum0_l, three_u16);
uint16x8_t output0_p1_h = vmlaq_u16(s1colsum0_h, s0colsum0_h, three_u16);
uint16x8_t output0_p2_l = vmlaq_u16(s0colsum0_l, s1colsum0_l, three_u16);
uint16x8_t output0_p2_h = vmlaq_u16(s0colsum0_h, s1colsum0_h, three_u16);
uint16x8_t output1_p1_l = vmlaq_u16(s1colsum1_l, s0colsum1_l, three_u16);
uint16x8_t output1_p1_h = vmlaq_u16(s1colsum1_h, s0colsum1_h, three_u16);
uint16x8_t output1_p2_l = vmlaq_u16(s0colsum1_l, s1colsum1_l, three_u16);
uint16x8_t output1_p2_h = vmlaq_u16(s0colsum1_h, s1colsum1_h, three_u16);
/* Add ordered dithering bias to odd pixel values. */
output0_p1_l = vaddq_u16(output0_p1_l, seven_u16);
output0_p1_h = vaddq_u16(output0_p1_h, seven_u16);
output1_p1_l = vaddq_u16(output1_p1_l, seven_u16);
output1_p1_h = vaddq_u16(output1_p1_h, seven_u16);
/* Right-shift by 4 (divide by 16), narrow to 8-bit and combine. */
uint8x16x2_t output_pixels0 = { vcombine_u8(vshrn_n_u16(output0_p1_l, 4),
vshrn_n_u16(output0_p1_h, 4)),
vcombine_u8(vrshrn_n_u16(output0_p2_l, 4),
vrshrn_n_u16(output0_p2_h, 4))
};
uint8x16x2_t output_pixels1 = { vcombine_u8(vshrn_n_u16(output1_p1_l, 4),
vshrn_n_u16(output1_p1_h, 4)),
vcombine_u8(vrshrn_n_u16(output1_p2_l, 4),
vrshrn_n_u16(output1_p2_h, 4))
};
/* Store pixel channel values to memory. */
/* The minimum size of the output buffer for each row is 64 bytes => no */
/* need to worry about buffer overflow here. See "Creation of 2-D sample */
/* arrays" in jmemmgr.c for details. */
vst2q_u8(outptr0 + 1, output_pixels0);
vst2q_u8(outptr1 + 1, output_pixels1);
/* The first pixel of the image shifted our loads and stores by one */
/* byte. We have to re-align on a 32-byte boundary at some point before */
/* the end of the row (we do it now on the 32/33 pixel boundary) to stay */
/* within the bounds of the sample buffers without having to resort to a */
/* slow scalar tail case for the last (downsampled_width % 16) samples. */
/* See "Creation of 2-D sample arrays" in jmemmgr.c for details.*/
for (unsigned colctr = 16; colctr < downsampled_width; colctr += 16) {
/* Step 1: Blend samples vertically in columns s0 and s1. */
/* Load and compute s0colsum0 and s0colsum1. */
s0r0 = vld1q_u8(inptr0 + colctr - 1);
s0r1 = vld1q_u8(inptr1 + colctr - 1);
s0r2 = vld1q_u8(inptr2 + colctr - 1);
s0colsum0_l = vmlal_u8(vmovl_u8(vget_low_u8(s0r0)),
vget_low_u8(s0r1), three_u8);
s0colsum0_h = vmlal_u8(vmovl_u8(vget_high_u8(s0r0)),
vget_high_u8(s0r1), three_u8);
s0colsum1_l = vmlal_u8(vmovl_u8(vget_low_u8(s0r2)),
vget_low_u8(s0r1), three_u8);
s0colsum1_h = vmlal_u8(vmovl_u8(vget_high_u8(s0r2)),
vget_high_u8(s0r1), three_u8);
/* Load and compute s1colsum0 and s1colsum1. */
s1r0 = vld1q_u8(inptr0 + colctr);
s1r1 = vld1q_u8(inptr1 + colctr);
s1r2 = vld1q_u8(inptr2 + colctr);
s1colsum0_l = vmlal_u8(vmovl_u8(vget_low_u8(s1r0)),
vget_low_u8(s1r1), three_u8);
s1colsum0_h = vmlal_u8(vmovl_u8(vget_high_u8(s1r0)),
vget_high_u8(s1r1), three_u8);
s1colsum1_l = vmlal_u8(vmovl_u8(vget_low_u8(s1r2)),
vget_low_u8(s1r1), three_u8);
s1colsum1_h = vmlal_u8(vmovl_u8(vget_high_u8(s1r2)),
vget_high_u8(s1r1), three_u8);
/* Step 2: Blend the already-blended columns. */
output0_p1_l = vmlaq_u16(s1colsum0_l, s0colsum0_l, three_u16);
output0_p1_h = vmlaq_u16(s1colsum0_h, s0colsum0_h, three_u16);
output0_p2_l = vmlaq_u16(s0colsum0_l, s1colsum0_l, three_u16);
output0_p2_h = vmlaq_u16(s0colsum0_h, s1colsum0_h, three_u16);
output1_p1_l = vmlaq_u16(s1colsum1_l, s0colsum1_l, three_u16);
output1_p1_h = vmlaq_u16(s1colsum1_h, s0colsum1_h, three_u16);
output1_p2_l = vmlaq_u16(s0colsum1_l, s1colsum1_l, three_u16);
output1_p2_h = vmlaq_u16(s0colsum1_h, s1colsum1_h, three_u16);
/* Add ordered dithering bias to odd pixel values. */
output0_p1_l = vaddq_u16(output0_p1_l, seven_u16);
output0_p1_h = vaddq_u16(output0_p1_h, seven_u16);
output1_p1_l = vaddq_u16(output1_p1_l, seven_u16);
output1_p1_h = vaddq_u16(output1_p1_h, seven_u16);
/* Right-shift by 4 (divide by 16), narrow to 8-bit and combine. */
output_pixels0.val[0] = vcombine_u8(vshrn_n_u16(output0_p1_l, 4),
vshrn_n_u16(output0_p1_h, 4));
output_pixels0.val[1] = vcombine_u8(vrshrn_n_u16(output0_p2_l, 4),
vrshrn_n_u16(output0_p2_h, 4));
output_pixels1.val[0] = vcombine_u8(vshrn_n_u16(output1_p1_l, 4),
vshrn_n_u16(output1_p1_h, 4));
output_pixels1.val[1] = vcombine_u8(vrshrn_n_u16(output1_p2_l, 4),
vrshrn_n_u16(output1_p2_h, 4));
/* Store pixel channel values to memory. */
vst2q_u8(outptr0 + 2 * colctr - 1, output_pixels0);
vst2q_u8(outptr1 + 2 * colctr - 1, output_pixels1);
}
/* Case 2: last pixel channel value in this row of the original image. */
int s1colsum0 = GETJSAMPLE(inptr1[downsampled_width - 1]) * 3 +
GETJSAMPLE(inptr0[downsampled_width - 1]);
outptr0[2 * downsampled_width - 1] = (JSAMPLE)((s1colsum0 * 4 + 7) >> 4);
int s1colsum1 = GETJSAMPLE(inptr1[downsampled_width - 1]) * 3 +
GETJSAMPLE(inptr2[downsampled_width - 1]);
outptr1[2 * downsampled_width - 1] = (JSAMPLE)((s1colsum1 * 4 + 7) >> 4);
inrow++;
}
}