simd/arm/common/jdsample-neon.c - chromium/deps/libjpeg_turbo - Git at Google

 /*
  * jdsample-neon.c - upsampling (Arm NEON)
  *
  * Copyright 2019 The Chromium Authors. All Rights Reserved.
  *
  * This software is provided 'as-is', without any express or implied
  * warranty.  In no event will the authors be held liable for any damages
  * arising from the use of this software.
  *
  * Permission is granted to anyone to use this software for any purpose,
  * including commercial applications, and to alter it and redistribute it
  * freely, subject to the following restrictions:
  *
  * 1. The origin of this software must not be misrepresented; you must not
  *    claim that you wrote the original software. If you use this software
  *    in a product, an acknowledgment in the product documentation would be
  *    appreciated but is not required.
  * 2. Altered source versions must be plainly marked as such, and must not be
  *    misrepresented as being the original software.
  * 3. This notice may not be removed or altered from any source distribution.
  */

 #define JPEG_INTERNALS
 #include "../../../jinclude.h"
 #include "../../../jpeglib.h"
 #include "../../../jsimd.h"
 #include "../../../jdct.h"
 #include "../../../jsimddct.h"
 #include "../../jsimd.h"

 #include <arm_neon.h>

 /*
  * The diagram below shows a grid-window of samples (luma or chroma) produced
  * by h2v2 downsampling.
  *
  *                  s0        s1
  *             +---------+---------+
  *             | p0   p1 | p2   p3 |
  *     r0      |         |         |
  *             | p4   p5 | p6   p7 |
  *             +---------+---------+
  *             | p8   p9 | p10  p11|
  *     r1      |         |         |
  *             | p12  p13| p14  p15|
  *             +---------+---------+
  *             | p16  p17| p18  p19|
  *     r2      |         |         |
  *             | p20  p21| p22  p23|
  *             +---------+---------+
  *
  * Every sample contains four of the original pixel channel values. The pixels'
  * channel values are centred at positions p0, p1, p2,..., p23 above. For a
  * given grid-window position, r1 is always used to denote the row of samples
  * containing the pixel channel values we are computing. For the top row of
  * pixel channel values in r1 (p8-p11), the nearest neighbouring samples are in
  * the row above - denoted by r0. Likewise, for the bottom row of pixels in r1
  * (p12-p15), the nearest neighbouring samples are in the row below - denoted
  * by r2.
  *
  * To compute the pixel channel values of the original image, we proportionally
  * blend the sample containing the pixel centre with the nearest neighbouring
  * samples in each row, column and diagonal.
  *
  * There are three cases to consider:
  *
  * 1) The first pixel in this row of the original image.
  *    Pixel channel value p8 only contains components from sample column s0.
  *    Its value is computed by blending samples s0r1 and s0r0 in the ratio 3:1.
  * 2) The last pixel in this row of the original image.
  *    Pixel channel value p11 only contains components from sample column s1.
  *    Its value is computed by blending samples s1r1 and s1r0 in the ratio 3:1.
  * 3) General case (all other pixels in the row).
  *    Apart from the first and last pixels, every other pixel channel value in
  *    the row contains components from samples in adjacent columns.
  *
  *    For example, the pixel centred at p9 would be computed as follows:
  *        (9/16 * s0r1) + (3/16 * s0r0) + (3/16 * s1r1) + (1/16 * s1r0)
  *
  *    This can be broken down into two steps:
  *    1) Blend samples vertically in columns s0 and s1 in the ratio 3:1:
  *        s0colsum = 3/4 * s0r1 + 1/4 * s0r0
  *        s1colsum = 3/4 * s1r1 + 1/4 * s1r0
  *    2) Blend the already-blended columns in the ratio 3:1:
  *        p9 = 3/4 * s0colsum + 1/4 * s1colsum
  *
  * The bottom row of pixel channel values in row r1 can be computed in the same
  * way for each of the three cases, only using samples in row r2 instead of row
  * r0 - as r2 is the nearest neighbouring row.
  */

 void jsimd_h2v2_fancy_upsample_neon(int max_v_samp_factor,
                                     JDIMENSION downsampled_width,
                                     JSAMPARRAY input_data,
                                     JSAMPARRAY *output_data_ptr)
 {
   JSAMPARRAY output_data = *output_data_ptr;
   JSAMPROW inptr0, inptr1, inptr2, outptr0, outptr1;
   int inrow, outrow;
   /* Setup constants. */
   const uint16x8_t seven_u16 = vdupq_n_u16(7);
   const uint8x8_t three_u8 = vdup_n_u8(3);
   const uint16x8_t three_u16 = vdupq_n_u16(3);

   inrow = outrow = 0;
   while (outrow < max_v_samp_factor) {
     inptr0 = input_data[inrow - 1];
     inptr1 = input_data[inrow];
     inptr2 = input_data[inrow + 1];
     /* Suffixes 0 and 1 denote the top and bottom rows of output pixels */
     /* respectively. */
     outptr0 = output_data[outrow++];
     outptr1 = output_data[outrow++];

     /* Case 1: first pixel channel value in this row of original image. */
     int s0colsum0 = GETJSAMPLE(*inptr1) * 3 + GETJSAMPLE(*inptr0);
     *outptr0 = (JSAMPLE)((s0colsum0 * 4 + 8) >> 4);
     int s0colsum1 = GETJSAMPLE(*inptr1) * 3 + GETJSAMPLE(*inptr2);
     *outptr1 = (JSAMPLE)((s0colsum1 * 4 + 8) >> 4);

     /* General case as described above. */
     /* Step 1: Blend samples vertically in columns s0 and s1. */
     /* Leave the divide by 4 to the end when it can be done for both */
     /* dimensions at once, right-shifting by 4. */

     /* Load and compute s0colsum0 and s0colsum1. */
     uint8x16_t s0r0 = vld1q_u8(inptr0);
     uint8x16_t s0r1 = vld1q_u8(inptr1);
     uint8x16_t s0r2 = vld1q_u8(inptr2);
     /* Multiplication makes vectors twice as wide: '_l' and '_h' suffixes */
     /* denote low half and high half respectively. */
     uint16x8_t s0colsum0_l = vmlal_u8(vmovl_u8(vget_low_u8(s0r0)),
                                       vget_low_u8(s0r1), three_u8);
     uint16x8_t s0colsum0_h = vmlal_u8(vmovl_u8(vget_high_u8(s0r0)),
                                       vget_high_u8(s0r1), three_u8);
     uint16x8_t s0colsum1_l = vmlal_u8(vmovl_u8(vget_low_u8(s0r2)),
                                       vget_low_u8(s0r1), three_u8);
     uint16x8_t s0colsum1_h = vmlal_u8(vmovl_u8(vget_high_u8(s0r2)),
                                       vget_high_u8(s0r1), three_u8);
     /* Load and compute s1colsum0 and s1colsum1. */
     uint8x16_t s1r0 = vld1q_u8(inptr0 + 1);
     uint8x16_t s1r1 = vld1q_u8(inptr1 + 1);
     uint8x16_t s1r2 = vld1q_u8(inptr2 + 1);
     uint16x8_t s1colsum0_l = vmlal_u8(vmovl_u8(vget_low_u8(s1r0)),
                                       vget_low_u8(s1r1), three_u8);
     uint16x8_t s1colsum0_h = vmlal_u8(vmovl_u8(vget_high_u8(s1r0)),
                                       vget_high_u8(s1r1), three_u8);
     uint16x8_t s1colsum1_l = vmlal_u8(vmovl_u8(vget_low_u8(s1r2)),
                                       vget_low_u8(s1r1), three_u8);
     uint16x8_t s1colsum1_h = vmlal_u8(vmovl_u8(vget_high_u8(s1r2)),
                                       vget_high_u8(s1r1), three_u8);
     /* Step 2: Blend the already-blended columns. */
     uint16x8_t output0_p1_l = vmlaq_u16(s1colsum0_l, s0colsum0_l, three_u16);
     uint16x8_t output0_p1_h = vmlaq_u16(s1colsum0_h, s0colsum0_h, three_u16);
     uint16x8_t output0_p2_l = vmlaq_u16(s0colsum0_l, s1colsum0_l, three_u16);
     uint16x8_t output0_p2_h = vmlaq_u16(s0colsum0_h, s1colsum0_h, three_u16);
     uint16x8_t output1_p1_l = vmlaq_u16(s1colsum1_l, s0colsum1_l, three_u16);
     uint16x8_t output1_p1_h = vmlaq_u16(s1colsum1_h, s0colsum1_h, three_u16);
     uint16x8_t output1_p2_l = vmlaq_u16(s0colsum1_l, s1colsum1_l, three_u16);
     uint16x8_t output1_p2_h = vmlaq_u16(s0colsum1_h, s1colsum1_h, three_u16);
     /* Add ordered dithering bias to odd pixel values. */
     output0_p1_l = vaddq_u16(output0_p1_l, seven_u16);
     output0_p1_h = vaddq_u16(output0_p1_h, seven_u16);
     output1_p1_l = vaddq_u16(output1_p1_l, seven_u16);
     output1_p1_h = vaddq_u16(output1_p1_h, seven_u16);
     /* Right-shift by 4 (divide by 16), narrow to 8-bit and combine. */
     uint8x16x2_t output_pixels0 = { vcombine_u8(vshrn_n_u16(output0_p1_l, 4),
                                                 vshrn_n_u16(output0_p1_h, 4)),
                                     vcombine_u8(vrshrn_n_u16(output0_p2_l, 4),
                                                 vrshrn_n_u16(output0_p2_h, 4))
                                   };
     uint8x16x2_t output_pixels1 = { vcombine_u8(vshrn_n_u16(output1_p1_l, 4),
                                                 vshrn_n_u16(output1_p1_h, 4)),
                                     vcombine_u8(vrshrn_n_u16(output1_p2_l, 4),
                                                 vrshrn_n_u16(output1_p2_h, 4))
                                   };
     /* Store pixel channel values to memory. */
     /* The minimum size of the output buffer for each row is 64 bytes => no */
     /* need to worry about buffer overflow here. See "Creation of 2-D sample */
     /* arrays" in jmemmgr.c for details. */
     vst2q_u8(outptr0 + 1, output_pixels0);
     vst2q_u8(outptr1 + 1, output_pixels1);

     /* The first pixel of the image shifted our loads and stores by one */
     /* byte. We have to re-align on a 32-byte boundary at some point before */
     /* the end of the row (we do it now on the 32/33 pixel boundary) to stay */
     /* within the bounds of the sample buffers without having to resort to a */
     /* slow scalar tail case for the last (downsampled_width % 16) samples. */
     /* See "Creation of 2-D sample arrays" in jmemmgr.c for details.*/
     for (unsigned colctr = 16; colctr < downsampled_width; colctr += 16) {
       /* Step 1: Blend samples vertically in columns s0 and s1. */
       /* Load and compute s0colsum0 and s0colsum1. */
       s0r0 = vld1q_u8(inptr0 + colctr - 1);
       s0r1 = vld1q_u8(inptr1 + colctr - 1);
       s0r2 = vld1q_u8(inptr2 + colctr - 1);
       s0colsum0_l = vmlal_u8(vmovl_u8(vget_low_u8(s0r0)),
                              vget_low_u8(s0r1), three_u8);
       s0colsum0_h = vmlal_u8(vmovl_u8(vget_high_u8(s0r0)),
                              vget_high_u8(s0r1), three_u8);
       s0colsum1_l = vmlal_u8(vmovl_u8(vget_low_u8(s0r2)),
                              vget_low_u8(s0r1), three_u8);
       s0colsum1_h = vmlal_u8(vmovl_u8(vget_high_u8(s0r2)),
                              vget_high_u8(s0r1), three_u8);
       /* Load and compute s1colsum0 and s1colsum1. */
       s1r0 = vld1q_u8(inptr0 + colctr);
       s1r1 = vld1q_u8(inptr1 + colctr);
       s1r2 = vld1q_u8(inptr2 + colctr);
       s1colsum0_l = vmlal_u8(vmovl_u8(vget_low_u8(s1r0)),
                              vget_low_u8(s1r1), three_u8);
       s1colsum0_h = vmlal_u8(vmovl_u8(vget_high_u8(s1r0)),
                              vget_high_u8(s1r1), three_u8);
       s1colsum1_l = vmlal_u8(vmovl_u8(vget_low_u8(s1r2)),
                              vget_low_u8(s1r1), three_u8);
       s1colsum1_h = vmlal_u8(vmovl_u8(vget_high_u8(s1r2)),
                              vget_high_u8(s1r1), three_u8);
       /* Step 2: Blend the already-blended columns. */
       output0_p1_l = vmlaq_u16(s1colsum0_l, s0colsum0_l, three_u16);
       output0_p1_h = vmlaq_u16(s1colsum0_h, s0colsum0_h, three_u16);
       output0_p2_l = vmlaq_u16(s0colsum0_l, s1colsum0_l, three_u16);
       output0_p2_h = vmlaq_u16(s0colsum0_h, s1colsum0_h, three_u16);
       output1_p1_l = vmlaq_u16(s1colsum1_l, s0colsum1_l, three_u16);
       output1_p1_h = vmlaq_u16(s1colsum1_h, s0colsum1_h, three_u16);
       output1_p2_l = vmlaq_u16(s0colsum1_l, s1colsum1_l, three_u16);
       output1_p2_h = vmlaq_u16(s0colsum1_h, s1colsum1_h, three_u16);
       /* Add ordered dithering bias to odd pixel values. */
       output0_p1_l = vaddq_u16(output0_p1_l, seven_u16);
       output0_p1_h = vaddq_u16(output0_p1_h, seven_u16);
       output1_p1_l = vaddq_u16(output1_p1_l, seven_u16);
       output1_p1_h = vaddq_u16(output1_p1_h, seven_u16);
       /* Right-shift by 4 (divide by 16), narrow to 8-bit and combine. */
       output_pixels0.val[0] = vcombine_u8(vshrn_n_u16(output0_p1_l, 4),
                                           vshrn_n_u16(output0_p1_h, 4));
       output_pixels0.val[1] = vcombine_u8(vrshrn_n_u16(output0_p2_l, 4),
                                           vrshrn_n_u16(output0_p2_h, 4));
       output_pixels1.val[0] = vcombine_u8(vshrn_n_u16(output1_p1_l, 4),
                                           vshrn_n_u16(output1_p1_h, 4));
       output_pixels1.val[1] = vcombine_u8(vrshrn_n_u16(output1_p2_l, 4),
                                           vrshrn_n_u16(output1_p2_h, 4));
       /* Store pixel channel values to memory. */
       vst2q_u8(outptr0 + 2 * colctr - 1, output_pixels0);
       vst2q_u8(outptr1 + 2 * colctr - 1, output_pixels1);
     }

     /* Case 2: last pixel channel value in this row of the original image. */
     int s1colsum0 = GETJSAMPLE(inptr1[downsampled_width - 1]) * 3 +
                     GETJSAMPLE(inptr0[downsampled_width - 1]);
     outptr0[2 * downsampled_width - 1] = (JSAMPLE)((s1colsum0 * 4 + 7) >> 4);
     int s1colsum1 = GETJSAMPLE(inptr1[downsampled_width - 1]) * 3 +
                     GETJSAMPLE(inptr2[downsampled_width - 1]);
     outptr1[2 * downsampled_width - 1] = (JSAMPLE)((s1colsum1 * 4 + 7) >> 4);
     inrow++;
   }
 }
	/*
	* jdsample-neon.c - upsampling (Arm NEON)
	*
	* Copyright 2019 The Chromium Authors. All Rights Reserved.
	*
	* This software is provided 'as-is', without any express or implied
	* warranty. In no event will the authors be held liable for any damages
	* arising from the use of this software.
	*
	* Permission is granted to anyone to use this software for any purpose,
	* including commercial applications, and to alter it and redistribute it
	* freely, subject to the following restrictions:
	*
	* 1. The origin of this software must not be misrepresented; you must not
	* claim that you wrote the original software. If you use this software
	* in a product, an acknowledgment in the product documentation would be
	* appreciated but is not required.
	* 2. Altered source versions must be plainly marked as such, and must not be
	* misrepresented as being the original software.
	* 3. This notice may not be removed or altered from any source distribution.
	*/

	#define JPEG_INTERNALS
	#include "../../../jinclude.h"
	#include "../../../jpeglib.h"
	#include "../../../jsimd.h"
	#include "../../../jdct.h"
	#include "../../../jsimddct.h"
	#include "../../jsimd.h"

	#include <arm_neon.h>

	/*
	* The diagram below shows a grid-window of samples (luma or chroma) produced
	* by h2v2 downsampling.
	*
	* s0 s1
	* +---------+---------+
	* \| p0 p1 \| p2 p3 \|
	* r0 \| \| \|
	* \| p4 p5 \| p6 p7 \|
	* +---------+---------+
	* \| p8 p9 \| p10 p11\|
	* r1 \| \| \|
	* \| p12 p13\| p14 p15\|
	* +---------+---------+
	* \| p16 p17\| p18 p19\|
	* r2 \| \| \|
	* \| p20 p21\| p22 p23\|
	* +---------+---------+
	*
	* Every sample contains four of the original pixel channel values. The pixels'
	* channel values are centred at positions p0, p1, p2,..., p23 above. For a
	* given grid-window position, r1 is always used to denote the row of samples
	* containing the pixel channel values we are computing. For the top row of
	* pixel channel values in r1 (p8-p11), the nearest neighbouring samples are in
	* the row above - denoted by r0. Likewise, for the bottom row of pixels in r1
	* (p12-p15), the nearest neighbouring samples are in the row below - denoted
	* by r2.
	*
	* To compute the pixel channel values of the original image, we proportionally
	* blend the sample containing the pixel centre with the nearest neighbouring
	* samples in each row, column and diagonal.
	*
	* There are three cases to consider:
	*
	* 1) The first pixel in this row of the original image.
	* Pixel channel value p8 only contains components from sample column s0.
	* Its value is computed by blending samples s0r1 and s0r0 in the ratio 3:1.
	* 2) The last pixel in this row of the original image.
	* Pixel channel value p11 only contains components from sample column s1.
	* Its value is computed by blending samples s1r1 and s1r0 in the ratio 3:1.
	* 3) General case (all other pixels in the row).
	* Apart from the first and last pixels, every other pixel channel value in
	* the row contains components from samples in adjacent columns.
	*
	* For example, the pixel centred at p9 would be computed as follows:
	* (9/16 * s0r1) + (3/16 * s0r0) + (3/16 * s1r1) + (1/16 * s1r0)
	*
	* This can be broken down into two steps:
	* 1) Blend samples vertically in columns s0 and s1 in the ratio 3:1:
	* s0colsum = 3/4 * s0r1 + 1/4 * s0r0
	* s1colsum = 3/4 * s1r1 + 1/4 * s1r0
	* 2) Blend the already-blended columns in the ratio 3:1:
	* p9 = 3/4 * s0colsum + 1/4 * s1colsum
	*
	* The bottom row of pixel channel values in row r1 can be computed in the same
	* way for each of the three cases, only using samples in row r2 instead of row
	* r0 - as r2 is the nearest neighbouring row.
	*/

	void jsimd_h2v2_fancy_upsample_neon(int max_v_samp_factor,
	JDIMENSION downsampled_width,
	JSAMPARRAY input_data,
	JSAMPARRAY *output_data_ptr)
	{
	JSAMPARRAY output_data = *output_data_ptr;
	JSAMPROW inptr0, inptr1, inptr2, outptr0, outptr1;
	int inrow, outrow;
	/* Setup constants. */
	const uint16x8_t seven_u16 = vdupq_n_u16(7);
	const uint8x8_t three_u8 = vdup_n_u8(3);
	const uint16x8_t three_u16 = vdupq_n_u16(3);

	inrow = outrow = 0;
	while (outrow < max_v_samp_factor) {
	inptr0 = input_data[inrow - 1];
	inptr1 = input_data[inrow];
	inptr2 = input_data[inrow + 1];
	/* Suffixes 0 and 1 denote the top and bottom rows of output pixels */
	/* respectively. */
	outptr0 = output_data[outrow++];
	outptr1 = output_data[outrow++];

	/* Case 1: first pixel channel value in this row of original image. */
	int s0colsum0 = GETJSAMPLE(inptr1) 3 + GETJSAMPLE(*inptr0);
	outptr0 = (JSAMPLE)((s0colsum0 4 + 8) >> 4);
	int s0colsum1 = GETJSAMPLE(inptr1) 3 + GETJSAMPLE(*inptr2);
	outptr1 = (JSAMPLE)((s0colsum1 4 + 8) >> 4);

	/* General case as described above. */
	/* Step 1: Blend samples vertically in columns s0 and s1. */
	/* Leave the divide by 4 to the end when it can be done for both */
	/* dimensions at once, right-shifting by 4. */

	/* Load and compute s0colsum0 and s0colsum1. */
	uint8x16_t s0r0 = vld1q_u8(inptr0);
	uint8x16_t s0r1 = vld1q_u8(inptr1);
	uint8x16_t s0r2 = vld1q_u8(inptr2);
	/* Multiplication makes vectors twice as wide: '_l' and '_h' suffixes */
	/* denote low half and high half respectively. */
	uint16x8_t s0colsum0_l = vmlal_u8(vmovl_u8(vget_low_u8(s0r0)),
	vget_low_u8(s0r1), three_u8);
	uint16x8_t s0colsum0_h = vmlal_u8(vmovl_u8(vget_high_u8(s0r0)),
	vget_high_u8(s0r1), three_u8);
	uint16x8_t s0colsum1_l = vmlal_u8(vmovl_u8(vget_low_u8(s0r2)),
	vget_low_u8(s0r1), three_u8);
	uint16x8_t s0colsum1_h = vmlal_u8(vmovl_u8(vget_high_u8(s0r2)),
	vget_high_u8(s0r1), three_u8);
	/* Load and compute s1colsum0 and s1colsum1. */
	uint8x16_t s1r0 = vld1q_u8(inptr0 + 1);
	uint8x16_t s1r1 = vld1q_u8(inptr1 + 1);
	uint8x16_t s1r2 = vld1q_u8(inptr2 + 1);
	uint16x8_t s1colsum0_l = vmlal_u8(vmovl_u8(vget_low_u8(s1r0)),
	vget_low_u8(s1r1), three_u8);
	uint16x8_t s1colsum0_h = vmlal_u8(vmovl_u8(vget_high_u8(s1r0)),
	vget_high_u8(s1r1), three_u8);
	uint16x8_t s1colsum1_l = vmlal_u8(vmovl_u8(vget_low_u8(s1r2)),
	vget_low_u8(s1r1), three_u8);
	uint16x8_t s1colsum1_h = vmlal_u8(vmovl_u8(vget_high_u8(s1r2)),
	vget_high_u8(s1r1), three_u8);
	/* Step 2: Blend the already-blended columns. */
	uint16x8_t output0_p1_l = vmlaq_u16(s1colsum0_l, s0colsum0_l, three_u16);
	uint16x8_t output0_p1_h = vmlaq_u16(s1colsum0_h, s0colsum0_h, three_u16);
	uint16x8_t output0_p2_l = vmlaq_u16(s0colsum0_l, s1colsum0_l, three_u16);
	uint16x8_t output0_p2_h = vmlaq_u16(s0colsum0_h, s1colsum0_h, three_u16);
	uint16x8_t output1_p1_l = vmlaq_u16(s1colsum1_l, s0colsum1_l, three_u16);
	uint16x8_t output1_p1_h = vmlaq_u16(s1colsum1_h, s0colsum1_h, three_u16);
	uint16x8_t output1_p2_l = vmlaq_u16(s0colsum1_l, s1colsum1_l, three_u16);
	uint16x8_t output1_p2_h = vmlaq_u16(s0colsum1_h, s1colsum1_h, three_u16);
	/* Add ordered dithering bias to odd pixel values. */
	output0_p1_l = vaddq_u16(output0_p1_l, seven_u16);
	output0_p1_h = vaddq_u16(output0_p1_h, seven_u16);
	output1_p1_l = vaddq_u16(output1_p1_l, seven_u16);
	output1_p1_h = vaddq_u16(output1_p1_h, seven_u16);
	/* Right-shift by 4 (divide by 16), narrow to 8-bit and combine. */
	uint8x16x2_t output_pixels0 = { vcombine_u8(vshrn_n_u16(output0_p1_l, 4),
	vshrn_n_u16(output0_p1_h, 4)),
	vcombine_u8(vrshrn_n_u16(output0_p2_l, 4),
	vrshrn_n_u16(output0_p2_h, 4))
	};
	uint8x16x2_t output_pixels1 = { vcombine_u8(vshrn_n_u16(output1_p1_l, 4),
	vshrn_n_u16(output1_p1_h, 4)),
	vcombine_u8(vrshrn_n_u16(output1_p2_l, 4),
	vrshrn_n_u16(output1_p2_h, 4))
	};
	/* Store pixel channel values to memory. */
	/* The minimum size of the output buffer for each row is 64 bytes => no */
	/* need to worry about buffer overflow here. See "Creation of 2-D sample */
	/* arrays" in jmemmgr.c for details. */
	vst2q_u8(outptr0 + 1, output_pixels0);
	vst2q_u8(outptr1 + 1, output_pixels1);

	/* The first pixel of the image shifted our loads and stores by one */
	/* byte. We have to re-align on a 32-byte boundary at some point before */
	/* the end of the row (we do it now on the 32/33 pixel boundary) to stay */
	/* within the bounds of the sample buffers without having to resort to a */
	/* slow scalar tail case for the last (downsampled_width % 16) samples. */
	/* See "Creation of 2-D sample arrays" in jmemmgr.c for details.*/
	for (unsigned colctr = 16; colctr < downsampled_width; colctr += 16) {
	/* Step 1: Blend samples vertically in columns s0 and s1. */
	/* Load and compute s0colsum0 and s0colsum1. */
	s0r0 = vld1q_u8(inptr0 + colctr - 1);
	s0r1 = vld1q_u8(inptr1 + colctr - 1);
	s0r2 = vld1q_u8(inptr2 + colctr - 1);
	s0colsum0_l = vmlal_u8(vmovl_u8(vget_low_u8(s0r0)),
	vget_low_u8(s0r1), three_u8);
	s0colsum0_h = vmlal_u8(vmovl_u8(vget_high_u8(s0r0)),
	vget_high_u8(s0r1), three_u8);
	s0colsum1_l = vmlal_u8(vmovl_u8(vget_low_u8(s0r2)),
	vget_low_u8(s0r1), three_u8);
	s0colsum1_h = vmlal_u8(vmovl_u8(vget_high_u8(s0r2)),
	vget_high_u8(s0r1), three_u8);
	/* Load and compute s1colsum0 and s1colsum1. */
	s1r0 = vld1q_u8(inptr0 + colctr);
	s1r1 = vld1q_u8(inptr1 + colctr);
	s1r2 = vld1q_u8(inptr2 + colctr);
	s1colsum0_l = vmlal_u8(vmovl_u8(vget_low_u8(s1r0)),
	vget_low_u8(s1r1), three_u8);
	s1colsum0_h = vmlal_u8(vmovl_u8(vget_high_u8(s1r0)),
	vget_high_u8(s1r1), three_u8);
	s1colsum1_l = vmlal_u8(vmovl_u8(vget_low_u8(s1r2)),
	vget_low_u8(s1r1), three_u8);
	s1colsum1_h = vmlal_u8(vmovl_u8(vget_high_u8(s1r2)),
	vget_high_u8(s1r1), three_u8);
	/* Step 2: Blend the already-blended columns. */
	output0_p1_l = vmlaq_u16(s1colsum0_l, s0colsum0_l, three_u16);
	output0_p1_h = vmlaq_u16(s1colsum0_h, s0colsum0_h, three_u16);
	output0_p2_l = vmlaq_u16(s0colsum0_l, s1colsum0_l, three_u16);
	output0_p2_h = vmlaq_u16(s0colsum0_h, s1colsum0_h, three_u16);
	output1_p1_l = vmlaq_u16(s1colsum1_l, s0colsum1_l, three_u16);
	output1_p1_h = vmlaq_u16(s1colsum1_h, s0colsum1_h, three_u16);
	output1_p2_l = vmlaq_u16(s0colsum1_l, s1colsum1_l, three_u16);
	output1_p2_h = vmlaq_u16(s0colsum1_h, s1colsum1_h, three_u16);
	/* Add ordered dithering bias to odd pixel values. */
	output0_p1_l = vaddq_u16(output0_p1_l, seven_u16);
	output0_p1_h = vaddq_u16(output0_p1_h, seven_u16);
	output1_p1_l = vaddq_u16(output1_p1_l, seven_u16);
	output1_p1_h = vaddq_u16(output1_p1_h, seven_u16);
	/* Right-shift by 4 (divide by 16), narrow to 8-bit and combine. */
	output_pixels0.val[0] = vcombine_u8(vshrn_n_u16(output0_p1_l, 4),
	vshrn_n_u16(output0_p1_h, 4));
	output_pixels0.val[1] = vcombine_u8(vrshrn_n_u16(output0_p2_l, 4),
	vrshrn_n_u16(output0_p2_h, 4));
	output_pixels1.val[0] = vcombine_u8(vshrn_n_u16(output1_p1_l, 4),
	vshrn_n_u16(output1_p1_h, 4));
	output_pixels1.val[1] = vcombine_u8(vrshrn_n_u16(output1_p2_l, 4),
	vrshrn_n_u16(output1_p2_h, 4));
	/* Store pixel channel values to memory. */
	vst2q_u8(outptr0 + 2 * colctr - 1, output_pixels0);
	vst2q_u8(outptr1 + 2 * colctr - 1, output_pixels1);
	}

	/* Case 2: last pixel channel value in this row of the original image. */
	int s1colsum0 = GETJSAMPLE(inptr1[downsampled_width - 1]) * 3 +
	GETJSAMPLE(inptr0[downsampled_width - 1]);
	outptr0[2 * downsampled_width - 1] = (JSAMPLE)((s1colsum0 * 4 + 7) >> 4);
	int s1colsum1 = GETJSAMPLE(inptr1[downsampled_width - 1]) * 3 +
	GETJSAMPLE(inptr2[downsampled_width - 1]);
	outptr1[2 * downsampled_width - 1] = (JSAMPLE)((s1colsum1 * 4 + 7) >> 4);
	inrow++;
	}
	}