Add Arm NEON implementation of h1v2_fancy_upsample

Adds an Arm NEON intrinsics implementation of h1v2_fancy_upsample.
There was no previous NEON assembly implementation for either AArch32
or AArch64.

Bug: 922430
Change-Id: Ifd10626083a48bd59a048a8de3ec8132b91d4bd5
diff --git a/simd/arm/arm/jsimd.c b/simd/arm/arm/jsimd.c
index 28012c1..264876e 100644
--- a/simd/arm/arm/jsimd.c
+++ b/simd/arm/arm/jsimd.c
@@ -380,6 +380,17 @@
 GLOBAL(int)
 jsimd_can_h1v2_fancy_upsample(void)
 {
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
   return 0;
 }
 
@@ -405,6 +416,9 @@
 jsimd_h1v2_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
                           JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
 {
+  jsimd_h1v2_fancy_upsample_neon(cinfo->max_v_samp_factor,
+                                 compptr->downsampled_width, input_data,
+                                 output_data_ptr);
 }
 
 GLOBAL(int)
diff --git a/simd/arm/arm64/jsimd.c b/simd/arm/arm64/jsimd.c
index 7a472fc..a7a2547 100644
--- a/simd/arm/arm64/jsimd.c
+++ b/simd/arm/arm64/jsimd.c
@@ -444,6 +444,17 @@
 GLOBAL(int)
 jsimd_can_h1v2_fancy_upsample(void)
 {
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
   return 0;
 }
 
@@ -466,6 +477,9 @@
 jsimd_h1v2_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
                           JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
 {
+  jsimd_h1v2_fancy_upsample_neon(cinfo->max_v_samp_factor,
+                                 compptr->downsampled_width, input_data,
+                                 output_data_ptr);
 }
 
 GLOBAL(int)
diff --git a/simd/arm/common/jdsample-neon.c b/simd/arm/common/jdsample-neon.c
index c6bdac1..d5a953c 100644
--- a/simd/arm/common/jdsample-neon.c
+++ b/simd/arm/common/jdsample-neon.c
@@ -251,3 +251,99 @@
     inrow++;
   }
 }
+
+
+/*
+ * The diagram below shows a grid-window of samples (luma or chroma) produced
+ * by h2v1 downsampling; which has been subsequently rotated 90 degrees. (The
+ * usual use of h1v2 upsampling is upsampling rotated or transposed h2v1
+ * downsampled images.)
+ *
+ *                  s0        s1
+ *             +---------+---------+
+ *             |    p0   |    p1   |
+ *     r0      |         |         |
+ *             |    p2   |    p3   |
+ *             +---------+---------+
+ *             |    p4   |    p5   |
+ *     r1      |         |         |
+ *             |    p6   |    p7   |
+ *             +---------+---------+
+ *             |    p8   |    p9   |
+ *     r2      |         |         |
+ *             |    p10  |    p11  |
+ *             +---------+---------+
+ *
+ * Every sample contains two of the original pixel channel values. The pixels'
+ * channel values are centred at positions p0, p1, p2,..., p11 above. For a
+ * given grid-window position, r1 is always used to denote the row of samples
+ * containing the pixel channel values we are computing. For the top row of
+ * pixel channel values in r1 (p4 and p5), the nearest neighbouring samples are
+ * in the row above - denoted by r0. Likewise, for the bottom row of pixels in
+ * r1 (p6 and p7), the nearest neighbouring samples are in the row below -
+ * denoted by r2.
+ *
+ * To compute the pixel channel values of the original image, we proportionally
+ * blend the adjacent samples in each column.
+ *
+ * For example, the pixel channel value centred at p4 would be computed as
+ * follows:
+ *     3/4 * s0r1 + 1/4 * s0r0
+ * while the pixel channel value centred at p6 would be:
+ *     3/4 * s0r1 + 1/4 * s0r2
+ */
+
+void jsimd_h1v2_fancy_upsample_neon(int max_v_samp_factor,
+                                    JDIMENSION downsampled_width,
+                                    JSAMPARRAY input_data,
+                                    JSAMPARRAY *output_data_ptr)
+{
+  JSAMPARRAY output_data = *output_data_ptr;
+  JSAMPROW inptr0, inptr1, inptr2, outptr0, outptr1;
+  int inrow, outrow;
+  /* Setup constants. */
+  const uint16x8_t one_u16 = vdupq_n_u16(1);
+  const uint8x8_t three_u8 = vdup_n_u8(3);
+
+  inrow = outrow = 0;
+  while (outrow < max_v_samp_factor) {
+    inptr0 = input_data[inrow - 1];
+    inptr1 = input_data[inrow];
+    inptr2 = input_data[inrow + 1];
+    /* Suffixes 0 and 1 denote the top and bottom rows of output pixels */
+    /* respectively. */
+    outptr0 = output_data[outrow++];
+    outptr1 = output_data[outrow++];
+    inrow++;
+
+    /* The size of the input and output buffers is always a multiple of 32 */
+    /* bytes => no need to worry about buffer overflow when reading/writing */
+    /* memory. See "Creation of 2-D sample arrays" in jmemmgr.c for details. */
+    for (unsigned colctr = 0; colctr < downsampled_width; colctr += 16) {
+      /* Load samples. */
+      uint8x16_t r0 = vld1q_u8(inptr0 + colctr);
+      uint8x16_t r1 = vld1q_u8(inptr1 + colctr);
+      uint8x16_t r2 = vld1q_u8(inptr2 + colctr);
+      /* Blend samples vertically. */
+      uint16x8_t colsum0_l = vmlal_u8(vmovl_u8(vget_low_u8(r0)),
+                                      vget_low_u8(r1), three_u8);
+      uint16x8_t colsum0_h = vmlal_u8(vmovl_u8(vget_high_u8(r0)),
+                                      vget_high_u8(r1), three_u8);
+      uint16x8_t colsum1_l = vmlal_u8(vmovl_u8(vget_low_u8(r2)),
+                                      vget_low_u8(r1), three_u8);
+      uint16x8_t colsum1_h = vmlal_u8(vmovl_u8(vget_high_u8(r2)),
+                                      vget_high_u8(r1), three_u8);
+      /* Add ordered dithering bias to pixel values in even output rows. */
+      colsum0_l = vaddq_u16(colsum0_l, one_u16);
+      colsum0_h = vaddq_u16(colsum0_h, one_u16);
+      /* Right-shift by 2 (divide by 4), narrow to 8-bit and combine. */
+      uint8x16_t output_pixels0 = vcombine_u8(vshrn_n_u16(colsum0_l, 2),
+                                              vshrn_n_u16(colsum0_h, 2));
+      uint8x16_t output_pixels1 = vcombine_u8(vrshrn_n_u16(colsum1_l, 2),
+                                              vrshrn_n_u16(colsum1_h, 2));
+      /* Store pixel channel values to memory. */
+      vst1q_u8(outptr0 + colctr, output_pixels0);
+      vst1q_u8(outptr1 + colctr, output_pixels1);
+    }
+  }
+}
diff --git a/simd/jsimd.h b/simd/jsimd.h
index 7c1e9e7..d20084b 100644
--- a/simd/jsimd.h
+++ b/simd/jsimd.h
@@ -611,6 +611,9 @@
 EXTERN(void) jsimd_h2v2_fancy_upsample_neon
   (int max_v_samp_factor, JDIMENSION downsampled_width, JSAMPARRAY input_data,
    JSAMPARRAY *output_data_ptr);
+EXTERN(void) jsimd_h1v2_fancy_upsample_neon
+  (int max_v_samp_factor, JDIMENSION downsampled_width, JSAMPARRAY input_data,
+   JSAMPARRAY *output_data_ptr);
 
 EXTERN(void) jsimd_h2v1_fancy_upsample_dspr2
   (int max_v_samp_factor, JDIMENSION downsampled_width, JSAMPARRAY input_data,