ByteToFloatRow_NEON to convert and scale bytes to floats

Each byte is converted to float (0.0 to 255.0) and then multiplied
by a scale parameter.

Bug: None
Test: arm 64 build passes.
Change-Id: I04736798540b8d985f60abdf0388e24a209d075b
Reviewed-on: https://chromium-review.googlesource.com/930226
Commit-Queue: Frank Barchard <fbarchard@chromium.org>
Reviewed-by: Ian Field <ianfield@google.com>
diff --git a/include/libyuv/planar_functions.h b/include/libyuv/planar_functions.h
index 324bb1e..91137ba 100644
--- a/include/libyuv/planar_functions.h
+++ b/include/libyuv/planar_functions.h
@@ -498,6 +498,10 @@
                    int width,
                    int height);
 
+// Convert a buffer of bytes to floats, scale the values and store as floats.
+LIBYUV_API
+int ByteToFloat(const uint8_t* src_y, float* dst_y, float scale, int width);
+
 // Quantize a rectangle of ARGB. Alpha unaffected.
 // scale is a 16 bit fractional fixed point scaler between 0 and 65535.
 // interval_size should be a value between 1 and 255.
diff --git a/include/libyuv/row.h b/include/libyuv/row.h
index ac91369..b5a42d0 100644
--- a/include/libyuv/row.h
+++ b/include/libyuv/row.h
@@ -308,6 +308,7 @@
 #define HAS_ARGBTOYROW_NEON
 #define HAS_BGRATOUVROW_NEON
 #define HAS_BGRATOYROW_NEON
+#define HAS_BYTETOFLOATROW_NEON
 #define HAS_COPYROW_NEON
 #define HAS_HALFFLOATROW_NEON
 #define HAS_I400TOARGBROW_NEON
@@ -3352,6 +3353,15 @@
                           uint16_t* dst_ptr,
                           float param,
                           int width);
+void ByteToFloatRow_C(const uint8_t* src, float* dst, float scale, int width);
+void ByteToFloatRow_NEON(const uint8_t* src,
+                         float* dst,
+                         float scale,
+                         int width);
+void ByteToFloatRow_Any_NEON(const uint8_t* src,
+                             float* dst,
+                             float scale,
+                             int width);
 
 void ARGBLumaColorTableRow_C(const uint8_t* src_argb,
                              uint8_t* dst_argb,
diff --git a/source/planar_functions.cc b/source/planar_functions.cc
index 77d7163..5eae3f7 100644
--- a/source/planar_functions.cc
+++ b/source/planar_functions.cc
@@ -3123,6 +3123,27 @@
   return 0;
 }
 
+// Convert a buffer of bytes to floats, scale the values and store as floats.
+LIBYUV_API
+int ByteToFloat(const uint8_t* src_y, float* dst_y, float scale, int width) {
+  void (*ByteToFloatRow)(const uint8_t* src, float* dst, float scale,
+                         int width) = ByteToFloatRow_C;
+  if (!src_y || !dst_y || width <= 0) {
+    return -1;
+  }
+#if defined(HAS_BYTETOFLOATROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ByteToFloatRow = ByteToFloatRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ByteToFloatRow = ByteToFloatRow_NEON;
+    }
+  }
+#endif
+
+  ByteToFloatRow(src_y, dst_y, scale, width);
+  return 0;
+}
+
 // Apply a lumacolortable to each ARGB pixel.
 LIBYUV_API
 int ARGBLumaColorTable(const uint8_t* src_argb,
diff --git a/source/row_any.cc b/source/row_any.cc
index 39bc6b0..cc5914d 100644
--- a/source/row_any.cc
+++ b/source/row_any.cc
@@ -807,37 +807,52 @@
 #undef ANY11C
 
 // Any 1 to 1 with parameter and shorts to byte.  BPP measures in shorts.
-#define ANY11P16(NAMEANY, ANY_SIMD, T, SBPP, BPP, MASK)             \
-  void NAMEANY(const uint16_t* src_ptr, uint16_t* dst_ptr, T param, \
-               int width) {                                         \
-    SIMD_ALIGNED(uint16_t temp[32 * 2]);                            \
-    memset(temp, 0, 64); /* for msan */                             \
-    int r = width & MASK;                                           \
-    int n = width & ~MASK;                                          \
-    if (n > 0) {                                                    \
-      ANY_SIMD(src_ptr, dst_ptr, param, n);                         \
-    }                                                               \
-    memcpy(temp, src_ptr + n, r * SBPP);                            \
-    ANY_SIMD(temp, temp + 16, param, MASK + 1);                     \
-    memcpy(dst_ptr + n, temp + 16, r * BPP);                        \
+#define ANY11P16(NAMEANY, ANY_SIMD, ST, T, SBPP, BPP, MASK)             \
+  void NAMEANY(const ST* src_ptr, T* dst_ptr, float param, int width) { \
+    SIMD_ALIGNED(ST temp[32]);                                          \
+    SIMD_ALIGNED(T out[32]);                                            \
+    memset(temp, 0, SBPP * 32); /* for msan */                          \
+    int r = width & MASK;                                               \
+    int n = width & ~MASK;                                              \
+    if (n > 0) {                                                        \
+      ANY_SIMD(src_ptr, dst_ptr, param, n);                             \
+    }                                                                   \
+    memcpy(temp, src_ptr + n, r * SBPP);                                \
+    ANY_SIMD(temp, out, param, MASK + 1);                               \
+    memcpy(dst_ptr + n, out, r * BPP);                                  \
   }
 
 #ifdef HAS_HALFFLOATROW_SSE2
-ANY11P16(HalfFloatRow_Any_SSE2, HalfFloatRow_SSE2, float, 2, 2, 7)
+ANY11P16(HalfFloatRow_Any_SSE2, HalfFloatRow_SSE2, uint16_t, uint16_t, 2, 2, 7)
 #endif
 #ifdef HAS_HALFFLOATROW_AVX2
-ANY11P16(HalfFloatRow_Any_AVX2, HalfFloatRow_AVX2, float, 2, 2, 15)
+ANY11P16(HalfFloatRow_Any_AVX2, HalfFloatRow_AVX2, uint16_t, uint16_t, 2, 2, 15)
 #endif
 #ifdef HAS_HALFFLOATROW_F16C
-ANY11P16(HalfFloatRow_Any_F16C, HalfFloatRow_F16C, float, 2, 2, 15)
-ANY11P16(HalfFloat1Row_Any_F16C, HalfFloat1Row_F16C, float, 2, 2, 15)
+ANY11P16(HalfFloatRow_Any_F16C, HalfFloatRow_F16C, uint16_t, uint16_t, 2, 2, 15)
+ANY11P16(HalfFloat1Row_Any_F16C,
+         HalfFloat1Row_F16C,
+         uint16_t,
+         uint16_t,
+         2,
+         2,
+         15)
 #endif
 #ifdef HAS_HALFFLOATROW_NEON
-ANY11P16(HalfFloatRow_Any_NEON, HalfFloatRow_NEON, float, 2, 2, 7)
-ANY11P16(HalfFloat1Row_Any_NEON, HalfFloat1Row_NEON, float, 2, 2, 7)
+ANY11P16(HalfFloatRow_Any_NEON, HalfFloatRow_NEON, uint16_t, uint16_t, 2, 2, 7)
+ANY11P16(HalfFloat1Row_Any_NEON,
+         HalfFloat1Row_NEON,
+         uint16_t,
+         uint16_t,
+         2,
+         2,
+         7)
 #endif
 #ifdef HAS_HALFFLOATROW_MSA
-ANY11P16(HalfFloatRow_Any_MSA, HalfFloatRow_MSA, float, 2, 2, 31)
+ANY11P16(HalfFloatRow_Any_MSA, HalfFloatRow_MSA, uint16_t, uint16_t, 2, 2, 31)
+#endif
+#ifdef HAS_BYTETOFLOATROW_NEON
+ANY11P16(ByteToFloatRow_Any_NEON, ByteToFloatRow_NEON, uint8_t, float, 1, 3, 7)
 #endif
 #undef ANY11P16
 
diff --git a/source/row_common.cc b/source/row_common.cc
index 049130a..da97821 100644
--- a/source/row_common.cc
+++ b/source/row_common.cc
@@ -2774,6 +2774,14 @@
   }
 }
 
+void ByteToFloatRow_C(const uint8_t* src, float* dst, float scale, int width) {
+  int i;
+  for (i = 0; i < width; ++i) {
+    float value = src[i] * scale;
+    dst[i] = value;
+  }
+}
+
 void ARGBLumaColorTableRow_C(const uint8_t* src_argb,
                              uint8_t* dst_argb,
                              int width,
diff --git a/source/row_neon.cc b/source/row_neon.cc
index b77d720..8b6c195 100644
--- a/source/row_neon.cc
+++ b/source/row_neon.cc
@@ -2659,6 +2659,32 @@
       : "cc", "memory", "q0", "q1", "q2", "q3");
 }
 
+void ByteToFloatRow_NEON(const uint8_t* src,
+                         float* dst,
+                         float scale,
+                         int width) {
+  asm volatile(
+      "vdup.32    q0, %3                         \n"
+
+      "1:                                        \n"
+      "vld1.8     {d2}, [%0]!                    \n"  // load 8 bytes
+      "subs       %2, %2, #8                     \n"  // 8 pixels per loop
+      "vmovl.u8   q1, d2                         \n"  // 8 shorts
+      "vmovl.u16  q2, d2                         \n"  // 8 ints
+      "vmovl.u16  q3, d3                         \n"
+      "vcvt.f32.u32  q2, q2                      \n"  // 8 floats
+      "vcvt.f32.u32  q3, q3                      \n"
+      "vmul.f32   q2, q2, d0[0]                  \n"  // scale
+      "vmul.f32   q3, q3, d0[0]                  \n"
+      "vst1.8     {q2, q3}, [%1]!                \n"  // store 8 floats
+      "bgt        1b                             \n"
+      : "+r"(src),   // %0
+        "+r"(dst),   // %1
+        "+r"(width)  // %2
+      : "r"(scale)   // %3
+      : "cc", "memory", "q0", "q1", "q2", "q3");
+}
+
 #endif  // !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__)..
 
 #ifdef __cplusplus
diff --git a/source/row_neon64.cc b/source/row_neon64.cc
index f6cf132..24b4520 100644
--- a/source/row_neon64.cc
+++ b/source/row_neon64.cc
@@ -2700,6 +2700,30 @@
       : "cc", "memory", "v1", "v2", "v3");
 }
 
+void ByteToFloatRow_NEON(const uint8_t* src,
+                         float* dst,
+                         float scale,
+                         int width) {
+  asm volatile(
+      "1:                                        \n"
+      "ld1        {v1.8b}, [%0], #8              \n"  // load 8 bytes
+      "subs       %w2, %w2, #8                   \n"  // 8 pixels per loop
+      "uxtl       v1.8h, v1.8b                   \n"  // 8 shorts
+      "uxtl       v2.4s, v1.4h                   \n"  // 8 ints
+      "uxtl2      v3.4s, v1.8h                   \n"
+      "scvtf      v2.4s, v2.4s                   \n"  // 8 floats
+      "scvtf      v3.4s, v3.4s                   \n"
+      "fmul       v2.4s, v2.4s, %3.s[0]          \n"  // scale
+      "fmul       v3.4s, v3.4s, %3.s[0]          \n"
+      "st1        {v2.16b, v3.16b}, [%1], #32    \n"  // store 8 floats
+      "b.gt       1b                             \n"
+      : "+r"(src),   // %0
+        "+r"(dst),   // %1
+        "+r"(width)  // %2
+      : "w"(scale)   // %3
+      : "cc", "memory", "v1", "v2", "v3");
+}
+
 float ScaleMaxSamples_NEON(const float* src,
                            float* dst,
                            float scale,
diff --git a/unit_test/planar_test.cc b/unit_test/planar_test.cc
index 9f95941..7560895 100644
--- a/unit_test/planar_test.cc
+++ b/unit_test/planar_test.cc
@@ -2168,6 +2168,52 @@
   EXPECT_LE(diff, 1);
 }
 
+float TestByteToFloat(int benchmark_width,
+                      int benchmark_height,
+                      int benchmark_iterations,
+                      int disable_cpu_flags,
+                      int benchmark_cpu_info,
+                      float scale) {
+  int i, j;
+  const int y_plane_size = benchmark_width * benchmark_height;
+
+  align_buffer_page_end(orig_y, y_plane_size * (1 + 4 + 4));
+  float* dst_opt = reinterpret_cast<float*>(orig_y + y_plane_size);
+  float* dst_c = reinterpret_cast<float*>(orig_y + y_plane_size * 5);
+
+  MemRandomize(orig_y, y_plane_size);
+  memset(dst_c, 0, y_plane_size * 4);
+  memset(dst_opt, 1, y_plane_size * 4);
+
+  // Disable all optimizations.
+  MaskCpuFlags(disable_cpu_flags);
+  ByteToFloat(orig_y, dst_c, scale, y_plane_size);
+
+  // Enable optimizations.
+  MaskCpuFlags(benchmark_cpu_info);
+  for (j = 0; j < benchmark_iterations; j++) {
+    ByteToFloat(orig_y, dst_opt, scale, y_plane_size);
+  }
+
+  float max_diff = 0;
+  for (i = 0; i < y_plane_size; ++i) {
+    float abs_diff = fabs(dst_c[i] - dst_opt[i]);
+    if (abs_diff > max_diff) {
+      max_diff = abs_diff;
+    }
+  }
+
+  free_aligned_buffer_page_end(orig_y);
+  return max_diff;
+}
+
+TEST_F(LibYUVPlanarTest, TestByteToFloat) {
+  float diff = TestByteToFloat(benchmark_width_, benchmark_height_,
+                               benchmark_iterations_, disable_cpu_flags_,
+                               benchmark_cpu_info_, 1.0f);
+  EXPECT_EQ(0.f, diff);
+}
+
 TEST_F(LibYUVPlanarTest, TestARGBLumaColorTable) {
   SIMD_ALIGNED(uint8_t orig_pixels[1280][4]);
   SIMD_ALIGNED(uint8_t dst_pixels_opt[1280][4]);