sharpyuv/sharpyuv.c - webm/libwebp - Git at Google

 // Copyright 2022 Google Inc. All Rights Reserved.
 //
 // Use of this source code is governed by a BSD-style license
 // that can be found in the COPYING file in the root of the source
 // tree. An additional intellectual property rights grant can be found
 // in the file PATENTS. All contributing project authors may
 // be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
 // Sharp RGB to YUV conversion.
 //
 // Author: Skal (pascal.massimino@gmail.com)

 #include "sharpyuv/sharpyuv.h"

 #include <assert.h>
 #include <limits.h>
 #include <math.h>
 #include <stdlib.h>
 #include <string.h>

 #include "src/webp/types.h"
 #include "src/dsp/cpu.h"
 #include "sharpyuv/sharpyuv_dsp.h"
 #include "sharpyuv/sharpyuv_gamma.h"

 //------------------------------------------------------------------------------
 // Sharp RGB->YUV conversion

 static const int kNumIterations = 4;

 #define YUV_FIX 16  // fixed-point precision for RGB->YUV
 static const int kYuvHalf = 1 << (YUV_FIX - 1);

 // Max bit depth so that intermediate calculations fit in 16 bits.
 static const int kMaxBitDepth = 14;

 // Returns the precision shift to use based on the input rgb_bit_depth.
 static int GetPrecisionShift(int rgb_bit_depth) {
   // Try to add 2 bits of precision if it fits in kMaxBitDepth. Otherwise remove
   // bits if needed.
   return ((rgb_bit_depth + 2) <= kMaxBitDepth) ? 2
                                                : (kMaxBitDepth - rgb_bit_depth);
 }

 typedef int16_t fixed_t;      // signed type with extra precision for UV
 typedef uint16_t fixed_y_t;   // unsigned type with extra precision for W

 //------------------------------------------------------------------------------

 static uint8_t clip_8b(fixed_t v) {
   return (!(v & ~0xff)) ? (uint8_t)v : (v < 0) ? 0u : 255u;
 }

 static uint16_t clip(fixed_t v, int max) {
   return (v < 0) ? 0 : (v > max) ? max : (uint16_t)v;
 }

 static fixed_y_t clip_bit_depth(int y, int bit_depth) {
   const int max = (1 << bit_depth) - 1;
   return (!(y & ~max)) ? (fixed_y_t)y : (y < 0) ? 0 : max;
 }

 //------------------------------------------------------------------------------

 static int RGBToGray(int64_t r, int64_t g, int64_t b) {
   const int64_t luma = 13933 * r + 46871 * g + 4732 * b + kYuvHalf;
   return (int)(luma >> YUV_FIX);
 }

 static uint32_t ScaleDown(uint16_t a, uint16_t b, uint16_t c, uint16_t d,
                           int rgb_bit_depth) {
   const int bit_depth = rgb_bit_depth + GetPrecisionShift(rgb_bit_depth);
   const uint32_t A = SharpYuvGammaToLinear(a, bit_depth);
   const uint32_t B = SharpYuvGammaToLinear(b, bit_depth);
   const uint32_t C = SharpYuvGammaToLinear(c, bit_depth);
   const uint32_t D = SharpYuvGammaToLinear(d, bit_depth);
   return SharpYuvLinearToGamma((A + B + C + D + 2) >> 2, bit_depth);
 }

 static WEBP_INLINE void UpdateW(const fixed_y_t* src, fixed_y_t* dst, int w,
                                 int rgb_bit_depth) {
   const int bit_depth = rgb_bit_depth + GetPrecisionShift(rgb_bit_depth);
   int i;
   for (i = 0; i < w; ++i) {
     const uint32_t R = SharpYuvGammaToLinear(src[0 * w + i], bit_depth);
     const uint32_t G = SharpYuvGammaToLinear(src[1 * w + i], bit_depth);
     const uint32_t B = SharpYuvGammaToLinear(src[2 * w + i], bit_depth);
     const uint32_t Y = RGBToGray(R, G, B);
     dst[i] = (fixed_y_t)SharpYuvLinearToGamma(Y, bit_depth);
   }
 }

 static void UpdateChroma(const fixed_y_t* src1, const fixed_y_t* src2,
                          fixed_t* dst, int uv_w, int rgb_bit_depth) {
   int i;
   for (i = 0; i < uv_w; ++i) {
     const int r =
         ScaleDown(src1[0 * uv_w + 0], src1[0 * uv_w + 1], src2[0 * uv_w + 0],
                   src2[0 * uv_w + 1], rgb_bit_depth);
     const int g =
         ScaleDown(src1[2 * uv_w + 0], src1[2 * uv_w + 1], src2[2 * uv_w + 0],
                   src2[2 * uv_w + 1], rgb_bit_depth);
     const int b =
         ScaleDown(src1[4 * uv_w + 0], src1[4 * uv_w + 1], src2[4 * uv_w + 0],
                   src2[4 * uv_w + 1], rgb_bit_depth);
     const int W = RGBToGray(r, g, b);
     dst[0 * uv_w] = (fixed_t)(r - W);
     dst[1 * uv_w] = (fixed_t)(g - W);
     dst[2 * uv_w] = (fixed_t)(b - W);
     dst  += 1;
     src1 += 2;
     src2 += 2;
   }
 }

 static void StoreGray(const fixed_y_t* rgb, fixed_y_t* y, int w) {
   int i;
   assert(w > 0);
   for (i = 0; i < w; ++i) {
     y[i] = RGBToGray(rgb[0 * w + i], rgb[1 * w + i], rgb[2 * w + i]);
   }
 }

 //------------------------------------------------------------------------------

 static WEBP_INLINE fixed_y_t Filter2(int A, int B, int W0, int bit_depth) {
   const int v0 = (A * 3 + B + 2) >> 2;
   return clip_bit_depth(v0 + W0, bit_depth);
 }

 //------------------------------------------------------------------------------

 static WEBP_INLINE int Shift(int v, int shift) {
   return (shift >= 0) ? (v << shift) : (v >> -shift);
 }

 static void ImportOneRow(const uint8_t* const r_ptr,
                          const uint8_t* const g_ptr,
                          const uint8_t* const b_ptr,
                          int rgb_step,
                          int rgb_bit_depth,
                          int pic_width,
                          fixed_y_t* const dst) {
   // Convert the rgb_step from a number of bytes to a number of uint8_t or
   // uint16_t values depending the bit depth.
   const int step = (rgb_bit_depth > 8) ? rgb_step / 2 : rgb_step;
   int i;
   const int w = (pic_width + 1) & ~1;
   for (i = 0; i < pic_width; ++i) {
     const int off = i * step;
     const int shift = GetPrecisionShift(rgb_bit_depth);
     if (rgb_bit_depth == 8) {
       dst[i + 0 * w] = Shift(r_ptr[off], shift);
       dst[i + 1 * w] = Shift(g_ptr[off], shift);
       dst[i + 2 * w] = Shift(b_ptr[off], shift);
     } else {
       dst[i + 0 * w] = Shift(((uint16_t*)r_ptr)[off], shift);
       dst[i + 1 * w] = Shift(((uint16_t*)g_ptr)[off], shift);
       dst[i + 2 * w] = Shift(((uint16_t*)b_ptr)[off], shift);
     }
   }
   if (pic_width & 1) {  // replicate rightmost pixel
     dst[pic_width + 0 * w] = dst[pic_width + 0 * w - 1];
     dst[pic_width + 1 * w] = dst[pic_width + 1 * w - 1];
     dst[pic_width + 2 * w] = dst[pic_width + 2 * w - 1];
   }
 }

 static void InterpolateTwoRows(const fixed_y_t* const best_y,
                                const fixed_t* prev_uv,
                                const fixed_t* cur_uv,
                                const fixed_t* next_uv,
                                int w,
                                fixed_y_t* out1,
                                fixed_y_t* out2,
                                int rgb_bit_depth) {
   const int uv_w = w >> 1;
   const int len = (w - 1) >> 1;   // length to filter
   int k = 3;
   const int bit_depth = rgb_bit_depth + GetPrecisionShift(rgb_bit_depth);
   while (k-- > 0) {   // process each R/G/B segments in turn
     // special boundary case for i==0
     out1[0] = Filter2(cur_uv[0], prev_uv[0], best_y[0], bit_depth);
     out2[0] = Filter2(cur_uv[0], next_uv[0], best_y[w], bit_depth);

     SharpYuvFilterRow(cur_uv, prev_uv, len, best_y + 0 + 1, out1 + 1,
                       bit_depth);
     SharpYuvFilterRow(cur_uv, next_uv, len, best_y + w + 1, out2 + 1,
                       bit_depth);

     // special boundary case for i == w - 1 when w is even
     if (!(w & 1)) {
       out1[w - 1] = Filter2(cur_uv[uv_w - 1], prev_uv[uv_w - 1],
                             best_y[w - 1 + 0], bit_depth);
       out2[w - 1] = Filter2(cur_uv[uv_w - 1], next_uv[uv_w - 1],
                             best_y[w - 1 + w], bit_depth);
     }
     out1 += w;
     out2 += w;
     prev_uv += uv_w;
     cur_uv  += uv_w;
     next_uv += uv_w;
   }
 }

 static WEBP_INLINE int RGBToYUVComponent(int r, int g, int b,
                                          const int coeffs[4], int sfix) {
   const int srounder = 1 << (YUV_FIX + sfix - 1);
   const int luma = coeffs[0] * r + coeffs[1] * g + coeffs[2] * b +
                    coeffs[3] + srounder;
   return (luma >> (YUV_FIX + sfix));
 }

 static int ConvertWRGBToYUV(const fixed_y_t* best_y, const fixed_t* best_uv,
                             uint8_t* y_ptr, int y_stride, uint8_t* u_ptr,
                             int u_stride, uint8_t* v_ptr, int v_stride,
                             int rgb_bit_depth,
                             int yuv_bit_depth, int width, int height,
                             const SharpYuvConversionMatrix* yuv_matrix) {
   int i, j;
   const fixed_t* const best_uv_base = best_uv;
   const int w = (width + 1) & ~1;
   const int h = (height + 1) & ~1;
   const int uv_w = w >> 1;
   const int uv_h = h >> 1;
   const int sfix = GetPrecisionShift(rgb_bit_depth);
   const int yuv_max = (1 << yuv_bit_depth) - 1;

   for (best_uv = best_uv_base, j = 0; j < height; ++j) {
     for (i = 0; i < width; ++i) {
       const int off = (i >> 1);
       const int W = best_y[i];
       const int r = best_uv[off + 0 * uv_w] + W;
       const int g = best_uv[off + 1 * uv_w] + W;
       const int b = best_uv[off + 2 * uv_w] + W;
       const int y = RGBToYUVComponent(r, g, b, yuv_matrix->rgb_to_y, sfix);
       if (yuv_bit_depth <= 8) {
         y_ptr[i] = clip_8b(y);
       } else {
         ((uint16_t*)y_ptr)[i] = clip(y, yuv_max);
       }
     }
     best_y += w;
     best_uv += (j & 1) * 3 * uv_w;
     y_ptr += y_stride;
   }
   for (best_uv = best_uv_base, j = 0; j < uv_h; ++j) {
     for (i = 0; i < uv_w; ++i) {
       const int off = i;
       // Note r, g and b values here are off by W, but a constant offset on all
       // 3 components doesn't change the value of u and v with a YCbCr matrix.
       const int r = best_uv[off + 0 * uv_w];
       const int g = best_uv[off + 1 * uv_w];
       const int b = best_uv[off + 2 * uv_w];
       const int u = RGBToYUVComponent(r, g, b, yuv_matrix->rgb_to_u, sfix);
       const int v = RGBToYUVComponent(r, g, b, yuv_matrix->rgb_to_v, sfix);
       if (yuv_bit_depth <= 8) {
         u_ptr[i] = clip_8b(u);
         v_ptr[i] = clip_8b(v);
       } else {
         ((uint16_t*)u_ptr)[i] = clip(u, yuv_max);
         ((uint16_t*)v_ptr)[i] = clip(v, yuv_max);
       }
     }
     best_uv += 3 * uv_w;
     u_ptr += u_stride;
     v_ptr += v_stride;
   }
   return 1;
 }

 //------------------------------------------------------------------------------
 // Main function

 static void* SafeMalloc(uint64_t nmemb, size_t size) {
   const uint64_t total_size = nmemb * (uint64_t)size;
   if (total_size != (size_t)total_size) return NULL;
   return malloc((size_t)total_size);
 }

 #define SAFE_ALLOC(W, H, T) ((T*)SafeMalloc((W) * (H), sizeof(T)))

 static int DoSharpArgbToYuv(const uint8_t* r_ptr, const uint8_t* g_ptr,
                             const uint8_t* b_ptr, int rgb_step, int rgb_stride,
                             int rgb_bit_depth, uint8_t* y_ptr, int y_stride,
                             uint8_t* u_ptr, int u_stride, uint8_t* v_ptr,
                             int v_stride, int yuv_bit_depth, int width,
                             int height,
                             const SharpYuvConversionMatrix* yuv_matrix) {
   // we expand the right/bottom border if needed
   const int w = (width + 1) & ~1;
   const int h = (height + 1) & ~1;
   const int uv_w = w >> 1;
   const int uv_h = h >> 1;
   uint64_t prev_diff_y_sum = ~0;
   int j, iter;

   // TODO(skal): allocate one big memory chunk. But for now, it's easier
   // for valgrind debugging to have several chunks.
   fixed_y_t* const tmp_buffer = SAFE_ALLOC(w * 3, 2, fixed_y_t);   // scratch
   fixed_y_t* const best_y_base = SAFE_ALLOC(w, h, fixed_y_t);
   fixed_y_t* const target_y_base = SAFE_ALLOC(w, h, fixed_y_t);
   fixed_y_t* const best_rgb_y = SAFE_ALLOC(w, 2, fixed_y_t);
   fixed_t* const best_uv_base = SAFE_ALLOC(uv_w * 3, uv_h, fixed_t);
   fixed_t* const target_uv_base = SAFE_ALLOC(uv_w * 3, uv_h, fixed_t);
   fixed_t* const best_rgb_uv = SAFE_ALLOC(uv_w * 3, 1, fixed_t);
   fixed_y_t* best_y = best_y_base;
   fixed_y_t* target_y = target_y_base;
   fixed_t* best_uv = best_uv_base;
   fixed_t* target_uv = target_uv_base;
   const uint64_t diff_y_threshold = (uint64_t)(3.0 * w * h);
   int ok;
   assert(w > 0);
   assert(h > 0);

   if (best_y_base == NULL || best_uv_base == NULL ||
       target_y_base == NULL || target_uv_base == NULL ||
       best_rgb_y == NULL || best_rgb_uv == NULL ||
       tmp_buffer == NULL) {
     ok = 0;
     goto End;
   }

   // Import RGB samples to W/RGB representation.
   for (j = 0; j < height; j += 2) {
     const int is_last_row = (j == height - 1);
     fixed_y_t* const src1 = tmp_buffer + 0 * w;
     fixed_y_t* const src2 = tmp_buffer + 3 * w;

     // prepare two rows of input
     ImportOneRow(r_ptr, g_ptr, b_ptr, rgb_step, rgb_bit_depth, width,
                  src1);
     if (!is_last_row) {
       ImportOneRow(r_ptr + rgb_stride, g_ptr + rgb_stride, b_ptr + rgb_stride,
                    rgb_step, rgb_bit_depth, width, src2);
     } else {
       memcpy(src2, src1, 3 * w * sizeof(*src2));
     }
     StoreGray(src1, best_y + 0, w);
     StoreGray(src2, best_y + w, w);

     UpdateW(src1, target_y, w, rgb_bit_depth);
     UpdateW(src2, target_y + w, w, rgb_bit_depth);
     UpdateChroma(src1, src2, target_uv, uv_w, rgb_bit_depth);
     memcpy(best_uv, target_uv, 3 * uv_w * sizeof(*best_uv));
     best_y += 2 * w;
     best_uv += 3 * uv_w;
     target_y += 2 * w;
     target_uv += 3 * uv_w;
     r_ptr += 2 * rgb_stride;
     g_ptr += 2 * rgb_stride;
     b_ptr += 2 * rgb_stride;
   }

   // Iterate and resolve clipping conflicts.
   for (iter = 0; iter < kNumIterations; ++iter) {
     const fixed_t* cur_uv = best_uv_base;
     const fixed_t* prev_uv = best_uv_base;
     uint64_t diff_y_sum = 0;

     best_y = best_y_base;
     best_uv = best_uv_base;
     target_y = target_y_base;
     target_uv = target_uv_base;
     for (j = 0; j < h; j += 2) {
       fixed_y_t* const src1 = tmp_buffer + 0 * w;
       fixed_y_t* const src2 = tmp_buffer + 3 * w;
       {
         const fixed_t* const next_uv = cur_uv + ((j < h - 2) ? 3 * uv_w : 0);
         InterpolateTwoRows(best_y, prev_uv, cur_uv, next_uv, w,
                            src1, src2, rgb_bit_depth);
         prev_uv = cur_uv;
         cur_uv = next_uv;
       }

       UpdateW(src1, best_rgb_y + 0 * w, w, rgb_bit_depth);
       UpdateW(src2, best_rgb_y + 1 * w, w, rgb_bit_depth);
       UpdateChroma(src1, src2, best_rgb_uv, uv_w, rgb_bit_depth);

       // update two rows of Y and one row of RGB
       diff_y_sum +=
           SharpYuvUpdateY(target_y, best_rgb_y, best_y, 2 * w,
                           rgb_bit_depth + GetPrecisionShift(rgb_bit_depth));
       SharpYuvUpdateRGB(target_uv, best_rgb_uv, best_uv, 3 * uv_w);

       best_y += 2 * w;
       best_uv += 3 * uv_w;
       target_y += 2 * w;
       target_uv += 3 * uv_w;
     }
     // test exit condition
     if (iter > 0) {
       if (diff_y_sum < diff_y_threshold) break;
       if (diff_y_sum > prev_diff_y_sum) break;
     }
     prev_diff_y_sum = diff_y_sum;
   }

   // final reconstruction
   ok = ConvertWRGBToYUV(best_y_base, best_uv_base, y_ptr, y_stride, u_ptr,
                         u_stride, v_ptr, v_stride, rgb_bit_depth, yuv_bit_depth,
                         width, height, yuv_matrix);

  End:
   free(best_y_base);
   free(best_uv_base);
   free(target_y_base);
   free(target_uv_base);
   free(best_rgb_y);
   free(best_rgb_uv);
   free(tmp_buffer);
   return ok;
 }
 #undef SAFE_ALLOC

 // Hidden exported init function.
 // By default SharpYuvConvert calls it with NULL. If needed, users can declare
 // it as extern and call it with a VP8CPUInfo function.
 extern void SharpYuvInit(VP8CPUInfo cpu_info_func);
 void SharpYuvInit(VP8CPUInfo cpu_info_func) {
   static volatile VP8CPUInfo sharpyuv_last_cpuinfo_used =
       (VP8CPUInfo)&sharpyuv_last_cpuinfo_used;
   const int initialized =
       (sharpyuv_last_cpuinfo_used != (VP8CPUInfo)&sharpyuv_last_cpuinfo_used);
   if (cpu_info_func == NULL && initialized) return;
   if (sharpyuv_last_cpuinfo_used == cpu_info_func) return;

   SharpYuvInitDsp(cpu_info_func);
   if (!initialized) {
     SharpYuvInitGammaTables();
   }

   sharpyuv_last_cpuinfo_used = cpu_info_func;
 }

 int SharpYuvConvert(const void* r_ptr, const void* g_ptr,
                     const void* b_ptr, int rgb_step, int rgb_stride,
                     int rgb_bit_depth, void* y_ptr, int y_stride,
                     void* u_ptr, int u_stride, void* v_ptr,
                     int v_stride, int yuv_bit_depth, int width,
                     int height, const SharpYuvConversionMatrix* yuv_matrix) {
   SharpYuvConversionMatrix scaled_matrix;
   const int rgb_max = (1 << rgb_bit_depth) - 1;
   const int rgb_round = 1 << (rgb_bit_depth - 1);
   const int yuv_max = (1 << yuv_bit_depth) - 1;
   const int sfix = GetPrecisionShift(rgb_bit_depth);

   if (width < 1 || height < 1 || width == INT_MAX || height == INT_MAX ||
       r_ptr == NULL || g_ptr == NULL || b_ptr == NULL || y_ptr == NULL ||
       u_ptr == NULL || v_ptr == NULL) {
     return 0;
   }
   if (rgb_bit_depth != 8 && rgb_bit_depth != 10 && rgb_bit_depth != 12 &&
       rgb_bit_depth != 16) {
     return 0;
   }
   if (yuv_bit_depth != 8 && yuv_bit_depth != 10 && yuv_bit_depth != 12) {
     return 0;
   }
   if (rgb_bit_depth > 8 && (rgb_step % 2 != 0 || rgb_stride %2 != 0)) {
     // Step/stride should be even for uint16_t buffers.
     return 0;
   }
   if (yuv_bit_depth > 8 &&
       (y_stride % 2 != 0 || u_stride % 2 != 0 || v_stride % 2 != 0)) {
     // Stride should be even for uint16_t buffers.
     return 0;
   }
   SharpYuvInit(NULL);

   // Add scaling factor to go from rgb_bit_depth to yuv_bit_depth, to the
   // rgb->yuv conversion matrix.
   if (rgb_bit_depth == yuv_bit_depth) {
     memcpy(&scaled_matrix, yuv_matrix, sizeof(scaled_matrix));
   } else {
     int i;
     for (i = 0; i < 3; ++i) {
       scaled_matrix.rgb_to_y[i] =
           (yuv_matrix->rgb_to_y[i] * yuv_max + rgb_round) / rgb_max;
       scaled_matrix.rgb_to_u[i] =
           (yuv_matrix->rgb_to_u[i] * yuv_max + rgb_round) / rgb_max;
       scaled_matrix.rgb_to_v[i] =
           (yuv_matrix->rgb_to_v[i] * yuv_max + rgb_round) / rgb_max;
     }
   }
   // Also incorporate precision change scaling.
   scaled_matrix.rgb_to_y[3] = Shift(yuv_matrix->rgb_to_y[3], sfix);
   scaled_matrix.rgb_to_u[3] = Shift(yuv_matrix->rgb_to_u[3], sfix);
   scaled_matrix.rgb_to_v[3] = Shift(yuv_matrix->rgb_to_v[3], sfix);

   return DoSharpArgbToYuv(r_ptr, g_ptr, b_ptr, rgb_step, rgb_stride,
                           rgb_bit_depth, y_ptr, y_stride, u_ptr, u_stride,
                           v_ptr, v_stride, yuv_bit_depth, width, height,
                           &scaled_matrix);
 }

 //------------------------------------------------------------------------------
	// Copyright 2022 Google Inc. All Rights Reserved.
	//
	// Use of this source code is governed by a BSD-style license
	// that can be found in the COPYING file in the root of the source
	// tree. An additional intellectual property rights grant can be found
	// in the file PATENTS. All contributing project authors may
	// be found in the AUTHORS file in the root of the source tree.
	// -----------------------------------------------------------------------------
	//
	// Sharp RGB to YUV conversion.
	//
	// Author: Skal (pascal.massimino@gmail.com)

	#include "sharpyuv/sharpyuv.h"

	#include <assert.h>
	#include <limits.h>
	#include <math.h>
	#include <stdlib.h>
	#include <string.h>

	#include "src/webp/types.h"
	#include "src/dsp/cpu.h"
	#include "sharpyuv/sharpyuv_dsp.h"
	#include "sharpyuv/sharpyuv_gamma.h"

	//------------------------------------------------------------------------------
	// Sharp RGB->YUV conversion

	static const int kNumIterations = 4;

	#define YUV_FIX 16 // fixed-point precision for RGB->YUV
	static const int kYuvHalf = 1 << (YUV_FIX - 1);

	// Max bit depth so that intermediate calculations fit in 16 bits.
	static const int kMaxBitDepth = 14;

	// Returns the precision shift to use based on the input rgb_bit_depth.
	static int GetPrecisionShift(int rgb_bit_depth) {
	// Try to add 2 bits of precision if it fits in kMaxBitDepth. Otherwise remove
	// bits if needed.
	return ((rgb_bit_depth + 2) <= kMaxBitDepth) ? 2
	: (kMaxBitDepth - rgb_bit_depth);
	}

	typedef int16_t fixed_t; // signed type with extra precision for UV
	typedef uint16_t fixed_y_t; // unsigned type with extra precision for W

	//------------------------------------------------------------------------------

	static uint8_t clip_8b(fixed_t v) {
	return (!(v & ~0xff)) ? (uint8_t)v : (v < 0) ? 0u : 255u;
	}

	static uint16_t clip(fixed_t v, int max) {
	return (v < 0) ? 0 : (v > max) ? max : (uint16_t)v;
	}

	static fixed_y_t clip_bit_depth(int y, int bit_depth) {
	const int max = (1 << bit_depth) - 1;
	return (!(y & ~max)) ? (fixed_y_t)y : (y < 0) ? 0 : max;
	}

	//------------------------------------------------------------------------------

	static int RGBToGray(int64_t r, int64_t g, int64_t b) {
	const int64_t luma = 13933 * r + 46871 * g + 4732 * b + kYuvHalf;
	return (int)(luma >> YUV_FIX);
	}

	static uint32_t ScaleDown(uint16_t a, uint16_t b, uint16_t c, uint16_t d,
	int rgb_bit_depth) {
	const int bit_depth = rgb_bit_depth + GetPrecisionShift(rgb_bit_depth);
	const uint32_t A = SharpYuvGammaToLinear(a, bit_depth);
	const uint32_t B = SharpYuvGammaToLinear(b, bit_depth);
	const uint32_t C = SharpYuvGammaToLinear(c, bit_depth);
	const uint32_t D = SharpYuvGammaToLinear(d, bit_depth);
	return SharpYuvLinearToGamma((A + B + C + D + 2) >> 2, bit_depth);
	}

	static WEBP_INLINE void UpdateW(const fixed_y_t* src, fixed_y_t* dst, int w,
	int rgb_bit_depth) {
	const int bit_depth = rgb_bit_depth + GetPrecisionShift(rgb_bit_depth);
	int i;
	for (i = 0; i < w; ++i) {
	const uint32_t R = SharpYuvGammaToLinear(src[0 * w + i], bit_depth);
	const uint32_t G = SharpYuvGammaToLinear(src[1 * w + i], bit_depth);
	const uint32_t B = SharpYuvGammaToLinear(src[2 * w + i], bit_depth);
	const uint32_t Y = RGBToGray(R, G, B);
	dst[i] = (fixed_y_t)SharpYuvLinearToGamma(Y, bit_depth);
	}
	}

	static void UpdateChroma(const fixed_y_t* src1, const fixed_y_t* src2,
	fixed_t* dst, int uv_w, int rgb_bit_depth) {
	int i;
	for (i = 0; i < uv_w; ++i) {
	const int r =
	ScaleDown(src1[0 * uv_w + 0], src1[0 * uv_w + 1], src2[0 * uv_w + 0],
	src2[0 * uv_w + 1], rgb_bit_depth);
	const int g =
	ScaleDown(src1[2 * uv_w + 0], src1[2 * uv_w + 1], src2[2 * uv_w + 0],
	src2[2 * uv_w + 1], rgb_bit_depth);
	const int b =
	ScaleDown(src1[4 * uv_w + 0], src1[4 * uv_w + 1], src2[4 * uv_w + 0],
	src2[4 * uv_w + 1], rgb_bit_depth);
	const int W = RGBToGray(r, g, b);
	dst[0 * uv_w] = (fixed_t)(r - W);
	dst[1 * uv_w] = (fixed_t)(g - W);
	dst[2 * uv_w] = (fixed_t)(b - W);
	dst += 1;
	src1 += 2;
	src2 += 2;
	}
	}

	static void StoreGray(const fixed_y_t* rgb, fixed_y_t* y, int w) {
	int i;
	assert(w > 0);
	for (i = 0; i < w; ++i) {
	y[i] = RGBToGray(rgb[0 * w + i], rgb[1 * w + i], rgb[2 * w + i]);
	}
	}

	//------------------------------------------------------------------------------

	static WEBP_INLINE fixed_y_t Filter2(int A, int B, int W0, int bit_depth) {
	const int v0 = (A * 3 + B + 2) >> 2;
	return clip_bit_depth(v0 + W0, bit_depth);
	}

	//------------------------------------------------------------------------------

	static WEBP_INLINE int Shift(int v, int shift) {
	return (shift >= 0) ? (v << shift) : (v >> -shift);
	}

	static void ImportOneRow(const uint8_t* const r_ptr,
	const uint8_t* const g_ptr,
	const uint8_t* const b_ptr,
	int rgb_step,
	int rgb_bit_depth,
	int pic_width,
	fixed_y_t* const dst) {
	// Convert the rgb_step from a number of bytes to a number of uint8_t or
	// uint16_t values depending the bit depth.
	const int step = (rgb_bit_depth > 8) ? rgb_step / 2 : rgb_step;
	int i;
	const int w = (pic_width + 1) & ~1;
	for (i = 0; i < pic_width; ++i) {
	const int off = i * step;
	const int shift = GetPrecisionShift(rgb_bit_depth);
	if (rgb_bit_depth == 8) {
	dst[i + 0 * w] = Shift(r_ptr[off], shift);
	dst[i + 1 * w] = Shift(g_ptr[off], shift);
	dst[i + 2 * w] = Shift(b_ptr[off], shift);
	} else {
	dst[i + 0 * w] = Shift(((uint16_t*)r_ptr)[off], shift);
	dst[i + 1 * w] = Shift(((uint16_t*)g_ptr)[off], shift);
	dst[i + 2 * w] = Shift(((uint16_t*)b_ptr)[off], shift);
	}
	}
	if (pic_width & 1) { // replicate rightmost pixel
	dst[pic_width + 0 * w] = dst[pic_width + 0 * w - 1];
	dst[pic_width + 1 * w] = dst[pic_width + 1 * w - 1];
	dst[pic_width + 2 * w] = dst[pic_width + 2 * w - 1];
	}
	}

	static void InterpolateTwoRows(const fixed_y_t* const best_y,
	const fixed_t* prev_uv,
	const fixed_t* cur_uv,
	const fixed_t* next_uv,
	int w,
	fixed_y_t* out1,
	fixed_y_t* out2,
	int rgb_bit_depth) {
	const int uv_w = w >> 1;
	const int len = (w - 1) >> 1; // length to filter
	int k = 3;
	const int bit_depth = rgb_bit_depth + GetPrecisionShift(rgb_bit_depth);
	while (k-- > 0) { // process each R/G/B segments in turn
	// special boundary case for i==0
	out1[0] = Filter2(cur_uv[0], prev_uv[0], best_y[0], bit_depth);
	out2[0] = Filter2(cur_uv[0], next_uv[0], best_y[w], bit_depth);

	SharpYuvFilterRow(cur_uv, prev_uv, len, best_y + 0 + 1, out1 + 1,
	bit_depth);
	SharpYuvFilterRow(cur_uv, next_uv, len, best_y + w + 1, out2 + 1,
	bit_depth);

	// special boundary case for i == w - 1 when w is even
	if (!(w & 1)) {
	out1[w - 1] = Filter2(cur_uv[uv_w - 1], prev_uv[uv_w - 1],
	best_y[w - 1 + 0], bit_depth);
	out2[w - 1] = Filter2(cur_uv[uv_w - 1], next_uv[uv_w - 1],
	best_y[w - 1 + w], bit_depth);
	}
	out1 += w;
	out2 += w;
	prev_uv += uv_w;
	cur_uv += uv_w;
	next_uv += uv_w;
	}
	}

	static WEBP_INLINE int RGBToYUVComponent(int r, int g, int b,
	const int coeffs[4], int sfix) {
	const int srounder = 1 << (YUV_FIX + sfix - 1);
	const int luma = coeffs[0] * r + coeffs[1] * g + coeffs[2] * b +
	coeffs[3] + srounder;
	return (luma >> (YUV_FIX + sfix));
	}

	static int ConvertWRGBToYUV(const fixed_y_t* best_y, const fixed_t* best_uv,
	uint8_t* y_ptr, int y_stride, uint8_t* u_ptr,
	int u_stride, uint8_t* v_ptr, int v_stride,
	int rgb_bit_depth,
	int yuv_bit_depth, int width, int height,
	const SharpYuvConversionMatrix* yuv_matrix) {
	int i, j;
	const fixed_t* const best_uv_base = best_uv;
	const int w = (width + 1) & ~1;
	const int h = (height + 1) & ~1;
	const int uv_w = w >> 1;
	const int uv_h = h >> 1;
	const int sfix = GetPrecisionShift(rgb_bit_depth);
	const int yuv_max = (1 << yuv_bit_depth) - 1;

	for (best_uv = best_uv_base, j = 0; j < height; ++j) {
	for (i = 0; i < width; ++i) {
	const int off = (i >> 1);
	const int W = best_y[i];
	const int r = best_uv[off + 0 * uv_w] + W;
	const int g = best_uv[off + 1 * uv_w] + W;
	const int b = best_uv[off + 2 * uv_w] + W;
	const int y = RGBToYUVComponent(r, g, b, yuv_matrix->rgb_to_y, sfix);
	if (yuv_bit_depth <= 8) {
	y_ptr[i] = clip_8b(y);
	} else {
	((uint16_t*)y_ptr)[i] = clip(y, yuv_max);
	}
	}
	best_y += w;
	best_uv += (j & 1) * 3 * uv_w;
	y_ptr += y_stride;
	}
	for (best_uv = best_uv_base, j = 0; j < uv_h; ++j) {
	for (i = 0; i < uv_w; ++i) {
	const int off = i;
	// Note r, g and b values here are off by W, but a constant offset on all
	// 3 components doesn't change the value of u and v with a YCbCr matrix.
	const int r = best_uv[off + 0 * uv_w];
	const int g = best_uv[off + 1 * uv_w];
	const int b = best_uv[off + 2 * uv_w];
	const int u = RGBToYUVComponent(r, g, b, yuv_matrix->rgb_to_u, sfix);
	const int v = RGBToYUVComponent(r, g, b, yuv_matrix->rgb_to_v, sfix);
	if (yuv_bit_depth <= 8) {
	u_ptr[i] = clip_8b(u);
	v_ptr[i] = clip_8b(v);
	} else {
	((uint16_t*)u_ptr)[i] = clip(u, yuv_max);
	((uint16_t*)v_ptr)[i] = clip(v, yuv_max);
	}
	}
	best_uv += 3 * uv_w;
	u_ptr += u_stride;
	v_ptr += v_stride;
	}
	return 1;
	}

	//------------------------------------------------------------------------------
	// Main function

	static void* SafeMalloc(uint64_t nmemb, size_t size) {
	const uint64_t total_size = nmemb * (uint64_t)size;
	if (total_size != (size_t)total_size) return NULL;
	return malloc((size_t)total_size);
	}

	#define SAFE_ALLOC(W, H, T) ((T)SafeMalloc((W) (H), sizeof(T)))

	static int DoSharpArgbToYuv(const uint8_t* r_ptr, const uint8_t* g_ptr,
	const uint8_t* b_ptr, int rgb_step, int rgb_stride,
	int rgb_bit_depth, uint8_t* y_ptr, int y_stride,
	uint8_t* u_ptr, int u_stride, uint8_t* v_ptr,
	int v_stride, int yuv_bit_depth, int width,
	int height,
	const SharpYuvConversionMatrix* yuv_matrix) {
	// we expand the right/bottom border if needed
	const int w = (width + 1) & ~1;
	const int h = (height + 1) & ~1;
	const int uv_w = w >> 1;
	const int uv_h = h >> 1;
	uint64_t prev_diff_y_sum = ~0;
	int j, iter;

	// TODO(skal): allocate one big memory chunk. But for now, it's easier
	// for valgrind debugging to have several chunks.
	fixed_y_t* const tmp_buffer = SAFE_ALLOC(w * 3, 2, fixed_y_t); // scratch
	fixed_y_t* const best_y_base = SAFE_ALLOC(w, h, fixed_y_t);
	fixed_y_t* const target_y_base = SAFE_ALLOC(w, h, fixed_y_t);
	fixed_y_t* const best_rgb_y = SAFE_ALLOC(w, 2, fixed_y_t);
	fixed_t* const best_uv_base = SAFE_ALLOC(uv_w * 3, uv_h, fixed_t);
	fixed_t* const target_uv_base = SAFE_ALLOC(uv_w * 3, uv_h, fixed_t);
	fixed_t* const best_rgb_uv = SAFE_ALLOC(uv_w * 3, 1, fixed_t);
	fixed_y_t* best_y = best_y_base;
	fixed_y_t* target_y = target_y_base;
	fixed_t* best_uv = best_uv_base;
	fixed_t* target_uv = target_uv_base;
	const uint64_t diff_y_threshold = (uint64_t)(3.0 * w * h);
	int ok;
	assert(w > 0);
	assert(h > 0);

	if (best_y_base == NULL \|\| best_uv_base == NULL \|\|
	target_y_base == NULL \|\| target_uv_base == NULL \|\|
	best_rgb_y == NULL \|\| best_rgb_uv == NULL \|\|
	tmp_buffer == NULL) {
	ok = 0;
	goto End;
	}

	// Import RGB samples to W/RGB representation.
	for (j = 0; j < height; j += 2) {
	const int is_last_row = (j == height - 1);
	fixed_y_t* const src1 = tmp_buffer + 0 * w;
	fixed_y_t* const src2 = tmp_buffer + 3 * w;

	// prepare two rows of input
	ImportOneRow(r_ptr, g_ptr, b_ptr, rgb_step, rgb_bit_depth, width,
	src1);
	if (!is_last_row) {
	ImportOneRow(r_ptr + rgb_stride, g_ptr + rgb_stride, b_ptr + rgb_stride,
	rgb_step, rgb_bit_depth, width, src2);
	} else {
	memcpy(src2, src1, 3 * w * sizeof(*src2));
	}
	StoreGray(src1, best_y + 0, w);
	StoreGray(src2, best_y + w, w);

	UpdateW(src1, target_y, w, rgb_bit_depth);
	UpdateW(src2, target_y + w, w, rgb_bit_depth);
	UpdateChroma(src1, src2, target_uv, uv_w, rgb_bit_depth);
	memcpy(best_uv, target_uv, 3 * uv_w * sizeof(*best_uv));
	best_y += 2 * w;
	best_uv += 3 * uv_w;
	target_y += 2 * w;
	target_uv += 3 * uv_w;
	r_ptr += 2 * rgb_stride;
	g_ptr += 2 * rgb_stride;
	b_ptr += 2 * rgb_stride;
	}

	// Iterate and resolve clipping conflicts.
	for (iter = 0; iter < kNumIterations; ++iter) {
	const fixed_t* cur_uv = best_uv_base;
	const fixed_t* prev_uv = best_uv_base;
	uint64_t diff_y_sum = 0;

	best_y = best_y_base;
	best_uv = best_uv_base;
	target_y = target_y_base;
	target_uv = target_uv_base;
	for (j = 0; j < h; j += 2) {
	fixed_y_t* const src1 = tmp_buffer + 0 * w;
	fixed_y_t* const src2 = tmp_buffer + 3 * w;
	{
	const fixed_t* const next_uv = cur_uv + ((j < h - 2) ? 3 * uv_w : 0);
	InterpolateTwoRows(best_y, prev_uv, cur_uv, next_uv, w,
	src1, src2, rgb_bit_depth);
	prev_uv = cur_uv;
	cur_uv = next_uv;
	}

	UpdateW(src1, best_rgb_y + 0 * w, w, rgb_bit_depth);
	UpdateW(src2, best_rgb_y + 1 * w, w, rgb_bit_depth);
	UpdateChroma(src1, src2, best_rgb_uv, uv_w, rgb_bit_depth);

	// update two rows of Y and one row of RGB
	diff_y_sum +=
	SharpYuvUpdateY(target_y, best_rgb_y, best_y, 2 * w,
	rgb_bit_depth + GetPrecisionShift(rgb_bit_depth));
	SharpYuvUpdateRGB(target_uv, best_rgb_uv, best_uv, 3 * uv_w);

	best_y += 2 * w;
	best_uv += 3 * uv_w;
	target_y += 2 * w;
	target_uv += 3 * uv_w;
	}
	// test exit condition
	if (iter > 0) {
	if (diff_y_sum < diff_y_threshold) break;
	if (diff_y_sum > prev_diff_y_sum) break;
	}
	prev_diff_y_sum = diff_y_sum;
	}

	// final reconstruction
	ok = ConvertWRGBToYUV(best_y_base, best_uv_base, y_ptr, y_stride, u_ptr,
	u_stride, v_ptr, v_stride, rgb_bit_depth, yuv_bit_depth,
	width, height, yuv_matrix);

	End:
	free(best_y_base);
	free(best_uv_base);
	free(target_y_base);
	free(target_uv_base);
	free(best_rgb_y);
	free(best_rgb_uv);
	free(tmp_buffer);
	return ok;
	}
	#undef SAFE_ALLOC

	// Hidden exported init function.
	// By default SharpYuvConvert calls it with NULL. If needed, users can declare
	// it as extern and call it with a VP8CPUInfo function.
	extern void SharpYuvInit(VP8CPUInfo cpu_info_func);
	void SharpYuvInit(VP8CPUInfo cpu_info_func) {
	static volatile VP8CPUInfo sharpyuv_last_cpuinfo_used =
	(VP8CPUInfo)&sharpyuv_last_cpuinfo_used;
	const int initialized =
	(sharpyuv_last_cpuinfo_used != (VP8CPUInfo)&sharpyuv_last_cpuinfo_used);
	if (cpu_info_func == NULL && initialized) return;
	if (sharpyuv_last_cpuinfo_used == cpu_info_func) return;

	SharpYuvInitDsp(cpu_info_func);
	if (!initialized) {
	SharpYuvInitGammaTables();
	}

	sharpyuv_last_cpuinfo_used = cpu_info_func;
	}

	int SharpYuvConvert(const void* r_ptr, const void* g_ptr,
	const void* b_ptr, int rgb_step, int rgb_stride,
	int rgb_bit_depth, void* y_ptr, int y_stride,
	void* u_ptr, int u_stride, void* v_ptr,
	int v_stride, int yuv_bit_depth, int width,
	int height, const SharpYuvConversionMatrix* yuv_matrix) {
	SharpYuvConversionMatrix scaled_matrix;
	const int rgb_max = (1 << rgb_bit_depth) - 1;
	const int rgb_round = 1 << (rgb_bit_depth - 1);
	const int yuv_max = (1 << yuv_bit_depth) - 1;
	const int sfix = GetPrecisionShift(rgb_bit_depth);

	if (width < 1 \|\| height < 1 \|\| width == INT_MAX \|\| height == INT_MAX \|\|
	r_ptr == NULL \|\| g_ptr == NULL \|\| b_ptr == NULL \|\| y_ptr == NULL \|\|
	u_ptr == NULL \|\| v_ptr == NULL) {
	return 0;
	}
	if (rgb_bit_depth != 8 && rgb_bit_depth != 10 && rgb_bit_depth != 12 &&
	rgb_bit_depth != 16) {
	return 0;
	}
	if (yuv_bit_depth != 8 && yuv_bit_depth != 10 && yuv_bit_depth != 12) {
	return 0;
	}
	if (rgb_bit_depth > 8 && (rgb_step % 2 != 0 \|\| rgb_stride %2 != 0)) {
	// Step/stride should be even for uint16_t buffers.
	return 0;
	}
	if (yuv_bit_depth > 8 &&
	(y_stride % 2 != 0 \|\| u_stride % 2 != 0 \|\| v_stride % 2 != 0)) {
	// Stride should be even for uint16_t buffers.
	return 0;
	}
	SharpYuvInit(NULL);

	// Add scaling factor to go from rgb_bit_depth to yuv_bit_depth, to the
	// rgb->yuv conversion matrix.
	if (rgb_bit_depth == yuv_bit_depth) {
	memcpy(&scaled_matrix, yuv_matrix, sizeof(scaled_matrix));
	} else {
	int i;
	for (i = 0; i < 3; ++i) {
	scaled_matrix.rgb_to_y[i] =
	(yuv_matrix->rgb_to_y[i] * yuv_max + rgb_round) / rgb_max;
	scaled_matrix.rgb_to_u[i] =
	(yuv_matrix->rgb_to_u[i] * yuv_max + rgb_round) / rgb_max;
	scaled_matrix.rgb_to_v[i] =
	(yuv_matrix->rgb_to_v[i] * yuv_max + rgb_round) / rgb_max;
	}
	}
	// Also incorporate precision change scaling.
	scaled_matrix.rgb_to_y[3] = Shift(yuv_matrix->rgb_to_y[3], sfix);
	scaled_matrix.rgb_to_u[3] = Shift(yuv_matrix->rgb_to_u[3], sfix);
	scaled_matrix.rgb_to_v[3] = Shift(yuv_matrix->rgb_to_v[3], sfix);

	return DoSharpArgbToYuv(r_ptr, g_ptr, b_ptr, rgb_step, rgb_stride,
	rgb_bit_depth, y_ptr, y_stride, u_ptr, u_stride,
	v_ptr, v_stride, yuv_bit_depth, width, height,
	&scaled_matrix);
	}

	//------------------------------------------------------------------------------